Case Studies

Project: 1

Group: 3

Group Members:
 - Muhammad Raafey Tariq (231806)
 - Farrukh Ahmed (230614)
 - Amirreza Khamehchin Khiabani (230891)
 - Aymane Hachcham (236392)


Requirements:
 - numpy==1.24.2
 - matplotlib==3.7.1
 - seaborn==0.12.2
 - pandas==2.0.0
 - openpyxl==3.1.2

Installation Commands (One-time only)
 - pip install pandas==2.0.0
 - pip install numpy==1.24.2
 - pip install seaborn==0.12.2
 - pip install matplotlib==3.7.1
 - pip install openpyxl==3.1.2

Imports and Libraries

In [1]:
import pandas as pd
import numpy as np
import itertools
import pprint
import random

# used for the graphs
import seaborn as sns

import os
sns.set(font_scale = 1.2)

# used for plotting
from matplotlib import pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

from helper_functions import *


import matplotlib

# setting font to 'Times New Roman'
matplotlib.rcParams["font.family"] = "Times New Roman"
matplotlib.rcParams.update({'font.size': 16})
%matplotlib inline

Global Variables and Constants

Importing Data

In [2]:
data_df = read_data()

  warn("""Cannot parse header or footer so it will be ignored""")


Reformating Columns to Correct Data Types and dropping nans

In [3]:
# variables that are dropped 
to_filter = ["id", "zeit", "postleitzahl", "gemeinde", "bezirk", "geburtsjahr", "terminal"]
data_df, cat_feat_list, num_feat_list = format_variables(data_df, to_filter=to_filter, drop_values=True)

# one hot encoding cat variables to prep data for Decision Tree
# ordinal variables and nominal are treated the same in trees, but need to be careful in Lin models

encoded_data_df = encode_data(data_df, cat_feat_list, num_feat_list)
encoded_train_set, encoded_test_set = train_test_split(encoded_data_df, test_size=0.3)

In [4]:
print("Size of training data: ", len(encoded_train_set))
print("Size of testing data: ", len(encoded_test_set))
print("Features used: ", data_df.columns)

Size of training data:  10381
Size of testing data:  4450
Features used:  Index(['bundesland', 'befinden', 'geschlecht', 'raucher', 'blutzucker_bekannt',
       'cholesterin_bekannt', 'in_behandlung', 'schaetzwert_bp_sys',
       'schaetzwert_by_dia', 'messwert_bp_sys', 'messwert_bp_dia', 'age'],
      dtype='object')


https://scikit-learn.org/stable/modules/tree.html#tree

 - scikit-learn uses an optimized version of the CART algorithm, does not support categorical variables
 - BIC cannot be computed as it depends on likelihood, cannot compute that for RegressionTree as it does not assume a conditional dist of data

In [5]:
# using self evaluated sys bp for analysis
target = "messwert_bp_sys"

# splitting targets from predictors
X_train, Y_train = separate_target(encoded_train_set, target)
X_test, Y_test = separate_target(encoded_test_set, target)

In [6]:
# fitting base model for DecisonTreeRegressor using all available features and default parameters

train_results_tree_base, test_results_tree_base, model_tree_base = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressor", params={"criterion" : "squared_error"})

In [7]:
# fitting base model for RandomForestRegressor using all available features and default parameters

train_results_rf_base, test_results_rf_base, model_rf_base = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressorRandomForest", {"criterion" : "squared_error",
                                                                                                                    "n_estimators": 100})

In [8]:
# finding the best set of parameters to use by finetuning RegTree using CV, fine-tuning is done on whole dataset

parameters= {"splitter":["best","random"],
            "max_depth" : list(np.arange(1, 25, 5, dtype=int)),
           "min_samples_leaf":list(np.arange(1, 100, 5, dtype=int)),
           "min_weight_fraction_leaf":list(np.arange(0, 1, 1.0, dtype=float)),
           "max_features":list(np.arange(1, len(X_train.columns), 1, dtype=int))
           }

model = DecisionTreeRegressor()
X_train_full, Y_train_full = separate_target(encoded_data_df, target)
tuning_model = GridSearchCV(model, param_grid=parameters, scoring='neg_mean_squared_error',cv=10,verbose=0)
tuning_model.fit(X_train_full, Y_train_full)

AttributeError: 'Series' object has no attribute 'columns'

In [None]:
tuning_model.best_params_

{'max_depth': 11,
 'max_features': 'auto',
 'min_samples_leaf': 66,
 'min_weight_fraction_leaf': 0.0,
 'splitter': 'best'}

In [None]:
# fitting fine_tuned model for DecisonTreeRegressor using all available features and fine_tuned parameters
model_params = tuning_model.best_params_.copy()
model_params["criterion"] = "squared_error"

train_results_tree_fine, test_results_tree_fine, model_tree_fine = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressor",
                                                                              model_params)



In [None]:
# fitting fine_tuned model for RandomForestRegressor using all available features and fine_tuned parameters
model_params = tuning_model.best_params_.copy()
model_params["criterion"] = "squared_error"
del model_params["splitter"]
model_params["n_estimators"] = 100

train_results_rf_fine, test_results_rf_fine, model_rf_fine = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressorRandomForest",
                                                                              model_params)

  warn(


In [None]:
# setting criterion for best subset selection
BEST_SUBSET_CRITERION = "mse"

In [None]:
# using best subset selection with default parameters for DecisionTreeRegressor

model_type = "DecisionTreeRegressor"
model_params = {}
model_params["criterion"] = "squared_error"
criterion = BEST_SUBSET_CRITERION
features = list(X_train.columns)

model_tree_base_best, train_results_tree_base_best, test_results_tree_base_best = best_subset_selection(features, criterion, X_train, Y_train, X_test, Y_test,
                                                     model_type, model_params, 1)


Best Model: 
Features:  ['messwert_bp_dia']
Train Results:  {'r_2': 0.4541610866864263, 'adjusted_r_2': 0.4541084959827638, 'mse': 203.65770861220201}
Test Results:  {'r_2': 0.4359605417058491, 'adjusted_r_2': 0.43583373427367866, 'mse': 204.76227967460264}


In [None]:
# using best subset selection with default parameters for DecisionTreeRegressorRandomForest
model_params = {}
model_params["criterion"] = "squared_error"
model_params["n_estimators"] = 100
model_type = "DecisionTreeRegressorRandomForest"
criterion = BEST_SUBSET_CRITERION
features = list(X_train.columns)
model_rf_base_best, train_results_rf_base_best, test_results_rf_base_best = best_subset_selection(features, criterion, X_train, Y_train, X_test, Y_test,
                                                     model_type, model_params, 1)


Best Model: 
Features:  ['befinden_2', 'befinden_3', 'befinden_4', 'befinden_5', 'geschlecht_m', 'raucher_True', 'blutzucker_bekannt_True', 'cholesterin_bekannt_True', 'in_behandlung_True', 'schaetzwert_bp_sys', 'schaetzwert_by_dia', 'messwert_bp_dia', 'age']
Train Results:  {'r_2': 0.9286951251316238, 'adjusted_r_2': 0.9286057103179565, 'mse': 26.604529421358432}
Test Results:  {'r_2': 0.5004704238132951, 'adjusted_r_2': 0.49900651838263077, 'mse': 181.34336752648866}


In [None]:
# using best subset selection with finetuned parameters for DecisionTreeRegressor

model_type = "DecisionTreeRegressor"
model_params = tuning_model.best_params_.copy()
model_params["criterion"] = "squared_error"

criterion = BEST_SUBSET_CRITERION
features = list(X_train.columns)

model_tree_fine_best, train_results_tree_fine_best, test_results_tree_fine_best = best_subset_selection(features, criterion, X_train, Y_train, X_test, Y_test,
                                                     model_type, model_params, 1)




Best Model: 
Features:  ['befinden_2', 'befinden_3', 'befinden_4', 'befinden_5', 'geschlecht_m', 'raucher_True', 'blutzucker_bekannt_True', 'cholesterin_bekannt_True', 'in_behandlung_True', 'schaetzwert_bp_sys', 'schaetzwert_by_dia', 'messwert_bp_dia', 'age']
Train Results:  {'r_2': 0.5534004172368694, 'adjusted_r_2': 0.5528403907512978, 'mse': 166.63056714033448}
Test Results:  {'r_2': 0.5217378392616026, 'adjusted_r_2': 0.5203362594397813, 'mse': 173.6226940772364}




In [None]:
# using best subset selection with finetuned parameters for DecisionTreeRegressorRF

model_type = "DecisionTreeRegressorRandomForest"
model_params = tuning_model.best_params_.copy()
del model_params["splitter"]
model_params["n_estimators"] = 100
criterion = BEST_SUBSET_CRITERION
features = list(X_train.columns)
model_rf_fine_best, train_results_rf_fine_best, test_results_rf_fine_best = best_subset_selection(features, criterion, X_train, Y_train, X_test, Y_test,
                                                     model_type, model_params, 1)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


: 

: 

: 

In [None]:
train_result_list = [train_results_tree_base, train_results_tree_fine, train_results_tree_base_best, train_results_tree_fine_best,
                     train_results_rf_base, train_results_rf_fine, train_results_rf_base_best, train_results_rf_fine_best]

test_result_list = [test_results_tree_base, test_results_tree_fine, test_results_tree_base_best, test_results_tree_fine_best,
                     test_results_rf_base, test_results_rf_fine, test_results_rf_base_best, test_results_rf_fine_best]

model_names = ["Tree (Base)", "Tree (Fine-tuned)", "Tree (Best Subset + Base)", "Tree (Best Subset + Fine-tuned)",
               "RF (Base)", "RF (Fine-tuned)", "RF (Best Subset + Base)", "RF (Best Subset + Fine-tuned)"]

tab = tabularize_model_metrics(train_result_list, test_result_list, model_names)
round(tab, 3)

Unnamed: 0,Model,Train Mean Sq Error,Test Mean Sq Error,Train R2,Test R2,Train Adjusted R2,Test Adjusted R2
0,Tree (Base),1.154,347.731,0.997,0.026,0.997,0.022
1,Tree (Fine-tuned),166.921,172.812,0.556,0.516,0.555,0.514
2,Tree (Best Subset + Base),205.374,201.392,0.453,0.436,0.453,0.436
3,Tree (Best Subset + Fine-tuned),166.935,172.676,0.556,0.517,0.555,0.516
4,RF (Base),26.939,177.558,0.928,0.503,0.928,0.5
5,RF (Fine-tuned),166.037,168.384,0.558,0.529,0.557,0.526
6,RF (Best Subset + Base),27.014,176.854,0.928,0.505,0.928,0.503
7,RF (Best Subset + Fine-tuned),165.777,168.136,0.559,0.529,0.558,0.528


In [None]:
print(round(tab, 2).to_latex())

In [None]:
# using self evaluated dia bp for analysis
target = "messwert_bp_dia"

# splitting targets from predictors
X_train, Y_train = separate_target(encoded_train_set, target)
X_test, Y_test = separate_target(encoded_test_set, target)

In [None]:
# fitting base model for DecisonTreeRegressor using all available features and default parameters

train_results_tree_base, test_results_tree_base, model_tree_base = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressor", {"criterion" : "squared_error"})

In [None]:
# fitting base model for RandomForestRegressor using all available features and default parameters

train_results_rf_base, test_results_rf_base, model_rf_base = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressorRandomForest", {"criterion" : "squared_error",
                                                                                                                    "n_estimators": 100})

In [None]:
# finding the best set of parameters to use by finetuning RegTree using CV, fine-tuning is done on whole dataset

parameters= {"splitter":["best","random"],
            "max_depth" : list(np.arange(1, 25, 5, dtype=int)),
           "min_samples_leaf":list(np.arange(1, 100, 5, dtype=int)),
           "min_weight_fraction_leaf":list(np.arange(0, 1, 1.0, dtype=float)),
           "max_features":["auto","log2","sqrt", None]
           }

model = DecisionTreeRegressor()
X_train_full, Y_train_full = separate_target(encoded_data_df, target)
tuning_model = GridSearchCV(model, param_grid=parameters, scoring='neg_mean_squared_error',cv=10,verbose=0)
tuning_model.fit(X_train_full, Y_train_full)



In [None]:
tuning_model.best_params_

{'max_depth': 6,
 'max_features': 'auto',
 'min_samples_leaf': 11,
 'min_weight_fraction_leaf': 0.0,
 'splitter': 'best'}

In [None]:
# fitting fine_tuned model for DecisonTreeRegressor using all available features and fine_tuned parameters
model_params = tuning_model.best_params_.copy()
model_params["criterion"] = "squared_error"

train_results_tree_fine, test_results_tree_fine, model_tree_fine = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressor",
                                                                              model_params)



In [None]:
# fitting fine_tuned model for RandomForestRegressor using all available features and fine_tuned parameters
model_params = tuning_model.best_params_.copy()
model_params["criterion"] = "squared_error"
del model_params["splitter"]
model_params["n_estimators"] = 100

train_results_rf_fine, test_results_rf_fine, model_rf_fine = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressorRandomForest",
                                                                              model_params)

  warn(


In [None]:
# using best subset selection with default parameters for DecisionTreeRegressor

model_type = "DecisionTreeRegressor"
model_params = {}
model_params["criterion"] = "squared_error"
criterion = BEST_SUBSET_CRITERION
features = list(X_train.columns)

model_tree_base_best, train_results_tree_base_best, test_results_tree_base_best = best_subset_selection(features, criterion, X_train, Y_train, X_test, Y_test,
                                                     model_type, model_params, 1)


Best Model: 
Features:  ['messwert_bp_sys']
Train Results:  {'r_2': 0.4557584733221004, 'adjusted_r_2': 0.4557060365240777, 'mse': 108.50237373660838}
Test Results:  {'r_2': 0.41894513594414684, 'adjusted_r_2': 0.41881450310600477, 'mse': 119.0388826949106}


In [None]:
# using best subset selection with default parameters for DecisionTreeRegressorRandomForest
model_params = {}
model_params["criterion"] = "squared_error"
model_params["n_estimators"] = 100
model_type = "DecisionTreeRegressorRandomForest"
criterion = BEST_SUBSET_CRITERION
features = list(X_train.columns)
model_rf_base_best, train_results_rf_base_best, test_results_rf_base_best = best_subset_selection(features, criterion, X_train, Y_train, X_test, Y_test,
                                                     model_type, model_params, 1)


Best Model: 
Features:  ['bundesland_Niederösterreich', 'bundesland_Oberösterreich', 'bundesland_Salzburg', 'bundesland_Steiermark', 'bundesland_Tirol', 'bundesland_Vorarlberg', 'bundesland_Wien', 'bundesland_not_applicable', 'befinden_2', 'befinden_3', 'befinden_4', 'befinden_5', 'geschlecht_m', 'raucher_True', 'blutzucker_bekannt_True', 'cholesterin_bekannt_True', 'in_behandlung_True', 'schaetzwert_bp_sys', 'schaetzwert_by_dia', 'messwert_bp_sys', 'age']
Train Results:  {'r_2': 0.9177953466808948, 'adjusted_r_2': 0.9176286995412384, 'mse': 16.38867962127534}
Test Results:  {'r_2': 0.4534494751507061, 'adjusted_r_2': 0.45085743336619055, 'mse': 111.97008723107949}


In [None]:
# using best subset selection with finetuned parameters for DecisionTreeRegressor

model_type = "DecisionTreeRegressor"
model_params = tuning_model.best_params_.copy()
model_params["criterion"] = "squared_error"
criterion = BEST_SUBSET_CRITERION
features = list(X_train.columns)

model_tree_fine_best, train_results_tree_fine_best, test_results_tree_fine_best = best_subset_selection(features, criterion, X_train, Y_train, X_test, Y_test,
                                                     model_type, model_params, 1)




Best Model: 
Features:  ['messwert_bp_sys', 'age']
Train Results:  {'r_2': 0.49074099002359783, 'adjusted_r_2': 0.4906428479904553, 'mse': 101.5281060349829}
Test Results:  {'r_2': 0.4693512193110686, 'adjusted_r_2': 0.4691125645862254, 'mse': 108.71234691282974}




In [None]:
# using best subset selection with finetuned parameters for DecisionTreeRegressorRF

model_type = "DecisionTreeRegressorRandomForest"
model_params = tuning_model.best_params_.copy()
del model_params["splitter"]
model_params["n_estimators"] = 100
criterion = BEST_SUBSET_CRITERION
features = list(X_train.columns)
model_rf_fine_best, train_results_rf_fine_best, test_results_rf_fine_best = best_subset_selection(features, criterion, X_train, Y_train, X_test, Y_test,
                                                     model_type, model_params, 1)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(



Best Model: 
Features:  ['befinden_4', 'befinden_5', 'geschlecht_m', 'raucher_True', 'blutzucker_bekannt_True', 'cholesterin_bekannt_True', 'in_behandlung_True', 'schaetzwert_bp_sys', 'schaetzwert_by_dia', 'messwert_bp_sys', 'age']
Train Results:  {'r_2': 0.5128867678904906, 'adjusted_r_2': 0.5123700116407843, 'mse': 97.11302679347621}
Test Results:  {'r_2': 0.486084737619665, 'adjusted_r_2': 0.484810950353738, 'mse': 105.2842036405988}


In [None]:
train_result_list = [train_results_tree_base, train_results_tree_fine, train_results_tree_base_best, train_results_tree_fine_best,
                     train_results_rf_base, train_results_rf_fine, train_results_rf_base_best, train_results_rf_fine_best]

test_result_list = [test_results_tree_base, test_results_tree_fine, test_results_tree_base_best, test_results_tree_fine_best,
                     test_results_rf_base, test_results_rf_fine, test_results_rf_base_best, test_results_rf_fine_best]

model_names = ["Tree (Base)", "Tree (Fine-tuned)", "Tree (Best Subset + Base)", "Tree (Best Subset + Fine-tuned)",
               "RF (Base)", "RF (Fine-tuned)", "RF (Best Subset + Base)", "RF (Best Subset + Fine-tuned)"]

tab = tabularize_model_metrics(train_result_list, test_result_list, model_names)
round(tab, 3)

Unnamed: 0,Model,Train Mean Sq Error,Test Mean Sq Error,Train R2,Test R2,Train Adjusted R2,Test Adjusted R2
0,Tree (Base),0.566,222.177,0.997,-0.084,0.997,-0.09
1,Tree (Fine-tuned),100.502,109.778,0.496,0.464,0.495,0.461
2,Tree (Best Subset + Base),108.502,119.039,0.456,0.419,0.456,0.419
3,Tree (Best Subset + Fine-tuned),101.528,108.712,0.491,0.469,0.491,0.469
4,RF (Base),16.275,112.481,0.918,0.451,0.918,0.448
5,RF (Fine-tuned),96.993,105.532,0.513,0.485,0.512,0.482
6,RF (Best Subset + Base),16.389,111.97,0.918,0.453,0.918,0.451
7,RF (Best Subset + Fine-tuned),97.113,105.284,0.513,0.486,0.512,0.485


In [None]:
print(round(tab, 2).to_latex())

\begin{tabular}{llrrrrrr}
\toprule
 & Model & Train Mean Sq Error & Test Mean Sq Error & Train R2 & Test R2 & Train Adjusted R2 & Test Adjusted R2 \\
\midrule
0 & Tree (Base) & 0.565552 & 222.176635 & 0.997163 & -0.084493 & 0.997157 & -0.089882 \\
1 & Tree (Fine-tuned) & 100.502283 & 109.778167 & 0.495886 & 0.464149 & 0.494816 & 0.461486 \\
2 & Tree (Best Subset + Base) & 108.502374 & 119.038883 & 0.455758 & 0.418945 & 0.455706 & 0.418815 \\
3 & Tree (Best Subset + Fine-tuned) & 101.528106 & 108.712347 & 0.490741 & 0.469351 & 0.490643 & 0.469113 \\
4 & RF (Base) & 16.275489 & 112.480810 & 0.918363 & 0.450957 & 0.918190 & 0.448228 \\
5 & RF (Fine-tuned) & 96.993445 & 105.531992 & 0.513487 & 0.484875 & 0.512453 & 0.482315 \\
6 & RF (Best Subset + Base) & 16.388680 & 111.970087 & 0.917795 & 0.453449 & 0.917629 & 0.450857 \\
7 & RF (Best Subset + Fine-tuned) & 97.113027 & 105.284204 & 0.512887 & 0.486085 & 0.512370 & 0.484811 \\
\bottomrule
\end{tabular}

