Case Studies

Project: 1

Group: 3

Group Members:
 - Muhammad Raafey Tariq (231806)
 - Farrukh Ahmed (230614)
 - Amirreza Khamehchin Khiabani (230891)
 - Aymane Hachcham (236392)


Requirements:
 - numpy==1.24.2
 - matplotlib==3.7.1
 - seaborn==0.12.2
 - pandas==2.0.0
 - openpyxl==3.1.2

Installation Commands (One-time only)
 - pip install pandas==2.0.0
 - pip install numpy==1.24.2
 - pip install seaborn==0.12.2
 - pip install matplotlib==3.7.1
 - pip install openpyxl==3.1.2

Imports and Libraries

In [1]:
import pandas as pd
import numpy as np
import itertools
import pprint
import random

# used for the graphs
import seaborn as sns

import os
sns.set(font_scale = 1.2)

# used for plotting
from matplotlib import pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

from helper_functions import *


import matplotlib

# setting font to 'Times New Roman'
matplotlib.rcParams["font.family"] = "Times New Roman"
matplotlib.rcParams.update({'font.size': 16})
%matplotlib inline

Global Variables and Constants

Importing Data

In [2]:
data_df = read_data()

  warn("""Cannot parse header or footer so it will be ignored""")


Reformating Columns to Correct Data Types and dropping nans

In [3]:
# variables that are dropped 
to_filter = ["id", "zeit", "postleitzahl", "gemeinde", "bezirk", "geburtsjahr", "schaetzwert_bp_sys", "schaetzwert_by_dia", "terminal"]
data_df, cat_feat_list, num_feat_list = format_variables(data_df, to_filter=to_filter, drop_values=True)

# one hot encoding cat variables to prep data for Decision Tree
# ordinal variables and nominal are treated the same in trees, but need to be careful in Lin models

encoded_data_df = encode_data(data_df, cat_feat_list, num_feat_list)
encoded_train_set, encoded_test_set = train_test_split(encoded_data_df, test_size=0.3)

In [4]:
print("Size of training data: ", len(encoded_train_set))
print("Size of testing data: ", len(encoded_test_set))
print("Features used: ", data_df.columns)

Size of training data:  10381
Size of testing data:  4450
Features used:  Index(['bundesland', 'befinden', 'geschlecht', 'raucher', 'blutzucker_bekannt',
       'cholesterin_bekannt', 'in_behandlung', 'schaetzwert_bp_sys',
       'schaetzwert_by_dia', 'messwert_bp_sys', 'messwert_bp_dia', 'age'],
      dtype='object')


https://scikit-learn.org/stable/modules/tree.html#tree

 - scikit-learn uses an optimized version of the CART algorithm, does not support categorical variables
 - BIC cannot be computed as it depends on likelihood, cannot compute that for RegressionTree as it does not assume a conditional dist of data

In [5]:
# using self evaluated sys bp for analysis
target = "messwert_bp_sys"

# splitting targets from predictors
X_train, Y_train = separate_target(encoded_train_set, target)
X_test, Y_test = separate_target(encoded_test_set, target)

In [6]:
# fitting base model for DecisonTreeRegressor using all available features and default parameters

train_results_tree_base, test_results_tree_base, model_tree_base = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressor", params={"criterion" : "squared_error"})

In [7]:
# fitting base model for RandomForestRegressor using all available features and default parameters

train_results_rf_base, test_results_rf_base, model_rf_base = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressorRandomForest", {"criterion" : "squared_error",
                                                                                                                    "n_estimators": 100})

In [8]:
# finding the best set of parameters to use by finetuning RegTree using CV, fine-tuning is done on whole dataset

parameters= {"splitter":["best","random"],
            "max_depth" : list(np.arange(1, 25, 5, dtype=int)),
           "min_samples_leaf":list(np.arange(1, 100, 5, dtype=int)),
           "min_weight_fraction_leaf":list(np.arange(0, 1, 1.0, dtype=float)),
           "max_features":list(np.arange(1, len(X_train.columns), 1, dtype=int))
           }

model = DecisionTreeRegressor()
X_train_full, Y_train_full = separate_target(encoded_data_df, target)
tuning_model = GridSearchCV(model, param_grid=parameters, scoring='neg_mean_squared_error',cv=10,verbose=0)
tuning_model.fit(X_train_full, Y_train_full)

In [9]:
tuning_model.best_params_

{'max_depth': 11,
 'max_features': 20,
 'min_samples_leaf': 66,
 'min_weight_fraction_leaf': 0.0,
 'splitter': 'best'}

In [10]:
# fitting fine_tuned model for DecisonTreeRegressor using all available features and fine_tuned parameters
model_params = tuning_model.best_params_.copy()
model_params["criterion"] = "squared_error"

train_results_tree_fine, test_results_tree_fine, model_tree_fine = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressor",
                                                                              model_params)

In [11]:
# fitting fine_tuned model for RandomForestRegressor using all available features and fine_tuned parameters
model_params = tuning_model.best_params_.copy()
model_params["criterion"] = "squared_error"
del model_params["splitter"]
model_params["n_estimators"] = 100

train_results_rf_fine, test_results_rf_fine, model_rf_fine = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressorRandomForest",
                                                                              model_params)

In [17]:
train_result_list = [train_results_tree_base, train_results_tree_fine, train_results_rf_base, train_results_rf_fine]

test_result_list = [test_results_tree_base, test_results_tree_fine, test_results_rf_base, test_results_rf_fine]

model_names = ["Tree (Base)", "Tree (Fine-tuned)",
               "RF (Base)", "RF (Fine-tuned)"]

tab = tabularize_model_metrics(train_result_list, test_result_list, model_names)
round(tab, 3)

Unnamed: 0,Model,Train Mean Sq Error,Test Mean Sq Error,Train R2,Test R2,Train Adjusted R2,Test Adjusted R2
0,Tree (Base),1.236,358.492,0.997,0.041,0.997,0.037
1,Tree (Fine-tuned),162.762,186.525,0.558,0.501,0.557,0.499
2,Tree (Best Subset + Base),201.705,210.619,0.452,0.437,0.452,0.437
3,Tree (Best Subset + Fine-tuned),163.631,185.668,0.556,0.504,0.556,0.503
4,RF (Base),25.893,190.671,0.93,0.49,0.93,0.488
5,RF (Fine-tuned),161.505,177.671,0.562,0.525,0.561,0.523
6,RF (Best Subset + Base),25.826,190.375,0.93,0.491,0.93,0.489
7,RF (Best Subset + Fine-tuned),161.37,177.851,0.562,0.524,0.561,0.523


In [18]:
print(round(tab, 2).to_latex())

\begin{tabular}{llrrrrrr}
\toprule
 & Model & Train Mean Sq Error & Test Mean Sq Error & Train R2 & Test R2 & Train Adjusted R2 & Test Adjusted R2 \\
\midrule
0 & Tree (Base) & 1.240000 & 358.490000 & 1.000000 & 0.040000 & 1.000000 & 0.040000 \\
1 & Tree (Fine-tuned) & 162.760000 & 186.530000 & 0.560000 & 0.500000 & 0.560000 & 0.500000 \\
2 & Tree (Best Subset + Base) & 201.700000 & 210.620000 & 0.450000 & 0.440000 & 0.450000 & 0.440000 \\
3 & Tree (Best Subset + Fine-tuned) & 163.630000 & 185.670000 & 0.560000 & 0.500000 & 0.560000 & 0.500000 \\
4 & RF (Base) & 25.890000 & 190.670000 & 0.930000 & 0.490000 & 0.930000 & 0.490000 \\
5 & RF (Fine-tuned) & 161.510000 & 177.670000 & 0.560000 & 0.520000 & 0.560000 & 0.520000 \\
6 & RF (Best Subset + Base) & 25.830000 & 190.380000 & 0.930000 & 0.490000 & 0.930000 & 0.490000 \\
7 & RF (Best Subset + Fine-tuned) & 161.370000 & 177.850000 & 0.560000 & 0.520000 & 0.560000 & 0.520000 \\
\bottomrule
\end{tabular}



In [19]:
# using self evaluated dia bp for analysis
target = "messwert_bp_dia"

# splitting targets from predictors
X_train, Y_train = separate_target(encoded_train_set, target)
X_test, Y_test = separate_target(encoded_test_set, target)

In [20]:
# fitting base model for DecisonTreeRegressor using all available features and default parameters

train_results_tree_base, test_results_tree_base, model_tree_base = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressor", {"criterion" : "squared_error"})

In [21]:
# fitting base model for RandomForestRegressor using all available features and default parameters

train_results_rf_base, test_results_rf_base, model_rf_base = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressorRandomForest", {"criterion" : "squared_error",
                                                                                                                    "n_estimators": 100})

In [22]:
# finding the best set of parameters to use by finetuning RegTree using CV, fine-tuning is done on whole dataset

parameters= {"splitter":["best","random"],
            "max_depth" : list(np.arange(1, 25, 5, dtype=int)),
           "min_samples_leaf":list(np.arange(1, 100, 5, dtype=int)),
           "min_weight_fraction_leaf":list(np.arange(0, 1, 1.0, dtype=float)),
           "max_features":list(np.arange(1, len(X_train.columns), 1, dtype=int))
           }

model = DecisionTreeRegressor()
X_train_full, Y_train_full = separate_target(encoded_data_df, target)
tuning_model = GridSearchCV(model, param_grid=parameters, scoring='neg_mean_squared_error',cv=10,verbose=0)
tuning_model.fit(X_train_full, Y_train_full)

In [23]:
tuning_model.best_params_

{'max_depth': 6,
 'max_features': 21,
 'min_samples_leaf': 11,
 'min_weight_fraction_leaf': 0.0,
 'splitter': 'best'}

In [24]:
# fitting fine_tuned model for DecisonTreeRegressor using all available features and fine_tuned parameters
model_params = tuning_model.best_params_.copy()
model_params["criterion"] = "squared_error"

train_results_tree_fine, test_results_tree_fine, model_tree_fine = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressor",
                                                                              model_params)

In [25]:
# fitting fine_tuned model for RandomForestRegressor using all available features and fine_tuned parameters
model_params = tuning_model.best_params_.copy()
model_params["criterion"] = "squared_error"
del model_params["splitter"]
model_params["n_estimators"] = 100

train_results_rf_fine, test_results_rf_fine, model_rf_fine = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressorRandomForest",
                                                                              model_params)

In [30]:
train_result_list = [train_results_tree_base, train_results_tree_fine, train_results_rf_base, train_results_rf_fine]

test_result_list = [test_results_tree_base, test_results_tree_fine, test_results_rf_base, test_results_rf_fine]

model_names = ["Tree (Base)", "Tree (Fine-tuned)", "RF (Base)", "RF (Fine-tuned)"]

tab = tabularize_model_metrics(train_result_list, test_result_list, model_names)
round(tab, 3)

Unnamed: 0,Model,Train Mean Sq Error,Test Mean Sq Error,Train R2,Test R2,Train Adjusted R2,Test Adjusted R2
0,Tree (Base),0.492,217.605,0.998,-0.104,0.998,-0.11
1,Tree (Fine-tuned),100.353,105.504,0.505,0.465,0.504,0.462
2,Tree (Best Subset + Base),109.183,117.472,0.461,0.404,0.461,0.404
3,Tree (Best Subset + Fine-tuned),100.327,105.647,0.505,0.464,0.504,0.463
4,RF (Base),15.95,114.534,0.921,0.419,0.921,0.416
5,RF (Fine-tuned),97.315,103.097,0.52,0.477,0.519,0.474
6,RF (Best Subset + Base),16.046,114.819,0.921,0.417,0.921,0.415
7,RF (Best Subset + Fine-tuned),97.226,103.153,0.52,0.477,0.519,0.475


In [31]:
print(round(tab, 2).to_latex())

\begin{tabular}{llrrrrrr}
\toprule
 & Model & Train Mean Sq Error & Test Mean Sq Error & Train R2 & Test R2 & Train Adjusted R2 & Test Adjusted R2 \\
\midrule
0 & Tree (Base) & 0.490000 & 217.600000 & 1.000000 & -0.100000 & 1.000000 & -0.110000 \\
1 & Tree (Fine-tuned) & 100.350000 & 105.500000 & 0.500000 & 0.460000 & 0.500000 & 0.460000 \\
2 & Tree (Best Subset + Base) & 109.180000 & 117.470000 & 0.460000 & 0.400000 & 0.460000 & 0.400000 \\
3 & Tree (Best Subset + Fine-tuned) & 100.330000 & 105.650000 & 0.500000 & 0.460000 & 0.500000 & 0.460000 \\
4 & RF (Base) & 15.950000 & 114.530000 & 0.920000 & 0.420000 & 0.920000 & 0.420000 \\
5 & RF (Fine-tuned) & 97.310000 & 103.100000 & 0.520000 & 0.480000 & 0.520000 & 0.470000 \\
6 & RF (Best Subset + Base) & 16.050000 & 114.820000 & 0.920000 & 0.420000 & 0.920000 & 0.410000 \\
7 & RF (Best Subset + Fine-tuned) & 97.230000 & 103.150000 & 0.520000 & 0.480000 & 0.520000 & 0.470000 \\
\bottomrule
\end{tabular}

