Case Studies

Project: 1

Group: 3

Group Members:
 - Muhammad Raafey Tariq (231806)
 - Farrukh Ahmed (230614)
 - Amirreza Khamehchin Khiabani (230891)
 - Aymane Hachcham (236392)


Imports and Libraries

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from helper_functions import *

Global Variables and Constants

Importing Data

In [10]:
data_df = read_data()

  warn("""Cannot parse header or footer so it will be ignored""")


Reformating Columns to Correct Data Types and dropping nans

In [11]:
# variables that are dropped 
to_filter = ["id", "time", "postal_code", "municipality", "district", "year_of_birth", "self_eval_bp_sys", "self_eval_bp_dia"]
data_df, cat_feat_list, num_feat_list = format_variables(data_df, to_filter=to_filter, drop_values=True)

# one hot encoding cat variables to prep data for Decision Tree
# ordinal variables and nominal are treated the same in trees, but need to be careful in Lin models

encoded_data_df = encode_data(data_df, cat_feat_list, num_feat_list)
encoded_train_set, encoded_test_set = train_test_split(encoded_data_df, test_size=0.3)

In [12]:
print("Size of training data: ", len(encoded_train_set))
print("Size of testing data: ", len(encoded_test_set))
print("Features used: ", data_df.columns)

Size of training data:  10404
Size of testing data:  4460
Features used:  Index(['terminal', 'federal_state', 'felt_health_condition', 'gender',
       'is_smoker', 'is_diabetic', 'has_cholestrol', 'in_treatment',
       'measured_bp_sys', 'measured_bp_dia', 'age', 'month', 'hour', 'day',
       'temp', 'humidity', 'temp_min', 'temp_max'],
      dtype='object')


https://scikit-learn.org/stable/modules/tree.html#tree

 - scikit-learn uses an optimized version of the CART algorithm, does not support categorical variables
 - BIC cannot be computed as it depends on likelihood, cannot compute that for RegressionTree as it does not assume a conditional dist of data

In [13]:
# using self evaluated sys bp for analysis
target = "measured_bp_sys"

# splitting targets from predictors
X_train, Y_train = separate_target(encoded_train_set, target)
X_test, Y_test = separate_target(encoded_test_set, target)

In [14]:
# fitting base model for DecisonTreeRegressor using all available features and default parameters

train_results_tree_base, test_results_tree_base, model_tree_base = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressor", params={"criterion" : "squared_error"})

In [15]:
# fitting base model for RandomForestRegressor using all available features and default parameters

train_results_rf_base, test_results_rf_base, model_rf_base = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressorRandomForest", {"criterion" : "squared_error",
                                                                                                                    "n_estimators": 100})

In [16]:
# finding the best set of parameters to use by finetuning RegTree using CV, fine-tuning is done on whole dataset

parameters= {"splitter":["best"],
            "max_depth" : [1,  5, 10, 15, 20, 25],
           "min_samples_leaf":[ 1,  5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100],
           "max_features":list(np.arange(1, len(X_train.columns), 1, dtype=int))
           }

model = DecisionTreeRegressor()
X_train_full, Y_train_full = separate_target(encoded_data_df, target)
tuning_model = GridSearchCV(model, param_grid=parameters, scoring='neg_mean_squared_error',cv=10,verbose=0)
tuning_model.fit(X_train_full, Y_train_full)

In [17]:
tuning_model.best_params_

{'max_depth': 10,
 'max_features': 40,
 'min_samples_leaf': 70,
 'splitter': 'best'}

In [18]:
# fitting fine_tuned model for DecisonTreeRegressor using all available features and fine_tuned parameters
model_params = tuning_model.best_params_.copy()
model_params["criterion"] = "squared_error"

train_results_tree_fine, test_results_tree_fine, model_tree_fine = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressor",
                                                                              model_params)

In [19]:
# fitting fine_tuned model for RandomForestRegressor using all available features and fine_tuned parameters
model_params = tuning_model.best_params_.copy()
model_params["criterion"] = "squared_error"
del model_params["splitter"]
model_params["n_estimators"] = 100

train_results_rf_fine, test_results_rf_fine, model_rf_fine = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressorRandomForest",
                                                                              model_params)

In [20]:
train_result_list = [train_results_tree_base, train_results_tree_fine, train_results_rf_base, train_results_rf_fine]

test_result_list = [test_results_tree_base, test_results_tree_fine, test_results_rf_base, test_results_rf_fine]

model_names = ["Tree (Base)", "Tree (Fine-tuned)",
               "RF (Base)", "RF (Fine-tuned)"]

tab = tabularize_model_metrics(train_result_list, test_result_list, model_names)
round(tab, 3)

Unnamed: 0,Model,Train Mean Sq Error,Test Mean Sq Error,Train R2,Test R2,Train Adjusted R2,Test Adjusted R2
0,Tree (Base),0.037,333.933,1.0,0.097,1.0,0.088
1,Tree (Fine-tuned),164.7,180.695,0.555,0.511,0.553,0.507
2,RF (Base),24.384,172.039,0.934,0.535,0.934,0.53
3,RF (Fine-tuned),161.026,172.187,0.565,0.534,0.563,0.53


In [21]:
print(round(tab, 2).to_latex())

\begin{tabular}{llrrrrrr}
\toprule
 & Model & Train Mean Sq Error & Test Mean Sq Error & Train R2 & Test R2 & Train Adjusted R2 & Test Adjusted R2 \\
\midrule
0 & Tree (Base) & 0.040000 & 333.930000 & 1.000000 & 0.100000 & 1.000000 & 0.090000 \\
1 & Tree (Fine-tuned) & 164.700000 & 180.690000 & 0.560000 & 0.510000 & 0.550000 & 0.510000 \\
2 & RF (Base) & 24.380000 & 172.040000 & 0.930000 & 0.530000 & 0.930000 & 0.530000 \\
3 & RF (Fine-tuned) & 161.030000 & 172.190000 & 0.570000 & 0.530000 & 0.560000 & 0.530000 \\
\bottomrule
\end{tabular}



In [22]:
# using self evaluated dia bp for analysis
target = "measured_bp_dia"

# splitting targets from predictors
X_train, Y_train = separate_target(encoded_train_set, target)
X_test, Y_test = separate_target(encoded_test_set, target)

In [23]:
# fitting base model for DecisonTreeRegressor using all available features and default parameters

train_results_tree_base, test_results_tree_base, model_tree_base = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressor", {"criterion" : "squared_error"})

In [24]:
# fitting base model for RandomForestRegressor using all available features and default parameters

train_results_rf_base, test_results_rf_base, model_rf_base = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressorRandomForest", {"criterion" : "squared_error",
                                                                                                                    "n_estimators": 100})

In [25]:
# finding the best set of parameters to use by finetuning RegTree using CV, fine-tuning is done on whole dataset

parameters= {"splitter":["best"],
            "max_depth" : [1,  5, 10, 15, 20, 25],
           "min_samples_leaf":[ 1,  5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100],
           "max_features":list(np.arange(1, len(X_train.columns), 1, dtype=int))
           }

model = DecisionTreeRegressor()
X_train_full, Y_train_full = separate_target(encoded_data_df, target)
tuning_model = GridSearchCV(model, param_grid=parameters, scoring='neg_mean_squared_error',cv=10,verbose=0)
tuning_model.fit(X_train_full, Y_train_full)

In [26]:
tuning_model.best_params_

{'max_depth': 25,
 'max_features': 39,
 'min_samples_leaf': 90,
 'splitter': 'best'}

In [27]:
# fitting fine_tuned model for DecisonTreeRegressor using all available features and fine_tuned parameters
model_params = tuning_model.best_params_.copy()
model_params["criterion"] = "squared_error"

train_results_tree_fine, test_results_tree_fine, model_tree_fine = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressor",
                                                                              model_params)

In [28]:
# fitting fine_tuned model for RandomForestRegressor using all available features and fine_tuned parameters
model_params = tuning_model.best_params_.copy()
model_params["criterion"] = "squared_error"
del model_params["splitter"]
model_params["n_estimators"] = 100

train_results_rf_fine, test_results_rf_fine, model_rf_fine = fit_model(X_train, Y_train, X_test, Y_test,
                                                                              "DecisionTreeRegressorRandomForest",
                                                                              model_params)

In [29]:
train_result_list = [train_results_tree_base, train_results_tree_fine, train_results_rf_base, train_results_rf_fine]

test_result_list = [test_results_tree_base, test_results_tree_fine, test_results_rf_base, test_results_rf_fine]

model_names = ["Tree (Base)", "Tree (Fine-tuned)", "RF (Base)", "RF (Fine-tuned)"]

tab = tabularize_model_metrics(train_result_list, test_result_list, model_names)
round(tab, 3)

Unnamed: 0,Model,Train Mean Sq Error,Test Mean Sq Error,Train R2,Test R2,Train Adjusted R2,Test Adjusted R2
0,Tree (Base),0.179,218.97,0.999,-0.046,0.999,-0.056
1,Tree (Fine-tuned),101.538,115.911,0.486,0.446,0.484,0.441
2,RF (Base),14.936,109.182,0.924,0.479,0.924,0.474
3,RF (Fine-tuned),100.137,112.034,0.493,0.465,0.491,0.46


In [30]:
print(round(tab, 2).to_latex())

\begin{tabular}{llrrrrrr}
\toprule
 & Model & Train Mean Sq Error & Test Mean Sq Error & Train R2 & Test R2 & Train Adjusted R2 & Test Adjusted R2 \\
\midrule
0 & Tree (Base) & 0.180000 & 218.970000 & 1.000000 & -0.050000 & 1.000000 & -0.060000 \\
1 & Tree (Fine-tuned) & 101.540000 & 115.910000 & 0.490000 & 0.450000 & 0.480000 & 0.440000 \\
2 & RF (Base) & 14.940000 & 109.180000 & 0.920000 & 0.480000 & 0.920000 & 0.470000 \\
3 & RF (Fine-tuned) & 100.140000 & 112.030000 & 0.490000 & 0.460000 & 0.490000 & 0.460000 \\
\bottomrule
\end{tabular}

