<a href="https://colab.research.google.com/github/czarinagluna/vaccination-status-classification/blob/main/GridSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Search for Optimal Parameters of Classification Models
Hyperparameter Tuning of *IV. Classification Models* in [Main Notebook](https://github.com/czarinagluna/vaccination-status-classification/blob/main/main.ipynb)
***
Authors: [Czarina Luna](https://www.linkedin.com/in/czarinaluna/), Weston Shuken, Justin Sohn

In [1]:
import pandas as pd
labels = pd.read_csv('data/training_set_labels.csv')
features = pd.read_csv('data/training_set_features.csv')
data = pd.merge(features, labels, on='respondent_id').drop(columns='respondent_id')

In [2]:
from sklearn.model_selection import train_test_split

X = data.drop(columns=['h1n1_vaccine'])
y = data['h1n1_vaccine']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=112221, stratify=y)

In [3]:
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy='minority', random_state=112221)
X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(
    steps=[('num_imputer', SimpleImputer(strategy='median', add_indicator=True)), 
           ('scaler', StandardScaler())])

categorical_transformer = Pipeline(
    steps=[('cat_imputer', SimpleImputer(strategy='most_frequent')),
           ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

num_features = list(range(0, 21)) + [31, 32]
cat_features = list(range(21, 31)) + [33, 34, 35]

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, num_features),
                  ('cat', categorical_transformer, cat_features)], remainder='passthrough')

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

default_models = {'LogisticRegression': {'classifier': LogisticRegression(random_state=112221)},
                  'DecisionTree': {'classifier': DecisionTreeClassifier(random_state=112221)},
                  'RandomForest': {'classifier': RandomForestClassifier(random_state=112221)},
                  'ExtraTrees': {'classifier': ExtraTreesClassifier(random_state=112221)},
                  'GradientBoost': {'classifier': GradientBoostingClassifier(random_state=112221)}}

In [6]:
import time

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

tuned_params = {}

def grid_search(params, name, scoring='accuracy', X_train=X_train, y_train=y_train):
    '''
    Performs grid search on classification models, returns best cross validation scores and parameters
    '''
    for model, grid in params.items():
        print(f'Running... {model} GridSearch')
        print(f'Time Started: {time.asctime()}')
        
        pipe = Pipeline(steps=[('col_transformer', preprocessor), 
                               ('classifier', default_models[model]['classifier'])])
        
        gridsearch = GridSearchCV(estimator=pipe, param_grid=grid, scoring=scoring, cv=5)
        gridsearch.fit(X_train, y_train)
        
        print(f'Time Finished: {time.asctime()}\n')
        print(f'Best cross validation score: {gridsearch.best_score_ :.2%}')
        print(f'Optimal parameters: {gridsearch.best_params_}')
        
        tuned_params[name] = gridsearch.best_params_
        
import warnings
warnings.filterwarnings('ignore')

In [7]:
params_lr1 = {'LogisticRegression': [{
    'classifier__penalty':['l1', 'l2', 'elasticnet'],
    'classifier__C':[0.01, 0.1, 1],
    'classifier__fit_intercept':[True, False],
    'classifier__solver':['lbfgs', 'saga']
}]}

grid_search(params_lr1, name='LogisticRegression')

Running... LogisticRegression GridSearch
Time Started: Fri Apr 22 11:06:55 2022
Time Finished: Fri Apr 22 11:14:11 2022

Best cross validation score: 86.76%
Optimal parameters: {'classifier__C': 1, 'classifier__fit_intercept': False, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}


In [8]:
grid_search(params_lr1, name='LogisticRegression_o', X_train=X_train_res, y_train=y_train_res)

Running... LogisticRegression GridSearch
Time Started: Fri Apr 22 11:14:11 2022
Time Finished: Fri Apr 22 11:26:58 2022

Best cross validation score: 82.12%
Optimal parameters: {'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}


In [9]:
params_lr2 = {'LogisticRegression': [{
    'classifier__penalty':['l1', 'l2', 'elasticnet'],
    'classifier__C':[1, 10, 100],
    'classifier__fit_intercept':[True, False],
    'classifier__solver':['lbfgs', 'saga']
}]}

grid_search(params_lr2, name='LogisticRegression2')

Running... LogisticRegression GridSearch
Time Started: Fri Apr 22 11:26:58 2022
Time Finished: Fri Apr 22 11:33:49 2022

Best cross validation score: 86.76%
Optimal parameters: {'classifier__C': 1, 'classifier__fit_intercept': False, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}


In [10]:
grid_search(params_lr2, name='LogisticRegression2_o', X_train=X_train_res, y_train=y_train_res)

Running... LogisticRegression GridSearch
Time Started: Fri Apr 22 11:33:49 2022
Time Finished: Fri Apr 22 11:47:15 2022

Best cross validation score: 82.12%
Optimal parameters: {'classifier__C': 10, 'classifier__fit_intercept': False, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}


In [11]:
grid_search(params_lr2, name='LogisticRegression2_p', scoring='precision')

Running... LogisticRegression GridSearch
Time Started: Fri Apr 22 11:47:15 2022
Time Finished: Fri Apr 22 11:54:05 2022

Best cross validation score: 74.54%
Optimal parameters: {'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}


In [12]:
lr_best_params = tuned_params['LogisticRegression2_p']
%store lr_best_params

Stored 'lr_best_params' (dict)


In [13]:
params_dt1 = {'DecisionTree': [{
    'classifier__criterion':['gini', 'entropy'],
    'classifier__splitter':['best', 'random'],
    'classifier__max_depth':[None, 1, 2, 5],
    'classifier__min_samples_split': [2, 3, 5],
    'classifier__min_samples_leaf': [1, 2, 5],
}]}

grid_search(params_dt1, name='DecisionTree')

Running... DecisionTree GridSearch
Time Started: Fri Apr 22 11:54:05 2022
Time Finished: Fri Apr 22 12:09:53 2022

Best cross validation score: 86.00%
Optimal parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': 5, 'classifier__min_samples_leaf': 5, 'classifier__min_samples_split': 2, 'classifier__splitter': 'best'}


In [14]:
grid_search(params_dt1, name='DecisionTree_o', X_train=X_train_res, y_train=y_train_res)

Running... DecisionTree GridSearch
Time Started: Fri Apr 22 12:09:53 2022
Time Finished: Fri Apr 22 12:45:59 2022

Best cross validation score: 92.51%
Optimal parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__splitter': 'best'}


In [15]:
params_dt2 = {'DecisionTree': [{
    'classifier__criterion':['gini', 'entropy'],
    'classifier__splitter':['best', 'random'],
    'classifier__max_depth':[5, 10, 15],
    'classifier__min_samples_split': [1, 2, 10],
    'classifier__min_samples_leaf': [1, 3, 5],
}]}

grid_search(params_dt2, name='DecisionTree2')

Running... DecisionTree GridSearch
Time Started: Fri Apr 22 12:45:59 2022
Time Finished: Fri Apr 22 12:57:20 2022

Best cross validation score: 86.00%
Optimal parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': 5, 'classifier__min_samples_leaf': 5, 'classifier__min_samples_split': 2, 'classifier__splitter': 'best'}


In [16]:
grid_search(params_dt2, name='DecisionTree2_o', X_train=X_train_res, y_train=y_train_res)

Running... DecisionTree GridSearch
Time Started: Fri Apr 22 12:57:20 2022
Time Finished: Fri Apr 22 13:22:56 2022

Best cross validation score: 89.05%
Optimal parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 15, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__splitter': 'best'}


In [17]:
grid_search(params_dt1, name='DecisionTree_op', X_train=X_train_res, y_train=y_train_res, scoring='precision')

Running... DecisionTree GridSearch
Time Started: Fri Apr 22 13:22:56 2022
Time Finished: Fri Apr 22 14:00:00 2022

Best cross validation score: 87.59%
Optimal parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__splitter': 'best'}


In [18]:
dt_best_params = tuned_params['DecisionTree_o']
%store dt_best_params

Stored 'dt_best_params' (dict)


In [19]:
params_rf1 = {'RandomForest': [{
    'classifier__n_estimators':[10, 30, 100],
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[None, 1, 2, 5],
    'classifier__min_samples_split':[2, 3, 5],
    'classifier__min_samples_leaf':[1, 2, 5]
}]}

grid_search(params_rf1, name='RandomForest')

Running... RandomForest GridSearch
Time Started: Fri Apr 22 14:00:00 2022
Time Finished: Fri Apr 22 14:33:43 2022

Best cross validation score: 86.47%
Optimal parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}


In [20]:
grid_search(params_rf1, name='RandomForest_o', X_train=X_train_res, y_train=y_train_res)

Running... RandomForest GridSearch
Time Started: Fri Apr 22 14:33:44 2022
Time Finished: Fri Apr 22 15:54:23 2022

Best cross validation score: 95.88%
Optimal parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 30}


In [21]:
params_rf2 = {'RandomForest': [{
    'classifier__n_estimators':[30, 100, 150],
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[None, 5, 10],
    'classifier__min_samples_split':[1, 2],
    'classifier__min_samples_leaf':[1, 10]
}]}

grid_search(params_rf2, name='RandomForest2')

Running... RandomForest GridSearch
Time Started: Fri Apr 22 15:54:24 2022
Time Finished: Fri Apr 22 16:07:37 2022

Best cross validation score: 86.47%
Optimal parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}


In [22]:
grid_search(params_rf1, name='RandomForest_op', X_train=X_train_res, y_train=y_train_res, scoring='precision')

Running... RandomForest GridSearch
Time Started: Fri Apr 22 16:07:37 2022
Time Finished: Fri Apr 22 17:18:38 2022

Best cross validation score: 93.11%
Optimal parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 30}


In [23]:
rf_best_params = tuned_params['RandomForest_op']
%store rf_best_params

Stored 'rf_best_params' (dict)


In [24]:
params_et1 = {'ExtraTrees': [{
    'classifier__n_estimators':[10, 30, 100],
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[None, 1, 2, 5],
    'classifier__min_samples_split':[2, 3, 5],
    'classifier__min_samples_leaf':[1, 2, 5]
}]}

grid_search(params_et1, name='ExtraTrees')

Running... ExtraTrees GridSearch
Time Started: Fri Apr 22 17:18:38 2022
Time Finished: Fri Apr 22 17:58:38 2022

Best cross validation score: 86.57%
Optimal parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}


In [25]:
grid_search(params_et1, name='ExtraTrees_o', X_train=X_train_res, y_train=y_train_res)

Running... ExtraTrees GridSearch
Time Started: Fri Apr 22 17:58:38 2022
Time Finished: Fri Apr 22 19:17:37 2022

Best cross validation score: 97.06%
Optimal parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 30}


In [26]:
params_et2 = {'ExtraTrees': [{
    'classifier__n_estimators':[30, 50, 100],
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[None, 3, 10],
    'classifier__min_samples_split':[2, 5, 10],
    'classifier__min_samples_leaf':[1, 3, 10]
}]}

grid_search(params_et2, name='ExtraTrees')

Running... ExtraTrees GridSearch
Time Started: Fri Apr 22 19:17:37 2022
Time Finished: Fri Apr 22 19:54:32 2022

Best cross validation score: 86.60%
Optimal parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}


In [27]:
grid_search(params_et2, name='ExtraTrees2_o', X_train=X_train_res, y_train=y_train_res)

Running... ExtraTrees GridSearch
Time Started: Fri Apr 22 19:54:32 2022
Time Finished: Fri Apr 22 21:06:25 2022

Best cross validation score: 97.06%
Optimal parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 30}


In [28]:
grid_search(params_et2, name='ExtraTrees2_op', X_train=X_train_res, y_train=y_train_res, scoring='precision')

Running... ExtraTrees GridSearch
Time Started: Fri Apr 22 21:06:25 2022
Time Finished: Fri Apr 22 22:21:31 2022

Best cross validation score: 95.32%
Optimal parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 30}


In [29]:
et_best_params = tuned_params['ExtraTrees2_op']
%store et_best_params

Stored 'et_best_params' (dict)


In [30]:
params_gb1 = {'GradientBoost': [{
    'classifier__loss': ['deviance', 'exponential'],
    'classifier__n_estimators':[10, 30, 100],
    'classifier__criterion':['friedman_mse', 'squared_error'],
    'classifier__max_depth':[None, 1, 2, 5],
    'classifier__min_samples_split':[2, 3, 5],
    'classifier__min_samples_leaf':[1, 2, 5]
}]}

grid_search(params_gb1, name='GradientBoost')

Running... GradientBoost GridSearch
Time Started: Fri Apr 22 22:21:31 2022
Time Finished: Sat Apr 23 02:03:14 2022

Best cross validation score: 87.19%
Optimal parameters: {'classifier__criterion': 'friedman_mse', 'classifier__loss': 'exponential', 'classifier__max_depth': 5, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}


In [31]:
grid_search(params_gb1, name='GradientBoost_o', X_train=X_train_res, y_train=y_train_res)

Running... GradientBoost GridSearch
Time Started: Sat Apr 23 02:03:14 2022
Time Finished: Sat Apr 23 08:11:11 2022

Best cross validation score: 95.54%
Optimal parameters: {'classifier__criterion': 'friedman_mse', 'classifier__loss': 'deviance', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 5, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}


In [32]:
params_gb2 = {'GradientBoost': [{
    'classifier__loss': ['deviance', 'exponential'],
    'classifier__n_estimators':[100, 150, 300],
    'classifier__criterion':['friedman_mse'],
    'classifier__max_depth':[None, 3, 5],
    'classifier__min_samples_split':[1, 2, 5],
    'classifier__min_samples_leaf':[1, 3, 5]
}]}

grid_search(params_gb2, name='GradientBoost2')

Running... GradientBoost GridSearch
Time Started: Sat Apr 23 09:00:18 2022
Time Finished: Sat Apr 23 18:14:55 2022

Best cross validation score: 87.19%
Optimal parameters: {'classifier__criterion': 'friedman_mse', 'classifier__loss': 'exponential', 'classifier__max_depth': 5, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}


In [33]:
grid_search(params_gb1, name='GradientBoost_op', X_train=X_train_res, y_train=y_train_res, scoring='precision')

Running... GradientBoost GridSearch
Time Started: Sat Apr 23 20:03:11 2022
Time Finished: Sun Apr 24 03:13:43 2022

Best cross validation score: 92.48%
Optimal parameters: {'classifier__criterion': 'friedman_mse', 'classifier__loss': 'deviance', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 5, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}


In [34]:
gb_best_params = tuned_params['GradientBoost_o']
%store gb_best_params

Stored 'gb_best_params' (dict)


# Contact
Feel free to contact me for any questions and connect with me on [Linkedin](https://www.linkedin.com/in/czarinagluna).