In [1]:
# import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pickle

from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
# get training and test data
X_train = pd.read_csv('X_train.csv')
X_train.index = X_train['Unnamed: 0']
X_train = X_train.drop(['Unnamed: 0'], axis=1)
X_train.index.name = ''

y_train = pd.read_csv('y_train.csv')
y_train.index = y_train['Unnamed: 0']
y_train = y_train.drop(['Unnamed: 0'], axis=1)
y_train.index.name = ''

X_test = pd.read_csv('X_test.csv')
X_test.index = X_test['Unnamed: 0']
X_test = X_test.drop(['Unnamed: 0'], axis=1)
X_test.index.name = ''

y_test = pd.read_csv('y_test.csv')
y_test.index = y_test['Unnamed: 0']
y_test = y_test.drop(['Unnamed: 0'], axis=1)
y_test.index.name = ''

I selected 5 models including: Random Forest, Gradient Boosting, Bagging with XGBoost, Bagging with Decision Tree and Adaptive Boosting with Decision Tree. Then I utilized grid search with StratifiedKFold cross validation to tune model and find the best hyperparameters. I recorded best hyperparameters and the corresponding score (AUC of the ROC curve) for each model type to compare in the next model selection process.

## Model 1: Random Forest

First, we used Random Forest. This is a tree-based ensemble model, and could avoid the results that have low bias but high variance. However, Random Forest requires a large number of predictors to choose from at each split, and also it would incur heavy computation. The best Random Forest model is `RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=70)` and the corresponding AUC is 0.836.

In [3]:
# perform grid search with Random Forest
pg_rf = {'n_estimators': [50,60,70,80,90],
         'criterion': ['gini','entropy'],
         'max_depth':[10,20,30,40,50]}
model_rf = GridSearchCV(
           RandomForestClassifier(), 
           pg_rf, 
           cv=StratifiedKFold(3, shuffle=True, random_state=1234), 
           scoring='roc_auc', 
           verbose=5,
           n_jobs=-1).fit(
           X_train,y_train)  

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [4]:
# record best estimator and best score
print(model_rf.best_estimator_)
print(model_rf.best_params_)
print(model_rf.best_score_)

RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=70)
{'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 70}
0.8360940603417045


## Model 2: Gradient Boosting

Second, I used Gradient Boosting. This is a very robust algorithm combining Gradient descent and Boosting. The word ‘gradient’ implies that we can have two or more derivatives of the same function. Gradient Boosting has three main components: additive model, loss function and a weak learner. The best Gradient Boosting model is `GradientBoostingClassifier(max_depth=5, n_estimators=100)` and the corresponding AUC is 0.838.

In [5]:
# perform grid search with Gradient Boosting
pg_gb = {'n_estimators': [100,300,500,800,1200],
         'max_depth': [None,5,10,15]}
model_gb = GridSearchCV(
           GradientBoostingClassifier(), 
           pg_gb, 
           cv=StratifiedKFold(3, shuffle=True, random_state=1234), 
           scoring='roc_auc', 
           verbose=5,
           n_jobs=-1).fit(
           X_train,y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [6]:
# record best estimator and best score
print(model_gb.best_estimator_)
print(model_gb.best_params_)
print(model_gb.best_score_)

GradientBoostingClassifier(max_depth=5)
{'max_depth': 5, 'n_estimators': 100}
0.8376538895981197


## Model 3: Bagging with XGBoost

Next, I used Bagging method with the XGBoost classifier as the base estimator. This is an alternative of Random Forest with a lighter computational pressure. A bagged tree is constructed in a similar fashion to Random Forest. The primary difference is that in a bagged model, all attributes are evaluated at each split in each tree. The best Bagging with XGBoost model is `BaggingClassifier(base_estimator=XGBClassifier(eval_metric='mlogloss', gamma=0, max_depth=5, min_child_weight=3), max_features=9, n_estimators=100)` and the corresponding AUC is 0.830.

In [16]:
# perform grid search with Bagging with XGBoost
pg_xgb = {'n_estimators': [100,300,500],
          'max_features': [1,5,7,9]}
model_xgb = GridSearchCV(
            BaggingClassifier(base_estimator=XGBClassifier(eval_metric='mlogloss',
                                                           gamma=0,
                                                           max_depth=5,
                                                           min_child_weight=3),
                                                           random_state=1234),
            pg_xgb, 
            cv=StratifiedKFold(3, shuffle=True, random_state=1234), 
            scoring='roc_auc', 
            verbose=5,
            n_jobs=-1).fit(
            X_train,y_train) 

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [17]:
# record best estimator and best score
print(model_xgb.best_estimator_)
print(model_xgb.best_params_)
print(model_xgb.best_score_)

BaggingClassifier(base_estimator=XGBClassifier(base_score=None, booster=None,
                                               colsample_bylevel=None,
                                               colsample_bynode=None,
                                               colsample_bytree=None,
                                               enable_categorical=False,
                                               eval_metric='mlogloss', gamma=0,
                                               gpu_id=None,
                                               importance_type=None,
                                               interaction_constraints=None,
                                               learning_rate=None,
                                               max_delta_step=None, max_depth=5,
                                               min_child_weight=3, missing=nan,
                                               monotone_constraints=None,
                                               n_e

## Model 4: Bagging with Decision Tree

After that, I utilized the Bagging classifier with classic Decision Tree model as the base estimator. This is a reasonable choice for our binary classification problem since the interpretation of a tree model is relatively transparent and straightforward. The best Bagging with Decision Tree model is `BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=5), max_features=15, n_estimators=1200)` and the corresponding AUC is 0.834.

In [9]:
# perform grid search with Bagging with Decision Tree
pg_dt = {'n_estimators':[100,120,200,300,500,800,1200],
         'max_features': [1,3,5,7,9,12,15,17,25]}
model_dt = GridSearchCV(
           BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=5),random_state=1234), 
           pg_dt, 
           cv=StratifiedKFold(3, shuffle=True, random_state=1234), 
           scoring='roc_auc', 
           verbose=5,
           n_jobs=-1).fit(
           X_train,y_train) 

Fitting 3 folds for each of 63 candidates, totalling 189 fits


In [10]:
# record best estimator and best score
print(model_dt.best_estimator_)
print(model_dt.best_params_)
print(model_dt.best_score_)

BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=5),
                  max_features=15, n_estimators=1200, random_state=1234)
{'max_features': 15, 'n_estimators': 1200}
0.8337653522569836


## Model 5: Adaptive Boosting with Decision Tree

Finally, I used Adaptive Boosting with Decision Tree as the base estimator. This ensemble method automatically adjusts its parameters to the data based on the actual performance in the current iteration. Both the weights for re-weighting the data and the weights for the final aggregation are re-computed iteratively. This method usually leads to an improved performance compared to the classification by one tree or other single base-learner. The best Adaptive Boosting with Decision Tree model is `AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5),learning_rate=0.01, n_estimators=300)` and the corresponding AUC is 0.837.

In [11]:
# perform grid search with Adaptive Boosting with Decision Tree
pg_ada = {'n_estimators': [100,120,200,300,500,800,1200],
          'learning_rate': [1e-4,1e-2,1e0,1e1,1e2]}
model_ada = GridSearchCV(
            AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5),random_state=1234), 
            pg_ada, 
            cv=StratifiedKFold(3, shuffle=True, random_state=1234), 
            scoring='roc_auc', 
            verbose=5,
            n_jobs=-1).fit(
            X_train,y_train) 

Fitting 3 folds for each of 35 candidates, totalling 105 fits


In [12]:
# record best estimator and best score
print(model_ada.best_estimator_)
print(model_ada.best_params_)
print(model_ada.best_score_)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5),
                   learning_rate=0.01, n_estimators=300, random_state=1234)
{'learning_rate': 0.01, 'n_estimators': 300}
0.8368010736998991


I created a dataframe and compared AUC of each model type from `StratifiedKFold` cross validation in the above process. The model performance result is: Gradient Boosting (0.8377) > Adaptive Boosting with Decision Tree (0.8368) > Random Forest (0.8361) > Bagging with Decision Tree (0.8338) > Bagging with XGBoost (0.8299). However, the ranking based on best score from cross validation might change slightly in each running of the code. Therefore, I pickled all 5 tuned models out and compared them on the same validation set.

In [18]:
# compare best score
data = [[model_rf.best_score_],[model_gb.best_score_],[model_xgb.best_score_], 
        [model_dt.best_score_],[model_ada.best_score_]]
df_compare = pd.DataFrame(data, columns=['Best Score (AUC)'],
                          index=['Random Forest','Gradient Boosting','Bagging with XGBoost',
                                 'Bagging with Decision Tree','Adaptive Boosting with Decision Tree'])
df_compare

Unnamed: 0,Best Score (AUC)
Random Forest,0.836094
Gradient Boosting,0.837654
Bagging with XGBoost,0.829905
Bagging with Decision Tree,0.833765
Adaptive Boosting with Decision Tree,0.836801


In [19]:
# pickle tuned models out
with open('model_rf.pickle', 'wb') as file:
    pickle.dump(model_rf.best_estimator_, file)
with open('model_gb.pickle', 'wb') as file:
    pickle.dump(model_gb.best_estimator_, file)
with open('model_xgb.pickle', 'wb') as file:
    pickle.dump(model_xgb.best_estimator_, file)
with open('model_dt.pickle', 'wb') as file:
    pickle.dump(model_dt.best_estimator_, file)
with open('model_ada.pickle', 'wb') as file:
    pickle.dump(model_ada.best_estimator_, file)