<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Baseline-model" data-toc-modified-id="Baseline-model-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Baseline model</a></span></li><li><span><a href="#Tuning-the-parameters" data-toc-modified-id="Tuning-the-parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Tuning the parameters</a></span></li></ul></div>

In [48]:
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
raw_data = pd.read_csv((
    "https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
    "master/ESS_practice_data/ESSdata_Thinkful.csv")).dropna()

raw_data.head()

Unnamed: 0,cntry,idno,year,tvtot,ppltrst,pplfair,pplhlp,happy,sclmeet,sclact,gndr,agea,partner
0,CH,5.0,6,3.0,3.0,10.0,5.0,8.0,5.0,4.0,2.0,60.0,1.0
1,CH,25.0,6,6.0,5.0,7.0,5.0,9.0,3.0,2.0,2.0,59.0,1.0
2,CH,26.0,6,1.0,8.0,8.0,8.0,7.0,6.0,3.0,1.0,24.0,2.0
3,CH,28.0,6,4.0,6.0,6.0,7.0,10.0,6.0,2.0,2.0,64.0,1.0
4,CH,29.0,6,5.0,6.0,7.0,5.0,8.0,7.0,2.0,2.0,55.0,1.0


In [35]:
from sklearn.model_selection import train_test_split, cross_val_score

Y = raw_data['partner'] - 1

X = raw_data.loc[:, ~raw_data.columns.isin(['partner'])]
X = pd.concat([X, pd.get_dummies(raw_data['cntry'])], axis = 1)
X.drop(columns = ['cntry', 'idno', 'year'], inplace = True)


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 13, stratify = Y)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((5702, 15), (2445, 15), (5702,), (2445,))

## Baseline model

In [49]:
clf = GradientBoostingClassifier(loss = 'deviance')
clf.fit(X_train, Y_train)

baseline_scores = cross_val_score(clf, X_train, Y_train, cv = 10)
np.mean(baseline_scores), baseline_scores

(0.7541149721940578,
 array([0.7793345 , 0.76532399, 0.7754386 , 0.74385965, 0.75087719,
        0.7245614 , 0.76666667, 0.75789474, 0.74561404, 0.73157895]))

In [50]:
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(clf.predict(X_test), Y_test, normalize='all')

array([[0.54764826, 0.18527607],
       [0.06748466, 0.199591  ]])

In [51]:
print(classification_report(clf.predict(X_test), Y_test))

              precision    recall  f1-score   support

         0.0       0.89      0.75      0.81      1792
         1.0       0.52      0.75      0.61       653

    accuracy                           0.75      2445
   macro avg       0.70      0.75      0.71      2445
weighted avg       0.79      0.75      0.76      2445



In [52]:
feature_imp = dict(zip(X.columns, clf.feature_importances_))
sorted(feature_imp.items(), key = lambda x: x[1], reverse=True)

[('agea', 0.7143541367440654),
 ('happy', 0.120345874533066),
 ('sclmeet', 0.040090510900615606),
 ('tvtot', 0.02310324559318136),
 ('gndr', 0.02166964125831519),
 ('sclact', 0.016779868766134742),
 ('ES', 0.016161566273129813),
 ('pplfair', 0.013003443814382733),
 ('pplhlp', 0.009353250938840476),
 ('ppltrst', 0.008209275399738123),
 ('NO', 0.00641751708168316),
 ('CZ', 0.004837202095715181),
 ('SE', 0.0035833330329723186),
 ('CH', 0.0016667345510890756),
 ('DE', 0.0004243990170709372)]

## Tuning the parameters

We start by creating a grid.

In [54]:
from sklearn.model_selection import GridSearchCV

param_grid = {'loss' : ['deviance', 'exponential'], 
              'learning_rate': [0.01, 0.1, 1, 10], 
              'n_estimators': [10, 100, 1000, 2500],
              'min_samples_split': [2, 4, 8, 16], 
              'max_features': [2, 4, 6],
              'subsample': [0.01, 0.1, 0.5, 1, 2, 5]}
              

grid = GridSearchCV(GradientBoostingClassifier(), param_grid, refit=True, verbose=1, n_jobs=-1)

# fitting the model for grid search
grid.fit(X_train, Y_train)

Fitting 5 folds for each of 2304 candidates, totalling 11520 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 236 tasks      | elapsed:   31.7s
[Parallel(n_jobs=-1)]: Done 486 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 836 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1286 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 1836 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 2486 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 3236 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 4086 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 5036 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 6086 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done 7236 tasks      | elapsed: 20.4min
[Parallel(n_jobs=-1)]: Done 8486 tasks      | elapsed: 23.8min
[Parallel(n_jobs=-1)]: Done 9836 tasks      | elapsed: 27.4min
[Parallel(n_jobs=-1)]: Done 11286 tasks      

GridSearchCV(cv=None, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_n...
                 

In [55]:
print(grid.best_estimator_) 

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='exponential', max_depth=3,
                           max_features=4, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=1000,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=0.1, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


In [56]:
print(classification_report(Y_test, grid.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.75      0.90      0.82      1504
         1.0       0.77      0.51      0.61       941

    accuracy                           0.75      2445
   macro avg       0.76      0.71      0.72      2445
weighted avg       0.75      0.75      0.74      2445

