# Modelling

In [1]:
# Model Check
# Oversample Minority
# shuffle predictors and targets such that predictors point to new targets
# Use dataset without smoteen & tomek

In [4]:
import pandas as pd
pd.set_option('max_colwidth',100)
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import seaborn as sns
from sklearn import preprocessing as pp
import pickle
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

  return f(*args, **kwds)


In [5]:
# Logistic Regression
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_predict, GridSearchCV, train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

  return f(*args, **kwds)


In [6]:
X_st_df = pd.read_pickle('./couple_data_predictors')
y_st_df = pd.read_pickle('./couple_data_target')

In [7]:
# Scale the resampled features
Xs_st = StandardScaler().fit_transform(X_st_df)
y_st = y_st_df.values.ravel()

In [8]:
# Training and Test set
X_train, X_test, y_train, y_test = train_test_split(Xs_st, y_st, test_size=0.33, random_state=42)

### GridSearch Logistic Regression

In [13]:
# Gridsearch for Ridge and Lasso Logistic Regression, optimize C

parameters = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

print ("GRID SEARCH:")
lr_grid_search = GridSearchCV(LogisticRegression(), parameters, cv=10, verbose=0)
lr_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
print('f1 score:', lr_grid_search.best_score_)
lr_best_parameters = lr_grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ("\t%s: %r" % (param_name, lr_best_parameters[param_name]))
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV LOGREG RESULT:")
clf = lr_grid_search.best_estimator_
lr_gs_predicted = clf.predict(X_test)
print(metrics.classification_report(y_test, lr_gs_predicted, labels=[1,0], target_names=['break up','still together']))

GRID SEARCH:
Best parameters set:
f1 score: 0.871253405995
	C: 0.019179102616724886
	penalty: 'l2'
	solver: 'liblinear'
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV LOGREG RESULT:
                precision    recall  f1-score   support

      break up       0.88      0.84      0.86       367
still together       0.84      0.88      0.86       357

   avg / total       0.86      0.86      0.86       724



In [14]:
# Get top 10 coefficients
lr_gs_coef = pd.DataFrame({
    'coef':clf.coef_.ravel(),
    'mag':np.abs(clf.coef_.ravel()),
    'pred':X_st_df.columns
})
lr_gs_coef.sort_values(by=['mag'], ascending=False, inplace=True)
lr_gs_coef.head(10)

Unnamed: 0,coef,mag,pred
33,-0.660917,0.660917,coresident[T.Yes]
31,-0.482298,0.482298,married[T.married]
32,-0.404847,0.404847,parental_approval[T.approve]
9,-0.345266,0.345266,parent_alive[T.neither father nor mother are alive]
43,-0.296333,0.296333,couple_relig_comb[T.Protestant or oth Christian_Protestant or oth Christian]
52,-0.268042,0.268042,how_long_relationship
50,-0.255254,0.255254,how_long_ago_first_romantic
18,-0.233371,0.233371,q24_church[T.Yes]
51,-0.227258,0.227258,how_long_ago_first_cohab
30,-0.205329,0.205329,met_through_as_neighbors[T.met through or as neighbors]


### GridSearch SGDClassifier

In [15]:
# Gridsearch SGDclassifier with log loss and optimal learning rate

sgd_parameters = {
    'learning_rate': ['optimal'],
    'loss':['log','hinge'],
    'penalty': ['l1','l2','elasticnet'],
    'alpha': np.logspace(-10,5,100),
    'l1_ratio': np.logspace(-1,0,20)
}

print ("GRID SEARCH:")
sgd_grid_search = GridSearchCV(SGDClassifier(max_iter=10000, tol=0.0001), sgd_parameters, cv=10, verbose=0)
sgd_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
print('f1 score:', sgd_grid_search.best_score_)
sgd_best_parameters = sgd_grid_search.best_estimator_.get_params()
for param_name in sorted(sgd_parameters.keys()):
    print ("\t%s: %r" % (param_name, sgd_best_parameters[param_name]))
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV SGDCLASSIFIER RESULT:")
sgd_clf = sgd_grid_search.best_estimator_
sgd_predicted = sgd_clf.predict(X_test)
print(metrics.classification_report(y_test, sgd_predicted, labels=[1,0], target_names=['break up','still together']))

GRID SEARCH:
Best parameters set:
f1 score: 0.873978201635
	alpha: 0.0053366992312063122
	l1_ratio: 0.10000000000000001
	learning_rate: 'optimal'
	loss: 'log'
	penalty: 'elasticnet'
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV SGDCLASSIFIER RESULT:
                precision    recall  f1-score   support

      break up       0.88      0.84      0.86       367
still together       0.84      0.88      0.86       357

   avg / total       0.86      0.86      0.86       724



In [16]:
# Get top 10 coefficients
sgd_coef = pd.DataFrame({
    'coef':sgd_clf.coef_.ravel(),
    'mag':np.abs(sgd_clf.coef_.ravel()),
    'pred':X_st_df.columns
})
sgd_coef.sort_values(by=['mag'], ascending=False, inplace=True)
sgd_coef.head(10)

Unnamed: 0,coef,mag,pred
33,-0.984743,0.984743,coresident[T.Yes]
32,-0.59515,0.59515,parental_approval[T.approve]
9,-0.563936,0.563936,parent_alive[T.neither father nor mother are alive]
31,-0.553029,0.553029,married[T.married]
51,-0.516062,0.516062,how_long_ago_first_cohab
43,-0.460647,0.460647,couple_relig_comb[T.Protestant or oth Christian_Protestant or oth Christian]
37,-0.377442,0.377442,couple_politic_view_comb[T.republican_other]
18,-0.335849,0.335849,q24_church[T.Yes]
52,-0.326403,0.326403,how_long_relationship
56,-0.30194,0.30194,partner_yrsed


### GridSearch XGBoost Classifier

In [17]:
# Reference link: https://www.kaggle.com/phunter/xgboost-with-gridsearchcv
# Credit to Shize's R code and the python re-implementation

xgb_model = xgb.XGBClassifier()

#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6,7,8],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [1000], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [65]}

xgb_clf = GridSearchCV(xgb_model, parameters, n_jobs=5, scoring='f1', verbose=0, refit=True, cv=10)

xgb_clf.fit(X_train, y_train)

best_parameters, score, _ = max(xgb_clf.grid_scores_, key=lambda x: x[1])
print('f1 score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
    
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV XGBCLASSIFIER RESULT:")
xgb_predicted = xgb_clf.best_estimator_.predict(X_test)
print(metrics.classification_report(y_test, xgb_predicted, labels=[1,0], target_names=['break up','still together']))

f1 score: 0.877142802118
colsample_bytree: 0.7
learning_rate: 0.05
max_depth: 6
min_child_weight: 11
missing: -999
n_estimators: 1000
nthread: 4
objective: 'binary:logistic'
seed: 65
silent: 1
subsample: 0.8
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV XGBCLASSIFIER RESULT:
                precision    recall  f1-score   support

      break up       0.88      0.86      0.87       367
still together       0.86      0.88      0.87       357

   avg / total       0.87      0.87      0.87       724





In [18]:
# Get top 10 coefficients
xgb_featimpt = pd.DataFrame({
    'importance':xgb_clf.best_estimator_.feature_importances_,
    'pred':X_st_df.columns
})
xgb_featimpt = xgb_featimpt[xgb_featimpt.importance > 0]
xgb_featimpt.sort_values(by=['importance'], ascending=False, inplace=True)
xgb_featimpt.head(10)

Unnamed: 0,importance,pred
48,0.106005,distancemoved_10mi
50,0.088425,how_long_ago_first_romantic
49,0.08651,how_long_ago_first_met
53,0.083203,age_difference
51,0.075544,how_long_ago_first_cohab
56,0.063011,partner_yrsed
58,0.055875,respondent_mom_yrsed
55,0.047346,respondent_yrsed
52,0.042646,how_long_relationship
32,0.039164,parental_approval[T.approve]


In [19]:
print(metrics.accuracy_score(y_test, xgb_predicted))

0.867403314917


In [20]:
xgb_featimpt = pd.DataFrame({
    'importance':xgb_clf.best_estimator_.feature_importances_,
    'pred':X_st_df.columns
})
xgb_featimpt[xgb_featimpt.importance <= 0]

Unnamed: 0,importance,pred
0,0.0,Intercept
6,0.0,both_parents_knew_before_met[T.yes]
7,0.0,parent_alive[T.father only]
11,0.0,fam_intro_partner[T.yes]
13,0.0,colleague_intro_partner[T.yes]
16,0.0,q24_school[T.Yes]
17,0.0,q24_college[T.Yes]
18,0.0,q24_church[T.Yes]
19,0.0,q24_vol_org[T.Yes]
20,0.0,q24_customer[T.Yes]


In [9]:
# Try TPOT
from tpot import TPOTClassifier

In [10]:
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2, n_jobs=-1)

  return f(*args, **kwds)


In [11]:
pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))
# pipeline_optimizer.export('tpot_exported_pipeline.py')

Optimization Progress:  33%|███▎      | 40/120 [00:48<01:21,  1.02s/pipeline]

Generation 1 - Current best internal CV score: 0.8699235858727054


Optimization Progress:  50%|█████     | 60/120 [01:05<00:39,  1.52pipeline/s]

Generation 2 - Current best internal CV score: 0.8712841300903923


Optimization Progress:  67%|██████▋   | 80/120 [01:32<01:45,  2.64s/pipeline]

Generation 3 - Current best internal CV score: 0.8712841300903923


Optimization Progress:  83%|████████▎ | 100/120 [02:18<00:48,  2.41s/pipeline]

Generation 4 - Current best internal CV score: 0.8746948094306216


                                                                              

Generation 5 - Current best internal CV score: 0.8746948094306216

Best pipeline: GradientBoostingClassifier(CombineDFs(input_matrix, input_matrix), learning_rate=0.1, max_depth=6, max_features=0.85, min_samples_leaf=1, min_samples_split=13, n_estimators=100, subsample=0.6)
0.875690607735


In [14]:
yhat = pipeline_optimizer.predict(X_test)
print(metrics.classification_report(y_test, yhat, labels=[1,0], target_names=['break up','still together']))

                precision    recall  f1-score   support

      break up       0.88      0.88      0.88       367
still together       0.87      0.87      0.87       357

   avg / total       0.88      0.88      0.88       724



### Best Models from TPOT
> * Gradient Boosting Classifier - **precision: 0.88** | **recall: 0.88** | **f1-score: 0.88**
* Extra Trees Classifier - **precision: 0.87** | **recall: 0.87** | **f1-score: 0.87**

In [15]:
extratreesclassifier = pipeline_optimizer.fitted_pipeline_.steps[-1][1]

In [17]:
pipeline_optimizer.fitted_pipeline_

Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('functiontransformer-1', FunctionTransformer(accept_sparse=False,
          func=<function copy at 0x7ff6406b57b8>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y='deprecated',
          validate=True)), ('...auto', random_state=None,
              subsample=0.6000000000000001, verbose=0, warm_start=False))])

In [16]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(extratreesclassifier, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

NotFittedError: This GradientBoostingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.