In [1]:
# Model Check
# Oversample Minority
# Shuffle Predictors and Targets such that predictors point to new targets
# Use dataset without Smoteen & Tomek

In [1]:
import pandas as pd
import numpy as np
pd.set_option('max_colwidth',100)
import matplotlib.pyplot as plt
import scipy as sp
import seaborn as sns
from sklearn import preprocessing as pp
import pickle
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_predict, GridSearchCV, train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [3]:
# pred = pd.read_pickle('./couple_data_without_resample_lasso_feature_selection_predictors')
pred = pd.read_pickle('./couple_data_without_resample_lasso_feature_selection_predictors')
target = pd.read_pickle('./couple_data_without_resample_target')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(pred.values, target.values.ravel(), test_size=0.3, random_state=42)

### GridSearch Logistic Regression

In [13]:
# Gridsearch for Ridge and Lasso Logistic Regression, optimize C

parameters = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

print ("GRID SEARCH:")
lr_grid_search = GridSearchCV(LogisticRegression(), parameters, cv=5, verbose=1, n_jobs=-1, scoring='roc_auc')
lr_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
lr_best_parameters = lr_grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ("\t%s: %r" % (param_name, lr_best_parameters[param_name]))
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV LOGREG RESULT:")
clf = lr_grid_search.best_estimator_
lr_gs_predicted = clf.predict(X_test)
print(metrics.classification_report(y_test, lr_gs_predicted, labels=[1,0], target_names=['break up','still together']))

GRID SEARCH:
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best parameters set:
	C: 0.021544346900318846
	penalty: 'l2'
	solver: 'liblinear'
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV LOGREG RESULT:
                precision    recall  f1-score   support

      break up       0.71      0.63      0.67       153
still together       0.84      0.88      0.86       342

   avg / total       0.80      0.81      0.80       495



[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.8s finished


In [14]:
# Get top 10 coefficients
lr_gs_coef = pd.DataFrame({
    'coef':clf.coef_.ravel(),
    'mag':np.abs(clf.coef_.ravel()),
    'pred':pred.columns
})
lr_gs_coef.sort_values(by=['mag'], ascending=False, inplace=True)

In [15]:
# Lasso LogReg factors that contribute to couple breakup
lr_gs_coef[lr_gs_coef.coef > 0]

Unnamed: 0,coef,mag,pred
12,0.170742,0.170742,couple_race_comb[T.other_other]
10,0.141614,0.141614,couple_relig_comb[T.Protestant or oth Christian_other]
0,0.126065,0.126065,hhinc[T.low_hhinc]
15,0.107548,0.107548,marriage_count_gender[T.married before_male]
11,0.070987,0.070987,couple_race_comb[T.NH white_other]
13,0.069624,0.069624,gender_internet_access[T.yes_female]
6,0.057908,0.057908,met_through_friends[T.meet through friends]
1,0.040186,0.040186,parent_alive[T.father only]


In [16]:
# Race in Gender Race can either be white or non-white

In [17]:
# Lasso LogReg factors that contribute to couple staying together
lr_gs_coef[lr_gs_coef.coef < 0]

Unnamed: 0,coef,mag,pred
9,-0.674038,0.674038,coresident[T.Yes]
7,-0.35986,0.35986,married[T.married]
19,-0.335256,0.335256,how_long_relationship
8,-0.282591,0.282591,parental_approval[T.approve]
3,-0.25741,0.25741,parent_alive[T.neither father nor mother are alive]
18,-0.238178,0.238178,how_long_ago_first_cohab
17,-0.148573,0.148573,relatives_seen_per_month
14,-0.107532,0.107532,hhhead_gender[T.yes_male]
2,-0.097515,0.097515,parent_alive[T.mother only]
4,-0.065731,0.065731,met_partner_work[T.yes]


### GridSearch SGDClassifier

In [20]:
# Gridsearch SGDclassifier with log loss and optimal learning rate
sgd_parameters = {
    'learning_rate': ['optimal'],
    'loss':['log','hinge'],
    'penalty': ['l1','l2','elasticnet'],
    'alpha': np.logspace(-10,5,100),
    'l1_ratio': np.logspace(-1,0,20)
}

print ("GRID SEARCH:")
sgd_grid_search = GridSearchCV(SGDClassifier(max_iter=10000, tol=0.0001), sgd_parameters, cv=5, verbose=1, scoring='roc_auc')
sgd_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
sgd_best_parameters = sgd_grid_search.best_estimator_.get_params()
for param_name in sorted(sgd_parameters.keys()):
    print ("\t%s: %r" % (param_name, sgd_best_parameters[param_name]))
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV SGDCLASSIFIER RESULT:")
sgd_clf = sgd_grid_search.best_estimator_
sgd_predicted = sgd_clf.predict(X_test)
print(metrics.classification_report(y_test, sgd_predicted, labels=[1,0], target_names=['break up','still together']))

GRID SEARCH:
Fitting 5 folds for each of 12000 candidates, totalling 60000 fits


[Parallel(n_jobs=1)]: Done 60000 out of 60000 | elapsed:  4.3min finished


Best parameters set:
	alpha: 0.043287612810830614
	l1_ratio: 0.14384498882876628
	learning_rate: 'optimal'
	loss: 'hinge'
	penalty: 'l2'
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV SGDCLASSIFIER RESULT:
                precision    recall  f1-score   support

      break up       0.72      0.61      0.66       153
still together       0.84      0.89      0.86       342

   avg / total       0.80      0.81      0.80       495



In [21]:
sgd_coef = pd.DataFrame({
    'coef':sgd_clf.coef_.ravel(),
    'mag':np.abs(sgd_clf.coef_.ravel()),
    'pred':pred.columns
})
sgd_coef.sort_values(by=['mag'], ascending=False, inplace=True)

In [22]:
# SGD factors that contribute to couple breakup
sgd_coef[sgd_coef.coef > 0]

Unnamed: 0,coef,mag,pred
16,0.056738,0.056738,marriage_count_gender[T.never married_male]
15,0.053156,0.053156,marriage_count_gender[T.married before_male]
13,0.049358,0.049358,gender_internet_access[T.yes_female]
12,0.046371,0.046371,couple_race_comb[T.other_other]
11,0.024308,0.024308,couple_race_comb[T.NH white_other]
0,0.016767,0.016767,hhinc[T.low_hhinc]
1,0.014774,0.014774,parent_alive[T.father only]
10,0.009392,0.009392,couple_relig_comb[T.Protestant or oth Christian_other]
6,0.00752,0.00752,met_through_friends[T.meet through friends]
5,0.00338,0.00338,self_intro_partner[T.yes]


In [23]:
# SGD factors that contribute to couple staying together
sgd_coef[sgd_coef.coef < 0]

Unnamed: 0,coef,mag,pred
9,-0.822612,0.822612,coresident[T.Yes]
19,-0.151826,0.151826,how_long_relationship
7,-0.109738,0.109738,married[T.married]
3,-0.104791,0.104791,parent_alive[T.neither father nor mother are alive]
8,-0.104656,0.104656,parental_approval[T.approve]
17,-0.051492,0.051492,relatives_seen_per_month
2,-0.035884,0.035884,parent_alive[T.mother only]
18,-0.034176,0.034176,how_long_ago_first_cohab
14,-0.031736,0.031736,hhhead_gender[T.yes_male]
4,-0.006187,0.006187,met_partner_work[T.yes]


In [33]:
np.min(np.logspace(-5,0,10))

1.0000000000000001e-05

### GridSearch XGBoost Classifier

In [25]:
# Reference link: https://www.kaggle.com/phunter/xgboost-with-gridsearchcv
# Credit to Shize's R code and the python re-implementation

xgb_model = xgb.XGBClassifier()

#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
parameters = {'nthread': [4], #when use hyperthread, xgboost may become slower
              'objective': ['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6,7,8],
              'min_child_weight': [11],
              'gamma': [0,0.1],
              'subsample': [0.5],
              'colsample_bytree': [0.5],
              'silent': [1],
              'scale_pos_weight': [1],
              'n_estimators': [1000], #number of trees, change it to 1000 for better results
              'missing': [-999],
              'reg_alpha': np.logspace(-5,0,10),
              'reg_lambda': np.logspace(-5,0,10),
              'seed': [65]}

xgb_clf = GridSearchCV(xgb_model, parameters, n_jobs=-1, verbose=1, cv=5, scoring='roc_auc')

xgb_clf.fit(X_train, y_train)

best_parameters = xgb_clf.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
    
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV XGBCLASSIFIER RESULT:")
xgb_predicted = xgb_clf.best_estimator_.predict(X_test)
print(metrics.classification_report(y_test, xgb_predicted, labels=[1,0], target_names=['break up','still together']))

Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 22.1min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed: 26.9min finished


base_score: 0.5
booster: 'gbtree'
colsample_bylevel: 1
colsample_bytree: 0.5
gamma: 0.1
learning_rate: 0.05
max_delta_step: 0
max_depth: 6
min_child_weight: 11
missing: -999
n_estimators: 1000
n_jobs: 1
nthread: 4
objective: 'binary:logistic'
random_state: 0
reg_alpha: 1.0
reg_lambda: 1.0
scale_pos_weight: 1
seed: 65
silent: 1
subsample: 0.5
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV XGBCLASSIFIER RESULT:
                precision    recall  f1-score   support

      break up       0.67      0.61      0.64       153
still together       0.83      0.87      0.85       342

   avg / total       0.78      0.79      0.78       495



In [26]:
xgb_featimpt = pd.DataFrame({
    'importance':xgb_clf.best_estimator_.feature_importances_,
    'pred':pred.columns
})
xgb_featimpt = xgb_featimpt[xgb_featimpt.importance > 0]
xgb_featimpt.sort_values(by=['importance'], ascending=False, inplace=True)
xgb_featimpt.head(10)

Unnamed: 0,importance,pred
19,0.204359,how_long_relationship
18,0.147505,how_long_ago_first_cohab
17,0.138661,relatives_seen_per_month
5,0.065698,self_intro_partner[T.yes]
8,0.056222,parental_approval[T.approve]
9,0.052432,coresident[T.Yes]
13,0.052116,gender_internet_access[T.yes_female]
0,0.049589,hhinc[T.low_hhinc]
7,0.047694,married[T.married]
14,0.041061,hhhead_gender[T.yes_male]


In [5]:
# Try TPOT
from tpot import TPOTClassifier

In [16]:
pipeline_optimizer = TPOTClassifier(scoring='roc_auc', cv=5, random_state=42, verbosity=2, n_jobs=-1, \
                                    generations=10)

In [17]:
pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))

Optimization Progress:   0%|          | 0/1100 [00:00<?, ?pipeline/s]Process ForkPoolWorker-1125:
Process ForkPoolWorker-1124:
Process ForkPoolWorker-1121:
Process ForkPoolWorker-1122:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/david.yan/anaconda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/david.yan/anaconda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/david.yan/anaconda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/david.yan/anaconda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/david.yan/anaconda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/david.yan/anaconda/lib/python3.6/multiprocessing/process.



TPOT closed prematurely. Will use the current best pipeline.


RuntimeError: A pipeline has not yet been optimized. Please call fit() first.

In [15]:
yhat = pipeline_optimizer.predict(X_test)
print(metrics.classification_report(y_test, yhat, labels=[1,0], target_names=['break up','still together']))

                precision    recall  f1-score   support

      break up       0.66      0.67      0.67       153
still together       0.85      0.85      0.85       342

   avg / total       0.79      0.79      0.79       495



In [14]:
pipeline_optimizer.export('./tpot_exported_pipeline_1.py')

True

In [65]:
from sklearn.svm import SVC

In [66]:
svc_params = {
    'C':np.logspace(-3, 2, 10),
    'gamma':np.logspace(-5, 2, 10),
    'kernel':['linear','rbf']
}

svc_gs = GridSearchCV(SVC(max_iter=10000, tol=0.0001), svc_params, cv=5, verbose=1, n_jobs=-1)
svc_gs.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 21.4min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': array([  1.00000e-03,   3.59381e-03,   1.29155e-02,   4.64159e-02,
         1.66810e-01,   5.99484e-01,   2.15443e+00,   7.74264e+00,
         2.78256e+01,   1.00000e+02]), 'gamma': array([  1.00000e-05,   5.99484e-05,   3.59381e-04,   2.15443e-03,
         1.29155e-02,   7.74264e-02,   4.64159e-01,   2.78256e+00,
         1.66810e+01,   1.00000e+02]), 'kernel': ['linear', 'rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [67]:
best_parameters = svc_gs.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

C: 7.7426368268112773
cache_size: 200
class_weight: None
coef0: 0.0
decision_function_shape: 'ovr'
degree: 3
gamma: 0.01291549665014884
kernel: 'rbf'
max_iter: -1
probability: False
random_state: None
shrinking: True
tol: 0.001
verbose: False


In [68]:
yhat = svc_gs.best_estimator_.predict(X_test)
print(metrics.classification_report(y_test, yhat, labels=[1,0], target_names=['break up','still together']))

                precision    recall  f1-score   support

      break up       0.68      0.58      0.62       153
still together       0.82      0.88      0.85       342

   avg / total       0.78      0.79      0.78       495



In [69]:
from sklearn.neighbors import KNeighborsClassifier

knn_params = {
    'n_neighbors':range(1,51),
    'weights':['distance','uniform']
}

knn_gs = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, verbose=1, n_jobs=-1)
knn_gs.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   26.0s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_neighbors': range(1, 51), 'weights': ['distance', 'uniform']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [31]:
best_parameters = knn_gs.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

algorithm: 'auto'
leaf_size: 30
metric: 'minkowski'
metric_params: None
n_jobs: 1
n_neighbors: 25
p: 2
weights: 'uniform'


In [32]:
yhat = knn_gs.best_estimator_.predict(X_test)
print(metrics.classification_report(y_test, yhat, labels=[1,0], target_names=['break up','still together']))

                precision    recall  f1-score   support

      break up       0.70      0.28      0.40       172
still together       0.74      0.94      0.83       372

   avg / total       0.73      0.73      0.69       544



In [None]:
# Check whether why TPOT did not churn out LOGREG as best model
# Wrap up with Dashboard and explaination