In [96]:
# Model Check
# Oversample Minority
# Shuffle Predictors and Targets such that predictors point to new targets
# Use dataset without Smoteen & Tomek

In [97]:
import pandas as pd
import numpy as np
pd.set_option('max_colwidth',100)
import matplotlib.pyplot as plt
import scipy as sp
import seaborn as sns
from sklearn import preprocessing as pp
import pickle
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [98]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_predict, GridSearchCV, train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [99]:
# pred = pd.read_pickle('./couple_data_without_resample_lasso_feature_selection_predictors')
pred = pd.read_pickle('./couple_data_without_resample_lasso_feature_selection_predictors')
target = pd.read_pickle('./couple_data_without_resample_target')

In [100]:
X_train, X_test, y_train, y_test = train_test_split(pred.values, target.values.ravel(), test_size=0.3, random_state=42)

In [101]:
1 - target.mean()

relationship_outcome_6yrs    0.694596
dtype: float64

### GridSearch Logistic Regression

In [102]:
# Gridsearch for Ridge and Lasso Logistic Regression, optimize C

parameters = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

print ("GRID SEARCH:")
lr_grid_search = GridSearchCV(LogisticRegression(), parameters, cv=5, verbose=1, n_jobs=-1)
lr_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
lr_best_parameters = lr_grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ("\t%s: %r" % (param_name, lr_best_parameters[param_name]))
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV LOGREG RESULT:")
clf = lr_grid_search.best_estimator_
lr_gs_predicted = clf.predict(X_test)
print(metrics.classification_report(y_test, lr_gs_predicted, labels=[1,0], target_names=['break up','still together']))

GRID SEARCH:
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best parameters set:
	C: 0.55908101825122225
	penalty: 'l1'
	solver: 'liblinear'
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV LOGREG RESULT:
                precision    recall  f1-score   support

      break up       0.66      0.61      0.64       153
still together       0.83      0.86      0.85       342

   avg / total       0.78      0.78      0.78       495



[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.7s finished


In [104]:
# Get top 10 coefficients
lr_gs_coef = pd.DataFrame({
    'coef':clf.coef_.ravel(),
    'mag':np.abs(clf.coef_.ravel()),
    'pred':pred.columns
})
lr_gs_coef.sort_values(by=['mag'], ascending=False, inplace=True)
lr_gs_coef.head(10)

Unnamed: 0,coef,mag,pred
21,-0.953433,0.953433,how_long_relationship
9,-0.718376,0.718376,married[T.married]
10,-0.532831,0.532831,parental_approval[T.approve]
5,-0.466077,0.466077,parent_alive[T.neither father nor mother are alive]
15,0.23411,0.23411,couple_race_comb[T.other_other]
2,-0.220212,0.220212,higher_income_earner[T.male_earn_more]
0,0.197698,0.197698,hhinc[T.low_hhinc]
20,-0.189732,0.189732,relatives_seen_per_month
4,-0.18444,0.18444,parent_alive[T.mother only]
14,0.178843,0.178843,couple_relig_comb[T.Protestant or oth Christian_other]


In [105]:
# Lasso LogReg factors that contribute to couple breakup
lr_gs_coef[lr_gs_coef.coef > 0]

Unnamed: 0,coef,mag,pred
15,0.23411,0.23411,couple_race_comb[T.other_other]
0,0.197698,0.197698,hhinc[T.low_hhinc]
14,0.178843,0.178843,couple_relig_comb[T.Protestant or oth Christian_other]
3,0.16313,0.16313,grow_up_same_city_town[T.yes]
18,0.161289,0.161289,marriage_count_gender[T.married before_male]
16,0.145317,0.145317,gender_work_status[T.not-working_male]
8,0.123191,0.123191,met_through_friends[T.meet through friends]
11,0.120662,0.120662,couple_politic_view_comb[T.democrat_republican]
1,0.046419,0.046419,pprent[T.rented for cash]


In [106]:
# Race in Gender Race can either be white or non-white
# median hhincome is 67250

In [107]:
# Lasso LogReg factors that contribute to couple staying together
lr_gs_coef[lr_gs_coef.coef < 0]

Unnamed: 0,coef,mag,pred
21,-0.953433,0.953433,how_long_relationship
9,-0.718376,0.718376,married[T.married]
10,-0.532831,0.532831,parental_approval[T.approve]
5,-0.466077,0.466077,parent_alive[T.neither father nor mother are alive]
2,-0.220212,0.220212,higher_income_earner[T.male_earn_more]
20,-0.189732,0.189732,relatives_seen_per_month
4,-0.18444,0.18444,parent_alive[T.mother only]
17,-0.085548,0.085548,gender_work_status[T.self-employed_male]
12,-0.072363,0.072363,couple_politic_view_comb[T.republican_republican]
22,-0.067656,0.067656,couple_moms_yrsed_diff


### GridSearch SGDClassifier

In [108]:
# Gridsearch SGDclassifier with log loss and optimal learning rate
sgd_parameters = {
    'learning_rate': ['optimal'],
    'loss':['log','hinge'],
    'penalty': ['l1','l2','elasticnet'],
    'alpha': np.logspace(-10,5,100),
    'l1_ratio': np.logspace(-1,0,50)
}

print ("GRID SEARCH:")
sgd_grid_search = GridSearchCV(SGDClassifier(max_iter=10000, tol=0.0001), sgd_parameters, cv=5, verbose=1)
sgd_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
sgd_best_parameters = sgd_grid_search.best_estimator_.get_params()
for param_name in sorted(sgd_parameters.keys()):
    print ("\t%s: %r" % (param_name, sgd_best_parameters[param_name]))
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV SGDCLASSIFIER RESULT:")
sgd_clf = sgd_grid_search.best_estimator_
sgd_predicted = sgd_clf.predict(X_test)
print(metrics.classification_report(y_test, sgd_predicted, labels=[1,0], target_names=['break up','still together']))

GRID SEARCH:
Fitting 5 folds for each of 30000 candidates, totalling 150000 fits


[Parallel(n_jobs=1)]: Done 150000 out of 150000 | elapsed:  9.3min finished


Best parameters set:
	alpha: 0.0053366992312063122
	l1_ratio: 0.11513953993264472
	learning_rate: 'optimal'
	loss: 'hinge'
	penalty: 'l1'
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV SGDCLASSIFIER RESULT:
                precision    recall  f1-score   support

      break up       0.62      0.61      0.61       153
still together       0.83      0.83      0.83       342

   avg / total       0.76      0.76      0.76       495



In [110]:
sgd_coef = pd.DataFrame({
    'coef':sgd_clf.coef_.ravel(),
    'mag':np.abs(sgd_clf.coef_.ravel()),
    'pred':pred.columns
})
sgd_coef.sort_values(by=['mag'], ascending=False, inplace=True)

In [111]:
sgd_coef.head(10)

Unnamed: 0,coef,mag,pred
21,-0.758583,0.758583,how_long_relationship
9,-0.695012,0.695012,married[T.married]
10,-0.433622,0.433622,parental_approval[T.approve]
20,-0.385849,0.385849,relatives_seen_per_month
15,0.349247,0.349247,couple_race_comb[T.other_other]
2,-0.262723,0.262723,higher_income_earner[T.male_earn_more]
22,-0.203866,0.203866,couple_moms_yrsed_diff
4,-0.20115,0.20115,parent_alive[T.mother only]
17,-0.147348,0.147348,gender_work_status[T.self-employed_male]
5,-0.137031,0.137031,parent_alive[T.neither father nor mother are alive]


In [112]:
# SGD factors that contribute to couple breakup
sgd_coef[sgd_coef.coef > 0]

Unnamed: 0,coef,mag,pred
15,0.349247,0.349247,couple_race_comb[T.other_other]
0,0.104312,0.104312,hhinc[T.low_hhinc]
14,0.093092,0.093092,couple_relig_comb[T.Protestant or oth Christian_other]
8,0.051742,0.051742,met_through_friends[T.meet through friends]
18,0.04792,0.04792,marriage_count_gender[T.married before_male]
11,0.018096,0.018096,couple_politic_view_comb[T.democrat_republican]


In [113]:
# SGD factors that contribute to couple staying together
sgd_coef[sgd_coef.coef < 0]

Unnamed: 0,coef,mag,pred
21,-0.758583,0.758583,how_long_relationship
9,-0.695012,0.695012,married[T.married]
10,-0.433622,0.433622,parental_approval[T.approve]
20,-0.385849,0.385849,relatives_seen_per_month
2,-0.262723,0.262723,higher_income_earner[T.male_earn_more]
22,-0.203866,0.203866,couple_moms_yrsed_diff
4,-0.20115,0.20115,parent_alive[T.mother only]
17,-0.147348,0.147348,gender_work_status[T.self-employed_male]
5,-0.137031,0.137031,parent_alive[T.neither father nor mother are alive]
7,-0.121561,0.121561,same_sex_couple[T.same-sex couple]


In [114]:
from sklearn.svm import SVC

In [116]:
svc_params = {
    'C':np.logspace(-5, 2, 100),
    'gamma':np.logspace(-5, 2, 100),
    'kernel':['linear']
}

svc_gs = GridSearchCV(SVC(max_iter=10000, tol=0.0001), svc_params, cv=5, verbose=1, n_jobs=-1)
svc_gs.fit(X_train, y_train)

Fitting 5 folds for each of 10000 candidates, totalling 50000 fits


[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 744 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 1744 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done 3144 tasks      | elapsed:   38.2s
[Parallel(n_jobs=-1)]: Done 4944 tasks      | elapsed:   59.6s
[Parallel(n_jobs=-1)]: Done 7144 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 9744 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 12744 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 16144 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 19944 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 24144 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 28744 tasks      | elapsed:  5.8min
































































































































































[Parallel(n_jobs=-1)]: Done 33744 tasks      | elapsed:  7.1min








































































































































































































































































































































[Parallel(n_jobs=-1)]: Done 39144 tasks      | elapsed:  9.2min


















































































































































































































































































[Parallel(n_jobs=-1)]: Done 43682 tasks      | elapsed: 13.1min




























































































































































































[Parallel(n_jobs=-1)]: Done 46782 tasks      | elapsed: 16.2min


































































































































































































[Parallel(n_jobs=-1)]: Done 50000 out of 50000 | elapsed: 19.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=10000, probability=False, random_state=None, shrinking=True,
  tol=0.0001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': array([  1.00000e-05,   1.17681e-05, ...,   8.49753e+01,   1.00000e+02]), 'gamma': array([  1.00000e-05,   1.17681e-05, ...,   8.49753e+01,   1.00000e+02]), 'kernel': ['linear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [117]:
best_parameters = svc_gs.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

C: 1.4508287784959402
cache_size: 200
class_weight: None
coef0: 0.0
decision_function_shape: 'ovr'
degree: 3
gamma: 1.0000000000000001e-05
kernel: 'linear'
max_iter: 10000
probability: False
random_state: None
shrinking: True
tol: 0.0001
verbose: False


In [118]:
yhat = svc_gs.best_estimator_.predict(X_test)
print(metrics.classification_report(y_test, yhat, labels=[1,0], target_names=['break up','still together']))

                precision    recall  f1-score   support

      break up       0.65      0.65      0.65       153
still together       0.84      0.84      0.84       342

   avg / total       0.78      0.78      0.78       495



In [120]:
svc_gs_coef = pd.DataFrame({
    'coef':svc_gs.best_estimator_.coef_.ravel(),
    'mag':np.abs(svc_gs.best_estimator_.coef_.ravel()),
    'pred':pred.columns
})
svc_gs_coef.sort_values(by=['mag'], ascending=False, inplace=True)

In [121]:
svc_gs_coef.head(10)

Unnamed: 0,coef,mag,pred
21,-0.780906,0.780906,how_long_relationship
9,-0.651563,0.651563,married[T.married]
10,-0.373642,0.373642,parental_approval[T.approve]
5,-0.319797,0.319797,parent_alive[T.neither father nor mother are alive]
15,0.180928,0.180928,couple_race_comb[T.other_other]
20,-0.161812,0.161812,relatives_seen_per_month
14,0.160191,0.160191,couple_relig_comb[T.Protestant or oth Christian_other]
2,-0.159802,0.159802,higher_income_earner[T.male_earn_more]
7,-0.133668,0.133668,same_sex_couple[T.same-sex couple]
0,0.131794,0.131794,hhinc[T.low_hhinc]


In [122]:
# SGD factors that contribute to couple breakup
svc_gs_coef[svc_gs_coef.coef > 0]

Unnamed: 0,coef,mag,pred
15,0.180928,0.180928,couple_race_comb[T.other_other]
14,0.160191,0.160191,couple_relig_comb[T.Protestant or oth Christian_other]
0,0.131794,0.131794,hhinc[T.low_hhinc]
11,0.113163,0.113163,couple_politic_view_comb[T.democrat_republican]
16,0.10373,0.10373,gender_work_status[T.not-working_male]
8,0.087099,0.087099,met_through_friends[T.meet through friends]
3,0.067282,0.067282,grow_up_same_city_town[T.yes]
18,0.061653,0.061653,marriage_count_gender[T.married before_male]
19,0.014334,0.014334,marriage_count_gender[T.never married_male]
1,0.008833,0.008833,pprent[T.rented for cash]


In [123]:
# SGD factors that contribute to couple staying together
svc_gs_coef[svc_gs_coef.coef < 0]

Unnamed: 0,coef,mag,pred
21,-0.780906,0.780906,how_long_relationship
9,-0.651563,0.651563,married[T.married]
10,-0.373642,0.373642,parental_approval[T.approve]
5,-0.319797,0.319797,parent_alive[T.neither father nor mother are alive]
20,-0.161812,0.161812,relatives_seen_per_month
2,-0.159802,0.159802,higher_income_earner[T.male_earn_more]
7,-0.133668,0.133668,same_sex_couple[T.same-sex couple]
4,-0.090077,0.090077,parent_alive[T.mother only]
22,-0.072293,0.072293,couple_moms_yrsed_diff
17,-0.072133,0.072133,gender_work_status[T.self-employed_male]


In [124]:
# Reference link: https://www.kaggle.com/phunter/xgboost-with-gridsearchcv
# Credit to Shize's R code and the python re-implementation

xgb_model = xgb.XGBClassifier()

#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
parameters = {'nthread': [4], #when use hyperthread, xgboost may become slower
              'objective': ['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6,7,8],
              'min_child_weight': [11],
              'gamma': [0,0.1],
              'subsample': [0.8],
              'colsample_bytree': [0.8],
              'silent': [1],
              'scale_pos_weight': [1],
              'n_estimators': [1000], #number of trees, change it to 1000 for better results
              'missing': [-999],
              'seed': [65]}

xgb_clf = GridSearchCV(xgb_model, parameters, n_jobs=-1, verbose=1, cv=5)

xgb_clf.fit(X_train, y_train)

best_parameters = xgb_clf.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
    
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV XGBCLASSIFIER RESULT:")
xgb_predicted = xgb_clf.best_estimator_.predict(X_test)
print(metrics.classification_report(y_test, xgb_predicted, labels=[1,0], target_names=['break up','still together']))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   26.4s finished


base_score: 0.5
booster: 'gbtree'
colsample_bylevel: 1
colsample_bytree: 0.8
gamma: 0
learning_rate: 0.05
max_delta_step: 0
max_depth: 6
min_child_weight: 11
missing: -999
n_estimators: 1000
n_jobs: 1
nthread: 4
objective: 'binary:logistic'
random_state: 0
reg_alpha: 0
reg_lambda: 1
scale_pos_weight: 1
seed: 65
silent: 1
subsample: 0.8
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV XGBCLASSIFIER RESULT:
                precision    recall  f1-score   support

      break up       0.66      0.55      0.60       153
still together       0.81      0.87      0.84       342

   avg / total       0.77      0.77      0.77       495



In [126]:
xgb_featimpt = pd.DataFrame({
    'importance':xgb_clf.best_estimator_.feature_importances_,
    'pred':pred.columns
})
xgb_featimpt.sort_values(by=['importance'], ascending=False, inplace=True)

In [127]:
xgb_featimpt.head(10)

Unnamed: 0,importance,pred
21,0.255664,how_long_relationship
20,0.147577,relatives_seen_per_month
22,0.135305,couple_moms_yrsed_diff
0,0.05365,hhinc[T.low_hhinc]
2,0.045469,higher_income_earner[T.male_earn_more]
10,0.044997,parental_approval[T.approve]
9,0.041378,married[T.married]
8,0.036658,met_through_friends[T.meet through friends]
1,0.032725,pprent[T.rented for cash]
13,0.03241,couple_relig_comb[T.Protestant or oth Christian_Protestant or oth Christian]


In [None]:
# Wrap up with Dashboard and Explaination