In [1]:
# Model Check
# Oversample Minority
# Shuffle Predictors and Targets such that predictors point to new targets
# Use dataset without Smoteen & Tomek

In [1]:
import pandas as pd
import numpy as np
pd.set_option('max_colwidth',100)
import matplotlib.pyplot as plt
import scipy as sp
import seaborn as sns
from sklearn import preprocessing as pp
import pickle
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_predict, GridSearchCV, train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [4]:
X = pd.read_pickle('./couple_data_without_resample_predictors')
y = pd.read_pickle('./couple_data_without_resample_target')

In [5]:
# Scale the resampled features
Xs = StandardScaler().fit_transform(X)
y = y.values.ravel()

In [6]:
# Training and Test set
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.33, random_state=42)

In [7]:
lr_lasso = LogisticRegressionCV(penalty='l1', solver='liblinear', Cs=100, cv=10)
lr_lasso.fit(X_train, y_train)

LogisticRegressionCV(Cs=100, class_weight=None, cv=10, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
           refit=True, scoring=None, solver='liblinear', tol=0.0001,
           verbose=0)

In [15]:
# Get top 10 coefficients
ft_sel_coef = pd.DataFrame({
    'coef':lr_lasso.coef_.ravel(),
    'mag':np.abs(lr_lasso.coef_.ravel()),
    'pred':X.columns
})

feat_sel = ft_sel_coef[ft_sel_coef.mag > 0].pred
feat_sel = pd.DataFrame(feat_sel.values, columns=[feat_sel.name])

Unnamed: 0,pred
0,higher_income_earner[T.male_earn_more]
1,same_high_school[T.same high school]
2,parent_alive[T.mother only]
3,parent_alive[T.neither father nor mother are alive]
4,met_partner_work[T.yes]
5,self_intro_partner[T.yes]
6,q24_college[T.Yes]
7,q24_church[T.Yes]
8,q24_vol_org[T.Yes]
9,q24_bar_restaurant[T.Yes]


In [21]:
Xs_df = pd.DataFrame(Xs, columns=X.columns)
pred_df = Xs_df[feat_sel.values.ravel()]
pred_df.to_pickle('./couple_data_lasso_feature_selection_predictors')

In [3]:
pred = pd.read_pickle('./couple_data_lasso_feature_selection_predictors')
target = pd.read_pickle('./couple_data_without_resample_target')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(pred.values, target.values.ravel(), test_size=0.3, random_state=42)

### GridSearch Logistic Regression

In [41]:
# Gridsearch for Ridge and Lasso Logistic Regression, optimize C

parameters = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

print ("GRID SEARCH:")
lr_grid_search = GridSearchCV(LogisticRegression(), parameters, cv=5, verbose=0)
lr_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
lr_best_parameters = lr_grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ("\t%s: %r" % (param_name, lr_best_parameters[param_name]))
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV LOGREG RESULT:")
clf = lr_grid_search.best_estimator_
lr_gs_predicted = clf.predict(X_test)
print(metrics.classification_report(y_test, lr_gs_predicted, labels=[1,0], target_names=['break up','still together']))

GRID SEARCH:
Best parameters set:
	C: 0.0084975343590864387
	penalty: 'l2'
	solver: 'liblinear'
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV LOGREG RESULT:
                precision    recall  f1-score   support

      break up       0.70      0.67      0.68       153
still together       0.85      0.87      0.86       342

   avg / total       0.81      0.81      0.81       495



In [42]:
# Get top 10 coefficients
lr_gs_coef = pd.DataFrame({
    'coef':clf.coef_.ravel(),
    'mag':np.abs(clf.coef_.ravel()),
    'pred':pred.columns
})
lr_gs_coef.sort_values(by=['mag'], ascending=False, inplace=True)
lr_gs_coef.head(10)

Unnamed: 0,coef,mag,pred
17,-0.544309,0.544309,coresident[T.Yes]
15,-0.328753,0.328753,married[T.married]
22,-0.282408,0.282408,how_long_relationship
16,-0.200223,0.200223,parental_approval[T.approve]
21,-0.179081,0.179081,how_long_ago_first_cohab
3,-0.162451,0.162451,parent_alive[T.neither father nor mother are alive]
24,-0.124427,0.124427,partner_yrsed
19,0.10504,0.10504,couple_relig_comb[T.Protestant or oth Christian_other]
20,0.09999,0.09999,couple_race_comb[T.other_other]
7,-0.086032,0.086032,q24_church[T.Yes]


### GridSearch SGDClassifier

In [47]:
# Gridsearch SGDclassifier with log loss and optimal learning rate
sgd_parameters = {
    'learning_rate': ['optimal'],
    'loss':['log','hinge'],
    'penalty': ['l1','l2','elasticnet'],
    'alpha': np.logspace(-10,5,100),
    'l1_ratio': np.logspace(-1,0,20)
}

print ("GRID SEARCH:")
sgd_grid_search = GridSearchCV(SGDClassifier(max_iter=10000, tol=0.0001), sgd_parameters, cv=5, verbose=0)
sgd_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
sgd_best_parameters = sgd_grid_search.best_estimator_.get_params()
for param_name in sorted(sgd_parameters.keys()):
    print ("\t%s: %r" % (param_name, sgd_best_parameters[param_name]))
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV SGDCLASSIFIER RESULT:")
sgd_clf = sgd_grid_search.best_estimator_
sgd_predicted = sgd_clf.predict(X_test)
print(metrics.classification_report(y_test, sgd_predicted, labels=[1,0], target_names=['break up','still together']))

GRID SEARCH:
Best parameters set:
	alpha: 0.0037649358067924714
	l1_ratio: 0.48329302385717521
	learning_rate: 'optimal'
	loss: 'hinge'
	penalty: 'l2'
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV SGDCLASSIFIER RESULT:
                precision    recall  f1-score   support

      break up       0.71      0.59      0.64       153
still together       0.83      0.89      0.86       342

   avg / total       0.79      0.80      0.79       495



In [48]:
# Get top 10 coefficients
sgd_coef = pd.DataFrame({
    'coef':sgd_clf.coef_.ravel(),
    'mag':np.abs(sgd_clf.coef_.ravel()),
    'pred':pred.columns
})
sgd_coef.sort_values(by=['mag'], ascending=False, inplace=True)
sgd_coef.head(10)

Unnamed: 0,coef,mag,pred
17,-0.931841,0.931841,coresident[T.Yes]
21,-0.355972,0.355972,how_long_ago_first_cohab
16,-0.324353,0.324353,parental_approval[T.approve]
7,-0.309408,0.309408,q24_church[T.Yes]
22,-0.238899,0.238899,how_long_relationship
3,-0.237552,0.237552,parent_alive[T.neither father nor mother are alive]
15,-0.202595,0.202595,married[T.married]
2,-0.181895,0.181895,parent_alive[T.mother only]
20,0.163397,0.163397,couple_race_comb[T.other_other]
14,-0.13762,0.13762,met_through_as_neighbors[T.met through or as neighbors]


### GridSearch XGBoost Classifier

In [7]:
# Reference link: https://www.kaggle.com/phunter/xgboost-with-gridsearchcv
# Credit to Shize's R code and the python re-implementation

xgb_model = xgb.XGBClassifier()

#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
parameters = {'nthread': [4], #when use hyperthread, xgboost may become slower
              'objective': ['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6,7,8],
              'min_child_weight': [11],
              'gamma': [0,0.1],
              'subsample': [0.8],
              'colsample_bytree': [0.8],
              'silent': [1],
              'scale_pos_weight': [1],
              'n_estimators': [1000], #number of trees, change it to 1000 for better results
              'missing': [-999],
              'seed': [65]}

xgb_clf = GridSearchCV(xgb_model, parameters, n_jobs=-1, verbose=1, cv=5)

xgb_clf.fit(X_train, y_train)

best_parameters = xgb_clf.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
    
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV XGBCLASSIFIER RESULT:")
xgb_predicted = xgb_clf.best_estimator_.predict(X_test)
print(metrics.classification_report(y_test, xgb_predicted, labels=[1,0], target_names=['break up','still together']))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   34.4s finished


base_score: 0.5
colsample_bylevel: 1
colsample_bytree: 0.8
gamma: 0.1
learning_rate: 0.05
max_delta_step: 0
max_depth: 7
min_child_weight: 11
missing: -999
n_estimators: 1000
nthread: 4
objective: 'binary:logistic'
reg_alpha: 0
reg_lambda: 1
scale_pos_weight: 1
seed: 65
silent: 1
subsample: 0.8
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV XGBCLASSIFIER RESULT:
                precision    recall  f1-score   support

      break up       0.70      0.61      0.66       153
still together       0.84      0.88      0.86       342

   avg / total       0.79      0.80      0.80       495



In [57]:
# Get top 10 coefficients
xgb_featimpt = pd.DataFrame({
    'importance':xgb_clf.best_estimator_.feature_importances_,
    'pred':pred.columns
})
xgb_featimpt = xgb_featimpt[xgb_featimpt.importance > 0]
xgb_featimpt.sort_values(by=['importance'], ascending=False, inplace=True)
xgb_featimpt.head(10)

Unnamed: 0,importance,pred
22,0.215581,how_long_relationship
21,0.16191,how_long_ago_first_cohab
23,0.105547,respondent_yrsed
24,0.098008,partner_yrsed
0,0.063902,higher_income_earner[T.male_earn_more]
5,0.059774,self_intro_partner[T.yes]
16,0.049004,parental_approval[T.approve]
15,0.04326,married[T.married]
17,0.039311,coresident[T.Yes]
13,0.033028,met_through_friends[T.meet through friends]


In [5]:
# Try TPOT
from tpot import TPOTClassifier

In [6]:
pipeline_optimizer = TPOTClassifier(generations=10, population_size=20, cv=5,
                                    random_state=42, verbosity=2, n_jobs=-1)

In [7]:
pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))

Optimization Progress:   0%|          | 0/220 [00:00<?, ?pipeline/s]Process ForkPoolWorker-2:
Process ForkPoolWorker-1:
Process ForkPoolWorker-5:
Process ForkPoolWorker-4:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/david.yan/anaconda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/david.yan/anaconda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/david.yan/anaconda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/david.yan/anaconda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/david.yan/anaconda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/david.yan/anaconda/lib/python3.6/multiprocessing/process.py", line 93,



TPOT closed prematurely. Will use the current best pipeline.


RuntimeError: A pipeline has not yet been optimized. Please call fit() first.

In [64]:
yhat = pipeline_optimizer.predict(X_test)
print(metrics.classification_report(y_test, yhat, labels=[1,0], target_names=['break up','still together']))

                precision    recall  f1-score   support

      break up       0.71      0.63      0.67       153
still together       0.84      0.89      0.86       342

   avg / total       0.80      0.81      0.80       495



In [65]:
from sklearn.svm import SVC

In [66]:
svc_params = {
    'C':np.logspace(-3, 2, 10),
    'gamma':np.logspace(-5, 2, 10),
    'kernel':['linear','rbf']
}

svc_gs = GridSearchCV(SVC(max_iter=10000, tol=0.0001), svc_params, cv=5, verbose=1, n_jobs=-1)
svc_gs.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 21.4min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': array([  1.00000e-03,   3.59381e-03,   1.29155e-02,   4.64159e-02,
         1.66810e-01,   5.99484e-01,   2.15443e+00,   7.74264e+00,
         2.78256e+01,   1.00000e+02]), 'gamma': array([  1.00000e-05,   5.99484e-05,   3.59381e-04,   2.15443e-03,
         1.29155e-02,   7.74264e-02,   4.64159e-01,   2.78256e+00,
         1.66810e+01,   1.00000e+02]), 'kernel': ['linear', 'rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [67]:
best_parameters = svc_gs.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

C: 7.7426368268112773
cache_size: 200
class_weight: None
coef0: 0.0
decision_function_shape: 'ovr'
degree: 3
gamma: 0.01291549665014884
kernel: 'rbf'
max_iter: -1
probability: False
random_state: None
shrinking: True
tol: 0.001
verbose: False


In [68]:
yhat = svc_gs.best_estimator_.predict(X_test)
print(metrics.classification_report(y_test, yhat, labels=[1,0], target_names=['break up','still together']))

                precision    recall  f1-score   support

      break up       0.68      0.58      0.62       153
still together       0.82      0.88      0.85       342

   avg / total       0.78      0.79      0.78       495



In [69]:
from sklearn.neighbors import KNeighborsClassifier

knn_params = {
    'n_neighbors':range(1,51),
    'weights':['distance','uniform']
}

knn_gs = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, verbose=1, n_jobs=-1)
knn_gs.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   26.0s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_neighbors': range(1, 51), 'weights': ['distance', 'uniform']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [31]:
best_parameters = knn_gs.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

algorithm: 'auto'
leaf_size: 30
metric: 'minkowski'
metric_params: None
n_jobs: 1
n_neighbors: 25
p: 2
weights: 'uniform'


In [32]:
yhat = knn_gs.best_estimator_.predict(X_test)
print(metrics.classification_report(y_test, yhat, labels=[1,0], target_names=['break up','still together']))

                precision    recall  f1-score   support

      break up       0.70      0.28      0.40       172
still together       0.74      0.94      0.83       372

   avg / total       0.73      0.73      0.69       544



In [None]:
# Check whether why TPOT did not churn out LOGREG as best model
# Wrap up with Dashboard and explaination