### Use Features selected by Xgboost to review factors that affect relationship outcome

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import seaborn as sns
from sklearn import preprocessing as pp
import pickle
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [25]:
# Logistic Regression
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_predict, GridSearchCV, train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [6]:
X = pd.read_pickle('./couple_data_xgboost_predictors')
y = pd.read_pickle('./couple_data_target')

In [7]:
# Scale the resampled features
Xs = StandardScaler().fit_transform(X)
y = y.values.ravel()

In [8]:
# Training and Test set
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.33, random_state=42)

### Plain vanilla logistic regression

In [9]:
# Plain vanilla logistic regression
lr = LogisticRegression()
train_predicted = cross_val_predict(lr, X_train, y_train, cv=10)
score = metrics.f1_score(y_train, train_predicted)
print('f1 score:', score)
lr.fit(X_train, y_train)
predicted = lr.predict(X_test)
print('LOGREG RESULT:')
print(metrics.classification_report(y_test, predicted, labels=[1,0], target_names=['break up','still together']))

f1 score: 0.868913857678
LOGREG RESULT:
                precision    recall  f1-score   support

      break up       0.89      0.86      0.87       356
still together       0.85      0.88      0.87       328

   avg / total       0.87      0.87      0.87       684



In [11]:
# Get top 10 coefficients
lr_coef = pd.DataFrame({
    'coef':lr.coef_.ravel(),
    'mag':np.abs(lr.coef_.ravel()),
    'pred':X.columns
})
lr_coef.sort_values(by=['mag'], ascending=False, inplace=True)
lr_coef.head(10)

Unnamed: 0,coef,mag,pred
16,-1.194512,1.194512,coresident[T.Yes]
18,-0.786232,0.786232,married[T.married]
30,-0.746207,0.746207,ppreg4[T.west]
14,-0.730732,0.730732,partner_politic_view[T.democrat]
4,-0.717916,0.717916,how_long_ago_first_cohab
24,-0.644229,0.644229,ppeducat[T.bachelor's degree or higher]
40,-0.557111,0.557111,partner_politic_view[T.independent]
0,-0.514978,0.514978,ppage
15,-0.453418,0.453418,parental_approval[T.approve]
42,-0.438306,0.438306,ppreg4[T.midwest]


### Ridge Logistic Regression

In [12]:
# Ridge logistic regression
lr_ridge = LogisticRegressionCV(penalty='l2', Cs=200, cv=10, scoring='f1')
lr_ridge.fit(X_train, y_train)
train_predicted = lr_ridge.predict(X_train)
score = metrics.f1_score(y_train, train_predicted)
print('f1 score:', score)
ridge_predicted = lr_ridge.predict(X_test)
print('RIDGE LOGREG RESULT:')
print(metrics.classification_report(y_test, ridge_predicted, labels=[1,0], target_names=['break up','still together']))

f1 score: 0.89634601044
RIDGE LOGREG RESULT:
                precision    recall  f1-score   support

      break up       0.89      0.86      0.87       356
still together       0.85      0.88      0.87       328

   avg / total       0.87      0.87      0.87       684



In [14]:
# Get top 10 coefficients
lr_ridge_coef = pd.DataFrame({
    'coef':lr_ridge.coef_.ravel(),
    'mag':np.abs(lr_ridge.coef_.ravel()),
    'pred':X.columns
})
lr_ridge_coef.sort_values(by=['mag'], ascending=False, inplace=True)
lr_ridge_coef.head(10)

Unnamed: 0,coef,mag,pred
16,-0.885575,0.885575,coresident[T.Yes]
18,-0.627527,0.627527,married[T.married]
30,-0.415268,0.415268,ppreg4[T.west]
14,-0.382592,0.382592,partner_politic_view[T.democrat]
15,-0.361501,0.361501,parental_approval[T.approve]
24,-0.361399,0.361399,ppeducat[T.bachelor's degree or higher]
0,-0.316721,0.316721,ppage
4,-0.313269,0.313269,how_long_ago_first_cohab
40,-0.312443,0.312443,partner_politic_view[T.independent]
39,-0.303928,0.303928,met_partner_work[T.yes]


### Lasso Logistic Regression

In [15]:
# Lasso Logistic Regression
lr_lasso = LogisticRegressionCV(penalty='l1', solver='liblinear', Cs=100, cv=10)
lr_lasso.fit(X_train, y_train)
train_predicted = lr_lasso.predict(X_train)
score = metrics.f1_score(y_train, train_predicted)
print('f1 score:', score)
lasso_predicted = lr_lasso.predict(X_test)
print('LASSO LOGREG RESULT:')
print(metrics.classification_report(y_test, lasso_predicted, labels=[1,0], target_names=['break up','still together']))

f1 score: 0.892376681614
LASSO LOGREG RESULT:
                precision    recall  f1-score   support

      break up       0.88      0.86      0.87       356
still together       0.85      0.88      0.86       328

   avg / total       0.87      0.87      0.87       684



In [16]:
# Get top 10 coefficients
lr_lasso_coef = pd.DataFrame({
    'coef':lr_lasso.coef_.ravel(),
    'mag':np.abs(lr_lasso.coef_.ravel()),
    'pred':X.columns
})
lr_lasso_coef.sort_values(by=['mag'], ascending=False, inplace=True)
lr_lasso_coef.head(10)

Unnamed: 0,coef,mag,pred
16,-1.14726,1.14726,coresident[T.Yes]
18,-0.775793,0.775793,married[T.married]
4,-0.667353,0.667353,how_long_ago_first_cohab
30,-0.633222,0.633222,ppreg4[T.west]
14,-0.623571,0.623571,partner_politic_view[T.democrat]
0,-0.486528,0.486528,ppage
24,-0.483193,0.483193,ppeducat[T.bachelor's degree or higher]
40,-0.475459,0.475459,partner_politic_view[T.independent]
15,-0.415274,0.415274,parental_approval[T.approve]
42,-0.398952,0.398952,ppreg4[T.midwest]


### GridSearch Logistic Regression

In [18]:
# Gridsearch for Ridge and Lasso Logistic Regression, optimize C

parameters = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

print ("GRID SEARCH:")
lr_grid_search = GridSearchCV(LogisticRegression(), parameters, cv=10, verbose=0)
lr_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
print('f1 score:', lr_grid_search.best_score_)
lr_best_parameters = lr_grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ("\t%s: %r" % (param_name, lr_best_parameters[param_name]))
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV LOGREG RESULT:")
clf = lr_grid_search.best_estimator_
lr_gs_predicted = clf.predict(X_test)
print(metrics.classification_report(y_test, lr_gs_predicted, labels=[1,0], target_names=['break up','still together']))

GRID SEARCH:
Best parameters set:
f1 score: 0.883116883117
	C: 0.3944206059437656
	penalty: 'l1'
	solver: 'liblinear'
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV LOGREG RESULT:
                precision    recall  f1-score   support

      break up       0.89      0.85      0.87       356
still together       0.85      0.89      0.87       328

   avg / total       0.87      0.87      0.87       684



In [19]:
# Get top 10 coefficients
lr_gs_coef = pd.DataFrame({
    'coef':clf.coef_.ravel(),
    'mag':np.abs(clf.coef_.ravel()),
    'pred':X.columns
})
lr_gs_coef.sort_values(by=['mag'], ascending=False, inplace=True)
lr_gs_coef.head(10)

Unnamed: 0,coef,mag,pred
16,-1.10733,1.10733,coresident[T.Yes]
18,-0.76299,0.76299,married[T.married]
4,-0.616563,0.616563,how_long_ago_first_cohab
30,-0.548412,0.548412,ppreg4[T.west]
14,-0.546904,0.546904,partner_politic_view[T.democrat]
0,-0.459742,0.459742,ppage
40,-0.419746,0.419746,partner_politic_view[T.independent]
24,-0.400797,0.400797,ppeducat[T.bachelor's degree or higher]
15,-0.393685,0.393685,parental_approval[T.approve]
42,-0.369161,0.369161,ppreg4[T.midwest]


### GridSearch SGDClassifier

In [24]:
# Gridsearch SGDclassifier with log loss and optimal learning rate
sgd_parameters = {
    'learning_rate': ['optimal'],
    'loss':['log','hinge'],
    'penalty': ['l1','l2','elasticnet'],
    'alpha': np.logspace(-10,5,100),
    'l1_ratio': np.logspace(-1,0,20)
}

print ("GRID SEARCH:")
sgd_grid_search = GridSearchCV(SGDClassifier(max_iter=10000, tol=0.0001), sgd_parameters, cv=10, verbose=0)
sgd_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
print('f1 score:', sgd_grid_search.best_score_)
sgd_best_parameters = sgd_grid_search.best_estimator_.get_params()
for param_name in sorted(sgd_parameters.keys()):
    print ("\t%s: %r" % (param_name, sgd_best_parameters[param_name]))
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV SGDCLASSIFIER RESULT:")
sgd_clf = sgd_grid_search.best_estimator_
sgd_predicted = sgd_clf.predict(X_test)
print(metrics.classification_report(y_test, sgd_predicted, labels=[1,0], target_names=['break up','still together']))

GRID SEARCH:
Best parameters set:
f1 score: 0.888888888889
	alpha: 0.010722672220103254
	l1_ratio: 0.16237767391887217
	learning_rate: 'optimal'
	loss: 'hinge'
	penalty: 'elasticnet'
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV SGDCLASSIFIER RESULT:
                precision    recall  f1-score   support

      break up       0.89      0.85      0.87       356
still together       0.84      0.88      0.86       328

   avg / total       0.86      0.86      0.86       684



In [23]:
# Get top 10 coefficients
sgd_coef = pd.DataFrame({
    'coef':sgd_clf.coef_.ravel(),
    'mag':np.abs(sgd_clf.coef_.ravel()),
    'pred':X.columns
})
sgd_coef.sort_values(by=['mag'], ascending=False, inplace=True)
sgd_coef.head(10)

Unnamed: 0,coef,mag,pred
16,-0.665778,0.665778,coresident[T.Yes]
18,-0.450589,0.450589,married[T.married]
24,-0.281795,0.281795,ppeducat[T.bachelor's degree or higher]
30,-0.271785,0.271785,ppreg4[T.west]
14,-0.258946,0.258946,partner_politic_view[T.democrat]
15,-0.24055,0.24055,parental_approval[T.approve]
40,-0.234693,0.234693,partner_politic_view[T.independent]
39,-0.220768,0.220768,met_partner_work[T.yes]
44,-0.212808,0.212808,ppmarit[T.living with partner]
48,-0.205841,0.205841,partner_politic_view[T.republican]


### GridSearch XGBoost Classifier

In [26]:
# Reference link: https://www.kaggle.com/phunter/xgboost-with-gridsearchcv
# Credit to Shize's R code and the python re-implementation

xgb_model = xgb.XGBClassifier()

#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
parameters = {'nthread': [4], #when use hyperthread, xgboost may become slower
              'objective': ['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6,7,8],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [1000], #number of trees, change it to 1000 for better results
              'missing': [-999],
              'seed': [65]}

xgb_clf = GridSearchCV(xgb_model, parameters, n_jobs=5, scoring='f1', verbose=0, refit=True, cv=10)

xgb_clf.fit(X_train, y_train)

best_parameters, score, _ = max(xgb_clf.grid_scores_, key=lambda x: x[1])
print('f1 score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
    
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV XGBCLASSIFIER RESULT:")
xgb_predicted = xgb_clf.best_estimator_.predict(X_test)
print(metrics.classification_report(y_test, xgb_predicted, labels=[1,0], target_names=['break up','still together']))

f1 score: 0.887875619507
colsample_bytree: 0.7
learning_rate: 0.05
max_depth: 7
min_child_weight: 11
missing: -999
n_estimators: 1000
nthread: 4
objective: 'binary:logistic'
seed: 65
silent: 1
subsample: 0.8
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV XGBCLASSIFIER RESULT:
                precision    recall  f1-score   support

      break up       0.89      0.87      0.88       356
still together       0.86      0.88      0.87       328

   avg / total       0.87      0.87      0.87       684





In [28]:
# Get top 10 coefficients
xgb_featimpt = pd.DataFrame({
    'importance':xgb_clf.best_estimator_.feature_importances_,
    'pred':X.columns
})
xgb_featimpt = xgb_featimpt[xgb_featimpt.importance > 0]
xgb_featimpt.sort_values(by=['importance'], ascending=False, inplace=True)
xgb_featimpt.head(10)

Unnamed: 0,importance,pred
0,0.088382,ppage
1,0.062541,distancemoved_10mi
2,0.056026,age_difference
3,0.054506,how_long_ago_first_met
4,0.046254,how_long_ago_first_cohab
7,0.044734,partner_yrsed
6,0.0443,how_long_ago_first_romantic
5,0.043865,hhinc
8,0.041694,respondent_yrsed
9,0.036699,relatives_seen_per_month


In [32]:
print(metrics.accuracy_score(y_test, xgb_predicted))

0.874269005848


In [31]:
# Check for any feature that are removed in this round which might result in the drop in performance
xgb_featimpt = pd.DataFrame({
    'importance':xgb_clf.best_estimator_.feature_importances_,
    'pred':X.columns
})
xgb_featimpt[xgb_featimpt.importance <= 0]

Unnamed: 0,importance,pred
65,0.0,q24_private_party[T.Yes]


In [None]:
# private party question removed, might be the cause of the drop in score from previously