### Use Features Selection to review factors that affect relationship outcome

In [1]:
# Model Check
# Oversample Minority
# Shuffle Predictors and Targets such that predictors point to new targets
# Use dataset without Smoteen & Tomek

### Result from using dataset without Smoteen & Tomek
> * Only GridSearchCV LogReg & GridSearchCV SGDClassifier performed normally **(Training Result > Test Result)** under imbalanced dataset 
    

### Result from using dataset with Smoteen & Tomek
> * All models performed normally 

### Minority class over Majority class, Reverse Imbalance
> * LogReg, Ridge LogReg, Lasso LogReg & Xgboost Classifier perform normally under reversed imbalanced
* GridSearchCV LogReg & GridSearchCV SGDClassifier did not perform normally **(Training Result < Test Result)**

### Shuffle Predictors & Targets
> * Scores across the models all dropped

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import seaborn as sns
from sklearn import preprocessing as pp
import pickle
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_predict, GridSearchCV, train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import xgboost as xgb



In [4]:
# X = pd.read_pickle('./couple_data_xgboost_predictors')
# X = pd.read_pickle('./couple_data_predictors')
X = pd.read_pickle('./couple_data_without_resample_predictors')
# X = pd.read_pickle('./couple_data_rev_imbal_predictors')
# X = pd.read_pickle('./couple_data_lasso_predictors')
# y = pd.read_pickle('./couple_data_rev_imbal_target')
y = pd.read_pickle('./couple_data_without_resample_target')
# y = pd.read_pickle('./couple_data_target')

In [5]:
# Shuffling Predictors, swapping rows
# for verification only
# X = X.sample(frac=1, replace=False)

In [6]:
# Scale the resampled features
Xs = StandardScaler().fit_transform(X)
y = y.values.ravel()

In [7]:
# Training and Test set
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.33, random_state=42)

### GridSearch Logistic Regression

In [8]:
# Gridsearch for Ridge and Lasso Logistic Regression, optimize C

parameters = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

print ("GRID SEARCH:")
lr_grid_search = GridSearchCV(LogisticRegression(), parameters, cv=10, verbose=0)
lr_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
print('f1 score:', lr_grid_search.best_score_)
lr_best_parameters = lr_grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ("\t%s: %r" % (param_name, lr_best_parameters[param_name]))
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV LOGREG RESULT:")
clf = lr_grid_search.best_estimator_
lr_gs_predicted = clf.predict(X_test)
print(metrics.classification_report(y_test, lr_gs_predicted, labels=[1,0], target_names=['break up','still together']))

GRID SEARCH:
Best parameters set:
f1 score: 0.825022665458
	C: 0.077426368268112694
	penalty: 'l1'
	solver: 'liblinear'
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV LOGREG RESULT:
                precision    recall  f1-score   support

      break up       0.72      0.60      0.66       172
still together       0.83      0.89      0.86       372

   avg / total       0.80      0.80      0.80       544



In [9]:
# Get top 10 coefficients
lr_gs_coef = pd.DataFrame({
    'coef':clf.coef_.ravel(),
    'mag':np.abs(clf.coef_.ravel()),
    'pred':X.columns
})
lr_gs_coef.sort_values(by=['mag'], ascending=False, inplace=True)
lr_gs_coef.head(10)

Unnamed: 0,coef,mag,pred
33,-0.831808,0.831808,coresident[T.Yes]
51,-0.420176,0.420176,how_long_ago_first_cohab
31,-0.337918,0.337918,married[T.married]
52,-0.299578,0.299578,how_long_relationship
32,-0.28121,0.28121,parental_approval[T.approve]
9,-0.188001,0.188001,parent_alive[T.neither father nor mother are a...
44,0.151057,0.151057,couple_relig_comb[T.Protestant or oth Christia...
56,-0.13275,0.13275,partner_yrsed
47,0.11412,0.11412,couple_race_comb[T.other_other]
18,-0.085298,0.085298,q24_church[T.Yes]


### GridSearch SGDClassifier

In [10]:
# Gridsearch SGDclassifier with log loss and optimal learning rate
sgd_parameters = {
    'learning_rate': ['optimal'],
    'loss':['log','hinge'],
    'penalty': ['l1','l2','elasticnet'],
    'alpha': np.logspace(-10,5,100),
    'l1_ratio': np.logspace(-1,0,20)
}

print ("GRID SEARCH:")
sgd_grid_search = GridSearchCV(SGDClassifier(max_iter=10000, tol=0.0001), sgd_parameters, cv=10, verbose=0)
sgd_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
print('f1 score:', sgd_grid_search.best_score_)
sgd_best_parameters = sgd_grid_search.best_estimator_.get_params()
for param_name in sorted(sgd_parameters.keys()):
    print ("\t%s: %r" % (param_name, sgd_best_parameters[param_name]))
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV SGDCLASSIFIER RESULT:")
sgd_clf = sgd_grid_search.best_estimator_
sgd_predicted = sgd_clf.predict(X_test)
print(metrics.classification_report(y_test, sgd_predicted, labels=[1,0], target_names=['break up','still together']))

GRID SEARCH:
Best parameters set:
f1 score: 0.827742520399
	alpha: 0.0075646332755462909
	l1_ratio: 0.88586679041008254
	learning_rate: 'optimal'
	loss: 'log'
	penalty: 'l1'
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV SGDCLASSIFIER RESULT:
                precision    recall  f1-score   support

      break up       0.72      0.59      0.65       172
still together       0.83      0.89      0.86       372

   avg / total       0.79      0.80      0.79       544



In [11]:
# Get top 10 coefficients
sgd_coef = pd.DataFrame({
    'coef':sgd_clf.coef_.ravel(),
    'mag':np.abs(sgd_clf.coef_.ravel()),
    'pred':X.columns
})
sgd_coef.sort_values(by=['mag'], ascending=False, inplace=True)
sgd_coef.head(10)

Unnamed: 0,coef,mag,pred
33,-0.938917,0.938917,coresident[T.Yes]
51,-0.813247,0.813247,how_long_ago_first_cohab
9,-0.349173,0.349173,parent_alive[T.neither father nor mother are a...
31,-0.288956,0.288956,married[T.married]
32,-0.28558,0.28558,parental_approval[T.approve]
44,0.227165,0.227165,couple_relig_comb[T.Protestant or oth Christia...
56,-0.152981,0.152981,partner_yrsed
47,0.145516,0.145516,couple_race_comb[T.other_other]
18,-0.141,0.141,q24_church[T.Yes]
23,-0.102237,0.102237,q24_public[T.Yes]


### GridSearch XGBoost Classifier

In [8]:
# Reference link: https://www.kaggle.com/phunter/xgboost-with-gridsearchcv
# Credit to Shize's R code and the python re-implementation

xgb_model = xgb.XGBClassifier()

#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
parameters = {'nthread': [4], #when use hyperthread, xgboost may become slower
              'objective': ['binary:logistic'],
              'learning_rate': [0.05, 0.1], #so called `eta` value
              'max_depth': [6,7,8],
              'min_child_weight': [1],
              'gamma': [0,0.1],
              'subsample': [0.8],
              'colsample_bytree': [0.8],
              'silent': [1],
              'scale_pos_weight': [1],
              'n_estimators': [1000], #number of trees, change it to 1000 for better results
              'missing': [-999],
              'seed': [65]}

xgb_clf = GridSearchCV(xgb_model, parameters, n_jobs=-1, scoring='f1', verbose=1, cv=5)

xgb_clf.fit(X_train, y_train)

best_parameters = xgb_clf.best_estimator_.get_params()
print('f1 score:', xgb_clf.best_score_)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
    
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV XGBCLASSIFIER RESULT:")
xgb_predicted = xgb_clf.best_estimator_.predict(X_test)
print(metrics.classification_report(y_test, xgb_predicted, labels=[1,0], target_names=['break up','still together']))

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.7min finished


f1 score: 0.702702159915
base_score: 0.5
colsample_bylevel: 1
colsample_bytree: 0.8
gamma: 0
learning_rate: 0.1
max_delta_step: 0
max_depth: 7
min_child_weight: 1
missing: -999
n_estimators: 1000
nthread: 4
objective: 'binary:logistic'
reg_alpha: 0
reg_lambda: 1
scale_pos_weight: 1
seed: 65
silent: 1
subsample: 0.8
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV XGBCLASSIFIER RESULT:
                precision    recall  f1-score   support

      break up       0.69      0.64      0.66       172
still together       0.84      0.87      0.85       372

   avg / total       0.79      0.80      0.79       544



In [None]:
# Get top 10 coefficients
xgb_featimpt = pd.DataFrame({
    'importance':xgb_clf.best_estimator_.feature_importances_,
    'pred':X.columns
})
xgb_featimpt = xgb_featimpt[xgb_featimpt.importance > 0]
xgb_featimpt.sort_values(by=['importance'], ascending=False, inplace=True)
xgb_featimpt.head(10)

In [None]:
# Only GridSearch LogReg & Gridsearch SGDClassifier & xgboost performs normally in an imbalanced dataset

In [None]:
# Check for any feature that are removed in this round which might result in the drop in performance
xgb_featimpt = pd.DataFrame({
    'importance':xgb_clf.best_estimator_.feature_importances_,
    'pred':X.columns
})
xgb_featimpt[xgb_featimpt.importance <= 0]

In [None]:
# Drop in the above features might be the cause of the drop from previous score

In [13]:
# Try TPOT
from tpot import TPOTClassifier

  return f(*args, **kwds)


In [14]:
pipeline_optimizer = TPOTClassifier()

  return f(*args, **kwds)


In [15]:
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2)

In [16]:
pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))

Optimization Progress:  33%|███▎      | 40/120 [00:36<00:59,  1.34pipeline/s]

Generation 1 - Current best internal CV score: 0.8385921338862516


Optimization Progress:  50%|█████     | 60/120 [01:07<02:12,  2.20s/pipeline]

Generation 2 - Current best internal CV score: 0.8385921338862516


Optimization Progress:  67%|██████▋   | 80/120 [03:11<07:05, 10.64s/pipeline]

Generation 3 - Current best internal CV score: 0.8385921338862516


Optimization Progress:  83%|████████▎ | 100/120 [13:09<09:17, 27.88s/pipeline]

Generation 4 - Current best internal CV score: 0.8394847706612412


                                                                               

Generation 5 - Current best internal CV score: 0.8394847706612412

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.5, min_samples_leaf=1, min_samples_split=8, n_estimators=100)
0.801470588235


In [17]:
yhat = pipeline_optimizer.predict(X_test)
print(metrics.classification_report(y_test, yhat, labels=[1,0], target_names=['break up','still together']))

                precision    recall  f1-score   support

      break up       0.73      0.60      0.66       172
still together       0.83      0.90      0.86       372

   avg / total       0.80      0.80      0.80       544



### Best Models from TPOT
> * Extra Trees Classifier - **precision: 0.80** | **recall: 0.80** | **f1-score: 0.80**