### Use Features Selection to review factors that affect relationship outcome

In [1]:
# Model Check
# Oversample Minority
# Shuffle Predictors and Targets such that predictors point to new targets
# Use dataset without Smoteen & Tomek

### Result from using dataset without Smoteen & Tomek
> * Only GridSearchCV LogReg & GridSearchCV SGDClassifier performed normally **(Training Result > Test Result)** under imbalanced dataset 
    

### Result from using dataset with Smoteen & Tomek
> * All models performed normally

### Minority class over Majority class, Reverse Imbalance
> * LogReg, Ridge LogReg, Lasso LogReg & Xgboost Classifier perform normally under reversed imbalanced
* GridSearchCV LogReg & GridSearchCV SGDClassifier did not perform normally **(Training Result < Test Result)**

### Shuffle Predictors & Targets
> * Scores across the models all dropped

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import seaborn as sns
from sklearn import preprocessing as pp
import pickle
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

  return f(*args, **kwds)


In [6]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_predict, GridSearchCV, train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [7]:
# X = pd.read_pickle('./couple_data_xgboost_predictors')
# X = pd.read_pickle('./couple_data_predictors')
# X = pd.read_pickle('./couple_data_without_resample_predictors')
X = pd.read_pickle('./couple_data_rev_imbal_predictors')
# X = pd.read_pickle('./couple_data_lasso_predictors')
y = pd.read_pickle('./couple_data_rev_imbal_target')
# y = pd.read_pickle('./couple_data_without_resample_target')
# y = pd.read_pickle('./couple_data_target')

In [8]:
# Shuffling Predictors, swapping rows
# for verification only
# X = X.sample(frac=1, replace=False)

In [9]:
# Scale the resampled features
Xs = StandardScaler().fit_transform(X)
y = y.values.ravel()

In [10]:
# Training and Test set
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.33, random_state=42)

### GridSearch Logistic Regression

In [15]:
# Gridsearch for Ridge and Lasso Logistic Regression, optimize C

parameters = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

print ("GRID SEARCH:")
lr_grid_search = GridSearchCV(LogisticRegression(), parameters, cv=10, verbose=0)
lr_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
print('f1 score:', lr_grid_search.best_score_)
lr_best_parameters = lr_grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ("\t%s: %r" % (param_name, lr_best_parameters[param_name]))
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV LOGREG RESULT:")
clf = lr_grid_search.best_estimator_
lr_gs_predicted = clf.predict(X_test)
print(metrics.classification_report(y_test, lr_gs_predicted, labels=[1,0], target_names=['break up','still together']))

GRID SEARCH:
Best parameters set:
f1 score: 0.887417218543
	C: 0.097700995729922469
	penalty: 'l1'
	solver: 'liblinear'
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV LOGREG RESULT:
                precision    recall  f1-score   support

      break up       0.90      0.89      0.90       356
still together       0.77      0.78      0.78       165

   avg / total       0.86      0.86      0.86       521



In [16]:
# Get top 10 coefficients
lr_gs_coef = pd.DataFrame({
    'coef':clf.coef_.ravel(),
    'mag':np.abs(clf.coef_.ravel()),
    'pred':X.columns
})
lr_gs_coef.sort_values(by=['mag'], ascending=False, inplace=True)
lr_gs_coef.head(10)

Unnamed: 0,coef,mag,pred
33,-0.918089,0.918089,coresident[T.Yes]
31,-0.541351,0.541351,married[T.married]
32,-0.42035,0.42035,parental_approval[T.approve]
9,-0.407524,0.407524,parent_alive[T.neither father nor mother are a...
51,-0.382412,0.382412,how_long_ago_first_cohab
52,-0.306339,0.306339,how_long_relationship
43,-0.199315,0.199315,couple_relig_comb[T.Protestant or oth Christia...
10,-0.179838,0.179838,met_partner_work[T.yes]
8,-0.168686,0.168686,parent_alive[T.mother only]
56,-0.166525,0.166525,partner_yrsed


### GridSearch SGDClassifier

In [17]:
# Gridsearch SGDclassifier with log loss and optimal learning rate
sgd_parameters = {
    'learning_rate': ['optimal'],
    'loss':['log','hinge'],
    'penalty': ['l1','l2','elasticnet'],
    'alpha': np.logspace(-10,5,100),
    'l1_ratio': np.logspace(-1,0,20)
}

print ("GRID SEARCH:")
sgd_grid_search = GridSearchCV(SGDClassifier(max_iter=10000, tol=0.0001), sgd_parameters, cv=10, verbose=0)
sgd_grid_search.fit(X_train, y_train)
print ("Best parameters set:")
print('f1 score:', sgd_grid_search.best_score_)
sgd_best_parameters = sgd_grid_search.best_estimator_.get_params()
for param_name in sorted(sgd_parameters.keys()):
    print ("\t%s: %r" % (param_name, sgd_best_parameters[param_name]))
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV SGDCLASSIFIER RESULT:")
sgd_clf = sgd_grid_search.best_estimator_
sgd_predicted = sgd_clf.predict(X_test)
print(metrics.classification_report(y_test, sgd_predicted, labels=[1,0], target_names=['break up','still together']))

GRID SEARCH:
Best parameters set:
f1 score: 0.891201513718
	alpha: 0.015199110829529331
	l1_ratio: 0.11288378916846889
	learning_rate: 'optimal'
	loss: 'hinge'
	penalty: 'l1'
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV SGDCLASSIFIER RESULT:
                precision    recall  f1-score   support

      break up       0.89      0.87      0.88       356
still together       0.73      0.77      0.75       165

   avg / total       0.84      0.84      0.84       521



In [18]:
# Get top 10 coefficients
sgd_coef = pd.DataFrame({
    'coef':sgd_clf.coef_.ravel(),
    'mag':np.abs(sgd_clf.coef_.ravel()),
    'pred':X.columns
})
sgd_coef.sort_values(by=['mag'], ascending=False, inplace=True)
sgd_coef.head(10)

Unnamed: 0,coef,mag,pred
31,-0.599935,0.599935,married[T.married]
33,-0.490681,0.490681,coresident[T.Yes]
32,-0.337862,0.337862,parental_approval[T.approve]
9,-0.3083,0.3083,parent_alive[T.neither father nor mother are a...
52,-0.294287,0.294287,how_long_relationship
56,-0.157177,0.157177,partner_yrsed
8,-0.139256,0.139256,parent_alive[T.mother only]
10,-0.126572,0.126572,met_partner_work[T.yes]
27,-0.113465,0.113465,same_sex_couple[T.same-sex couple]
18,-0.104822,0.104822,q24_church[T.Yes]


### GridSearch XGBoost Classifier

In [19]:
# Reference link: https://www.kaggle.com/phunter/xgboost-with-gridsearchcv
# Credit to Shize's R code and the python re-implementation

xgb_model = xgb.XGBClassifier()

#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
parameters = {'nthread': [4], #when use hyperthread, xgboost may become slower
              'objective': ['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6,7,8],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [1000], #number of trees, change it to 1000 for better results
              'missing': [-999],
              'seed': [65]}

xgb_clf = GridSearchCV(xgb_model, parameters, n_jobs=5, scoring='f1', verbose=0, refit=True, cv=10)

xgb_clf.fit(X_train, y_train)

best_parameters, score, _ = max(xgb_clf.grid_scores_, key=lambda x: x[1])
print('f1 score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
    
print ("-----------------------------------------")
print ("-----------------------------------------")
print ("GRIDSEARCHCV XGBCLASSIFIER RESULT:")
xgb_predicted = xgb_clf.best_estimator_.predict(X_test)
print(metrics.classification_report(y_test, xgb_predicted, labels=[1,0], target_names=['break up','still together']))

f1 score: 0.917064609061
colsample_bytree: 0.7
learning_rate: 0.05
max_depth: 6
min_child_weight: 11
missing: -999
n_estimators: 1000
nthread: 4
objective: 'binary:logistic'
seed: 65
silent: 1
subsample: 0.8
-----------------------------------------
-----------------------------------------
GRIDSEARCHCV XGBCLASSIFIER RESULT:
                precision    recall  f1-score   support

      break up       0.89      0.89      0.89       356
still together       0.77      0.77      0.77       165

   avg / total       0.85      0.85      0.85       521





In [20]:
# Get top 10 coefficients
xgb_featimpt = pd.DataFrame({
    'importance':xgb_clf.best_estimator_.feature_importances_,
    'pred':X.columns
})
xgb_featimpt = xgb_featimpt[xgb_featimpt.importance > 0]
xgb_featimpt.sort_values(by=['importance'], ascending=False, inplace=True)
xgb_featimpt.head(10)

Unnamed: 0,importance,pred
48,0.117596,distancemoved_10mi
50,0.091917,how_long_ago_first_romantic
53,0.089291,age_difference
51,0.088707,how_long_ago_first_cohab
49,0.080245,how_long_ago_first_met
56,0.057485,partner_yrsed
55,0.053691,respondent_yrsed
52,0.042019,how_long_relationship
1,0.039685,higher_income_earner[T.female_earn_more]
32,0.037934,parental_approval[T.approve]


In [21]:
# Only GridSearch LogReg & Gridsearch SGDClassifier performs normally in an imbalanced dataset

In [22]:
# Check for any feature that are removed in this round which might result in the drop in performance
xgb_featimpt = pd.DataFrame({
    'importance':xgb_clf.best_estimator_.feature_importances_,
    'pred':X.columns
})
xgb_featimpt[xgb_featimpt.importance <= 0]

Unnamed: 0,importance,pred
0,0.0,Intercept
3,0.0,same_high_school[T.same high school]
4,0.0,same_college_uni[T.did not attend same college...
6,0.0,both_parents_knew_before_met[T.yes]
7,0.0,parent_alive[T.father only]
11,0.0,fam_intro_partner[T.yes]
13,0.0,colleague_intro_partner[T.yes]
15,0.0,q24_met_online[T.met online]
16,0.0,q24_school[T.Yes]
17,0.0,q24_college[T.Yes]


In [23]:
# Drop in the above features might be the cause of the drop from previous score

In [1]:
# Try TPOT
from tpot import TPOTClassifier

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
pipeline_optimizer = TPOTClassifier()

  return f(*args, **kwds)


In [3]:
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2)

In [11]:
pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))
# pipeline_optimizer.export('tpot_exported_pipeline.py')

Optimization Progress:  33%|███▎      | 40/120 [12:10<14:24, 10.80s/pipeline]  

Generation 1 - Current best internal CV score: 0.8818072073683269


Optimization Progress:  50%|█████     | 60/120 [20:49<26:31, 26.53s/pipeline]

Generation 2 - Current best internal CV score: 0.884632924975409


Optimization Progress:  67%|██████▋   | 80/120 [26:53<03:22,  5.06s/pipeline]

Generation 3 - Current best internal CV score: 0.884632924975409


Optimization Progress:  83%|████████▎ | 100/120 [35:40<09:54, 29.74s/pipeline]

Generation 4 - Current best internal CV score: 0.8865197174282393


                                                                              

Generation 5 - Current best internal CV score: 0.8865197174282393

Best pipeline: ExtraTreesClassifier(CombineDFs(input_matrix, input_matrix), bootstrap=False, criterion=entropy, max_features=0.2, min_samples_leaf=8, min_samples_split=6, n_estimators=100)
0.865642994242


In [12]:
yhat = pipeline_optimizer.predict(X_test)
print(metrics.classification_report(y_test, yhat, labels=[1,0], target_names=['break up','still together']))

                precision    recall  f1-score   support

      break up       0.90      0.91      0.90       356
still together       0.80      0.77      0.78       165

   avg / total       0.86      0.87      0.86       521

