## Build string matching classifier.  Benchmark xgboost as Light GBM

In [1]:
from __future__ import division

import pandas as pd
import numpy as np
import lightgbm as lgb # The first test of lightgbm

from sklearn.metrics import accuracy_score, classification_report, auc, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.metrics import (brier_score_loss, precision_score, recall_score,
                             f1_score)
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
%matplotlib inline

  'Matplotlib is building the font cache using fc-list. '


In [2]:
pd.options.display.max_columns = 1000
pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 100000

In [3]:
df = pd.read_csv('full_matching_data_set.csv', header=0)

In [4]:
df.head()

Unnamed: 0,str_1,str_2,is_correct,fuzzy_similarity,DL_similarity,jaccard_similarity,ngram_compare,phonetic_distance
0,#THROWBACKTHURSDAY,THROWBACKTHURSDAY VIDEOS,1,0.85,0.73913,0.5,0.607143,0.909091
1,10 THINGS HATE ABOUT YO,10 THINGS HATE ABOUT YOU,1,0.98,0.956522,0.666667,0.821429,1.0
2,10 THINGS HATE ABOUT YOU,10 THINGS HATE ABOUT YOU,1,1.0,1.0,1.0,1.0,1.0
3,10 THINGS HATE ABOUT YOU,WILD THINGS,0,0.36,0.173913,0.166667,0.147059,0.576984
4,100 GREATEST KIDS STARS,THE GREATEST,0,0.55,0.428571,0.2,0.21875,0.619048


In [5]:
# Break up data
X_data_df = df[['fuzzy_similarity', 'DL_similarity', 'jaccard_similarity', 
                'ngram_compare', 'phonetic_distance']]
y_data_df = df['is_correct']

In [6]:
X_data_mat = X_data_df.as_matrix()
y_data_mat = y_data_df.as_matrix()

In [8]:
# test train split.  Leave test set out for future evaluation.  Train set will be used to
# cross validate.
X_split, X_test, y_split, y_test = train_test_split(X_data_mat, y_data_mat, test_size=0.1)

## Models without Bagging/Bootstrapping

In [9]:
# Let's see what the effect of tuning depth on this is...
xgb_param_grid = {'max_depth': [3,5,7,9,13]}

In [10]:
# Use the test set as an early stopping criterion. Should use another test train split...
X_train, X_valid, y_train, y_valid = train_test_split(X_split, y_split, test_size=0.1)
watchlist = [(X_valid, y_valid)]

In [13]:
# Set up fit parameters
fit_params = {'eval_set': watchlist, 'early_stopping_rounds': 10}

In [14]:
# Setup estimator defaults
xgb_classifier = XGBClassifier(n_estimators=1000000)

In [15]:
xgb_class = GridSearchCV(cv=5, estimator=xgb_classifier, param_grid=xgb_param_grid, fit_params=fit_params,
                        verbose=10)

In [16]:
xgb_class.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] max_depth=3 .....................................................
[0]	validation_0-error:0.138996
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.137066
[2]	validation_0-error:0.133205
[3]	validation_0-error:0.133205
[4]	validation_0-error:0.133205
[5]	validation_0-error:0.133205
[6]	validation_0-error:0.137066
[7]	validation_0-error:0.135135
[8]	validation_0-error:0.138996
[9]	validation_0-error:0.137066
[10]	validation_0-error:0.135135
[11]	validation_0-error:0.129344
[12]	validation_0-error:0.135135
[13]	validation_0-error:0.131274
[14]	validation_0-error:0.129344
[15]	validation_0-error:0.111969
[16]	validation_0-error:0.1139
[17]	validation_0-error:0.1139
[18]	validation_0-error:0.111969
[19]	validation_0-error:0.110039
[20]	validation_0-error:0.111969
[21]	validation_0-error:0.110039
[22]	validation_0-error:0.108108
[23]	validation_0-error:0.108108
[24]	validation_0-error:0



[27]	validation_0-error:0.106178
[28]	validation_0-error:0.108108
[29]	validation_0-error:0.108108
[30]	validation_0-error:0.108108
[31]	validation_0-error:0.106178
[32]	validation_0-error:0.106178
[33]	validation_0-error:0.108108
[34]	validation_0-error:0.108108
Stopping. Best iteration:
[24]	validation_0-error:0.104247

[CV] ................ max_depth=3, score=0.887580299786, total=   0.2s
[CV] max_depth=3 .....................................................
[0]	validation_0-merror:0.144788
Will train until validation_0-merror hasn't improved in 10 rounds.
[1]	validation_0-merror:0.142857
[2]	validation_0-merror:0.144788
[3]	validation_0-merror:0.140927
[4]	validation_0-merror:0.131274
[5]	validation_0-merror:0.127413
[6]	validation_0-merror:0.127413
[7]	validation_0-merror:0.121622
[8]	validation_0-merror:0.119691
[9]	validation_0-merror:0.121622
[10]	validation_0-merror:0.121622
[11]	validation_0-merror:0.121622
[12]	validation_0-merror:0.119691
[13]	validation_0-merror:0.117761
[

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[18]	validation_0-merror:0.102317
[19]	validation_0-merror:0.102317
[20]	validation_0-merror:0.102317
[21]	validation_0-merror:0.100386
[22]	validation_0-merror:0.098456
[23]	validation_0-merror:0.098456
[24]	validation_0-merror:0.096525
[25]	validation_0-merror:0.100386
[26]	validation_0-merror:0.100386
[27]	validation_0-merror:0.102317
[28]	validation_0-merror:0.102317
[29]	validation_0-merror:0.102317
[30]	validation_0-merror:0.102317
[31]	validation_0-merror:0.102317
[32]	validation_0-merror:0.102317
[33]	validation_0-merror:0.104247
[34]	validation_0-merror:0.102317
Stopping. Best iteration:
[24]	validation_0-merror:0.096525

[CV] ................ max_depth=3, score=0.890675241158, total=   0.3s
[CV] max_depth=3 .....................................................
[0]	validation_0-merror:0.138996
Will train until validation_0-merror hasn't improved in 10 rounds.
[1]	validation_0-merror:0.138996
[2]	validation_0-merror:0.138996
[3]	validation_0-merror:0.133205
[4]	validation_0-mer

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


[20]	validation_0-merror:0.100386
[21]	validation_0-merror:0.104247
[22]	validation_0-merror:0.102317
[23]	validation_0-merror:0.104247
[24]	validation_0-merror:0.106178
[25]	validation_0-merror:0.104247
[26]	validation_0-merror:0.106178
[27]	validation_0-merror:0.106178
[28]	validation_0-merror:0.106178
[29]	validation_0-merror:0.106178
[30]	validation_0-merror:0.104247
Stopping. Best iteration:
[20]	validation_0-merror:0.100386

[CV] ................ max_depth=3, score=0.880901287554, total=   0.3s
[CV] max_depth=3 .....................................................
[0]	validation_0-merror:0.144788
Will train until validation_0-merror hasn't improved in 10 rounds.
[1]	validation_0-merror:0.138996
[2]	validation_0-merror:0.129344
[3]	validation_0-merror:0.121622
[4]	validation_0-merror:0.123552
[5]	validation_0-merror:0.121622
[6]	validation_0-merror:0.121622
[7]	validation_0-merror:0.121622
[8]	validation_0-merror:0.121622
[9]	validation_0-merror:0.117761
[10]	validation_0-merror:0

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.9s remaining:    0.0s


[20]	validation_0-merror:0.104247
[21]	validation_0-merror:0.102317
[22]	validation_0-merror:0.102317
[23]	validation_0-merror:0.102317
[24]	validation_0-merror:0.102317
[25]	validation_0-merror:0.100386
[26]	validation_0-merror:0.100386
[27]	validation_0-merror:0.102317
[28]	validation_0-merror:0.102317
[29]	validation_0-merror:0.102317
[30]	validation_0-merror:0.102317
[31]	validation_0-merror:0.102317
[32]	validation_0-merror:0.102317
[33]	validation_0-merror:0.102317
[34]	validation_0-merror:0.100386
[35]	validation_0-merror:0.100386
Stopping. Best iteration:
[25]	validation_0-merror:0.100386

[CV] ................ max_depth=3, score=0.892703862661, total=   0.3s
[CV] max_depth=3 .....................................................
[0]	validation_0-merror:0.137066
Will train until validation_0-merror hasn't improved in 10 rounds.
[1]	validation_0-merror:0.142857
[2]	validation_0-merror:0.140927
[3]	validation_0-merror:0.140927
[4]	validation_0-merror:0.135135
[5]	validation_0-merr

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.5s remaining:    0.0s


[0]	validation_0-error:0.108108
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.106178
[2]	validation_0-error:0.1139
[3]	validation_0-error:0.111969
[4]	validation_0-error:0.108108
[5]	validation_0-error:0.108108
[6]	validation_0-error:0.110039
[7]	validation_0-error:0.108108
[8]	validation_0-error:0.108108
[9]	validation_0-error:0.106178
[10]	validation_0-error:0.106178
[11]	validation_0-error:0.108108
Stopping. Best iteration:
[1]	validation_0-error:0.106178

[CV] ................. max_depth=5, score=0.88329764454, total=   0.1s
[CV] max_depth=5 .....................................................
[0]	validation_0-merror:0.119691
Will train until validation_0-merror hasn't improved in 10 rounds.
[1]	validation_0-merror:0.117761
[2]	validation_0-merror:0.117761
[3]	validation_0-merror:0.119691
[4]	validation_0-merror:0.1139
[5]	validation_0-merror:0.111969
[6]	validation_0-merror:0.110039
[7]	validation_0-merror:0.104247
[8]	validation_0-mer

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    1.7s remaining:    0.0s


[15]	validation_0-merror:0.102317
[16]	validation_0-merror:0.102317
[17]	validation_0-merror:0.100386
[18]	validation_0-merror:0.102317
[19]	validation_0-merror:0.102317
[20]	validation_0-merror:0.102317
[21]	validation_0-merror:0.102317
[22]	validation_0-merror:0.100386
[23]	validation_0-merror:0.098456
[24]	validation_0-merror:0.098456
[25]	validation_0-merror:0.098456
[26]	validation_0-merror:0.098456
[27]	validation_0-merror:0.098456
[28]	validation_0-merror:0.098456
[29]	validation_0-merror:0.098456
[30]	validation_0-merror:0.100386
[31]	validation_0-merror:0.100386
[32]	validation_0-merror:0.100386
[33]	validation_0-merror:0.100386
Stopping. Best iteration:
[23]	validation_0-merror:0.098456

[CV] ................ max_depth=5, score=0.893890675241, total=   0.4s
[CV] max_depth=5 .....................................................
[0]	validation_0-merror:0.121622
Will train until validation_0-merror hasn't improved in 10 rounds.
[1]	validation_0-merror:0.108108
[2]	validation_0-m

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    2.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    2.3s remaining:    0.0s


[CV] max_depth=5 .....................................................
[0]	validation_0-merror:0.1139
Will train until validation_0-merror hasn't improved in 10 rounds.
[1]	validation_0-merror:0.106178
[2]	validation_0-merror:0.102317
[3]	validation_0-merror:0.098456
[4]	validation_0-merror:0.100386
[5]	validation_0-merror:0.096525
[6]	validation_0-merror:0.102317
[7]	validation_0-merror:0.100386
[8]	validation_0-merror:0.104247
[9]	validation_0-merror:0.102317
[10]	validation_0-merror:0.104247
[11]	validation_0-merror:0.106178
[12]	validation_0-merror:0.104247
[13]	validation_0-merror:0.100386
[14]	validation_0-merror:0.096525
[15]	validation_0-merror:0.094595
[16]	validation_0-merror:0.092664
[17]	validation_0-merror:0.094595
[18]	validation_0-merror:0.094595
[19]	validation_0-merror:0.096525
[20]	validation_0-merror:0.096525
[21]	validation_0-merror:0.096525
[22]	validation_0-merror:0.096525
[23]	validation_0-merror:0.096525
[24]	validation_0-merror:0.096525
[25]	validation_0-merror

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    2.7s remaining:    0.0s


[16]	validation_0-merror:0.090734
[17]	validation_0-merror:0.090734
[18]	validation_0-merror:0.086873
[19]	validation_0-merror:0.088803
[20]	validation_0-merror:0.088803
[21]	validation_0-merror:0.090734
[22]	validation_0-merror:0.090734
[23]	validation_0-merror:0.090734
[24]	validation_0-merror:0.090734
[25]	validation_0-merror:0.090734
[26]	validation_0-merror:0.090734
[27]	validation_0-merror:0.090734
[28]	validation_0-merror:0.090734
Stopping. Best iteration:
[18]	validation_0-merror:0.086873

[CV] ................ max_depth=5, score=0.887218045113, total=   0.4s
[CV] max_depth=7 .....................................................
[0]	validation_0-error:0.127413
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.125483
[2]	validation_0-error:0.117761
[3]	validation_0-error:0.117761
[4]	validation_0-error:0.111969
[5]	validation_0-error:0.110039
[6]	validation_0-error:0.110039
[7]	validation_0-error:0.110039
[8]	validation_0-error:0.110039
[

[4]	validation_0-merror:0.117761
[5]	validation_0-merror:0.1139
[6]	validation_0-merror:0.1139
[7]	validation_0-merror:0.110039
[8]	validation_0-merror:0.104247
[9]	validation_0-merror:0.104247
[10]	validation_0-merror:0.104247
[11]	validation_0-merror:0.102317
[12]	validation_0-merror:0.110039
[13]	validation_0-merror:0.106178
[14]	validation_0-merror:0.106178
[15]	validation_0-merror:0.108108
[16]	validation_0-merror:0.111969
[17]	validation_0-merror:0.1139
[18]	validation_0-merror:0.110039
[19]	validation_0-merror:0.108108
[20]	validation_0-merror:0.108108
[21]	validation_0-merror:0.110039
Stopping. Best iteration:
[11]	validation_0-merror:0.102317

[CV] ................ max_depth=9, score=0.901287553648, total=   0.4s
[CV] max_depth=9 .....................................................
[0]	validation_0-merror:0.11583
Will train until validation_0-merror hasn't improved in 10 rounds.
[1]	validation_0-merror:0.117761
[2]	validation_0-merror:0.108108
[3]	validation_0-merror:0.108108

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    9.2s finished


[8]	validation_0-merror:0.108108
[9]	validation_0-merror:0.104247
[10]	validation_0-merror:0.102317
[11]	validation_0-merror:0.102317
[12]	validation_0-merror:0.100386
[13]	validation_0-merror:0.100386
[14]	validation_0-merror:0.102317
[15]	validation_0-merror:0.104247
[16]	validation_0-merror:0.106178
[17]	validation_0-merror:0.106178
[18]	validation_0-merror:0.108108
[19]	validation_0-merror:0.110039
[20]	validation_0-merror:0.108108
[21]	validation_0-merror:0.108108
[22]	validation_0-merror:0.104247
Stopping. Best iteration:
[12]	validation_0-merror:0.100386



GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={'eval_set': [(array([[ 0.67   ,  0.44444, ...,  0.19355,  0.63593],
       [ 0.5    ,  0.35185, ...,  0.2027 ,  0.72295],
       ...,
       [ 0.7    ,  0.44   , ...,  0.38889,  0.70912],
       [ 0.6    ,  0.5    , ...,  0.29032,  0.7381 ]]), array([0, 0, ..., 1, 0]))], 'early_stopping_rounds': 10},
       iid=True, n_jobs=1, param_grid={'max_depth': [3, 5, 7, 9, 13]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [17]:
xgb_class.best_params_

{'max_depth': 9}

In [None]:
# Interesting....retrain on the entire split set and test against the hold out.
