Todos
* Recursive Feature Elimination
    * from sklearn.feature_selection import RFE

In [None]:
# Add table of contents

In [34]:
import pandas as pd
import numpy as np

# Tools
from collections import Counter
import pickle
# import joblib
# svc_model = joblib.load('../models/SVC_20k.pkl')

# Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve, precision_score, recall_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, fbeta_score
from sklearn.metrics import roc_auc_score, roc_curve

# Model Prep & Selection
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Visualizations
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('ggplot')

# Train and Test Splitting

In [2]:
# Load dataframe
df = pd.read_pickle('../data/df_cyclical_features_20k.pkl')

# # TEMPORARY
# df = df.sample(5000)

df.head()

Unnamed: 0,case_id,is_duplicate,opened,closed,updated,status_notes,responsible_agency,category,request_type,request_details,...,source,opened_year,opened_month_sin,opened_month_cos,opened_week_sin,opened_week_cos,opened_day_sin,opened_day_cos,opened_hour_sin,opened_hour_cos
619,11879423,0,2019-12-30 20:54:00,2020-01-03 11:59:19,2020-01-03 11:59:19,Agencies responded to request and no encampmen...,Duplicate Case Hold Queue,Encampments,Encampment Reports,Encampment Cleanup,...,Mobile/Open311,2019,-2.449294e-16,1.0,0.120537,0.992709,0.0,1.0,-0.8660254,0.5
620,11877576,0,2019-12-30 13:31:54,2019-12-31 13:42:03,2019-12-31 13:42:03,Case Resolved - SES Graffiti Crew - Remove Si...,DPW Ops Queue,Illegal Postings,Illegal Postings - Affixed_Improperly,Affixed Improperly,...,Mobile/Open311,2019,-2.449294e-16,1.0,0.120537,0.992709,0.0,1.0,-0.258819,-0.965926
621,11877532,0,2019-12-30 13:26:00,2019-12-30 14:03:00,2019-12-30 14:03:00,Case Resolved,DPW Ops Queue,Street and Sidewalk Cleaning,General Cleaning,Other Loose Garbage,...,Phone,2019,-2.449294e-16,1.0,0.120537,0.992709,0.0,1.0,-0.258819,-0.965926
622,11877496,0,2019-12-30 13:22:00,2019-12-30 18:53:45,2019-12-30 18:53:45,Case Resolved - WASTE NOT FOUND ...,Recology_Abandoned,Street and Sidewalk Cleaning,Bulky Items,Refrigerator,...,Phone,2019,-2.449294e-16,1.0,0.120537,0.992709,0.0,1.0,-0.258819,-0.965926
623,11877234,0,2019-12-30 12:45:38,2020-01-02 07:17:00,2020-01-02 07:17:00,Case Resolved,DPW Ops Queue,Street and Sidewalk Cleaning,General Cleaning,Other Loose Garbage,...,Mobile/Open311,2019,-2.449294e-16,1.0,0.120537,0.992709,0.0,1.0,1.224647e-16,-1.0


In [3]:
# Columns to exclude
exclude_cols = [
    'is_duplicate', # Target variable
    'case_id',
    'opened', # Needs Feature Eng
    'closed', # Needs Feature Eng
    'updated',
    'responsible_agency', # Needs NLP
    'status_notes', # Needs NLP
    'request_type', # Needs NLP
    'request_details', # Needs NLP
    'address', # Needs NLP
#     'street', # Convert to 'category' type to get dummies
    'point'
]

# # Scale data using MinMax scaler
# # No need to standardize as all features are categorical (maybe scale lat/long....)
# scaler = MinMaxScaler()

# Predictor variables
x_variables_df = df.drop(columns=exclude_cols, axis=0, inplace=False)

# Get dummies for categorical variables
X = pd.get_dummies(x_variables_df, drop_first=True)

# Target variable
y = df['is_duplicate']

# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, 
                                                    random_state=2020, 
                                                    stratify=y,  # Stratify to keep same class ratios
                                                    shuffle=True # Shuffle data since it's ordered chronologically
                                                   )
X_train.head()

Unnamed: 0,latitude,longitude,opened_month_sin,opened_month_cos,opened_week_sin,opened_week_cos,opened_day_sin,opened_day_cos,opened_hour_sin,opened_hour_cos,...,opened_year_2010,opened_year_2011,opened_year_2012,opened_year_2013,opened_year_2014,opened_year_2015,opened_year_2016,opened_year_2017,opened_year_2018,opened_year_2019
19476,37.784027,-122.409607,0.866025,-0.5,0.970942,-0.2393157,0.0,1.0,-0.258819,-0.965926,...,0,0,0,0,0,0,0,0,0,0
4602,37.770302,-122.450912,-0.866025,0.5,-0.992709,0.1205367,0.0,1.0,0.258819,-0.965926,...,0,0,0,0,0,0,0,0,1,0
10155,37.720915,-122.435768,-0.866025,-0.5,-0.822984,-0.5680647,0.433884,-0.900969,0.8660254,-0.5,...,0,0,0,0,0,0,1,0,0,0
7455,37.764227,-122.410453,-1.0,-1.83697e-16,-1.0,-1.83697e-16,0.974928,-0.222521,1.224647e-16,-1.0,...,0,0,0,0,0,0,0,1,0,0
14313,37.747768,-122.403488,0.5,-0.8660254,0.748511,-0.6631227,0.0,1.0,-0.258819,-0.965926,...,0,0,0,0,1,0,0,0,0,0


In [4]:
print('df\t', df.shape)
print('X_train\t', X_train.shape)
print('X_test\t', X_test.shape)
print('y_train\t', y_train.shape)
print('y_test\t', y_test.shape)

df	 (16786, 28)
X_train	 (13428, 1658)
X_test	 (3358, 1658)
y_train	 (13428,)
y_test	 (3358,)


## Class Balancing

In [5]:
# Target variable
target_count = df['is_duplicate'].value_counts()

# Print class balance
print(f'Class 0: {target_count[0]}')
print(f'Class 1: {target_count[1]}')
print(f'Proportion: {round(target_count[0] / target_count[1], 2)} : 1')
print(f'Percentage of Majority Class: {round(target_count[0] / sum(target_count), 3)*100}')

Class 0: 15824
Class 1: 962
Proportion: 16.45 : 1
Percentage of Majority Class: 94.3


In [6]:
# A ratio of .5 is saying that 50% of my data is simulated
# Trevor noted that .2 would be good but let's try different ratios
smote = SMOTE(random_state=2020)
X_train_smote, y_train_smote = smote.fit_sample(X_train, y_train)
Counter(y_train_smote)
# pd.Series(y_train_smote).value_counts().plot.bar()

Counter({0: 12658, 1: 12658})

# Modeling

Objective: Maximize F1 Score as both recall and precision are equally important and the classes are imbalanced

In [7]:
# Create list of model and performance
model_performance = []
models = []

User RandomizedSearchCV instead:
https://blog.usejournal.com/a-comparison-of-grid-search-and-randomized-search-using-scikit-learn-29823179bc85


# Baseline Models

In [8]:
#Dummy Classifier
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy= 'most_frequent').fit(X_train_smote, y_train_smote)
y_pred = clf.predict(X_test)

#Distribution of y test
print('y actual\t:', Counter(y_test))

#Distribution of y predicted
print('y predicted\t:', Counter(y_pred))

print(f'Precision : {round(precision_score(y_test, y_pred), 4)}')
print(f'Recall    : {round(recall_score(y_test, y_pred), 4)}')
print(f'F-score   : {round(f1_score(y_test, y_pred), 4)}')

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, y_pred)))

y actual	: Counter({0: 3166, 1: 192})
y predicted	: Counter({0: 3358})
Precision : 0.0
Recall    : 0.0
F-score   : 0.0

Confusion Matrix
[[3166    0]
 [ 192    0]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
# base_models = []

# # Instantiate the models
# base_models.append(('LogisticRegression', LogisticRegression(solver='saga')))
# base_models.append(('SVC', SVC(gamma='auto')))
# base_models.append(('KNeighbors', KNeighborsClassifier()))
# base_models.append(('RandomForest', RandomForestClassifier(n_estimators=10)))
# base_models.append(('XGBoost', XGBClassifier()))

# cv_results = []
# names = []

# # Cross Validate - 5 fold
# for name, model in base_models:
#     names.append(name)
#     cv_results.append(np.round_(cross_val_score(model, X_train_smote, y_train_smote, cv=5, scoring='f1'), 3))
#     with open(f'../models/{name}_20k.pkl', 'wb') as f:
#         pickle.dump(model, f)

# for i in range(len(names)):
#     print(names[i], round(cv_results[i].mean(), 3))

## Logistic Regression

In [9]:
# Grid searching key hyperparameters for logistic regression

# Instantiate model and SMOTE
lg_model = LogisticRegression()
smote = SMOTE(random_state=2020)

# Construct pipeline
steps = [('smt', smote), ('lgr', lg_model)]
pipeline = Pipeline(steps)

# Define parameter grid values to be searched
param_grid = {
    'smt__random_state': [2020],
    'lgr__solver': ['saga', 'liblinear'],
    'lgr__penalty': ['l1', 'l2'],
    'lgr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

# Use stratify version of k-fold to keep class imbalance ratio
k_fold = StratifiedKFold(n_splits=3, shuffle = True, random_state=2020)

# Cross Validation
lg_grid = RandomizedSearchCV(pipeline, param_grid=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')

# Train with balanced classes
grid_result = lg_grid.fit(X_train_smote, y_train_smote) # Should I use X_train, y_train here?

# Summarize results
print(f'Best Score: {round(grid_result.best_score_, 3)}\nParams: {grid_result.best_params_}\n')

mean_train = grid_result.cv_results_['mean_train_score']
mean_test = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean_tr, mean_ts, param in zip(mean_train, mean_test, params):
    print(f'Train {round(mean_tr, 3)}\tTest {round(mean_ts, 3)}\tParams: {param}')
    
# # Examine the best model
# print(lg_grid.best_score_)
# print(lg_grid.best_params_)

Best Score: 0.896
Params: {'lgr__C': 1000, 'lgr__penalty': 'l1', 'lgr__solver': 'liblinear', 'smt__random_state': 2020}

Train 0.667	Test 0.666	Params: {'lgr__C': 0.001, 'lgr__penalty': 'l1', 'lgr__solver': 'saga', 'smt__random_state': 2020}
Train 0.667	Test 0.666	Params: {'lgr__C': 0.001, 'lgr__penalty': 'l1', 'lgr__solver': 'liblinear', 'smt__random_state': 2020}
Train 0.742	Test 0.74	Params: {'lgr__C': 0.001, 'lgr__penalty': 'l2', 'lgr__solver': 'saga', 'smt__random_state': 2020}
Train 0.742	Test 0.74	Params: {'lgr__C': 0.001, 'lgr__penalty': 'l2', 'lgr__solver': 'liblinear', 'smt__random_state': 2020}
Train 0.731	Test 0.73	Params: {'lgr__C': 0.01, 'lgr__penalty': 'l1', 'lgr__solver': 'saga', 'smt__random_state': 2020}
Train 0.732	Test 0.731	Params: {'lgr__C': 0.01, 'lgr__penalty': 'l1', 'lgr__solver': 'liblinear', 'smt__random_state': 2020}
Train 0.778	Test 0.774	Params: {'lgr__C': 0.01, 'lgr__penalty': 'l2', 'lgr__solver': 'saga', 'smt__random_state': 2020}
Train 0.785	Test 0.78	P



In [10]:
# Instantiate model with best paramaters
lg_best = LogisticRegression(C=1000, penalty='l1', solver='liblinear', random_state=2020)

# Train with balanced classes
lg_best.fit(X_train_smote, y_train_smote)

# Get predictions
lg_best_preds = lg_best.predict(X_test)
# lg_best_y_score = lg_best.predict_proba(X_test)



In [18]:
# Get ROC AUC Score, precision, recall, f1-score
lg_best_roc_auc_test  = round(roc_auc_score(y_test, lg_best_preds), 3)
print(f'ROC_AUC - Test\t{lg_best_roc_auc_test}')

precision, recall, fscore, support = precision_recall_fscore_support(y_test, lg_best_preds, average='macro')
print(f'Precision\t{round(precision, 3)}')
print(f'Recall\t\t{round(recall, 3)}')
print(f'F-score\t\t{round(fscore, 3)}')

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, lg_best_preds)))

ROC_AUC - Test	0.62
Precision	0.534
Recall		0.62
F-score		0.513

Confusion Matrix
[[2409  757]
 [ 100   92]]


>Why are my scores high in the CV but much lower after using best params?
>* The FP Rate is really high...

In [20]:
# Add model and accuracy dict to list
model_performance.append(dict([
    ('Model', 'Logistic Regression'),
#     ('Train ROC AUC', lg_best_roc_auc_train),
    ('Test ROC AUC', lg_best_roc_auc_test),
    ('Precision', round(precision, 3)),
    ('Recall', round(recall, 3)),
    ('F1', round(fscore, 3))
     ]))

# Add model to list
models.append('Logistic Regression')

# KNN

In [19]:
# Grid searching key hyperparameters for KNN

# Instantiate model and SMOTE
knn_model = KNeighborsClassifier()
smote = SMOTE(random_state=2020)

# Construct pipeline
steps = [('smt', smote), ('knn', knn_model)]
pipeline = Pipeline(steps)

# Define parameter grid values to be searched
param_grid = {
    'smt__random_state': [2020],
    'knn__n_neighbors' : [3, 5, 7]
}

# Use stratify version of k-fold to keep class imbalance ratio
k_fold = StratifiedKFold(n_splits=3, shuffle = True, random_state=2020)

# Cross Validation
knn_grid = RandomizedSearchCV(pipeline, param_grid=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')

In [21]:
# Train with balanced classes
grid_result = knn_grid.fit(X_train_smote, y_train_smote) # Should I use X_train, y_train here?

# Summarize results
print(f'Best Score: {round(grid_result.best_score_, 3)}\nParams: {grid_result.best_params_}\n')

mean_train = grid_result.cv_results_['mean_train_score']
mean_test = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean_tr, mean_ts, param in zip(mean_train, mean_test, params):
    print(f'Train {round(mean_tr, 3)}\tTest {round(mean_ts, 3)}\tParams: {param}')
    
# # Examine the best model
# print(knn_grid.best_score_)
# print(knn_grid.best_params_)

Best Score: 0.95
Params: {'knn__n_neighbors': 7, 'smt__random_state': 2020}

Train 1.0	Test 0.925	Params: {'knn__n_neighbors': 3, 'smt__random_state': 2020}
Train 1.0	Test 0.94	Params: {'knn__n_neighbors': 5, 'smt__random_state': 2020}
Train 1.0	Test 0.95	Params: {'knn__n_neighbors': 7, 'smt__random_state': 2020}


In [22]:
# Instantiate model with best paramaters
knn_best = KNeighborsClassifier(n_neighbors=7)

# Train with balanced classes
knn_best.fit(X_train_smote, y_train_smote)

# Get predictions
knn_best_preds = knn_best.predict(X_test)
# knn_best_y_score = knn_best.predict_proba(X_test)

In [23]:
# Get ROC AUC Score, precision, recall, f1-score
knn_best_roc_auc_test  = round(roc_auc_score(y_test, knn_best_preds), 3)
print(f'ROC_AUC - Test\t{knn_best_roc_auc_test}')

precision, recall, fscore, support = precision_recall_fscore_support(y_test, knn_best_preds, average='macro')
print(f'Precision\t{round(precision, 3)}')
print(f'Recall\t\t{round(recall, 3)}')
print(f'F-score\t\t{round(fscore, 3)}')

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, knn_best_preds)))

ROC_AUC - Test	0.596
Precision	0.522
Recall		0.596
F-score		0.448

Confusion Matrix
[[1946 1220]
 [  81  111]]


In [None]:
# Add model and accuracy dict to list
model_performaXGBClassifier.append(dict([
    ('Model', 'KNN'),
#     ('Train ROC AUC', knn_best_roc_auc_train),
    ('Test ROC AUC', knn_best_roc_auc_test),
    ('Precision', round(precision, 3)),
    ('Recall', round(recall, 3)),
    ('F1', round(fscore, 3))
     ]))

# Add model to list
models.append('KNN')

## Random Forest

In [24]:
# Grid searching key hyperparameters for Random Forest

# Instantiate model and SMOTE
rf_model = RandomForestClassifier()
smote = SMOTE(random_state=2020)

# Construct pipeline
steps = [('smt', smote), ('rfc', rf_model)]
pipeline = Pipeline(steps)

# Define parameter grid values to be searched
param_grid = {
    'smt__random_state': [2020],
    'rfc__n_estimators': [50, 100, 150, 200, 1000],
    'rfc__max_depth' : [2, 3, 4],
#     'rfc__max_features' : [5, 10, 15],
#     'rfc__criterion' : ['gini', 'entropy'],
    'rfc__random_state' :[2020]
}

# Use stratify version of k-fold to keep class imbalance ratio
k_fold = StratifiedKFold(n_splits=3, shuffle = True, random_state=2020)

# Cross Validation
rf_grid = RandomizedSearchCV(pipeline, param_grid=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc') # Try accuracy or F1

# Train with balanced classes
grid_result = rf_grid.fit(X_train_smote, y_train_smote) # Should I use X_train, y_train here?

# Summarize results
print(f'Best Score: {round(grid_result.best_score_, 3)}\nParams: {grid_result.best_params_}\n')

mean_train = grid_result.cv_results_['mean_train_score']
mean_test = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean_tr, mean_ts, param in zip(mean_train, mean_test, params):
    print(f'Train {round(mean_tr, 3)}\tTest {round(mean_ts, 3)}\tParams: {param}')
    
# # Examine the best model
# print(rf_grid.best_score_)
# print(rf_grid.best_params_)

In [30]:
# Instantiate model with best paramaters
rf_best = RandomForestClassifier(max_depth=4, n_estimators=150, random_state=2020)

# Train with balanced classes
rf_best.fit(X_train_smote, y_train_smote)

# Get predictions
rf_best_preds = knn_best.predict(X_test)
# knn_best_y_score = rf_best.predict_proba(X_test)

In [31]:
# Get ROC AUC Score, precision, recall, f1-score
rf_best_roc_auc_test  = round(roc_auc_score(y_test, rf_best_preds), 3)
print(f'ROC_AUC - Test\t{rf_best_roc_auc_test}')

precision, recall, fscore, support = precision_recall_fscore_support(y_test, rf_best_preds, average='macro')
print(f'Precision\t{round(precision, 3)}')
print(f'Recall\t\t{round(recall, 3)}')
print(f'F-score\t\t{round(fscore, 3)}')

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, rf_best_preds)))

ROC_AUC - Test	0.596
Precision	0.522
Recall		0.596
F-score		0.448

Confusion Matrix
[[1946 1220]
 [  81  111]]


## XGBoost

In [None]:
# Grid searching key hyperparameters for XGBoost

# Instantiate model and SMOTE
xgb_model = XGBClassifier()
smote = SMOTE(random_state=2020)

# Construct pipeline
steps = [('smt', smote), ('xgb', xgb_model)]
pipeline = Pipeline(steps)

# Define parameter grid values to be searched
param_grid = {
    'smt__random_state': [2020],
    'xgb__n_estimators': [100, 250, 500, 1000], 
    'xgb__max_depth': [3, 4, 5], 
    'xgb__learning_rate': [0.001, 0.01, 0.1]
}

# Use stratify version of k-fold to keep class imbalance ratio
k_fold = StratifiedKFold(n_splits=3, shuffle = True, random_state=2020)

# Cross Validation
xgb_grid = RandomizedSearchCV(pipeline, param_grid=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')

# Train with balanced classes
grid_result = xgb_grid.fit(X_train_smote, y_train_smote) # Should I use X_train, y_train here?

# Summarize results
print(f'Best Score: {round(grid_result.best_score_, 3)}\nParams: {grid_result.best_params_}\n')

mean_train = grid_result.cv_results_['mean_train_score']
mean_test = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean_tr, mean_ts, param in zip(mean_train, mean_test, params):
    print(f'Train {round(mean_tr, 3)}\tTest {round(mean_ts, 3)}\tParams: {param}')
    
# # Examine the best model
# print(lg_grid.best_score_)
# print(lg_grid.best_params_)

In [None]:
# Instantiate model with best paramaters
xgb_best = XGB(

# Train with balanced classes
xgb_best.fit(X_train_smote, y_train_smote)

# Get predictions
xgb_best_preds = xgb_best.predict(X_test)
# xgb_best_y_score = xgb_best.predict_proba(X_test)

In [None]:
# Get ROC AUC Score, precision, recall, f1-score
xgb_best_roc_auc_test  = round(roc_auc_score(y_test, xgb_best_preds), 3)
print(f'ROC_AUC - Test\t{xgb_best_roc_auc_test}')

precision, recall, fscore, support = precision_recall_fscore_support(y_test, xgb_best_preds, average='macro')
print(f'Precision\t{round(precision, 3)}')
print(f'Recall\t\t{round(recall, 3)}')
print(f'F-score\t\t{round(fscore, 3)}')

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, xgb_best_preds)))

In [None]:
# Add model and accuracy dict to list
model_performance.append(dict([
    ('Model', 'XGBoost'),
#     ('Train ROC AUC', xgb_best_roc_auc_train),
    ('Test ROC AUC', xgb_best_roc_auc_test),
    ('Precision', round(precision, 3)),
    ('Recall', round(recall, 3)),
    ('F1', round(fscore, 3))
     ]))

# Add model to list
models.append('XGBoost')

## Confusion Matrix for Test Dataset

In [None]:
# # Print confusion matrix for XGBoost
# xgb_confusion = confusion_matrix(y_test, test_pred_smote)

# plt.figure(dpi=125)
# sns.heatmap(xgb_confusion, annot=True, fmt='g', square=True, cbar=False,
#             xticklabels=['no duplicate', 'is duplicate'],
#             yticklabels=['no duplicate', 'is duplicate'])

# plt.title('Confusion Matrix - Test Dataset\nXGBoost', pad=20)
# plt.xlabel('Predicted\n', labelpad=20)
# plt.ylabel('Actual\n', labelpad=20);