In [None]:
cat /proc/cpuinfo

Todos
* Recursive Feature Elimination
    * from sklearn.feature_selection import RFE

In [None]:
client.close()

In [None]:
# Import Dask libraries

from dask import delayed
import joblib
import dask.dataframe as dd
import dask.array as da

from dask.distributed import Client, progress
client = Client(processes=False)
client

In [1]:
import pandas as pd
import numpy as np


# Tools
from collections import Counter
import pickle
# import joblib
# svc_model = joblib.load('../models/SVC_20k.pkl')

# Features
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from xgboost import plot_importance

# Sampling & Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Model Selection
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier

# Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve, precision_score, recall_score
from sklearn.metrics import f1_score, fbeta_score
from sklearn.metrics import roc_auc_score, roc_curve, auc

# Visualizations
from matplotlib import pyplot
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('ggplot')

Using TensorFlow backend.


In [2]:
# Load dataframe
df = pd.read_pickle('../data/df_pre_model_2018.pkl')

df.head()

Unnamed: 0,case_id,target,opened,closed,updated,status,status_notes,responsible_agency,category,request_type,...,opened_year,opened_month_sin,opened_month_cos,opened_week_sin,opened_week_cos,opened_day_sin,opened_day_cos,opened_hour_sin,opened_hour_cos,resolution_time
811474,9993791,1,2018-12-30 22:40:00,2018-12-30 23:13:06,2018-12-30 23:13:06,Closed,Case Resolved - Officer responded to request u...,Parking Enforcement Dispatch Queue,Parking Enforcement,Parking_on_Sidewalk,...,2018,-2.449294e-16,1.0,-2.449294e-16,1.0,-0.781831,0.62349,-0.5,0.866025,33.1
811481,9993771,1,2018-12-30 22:18:00,2018-12-30 22:19:29,2018-12-30 22:19:29,Closed,Case is Invalid - Contact name and phone numbe...,Parking Enforcement Dispatch Queue,Parking Enforcement,Blocking_Driveway_Cite_Only,...,2018,-2.449294e-16,1.0,-2.449294e-16,1.0,-0.781831,0.62349,-0.5,0.866025,1.483333
811482,9993764,0,2018-12-30 22:14:00,2018-12-30 23:12:50,2018-12-30 23:12:50,Closed,Case Resolved - Officer responded to request u...,Parking Enforcement Dispatch Queue,Parking Enforcement,Blocking_Driveway_Cite_Tow,...,2018,-2.449294e-16,1.0,-2.449294e-16,1.0,-0.781831,0.62349,-0.5,0.866025,58.833333
811494,9993737,0,2018-12-30 21:53:14,2018-12-30 22:18:00,2018-12-30 22:18:00,Closed,Case Transferred - See encampment sr#9993757,311 Supervisor Queue,General Request - PUBLIC WORKS,request_for_service,...,2018,-2.449294e-16,1.0,-2.449294e-16,1.0,-0.781831,0.62349,-0.707107,0.707107,24.766667
811508,9993687,0,2018-12-30 21:16:00,2018-12-30 21:38:46,2018-12-30 21:38:46,Closed,Case Resolved - Police Officer responded to re...,Parking Enforcement Dispatch Queue,Parking Enforcement,Blocking_Driveway_Cite_Only,...,2018,-2.449294e-16,1.0,-2.449294e-16,1.0,-0.781831,0.62349,-0.707107,0.707107,22.766667


In [3]:
# Train and test splitting

# Columns to exclude
exclude_cols = [
    'target', # Target variable
    'case_id',
    'opened', # Feature Eng
    'closed', # Feature Eng
    'updated',
    'status',
    'status_notes', # Needs NLP
    'request_details', # Needs NLP
    'address', # Needs NLP
#     'street', # Convert to 'category' type to get dummies
    'point',
    
    # New items
    'responsible_agency',
    'category', # Need to choose 'category' or 'request_type' NOT BOTH
#     'request_type', # Needs NLP
    'opened_year',
    'opened_month_sin',
    'opened_month_cos',
#     'opened_week_sin',
#     'opened_week_cos',
#     'opened_day_sin',
#     'opened_day_cos',
#     'opened_hour_sin',
#     'opened_hour_cos',
    'police_district',
    'supervisor_district',
#     'latitude',
#     'longitude',
]

# # Scale data using MinMax scaler
# # No need to standardize as all features are categorical (maybe scale lat/long....)
# scaler = MinMaxScaler()

# Predictor variables
X = df.drop(columns=exclude_cols, axis=0, inplace=False)

# Get dummies for categorical variables
X = pd.get_dummies(X, drop_first=True)

# Target variable
y = df['target']

# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, 
                                                    random_state=2020, 
                                                    stratify=y,  # Stratify to keep same class ratios
                                                    shuffle=True # Shuffle data since it's ordered chronologically
                                                   )
X_train.head()

Unnamed: 0,latitude,longitude,opened_week_sin,opened_week_cos,opened_day_sin,opened_day_cos,opened_hour_sin,opened_hour_cos,resolution_time,request_type_Abandoned Vehicle - Car2door,...,neighborhood_Westwood Park,neighborhood_Yerba Buena Island,source_Integrated Agency,source_Mail,source_Mobile/Open311,source_Other Department,source_Phone,source_Twitter,source_Web,has_media_1
838007,37.7586,-122.4324,-0.239316,0.970942,0.974928,-0.222521,0.866025,-0.5,40.0,0,...,0,0,0,0,1,0,0,0,0,0
1004322,37.7811,-122.4092,-0.935016,-0.354605,0.974928,-0.222521,-0.5,-0.866025,1502.0,0,...,0,0,0,0,0,0,1,0,0,0
877110,37.7815,-122.4383,-0.663123,0.748511,-0.974928,-0.222521,-0.5,-0.866025,82.116667,0,...,0,0,0,0,1,0,0,0,0,1
1052583,37.7806,-122.4058,-0.663123,-0.748511,0.974928,-0.222521,-0.258819,-0.965926,23.6,0,...,0,0,1,0,0,0,0,0,0,0
960828,37.7664,-122.4665,-0.992709,0.120537,0.0,1.0,0.866025,-0.5,15763.166667,0,...,0,0,0,0,0,0,0,0,1,0


# Feature Selection

In [4]:
def select_features(X_train, y_train, X_test):
    '''Returns X_train, X_test, and feature selection function'''
    fs = SelectKBest(score_func=f_classif, k='all')
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

In [5]:
# Feature selection
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)

 1802 1816 1821 1885 1917 1952 2027 2041 2063 2238 2305 2312 2381 2445
 2478 2608 2629 2785] are constant.
  f = msb / msw


In [6]:
# Feature scores
features_df = pd.DataFrame(data=[X_train.columns, fs.scores_.astype(int)]).transpose()
features_df.rename(columns={0: 'Feature', 1: 'ANOVA F-Value'}, inplace=True)
features_df.sort_values(by='ANOVA F-Value', ascending=False, inplace=True)
features_df.reset_index(drop=True, inplace=True)
features_df

Unnamed: 0,Feature,ANOVA F-Value
0,request_type_Abandoned Vehicles,39869
1,request_type_Bulky Items,20843
2,request_type_Abandoned Vehicle - Car4door,8612
3,request_type_Encampment Reports,6729
4,request_type_Parking_on_Sidewalk,4366
...,...,...
2787,street_Mullen and Peralta Mini Park,-9223372036854775808
2788,street_OLORAN AVE,-9223372036854775808
2789,street_MILAN TER,-9223372036854775808
2790,street_WHITING ST,-9223372036854775808


In [7]:
# Select features above threshold
threshold = 5
best_features_df = features_df[(features_df['ANOVA F-Value'] > threshold)]

In [8]:
# Filter X_train & X_test with selected features
X_train = X_train.filter(items=best_features_df['Feature'])
X_test  = X_test.filter(items=best_features_df['Feature'])

# Clean column names
X_train.columns = X_train.columns.str.strip().str.lower().str.replace(
    ' ', '_').str.replace('(', '').str.replace(')', '')

X_test.columns = X_test.columns.str.strip().str.lower().str.replace(
    ' ', '_').str.replace('(', '').str.replace(')', '')

In [9]:
print('df\t', df.shape)
print('X_train\t', X_train.shape)
print('X_test\t', X_test.shape)
print('y_train\t', y_train.shape)
print('y_test\t', y_test.shape)

df	 (530759, 31)
X_train	 (424607, 772)
X_test	 (106152, 772)
y_train	 (424607,)
y_test	 (106152,)


# Class Balancing

In [10]:
# Target variable
target_count = df['target'].value_counts()

# Print class balance
print(f'Class 0: {target_count[0]}')
print(f'Class 1: {target_count[1]}')
print(f'Proportion: {round(target_count[0] / target_count[1], 2)} : 1')
print(f'Percentage of Majority Class: {round(target_count[0] / sum(target_count), 3)*100}')

Class 0: 418924
Class 1: 111835
Proportion: 3.75 : 1
Percentage of Majority Class: 78.9


In [11]:
# # Pickle dataframe
# X_train.to_pickle('../data/X_train.pkl')
# X_test.to_pickle('../data/X_test.pkl')

In [12]:
# USE DASK from here on out

## Oversampling

In [13]:
# # A ratio of .5 is saying that 50% of my data is simulated
# # Trevor noted that .2 would be good but let's try different ratios
# smote = SMOTE(random_state=2020)
# X_train_smote, y_train_smote = smote.fit_sample(X_train, y_train)

# # Summarize the new class distribution
# Counter(y_train_smote)

## Undersampling

In [14]:
# # Define the undersampling method – RandomUnderSampler
rndm_under = RandomUnderSampler(random_state=2020)

# Transform the dataset
X_train_under, y_train_under = rndm_under.fit_sample(X_train, y_train)

# Summarize the new class distribution
Counter(y_train_under)

Counter({0: 89468, 1: 89468})

In [15]:
# # Define the undersampling method – NearMiss
# # Selects the closest examples from the majority class for each minority class.
# undersample = NearMiss(version=3, n_neighbors_ver3=3)

# # Transform the dataset
# X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)

# # Summarize the new class distribution
# Counter(y_train_under)

# Modeling

Objective: Maximize ROC AUC Score as both recall and precision are equally important and the classes are imbalanced

In [16]:
# Create list of model and performance
model_performance = []
models_optimized = []

User RandomizedSearchCV instead:
https://blog.usejournal.com/a-comparison-of-grid-search-and-randomized-search-using-scikit-learn-29823179bc85


## Baseline Models

In [17]:
#Dummy Classifier
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy= 'most_frequent').fit(X_train_under, y_train_under)
y_pred = clf.predict(X_test)

#Distribution of y test
print('y actual\t:', Counter(y_test))

#Distribution of y predicted
print('y predicted\t:', Counter(y_pred))

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, y_pred)))

y actual	: Counter({0: 83785, 1: 22367})
y predicted	: Counter({0: 106152})

Confusion Matrix
[[83785     0]
 [22367     0]]


In [18]:
base_models = []

# Instantiate the models
base_models.append(('LogisticRegression', LogisticRegression(solver='liblinear')))
base_models.append(('KNeighbors', KNeighborsClassifier()))
base_models.append(('RandomForest', RandomForestClassifier(n_estimators=10)))
base_models.append(('XGBoost', XGBClassifier()))

cv_results = []
names = []

# Cross Validate - 5 fold
for name, model in base_models:
    names.append(name)
    cv_results.append(np.round_(cross_val_score(model, X_train_under, y_train_under, cv=5, scoring='roc_auc'), 3))

for i in range(len(names)):
    print(names[i], round(cv_results[i].mean(), 3))

LogisticRegression 0.686
KNeighbors 0.764
RandomForest 0.879
XGBoost 0.899


In [None]:
# Identify the models that work best without any GridSearchCV
# Plot the ROC Curves
# Undersampling (Don't use synthetic data)
# How to predict duplicates?
## More like unsupervised learning... 
## Create a function – given a point, identify a case that this is more similar to...


In [None]:
# # Save models
# with open(f'../models/{name}_20k.pkl', 'wb') as f: # how can I save the model for later use?ii
#     pickle.dump(model, f)

## Logistic Regression

In [None]:
# Grid searching key hyperparameters for logistic regression

# Instantiate model and sampler
lg_model = LogisticRegression()
under = under(random_state=2020)

# Construct pipeline
steps = [('und', under), ('lgr', lg_model)]
pipeline = Pipeline(steps)

# Define parameter grid values to be searched
param_grid = {
    'und__random_state': [2020],
    'lgr__solver': ['saga', 'liblinear'],
    'lgr__penalty': ['l1', 'l2'],
    'lgr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

# Use stratify version of k-fold to keep class imbalance ratio
k_fold = StratifiedKFold(n_splits=3, shuffle = True, random_state=2020)

# Cross Validation
# lg_grid = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')
lg_rndm = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')

# Train with balanced classes
grid_result = lg_rndm.fit(X_train_under, y_train_under) # Should I use X_train, y_train here?

# Summarize results
print(f'Best Score: {round(grid_result.best_score_, 3)}\nParams: {grid_result.best_params_}\n')

mean_train = grid_result.cv_results_['mean_train_score']
mean_test = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean_tr, mean_ts, param in zip(mean_train, mean_test, params):
    print(f'Train {round(mean_tr, 3)}\tTest {round(mean_ts, 3)}\tParams: {param}')
    
# # Examine the best model
# print(lg_grid.best_score_)
# print(lg_grid.best_params_)

In [None]:
# Instantiate model with best paramaters
lg_best = LogisticRegression(C=10, penalty='l2', solver='saga', random_state=2020) # Check the params

# Train with balanced classes
lg_best.fit(X_train_under, y_train_under)

# Get predictions
lg_best_preds = lg_best.predict(X_test)
# lg_best_y_score = lg_best.predict_proba(X_test)

# Get ROC AUC Score, precision, recall, f1-score
accuracy  = round(accuracy_score(y_test, lg_best_preds), 3)
roc_auc   = round(roc_auc_score(y_test, lg_best_preds), 3)
precision = round(precision_score(y_test, lg_best_preds), 3)
recall    = round(recall_score(y_test, lg_best_preds), 3)
f1        = round(f1_score(y_test, lg_best_preds), 3)

print(f'Accuracy  : {accuracy}')
print(f'ROC_AUC   : {roc_auc}')
print(f'Precision : {precision}')
print(f'Recall    : {recall}')
print(f'F-score   : {f1}')

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, lg_best_preds)))

In [None]:
# # Add model and accuracy dict to list
# model_performance.append(dict([
#     ('Model', 'Logistic Regression'),
#     ('ROC AUC', round(roc_auc, 3)),
#     ('Precision', round(precision, 3)),
#     ('Recall', round(recall, 3)),
#     ('f1', round(f1_score, 3))
#      ]))

# # Add model to list
# models_optimized.append('Logistic Regression')

## KNN

In [None]:
# Grid searching key hyperparameters for KNN

# Instantiate model and RandomUnderSampler
knn_model = KNeighborsClassifier()
under = RandomUnderSampler(random_state=2020)

# Construct pipeline
steps = [('und', under), ('knn', knn_model)]
pipeline = Pipeline(steps)

# Define parameter grid values to be searched
param_grid = {
    'und__random_state': [2020],
    'knn__n_neighbors' : [3, 5, 7]
}

# Use stratify version of k-fold to keep class imbalance ratio
k_fold = StratifiedKFold(n_splits=3, shuffle = True, random_state=2020)

# Cross Validation
# knn_grid = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')
knn_rndm = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')

# Train with balanced classes
grid_result = knn_rndm.fit(X_train_under, y_train_under) # Should I use X_train, y_train here?

# Summarize results
print(f'Best Score: {round(grid_result.best_score_, 3)}\nParams: {grid_result.best_params_}\n')

mean_train = grid_result.cv_results_['mean_train_score']
mean_test = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean_tr, mean_ts, param in zip(mean_train, mean_test, params):
    print(f'Train {round(mean_tr, 3)}\tTest {round(mean_ts, 3)}\tParams: {param}')
    
# # Examine the best model
# print(knn_grid.best_score_)
# print(knn_grid.best_params_)

In [None]:
# Instantiate model with best paramaters
knn_best = KNeighborsClassifier(n_neighbors=7)

# Train with balanced classes
knn_best.fit(X_train_under, y_train_under)

# Get predictions
knn_best_preds = knn_best.predict(X_test)
# knn_best_y_score = knn_best.predict_proba(X_test)

# Get ROC AUC Score, precision, recall, f1-score
accuracy  = round(accuracy_score(y_test,  knn_best_preds), 3)
roc_auc   = round(roc_auc_score(y_test,   knn_best_preds), 3)
precision = round(precision_score(y_test, knn_best_preds), 3)
recall    = round(recall_score(y_test,    knn_best_preds), 3)
f1        = round(f1_score(y_test,        knn_best_preds), 3)

print(f'Accuracy  : {accuracy}')
print(f'ROC_AUC   : {roc_auc}')
print(f'Precision : {precision}')
print(f'Recall    : {recall}')
print(f'F-score   : {f1}')

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, knn_best_preds)))

In [None]:
# # Add model and accuracy dict to list
# model_performance.append(dict([
#     ('Model', 'KNN'),
#     ('ROC AUC', round(roc_auc, 3)),
#     ('Precision', round(precision, 3)),
#     ('Recall', round(recall, 3)),
#     ('f1', round(f1_score, 3))
#      ]))

# # Add model to list
# models_optimized.append('KNN')

## Random Forest

In [None]:
# Grid searching key hyperparameters for Random Forest

# Instantiate model and RandomUnderSampler
rf_model = RandomForestClassifier()
under = RandomUnderSampler(random_state=2020)

# Construct pipeline
steps = [('und', under), ('rfc', rf_model)]
pipeline = Pipeline(steps)

# Define parameter grid values to be searched
param_grid = {
    'und__random_state': [2020],
    'rfc__n_estimators': [50, 100, 150, 200, 1000],
    'rfc__max_depth' : [2, 3, 4],
    'rfc__max_features' : [5, 10, 15],
    'rfc__criterion' : ['gini', 'entropy'],
    'rfc__random_state' :[2020]
}

# Use stratify version of k-fold to keep class imbalance ratio
k_fold = StratifiedKFold(n_splits=3, shuffle = True, random_state=2020)

# Cross Validation
# rf_grid = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')
rf_rndm = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')

# Train with balanced classes
grid_result = rf_rndm.fit(X_train_under, y_train_under) # Should I use X_train, y_train here?

# Summarize results
print(f'Best Score: {round(grid_result.best_score_, 3)}\nParams: {grid_result.best_params_}\n')

mean_train = grid_result.cv_results_['mean_train_score']
mean_test = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean_tr, mean_ts, param in zip(mean_train, mean_test, params):
    print(f'Train {round(mean_tr, 3)}\tTest {round(mean_ts, 3)}\tParams: {param}')
    
# # Examine the best model
# print(rf_grid.best_score_)
# print(rf_grid.best_params_)

In [None]:
# Instantiate model with best paramaters
rf_best = RandomForestClassifier(n_estimators=1000, criterion='entropy', max_depth=4, max_features=10, random_state=2020)

# Train with balanced classes
rf_best.fit(X_train_under, y_train_under)

# Get predictions
rf_best_preds = rf_best.predict(X_test)
# rf_best_y_score = rf_best.predict_proba(X_test)

# Get ROC AUC Score, precision, recall, f1-score
accuracy  = round(accuracy_score(y_test,  rf_best_preds), 3)
roc_auc   = round(roc_auc_score(y_test,   rf_best_preds), 3)
precision = round(precision_score(y_test, rf_best_preds), 3)
recall    = round(recall_score(y_test,    rf_best_preds), 3)
f1        = round(f1_score(y_test,        rf_best_preds), 3)

print(f'Accuracy  : {accuracy}')
print(f'ROC_AUC   : {roc_auc}')
print(f'Precision : {precision}')
print(f'Recall    : {recall}')
print(f'F-score   : {f1}')

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, rf_best_preds)))

In [None]:
# # Add model and accuracy dict to list
# model_performance.append(dict([
#     ('Model', 'Random Forest'),
#     ('ROC AUC', round(roc_auc, 3)),
#     ('Precision', round(precision, 3)),
#     ('Recall', round(recall, 3)),
#     ('F1', round(f1_score, 3))
#      ]))

# # Add model to list
# models_optimized.append('Random Forest')

## XGBoost

In [None]:
# Grid searching key hyperparameters for XGBoost

# Instantiate model and RandomUnderSampler
xgb_model = XGBClassifier()
under = RandomUnderSampler(random_state=2020)

# Construct pipeline
steps = [('und', under), ('xgb', xgb_model)]
pipeline = Pipeline(steps)

# Define parameter grid values to be searched
param_grid = {
    'und__random_state': [2020],
    'xgb__n_estimators': [100, 250, 500, 1000], 
    'xgb__max_depth': [3, 4, 5], 
    'xgb__learning_rate': [0.001, 0.01, 0.1]
}

# Use stratify version of k-fold to keep class imbalance ratio
k_fold = StratifiedKFold(n_splits=3, shuffle = True, random_state=2020)

# Cross Validation
# xgb_grid = GridSearchCV(pipeline, param_grid=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')
xgb_rndm = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=k_fold, n_jobs=-1, return_train_score=True, scoring='roc_auc')

# Train with balanced classes
grid_result = xgb_rndm.fit(X_train_under, y_train_under) # Should I use X_train, y_train here?

# Summarize results
print(f'Best Score: {round(grid_result.best_score_, 3)}\nParams: {grid_result.best_params_}\n')

mean_train = grid_result.cv_results_['mean_train_score']
mean_test = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean_tr, mean_ts, param in zip(mean_train, mean_test, params):
    print(f'Train {round(mean_tr, 3)}\tTest {round(mean_ts, 3)}\tParams: {param}')
    
# # Examine the best model
# print(lg_grid.best_score_)
# print(lg_grid.best_params_)

In [19]:
# Instantiate model with best paramaters
xgb_best = XGBClassifier(n_estimators=500, max_depth=4, learning_rate=0.1)

# Train with balanced classes
xgb_best.fit(X_train_under, y_train_under)

# Get predictions
xgb_best_preds = xgb_best.predict(X_test)
# xgb_best_y_score = xgb_best.predict_proba(X_test)

# Get ROC AUC Score, precision, recall, f1-score
accuracy  = round(accuracy_score(y_test,  xgb_best_preds), 3)
roc_auc   = round(roc_auc_score(y_test,   xgb_best_preds), 3)
precision = round(precision_score(y_test, xgb_best_preds), 3)
recall    = round(recall_score(y_test,    xgb_best_preds), 3)
f1        = round(f1_score(y_test,        xgb_best_preds), 3)

print(f'Accuracy  : {accuracy}')
print(f'ROC_AUC   : {roc_auc}')
print(f'Precision : {precision}')
print(f'Recall    : {recall}')
print(f'F-score   : {f1}')

# Confusion matrix
print('\nConfusion Matrix\n' + str(confusion_matrix(y_test, xgb_best_preds)))

Accuracy  : 0.828
ROC_AUC   : 0.812
Precision : 0.566
Recall    : 0.784
F-score   : 0.657

Confusion Matrix
[[70349 13436]
 [ 4835 17532]]


In [None]:
# Add model and accuracy dict to list
model_performance.append(dict([
    ('Model', 'XGBoost'),
    ('ROC AUC', round(roc_auc, 3)),
    ('Precision', round(precision, 3)),
    ('Recall', round(recall, 3)),
    ('F1', round(f1, 3))
     ]))

# Add model to list
models_optimized.append('XGBoost')

# Feature Importance

In [None]:
# Feature Importance – XGBoost
fig, ax = plt.subplots(figsize=(10,10))
xgb_best.get_booster().feature_names = list(X_test.columns)
plot_importance(xgb_best.get_booster(), max_num_features=15, height=0.8, ax=ax)
plt.title('Feature Importance')

# plt.savefig('feature_importance.png', bbox_inches='tight');

# Model Comparison

In [None]:
def plot_roc_curves(classifiers, X_train, y_train, X_test, y_test, sampler = None):
    """
        Function for plotting roc curves of classifiers for comparison
        
        :param classifiers: dictionary of classifiers
        :param sampler: sampling method to use e.g. smote
        
        :returns fpr: an array of false positive rate values from roc_curve
        :returns tpr: an array of true positive rate values from roc_curve
        :returns thresholds: an array of threshold values from roc_curve
        :returns roc_auc: roc_auc scores for each classifier in classifiers
        
        https://github.com/kevinchiv/Predicting-Kickstarter-Success/blob/master/02%20-%20Classification%20Modeling.ipynb
    """

    fpr, tpr, thresholds = {}, {}, {}
    roc_auc = {}


    for classifier_name, classifier in classifiers.items():

        #set n_jobs = -1 for faster performance
        classifier = classifier.set_params(n_jobs = -1)

#         if sampler:
#             sampler.set_params(random_state = 42, n_jobs = -1)

        steps = [('sampling', sampler),
                 (classifier_name, classifier)]

        model = Pipeline(steps)

        model.fit(X_train, y_train)

        y_proba = model.predict_proba(X_test)[:,1]

        fpr[classifier_name], tpr[classifier_name], thresholds[classifier_name] = roc_curve(y_test, y_proba)

        roc_auc[classifier_name] = round(auc(fpr[classifier_name], tpr[classifier_name]), 3)

        print(classifier_name + ' roc_auc score: ' + str(roc_auc[classifier_name]))


    plt.figure(figsize=(8, 8))

    for classifier_name, classifier in classifiers.items():
        sns.lineplot(fpr[classifier_name], tpr[classifier_name], err_style = None);

    midline_points = np.arange(0, 1.05, 0.05) 

    plt.plot(midline_points, midline_points, linestyle ='--', color = 'black');
    plt.title("ROC Curves for Different Classification Models", y =1.05, fontsize = 16);
    plt.xlabel('FPR', fontsize = 12, x = 1.05);
    plt.ylabel('TPR', fontsize = 12, rotation =0, y = 1.05);
    plt.legend(list(zip(classifiers.keys(), roc_auc.values())));

    return fpr, tpr, thresholds, roc_auc

In [None]:
classifiers = {'KNN': knn_best, 
               'LGR': lg_best, 
               'RFC': rf_best, 
               'XGB': xgb_best,
               'Ensemble': VotingClassifier(estimators=[('lr', lg_best), ('rf', rf_best), ('xgb', xgb_best)],
                                            voting='soft', weights=[1, 1, 1])
              }

fpr, tpr, thresholds, roc_auc = plot_roc_curves(classifiers, X_train_under, y_train_under, X_test, y_test)

## Confusion Matrix for Test Dataset

In [None]:
# # Print confusion matrix for XGBoost
# xgb_confusion = confusion_matrix(y_test, test_pred_under)

# plt.figure(dpi=125)
# sns.heatmap(xgb_confusion, annot=True, fmt='g', square=True, cbar=False,
#             xticklabels=['no duplicate', 'is duplicate'],
#             yticklabels=['no duplicate', 'is duplicate'])

# plt.title('Confusion Matrix - Test Dataset\nXGBoost', pad=20)
# plt.xlabel('Predicted\n', labelpad=20)
# plt.ylabel('Actual\n', labelpad=20);