In [None]:
# Add table of contents

In [2]:
import pandas as pd
import numpy as np

# Tools
from collections import Counter

# Preprocessing
from sklearn.preprocessing import MinMaxScaler

# Modeling
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC, SVC # Remove
from sklearn.datasets import make_classification

# Metrics
from sklearn.metrics import accuracy_score, confusion_matrix

# # Visualizations
# from matplotlib.pyplot import figure
# from sklearn.metrics import roc_auc_score, roc_curve
# import matplotlib.pyplot as plt
# import seaborn as sns
# from mlxtend.plotting import plot_decision_regions
# %matplotlib inline


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Train and Test Splitting

In [7]:
# Load dataframe
df = pd.read_pickle('../data/df_cyclical_features_20k.pkl')

df.head()

Unnamed: 0,case_id,is_duplicate,opened,closed,updated,status_notes,responsible_agency,category,request_type,request_details,...,source,opened_year,opened_month_sin,opened_month_cos,opened_week_sin,opened_week_cos,opened_day_sin,opened_day_cos,opened_hour_sin,opened_hour_cos
619,11879423,0,2019-12-30 20:54:00,2020-01-03 11:59:19,2020-01-03 11:59:19,Agencies responded to request and no encampmen...,Duplicate Case Hold Queue,Encampments,Encampment Reports,Encampment Cleanup,...,Mobile/Open311,2019,-2.449294e-16,1.0,0.120537,0.992709,0.0,1.0,-0.8660254,0.5
620,11877576,0,2019-12-30 13:31:54,2019-12-31 13:42:03,2019-12-31 13:42:03,Case Resolved - SES Graffiti Crew - Remove Si...,DPW Ops Queue,Illegal Postings,Illegal Postings - Affixed_Improperly,Affixed Improperly,...,Mobile/Open311,2019,-2.449294e-16,1.0,0.120537,0.992709,0.0,1.0,-0.258819,-0.965926
621,11877532,0,2019-12-30 13:26:00,2019-12-30 14:03:00,2019-12-30 14:03:00,Case Resolved,DPW Ops Queue,Street and Sidewalk Cleaning,General Cleaning,Other Loose Garbage,...,Phone,2019,-2.449294e-16,1.0,0.120537,0.992709,0.0,1.0,-0.258819,-0.965926
622,11877496,0,2019-12-30 13:22:00,2019-12-30 18:53:45,2019-12-30 18:53:45,Case Resolved - WASTE NOT FOUND ...,Recology_Abandoned,Street and Sidewalk Cleaning,Bulky Items,Refrigerator,...,Phone,2019,-2.449294e-16,1.0,0.120537,0.992709,0.0,1.0,-0.258819,-0.965926
623,11877234,0,2019-12-30 12:45:38,2020-01-02 07:17:00,2020-01-02 07:17:00,Case Resolved,DPW Ops Queue,Street and Sidewalk Cleaning,General Cleaning,Other Loose Garbage,...,Mobile/Open311,2019,-2.449294e-16,1.0,0.120537,0.992709,0.0,1.0,1.224647e-16,-1.0


In [8]:
# Columns to exclude
exclude_cols = [
    'is_duplicate', # Target variable
    'case_id',
    'opened', # Needs Feature Eng
    'closed', # Needs Feature Eng
    'updated',
    'responsible_agency', # Needs NLP
    'status_notes', # Needs NLP
    'request_type', # Needs NLP
    'request_details', # Needs NLP
    'address', # Needs NLP
#     'street', # Convert to 'category' type to get dummies
    'point'
]

# # Scale data using MinMax scaler
# # No need to standardize as all features are categorical (maybe scale lat/long....)
# scaler = MinMaxScaler()

# Predictor variables
x_variables_df = df.drop(columns=exclude_cols, axis=0, inplace=False)

# Get dummies for categorical variables
X = pd.get_dummies(x_variables_df, drop_first=True)

# Target variable
y = df['is_duplicate']

# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, 
                                                    random_state=2020, 
                                                    stratify=y,  # Stratify to keep same class ratios
                                                    shuffle=True # Shuffle data since it's ordered chronologically
                                                   )
X_train.head()

Unnamed: 0,latitude,longitude,opened_month_sin,opened_month_cos,opened_week_sin,opened_week_cos,opened_day_sin,opened_day_cos,opened_hour_sin,opened_hour_cos,...,opened_year_2010,opened_year_2011,opened_year_2012,opened_year_2013,opened_year_2014,opened_year_2015,opened_year_2016,opened_year_2017,opened_year_2018,opened_year_2019
19476,37.784027,-122.409607,0.866025,-0.5,0.970942,-0.2393157,0.0,1.0,-0.258819,-0.965926,...,0,0,0,0,0,0,0,0,0,0
4602,37.770302,-122.450912,-0.866025,0.5,-0.992709,0.1205367,0.0,1.0,0.258819,-0.965926,...,0,0,0,0,0,0,0,0,1,0
10155,37.720915,-122.435768,-0.866025,-0.5,-0.822984,-0.5680647,0.433884,-0.900969,0.8660254,-0.5,...,0,0,0,0,0,0,1,0,0,0
7455,37.764227,-122.410453,-1.0,-1.83697e-16,-1.0,-1.83697e-16,0.974928,-0.222521,1.224647e-16,-1.0,...,0,0,0,0,0,0,0,1,0,0
14313,37.747768,-122.403488,0.5,-0.8660254,0.748511,-0.6631227,0.0,1.0,-0.258819,-0.965926,...,0,0,0,0,1,0,0,0,0,0


In [9]:
print('df\t', df.shape)
print('X_train\t', X_train.shape)
print('X_test\t', X_test.shape)
print('y_train\t', y_train.shape)
print('y_test\t', y_test.shape)

df	 (16786, 28)
X_train	 (13428, 1658)
X_test	 (3358, 1658)
y_train	 (13428,)
y_test	 (3358,)


## Class Balancing

In [10]:
# Target variable
target_count = df['is_duplicate'].value_counts()

# Print class balance
print(f'Class 0: {target_count[0]}')
print(f'Class 1: {target_count[1]}')
print(f'Proportion: {round(target_count[0] / target_count[1], 2)} : 1')
print(f'Percentage of Majority Class: {round(target_count[0] / sum(target_count), 3)*100}')

Class 0: 15824
Class 1: 962
Proportion: 16.45 : 1
Percentage of Majority Class: 94.3


In [11]:
# A ratio of .5 is saying that 50% of my data is simulated
# Trevor noted that .2 would be good but let's try different ratios
X_smoted, y_smoted = SMOTE(random_state=2020).fit_sample(X_train, y_train)
Counter(y_smoted)
# pd.Series(y_smoted).value_counts().plot.bar()

Counter({0: 12658, 1: 12658})

In [12]:
# Add cross validation

# Modeling

In [13]:
# Istantiate the model
xgb_clf = XGBClassifier()

# Construct X_smoted dataframe
X_smoted_df = pd.DataFrame(X_smoted, columns=X_test.columns)

# Train the model
xgb_clf.fit(X_smoted_df, y_smoted)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [14]:
# Make prediction on training data
train_pred_smote = xgb_clf.predict(X_smoted_df)

# # Make prediction on test data
# test_pred_smote = xgb_clf.predict(X_test)

In [15]:
print('Training Accuracy Score = ', round(accuracy_score(train_pred_smote, y_smoted), 4)*100)
# print('Tests = ', round(accuracy_score(test_pred_smote, y_test), 4)*100)

Training Accuracy Score =  95.28


In [16]:
# Print confusion matrix for XGBoost
xgb_confusion = confusion_matrix(y_smoted, train_pred_smote)

plt.figure(dpi=125)
sns.heatmap(xgb_confusion, annot=True, fmt='g', square=True, cbar=False,
            xticklabels=['no duplicate', 'is duplicate'],
            yticklabels=['no duplicate', 'is duplicate'])

plt.title('Confusion Matrix - Training Dataset\nXGBoost', pad=20)
plt.xlabel('Predicted\n', labelpad=20)
plt.ylabel('Actual\n', labelpad=20);

NameError: name 'plt' is not defined

## Confusion Matrix for Test Dataset

In [None]:
# # Print confusion matrix for XGBoost – TEST Dataset
# xgb_confusion = confusion_matrix(y_test, test_pred_smote)

# plt.figure(dpi=125)
# sns.heatmap(xgb_confusion, annot=True, fmt='g', square=True, cbar=False,
#             xticklabels=['no duplicate', 'is duplicate'],
#             yticklabels=['no duplicate', 'is duplicate'])

# plt.title('Confusion Matrix - Test Dataset\nXGBoost', pad=20)
# plt.xlabel('Predicted\n', labelpad=20)
# plt.ylabel('Actual\n', labelpad=20);