In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm, datasets
from sklearn.metrics import accuracy_score
import sklearn.tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import calendar as cl
import scipy
import scipy.stats               # For reciprocal distribution
import warnings

In [2]:
#importing Montreal crime data
montreal_data = pd.read_csv('Montreal_2015toOct2020.csv',delimiter =',',na_values='',usecols=[0,1,2,4,5,6,7],encoding='latin')

In [3]:
#Cleaning Montreal dataset
montreal_data.rename(columns={"CATEGORIE":"CATEGORY"},inplace=True)

#Converting the CATEGORIE text to English
montreal_data["CATEGORY"]=montreal_data["CATEGORY"].replace("Vol de véhicule à moteur","Motor vehicle theft")\
                                                .replace("Méfait","Misdemeanor(Mischiefs)")\
                                                .replace("Vols qualifiés","Qualified flights(Robbery)")\
                                                .replace("Vol dans / sur véhicule à moteur","Flight into / on motor vehicle")\
                                                .replace("Introduction","Introduction(Breaking)")\
                                                .replace("Infractions entrainant la mort","Offenses causing death")

#Converting the QUART text to English
montreal_data["QUART"]=montreal_data["QUART"].replace("jour","Day")\
                                        .replace("nuit","Night")\
                                        .replace("soir","Evening")

In [4]:
montreal_data

Unnamed: 0,CATEGORY,DATE,QUART,X,Y,LONGITUDE,LATITUDE
0,Motor vehicle theft,2018-09-13,Day,294904.159001,5.047549e+06,-73.626778,45.567780
1,Motor vehicle theft,2018-04-30,Day,294904.159001,5.047549e+06,-73.626778,45.567780
2,Motor vehicle theft,2018-09-01,Night,290274.565000,5.042150e+06,-73.685928,45.519122
3,Misdemeanor(Mischiefs),2017-07-21,Day,0.000000,0.000000e+00,1.000000,1.000000
4,Misdemeanor(Mischiefs),2017-07-29,Day,0.000000,0.000000e+00,1.000000,1.000000
...,...,...,...,...,...,...,...
175937,Flight into / on motor vehicle,2020-06-18,Day,0.000000,0.000000e+00,1.000000,1.000000
175938,Qualified flights(Robbery),2020-07-07,Evening,0.000000,0.000000e+00,1.000000,1.000000
175939,Flight into / on motor vehicle,2020-07-09,Evening,0.000000,0.000000e+00,1.000000,1.000000
175940,Misdemeanor(Mischiefs),2020-07-13,Day,0.000000,0.000000e+00,1.000000,1.000000


In [9]:
montreal_data = montreal_data.dropna()

In [6]:
montreal_data['DATE'] = pd.to_datetime(montreal_data['DATE'])
montreal_data['WEEKDAYINT'] = montreal_data['DATE'].apply(lambda x: x.weekday())
montreal_data['YEAR']=montreal_data['DATE'].dt.to_period('Y').astype(str)
montreal_data['MONTH']=montreal_data['DATE'].dt.to_period('M').astype(str)
montreal_data['MONTH']  = montreal_data['MONTH'].str.split('-').str[1]
montreal_data['MONTH'] = montreal_data['MONTH'].apply(pd.to_numeric)
montreal_data['YEAR'] = montreal_data['YEAR'].apply(pd.to_numeric)
result = [] 
for value in montreal_data['MONTH']: 
    if value >= 4 and value <= 7: 
        result.append("Summer") 
    elif value >= 7 and value <= 10: 
        result.append("Fall") 
    else:
        result.append("Winter") 
     
    
montreal_data['SEASON'] = result

In [10]:
montreal_data

Unnamed: 0,CATEGORY,DATE,QUART,X,Y,LONGITUDE,LATITUDE,WEEKDAYINT,YEAR,MONTH,SEASON
0,Motor vehicle theft,2018-09-13,Day,294904.159001,5.047549e+06,-73.626778,45.567780,3,2018,9,Fall
1,Motor vehicle theft,2018-04-30,Day,294904.159001,5.047549e+06,-73.626778,45.567780,0,2018,4,Summer
2,Motor vehicle theft,2018-09-01,Night,290274.565000,5.042150e+06,-73.685928,45.519122,5,2018,9,Fall
3,Misdemeanor(Mischiefs),2017-07-21,Day,0.000000,0.000000e+00,1.000000,1.000000,4,2017,7,Summer
4,Misdemeanor(Mischiefs),2017-07-29,Day,0.000000,0.000000e+00,1.000000,1.000000,5,2017,7,Summer
...,...,...,...,...,...,...,...,...,...,...,...
175937,Flight into / on motor vehicle,2020-06-18,Day,0.000000,0.000000e+00,1.000000,1.000000,3,2020,6,Summer
175938,Qualified flights(Robbery),2020-07-07,Evening,0.000000,0.000000e+00,1.000000,1.000000,1,2020,7,Summer
175939,Flight into / on motor vehicle,2020-07-09,Evening,0.000000,0.000000e+00,1.000000,1.000000,3,2020,7,Summer
175940,Misdemeanor(Mischiefs),2020-07-13,Day,0.000000,0.000000e+00,1.000000,1.000000,0,2020,7,Summer


In [11]:
codesCAT, uniques = pd.factorize(montreal_data['CATEGORY'])
codesQRT, uniques = pd.factorize(montreal_data['QUART'])
codesSSN, uniques = pd.factorize(montreal_data['SEASON'])

In [12]:
montreal_data['codesCAT'] = codesCAT
montreal_data['codesQRT'] = codesQRT
montreal_data['codesSSN'] = codesSSN

In [13]:
montreal_data.drop(['CATEGORY','QUART','DATE','SEASON'],axis =1 ,inplace=True)

In [14]:
indexNames = montreal_data[ (montreal_data['X'] == 0) ].index
montreal_data.drop(indexNames , inplace=True)
montreal_data

Unnamed: 0,X,Y,LONGITUDE,LATITUDE,WEEKDAYINT,YEAR,MONTH,codesCAT,codesQRT,codesSSN
0,294904.159001,5.047549e+06,-73.626778,45.567780,3,2018,9,0,0,0
1,294904.159001,5.047549e+06,-73.626778,45.567780,0,2018,4,0,0,1
2,290274.565000,5.042150e+06,-73.685928,45.519122,5,2018,9,0,1,0
6,297654.715002,5.041877e+06,-73.591457,45.516776,6,2017,7,1,0,1
8,294259.780993,5.051450e+06,-73.635117,45.602873,1,2017,8,3,0,0
...,...,...,...,...,...,...,...,...,...,...
175855,298328.593992,5.051506e+06,-73.582959,45.603432,5,2020,7,3,0,1
175856,295460.915997,5.043059e+06,-73.619560,45.527386,3,2020,6,1,1,1
175857,295460.915997,5.043059e+06,-73.619560,45.527386,1,2020,6,1,2,1
175858,295499.657993,5.043120e+06,-73.619065,45.527931,3,2020,9,4,2,0


In [15]:
montreal_data_values = montreal_data[:].values

In [16]:
montreal_data_X = montreal_data_values[:,[0,1,2,3,4,5,6,8,9]]
montreal_data_Y = montreal_data_values[:,[7]]

In [None]:
montreal_data_X = StandardScaler().fit_transform(montreal_data_X)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(montreal_data_X, montreal_data_Y, train_size=0.8, random_state = 0)

In [None]:
X_train

In [15]:
#np.any(np.isnan(X_train))
np.argwhere(np.isnan(X_train))

array([], shape=(0, 2), dtype=int64)

In [17]:
clfDT = sklearn.tree.DecisionTreeClassifier(random_state=0)
clfDT.fit(X_train,y_train)
y_pred_train = clfDT.predict(X_train)
y_pred_test = clfDT.predict(X_test)
train_accuracy = sklearn.metrics.accuracy_score(y_train, y_pred_train)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred_test)
print("Decision Tree Training Accuracy -> ",train_accuracy*100)
print("Decision Tree Testing Accuracy -> ",test_accuracy*100)

Decision Tree Training Accuracy ->  98.60641273940176
Decision Tree Testing Accuracy ->  34.75072304090346


In [18]:
clfRF = RandomForestClassifier(random_state=0)
clfRF.fit(X_train,y_train.ravel())
y_pred_train = clfRF.predict(X_train)
y_pred_test = clfRF.predict(X_test)
train_accuracy = sklearn.metrics.accuracy_score(y_train, y_pred_train)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred_test)
print("Random Forest Training Accuracy -> ",train_accuracy*100)
print("Random Forest Testing Accuracy -> ",test_accuracy*100)

Random Forest Training Accuracy ->  98.60641273940176
Random Forest Testing Accuracy ->  41.26497727585732


In [15]:
adaBoostclf = AdaBoostClassifier(random_state=0,n_estimators =3,algorithm='SAMME.R')
adaBoostclf.fit(X_train,y_train.ravel())
y_pred_train = adaBoostclf.predict(X_train)
y_pred_test = adaBoostclf.predict(X_test)
train_accuracy = sklearn.metrics.accuracy_score(y_train, y_pred_train)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred_test)
print("AdaBoost Training Accuracy -> ",train_accuracy*100)
print("AdaBoost Testing Accuracy -> ",test_accuracy*100)

AdaBoost Training Accuracy ->  32.73509791263181
AdaBoost Testing Accuracy ->  32.81572786117615


In [16]:
gradBoostclf = GradientBoostingClassifier(random_state=0)
gradBoostclf.fit(X_train,y_train.ravel())
y_pred_train = gradBoostclf.predict(X_train)
y_pred_test = gradBoostclf.predict(X_test)
train_accuracy = sklearn.metrics.accuracy_score(y_train, y_pred_train)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred_test)
print("GradientBoost Training Accuracy -> ",train_accuracy*100)
print("GradientBoost Testing Accuracy -> ",test_accuracy*100)

GradientBoost Training Accuracy ->  38.81385840327093
GradientBoost Testing Accuracy ->  37.57746866822752


In [None]:
neigh = KNeighborsClassifier()
neigh.fit(X_train,y_train.ravel())
y_pred_train = neigh.predict(X_train)
y_pred_test =  neigh.predict(X_test)
train_accuracy = sklearn.metrics.accuracy_score(y_train, y_pred_train)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred_test)
print("KNN Training Accuracy -> ",train_accuracy*100)
print("KNN Testing Accuracy -> ",test_accuracy*100)

In [None]:
SVMrbf = svm.SVC(kernel='rbf', gamma=1, C=1).fit(X_train, y_train.ravel())
predictions_SVM = SVMrbf.predict(X_test)
print("SVM Accuracy Score rbf -> ",accuracy_score(y_test,predictions_SVM)*100)

In [None]:
NNclf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train.ravel())
y_pred_train = NNclf.predict(X_train)
y_pred_test =  NNclf.predict(X_test)
train_accuracy = sklearn.metrics.accuracy_score(y_train, y_pred_train)
test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred_test)
print("NeuralNets Training Accuracy -> ",train_accuracy*100)
print("NeuralNets Testing Accuracy -> ",test_accuracy*100)

In [17]:
classifiers =[clfRF]
for clf in classifiers:
    kfold_scores = sklearn.model_selection.cross_val_score(clf, X_train,y_train.ravel(),cv =10)

    for i in range(10):
        if i> 1:
            print('held-out accuracy ({}-fold): {}%'.format(i,round(kfold_scores[i]*100,1)))
    

held-out accuracy (2-fold): 41.2%
held-out accuracy (3-fold): 41.3%
held-out accuracy (4-fold): 42.0%
held-out accuracy (5-fold): 42.3%
held-out accuracy (6-fold): 41.1%
held-out accuracy (7-fold): 40.9%
held-out accuracy (8-fold): 42.0%
held-out accuracy (9-fold): 41.4%


In [None]:
clfRF.get_params().keys()

In [21]:
param_distributions = {
    'max_depth' : [int(x) for x in np.linspace(1, 10, num=10)],
    'min_samples_leaf' : [int(x) for x in np.linspace(start =1, stop= 3,num =3)],
    'n_estimators' : [int(x) for x in np.linspace(start = 10, stop = 500,num =10)],
    'max_features' : [int(x) for x in np.linspace(start = 1, stop =8, num =8)],
    'bootstrap' : [True, False],
    'criterion': ['gini','entropy']
}
randomcv = sklearn.model_selection.RandomizedSearchCV(clfRF, param_distributions, n_iter=50, verbose=2, random_state=0,
                                                      n_jobs= -1, cv =5)
randomcv.fit(X_train, y_train)
print("best parameters:", randomcv.best_params_)
print("{}% accuracy on validation sets (average)".format(round(randomcv.best_score_*100,1)))

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 52.8min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 163.8min finished
  self.best_estimator_.fit(X, y, **fit_params)


best parameters: {'n_estimators': 118, 'min_samples_leaf': 2, 'max_features': 5, 'max_depth': 10, 'criterion': 'gini', 'bootstrap': True}
40.2% accuracy on validation sets (average)
