In [1]:
#Loading the libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from random import randint
import scipy.stats

In [2]:
#Data Loading
montreal_data=pd.read_csv("Montreal_2015toOct2020.csv",encoding='latin_1')
pdq_data=pd.read_csv("pdq.csv",encoding='latin_1')

In [3]:
#Data Cleaning
#Montreal data
montreal_data.rename(columns={"CATEGORIE":"OFFENCE","QUART":"OCCURENCETIME"},inplace=True)
montreal_data = montreal_data.dropna(how='any',axis=0) 

offence_factorize=pd.factorize(montreal_data["OFFENCE"])
montreal_data["OFFENCE"]=offence_factorize[0]
offence_list_names=offence_factorize[1]
montreal_data["OCCURENCETIME"]=pd.factorize(montreal_data["OCCURENCETIME"])[0]

montreal_data["PDQ"]=montreal_data["PDQ"].astype(np.int32)
montreal_data[["OCCURENCEYEAR","OCCURENCEMONTH","OCCURENCEDATE"]]=montreal_data.DATE.str.split("-", expand=True)

#PDQ data
pdq_data.rename(columns={"PREFIX_TEM":"TYPEOFROAD","MUN_TEMP":"MUNICIPALITY"},inplace=True)
pdq_data[["temp","PDQ"]] = pdq_data.DESC_LIEU.str.split("QUARTIER", expand = True)
pdq_data["PDQ"] = pdq_data["PDQ"].astype(np.int32)
pdq_data.drop(["temp","DESC_LIEU","NO_CIV_LIE","DIR_TEMP","LONGITUDE","LATITUDE","NOM_TEMP","OBJECTID"],axis =1 ,\
              inplace=True)

pdq_data["TYPEOFROAD"]=pd.factorize(pdq_data["TYPEOFROAD"])[0]
pdq_data["MUNICIPALITY"]=pd.factorize(pdq_data["MUNICIPALITY"])[0]

In [4]:
#Compute Weekday and Season features
montreal_data['DATE'] = pd.to_datetime(montreal_data['DATE'])
montreal_data['WEEKDAYINT'] = montreal_data['DATE'].apply(lambda x: x.weekday())
montreal_data['YEAR']=montreal_data['DATE'].dt.to_period('Y').astype(str)
montreal_data['MONTH']=montreal_data['DATE'].dt.to_period('M').astype(str)
montreal_data['MONTH']  = montreal_data['MONTH'].str.split('-').str[1]
montreal_data['MONTH'] = montreal_data['MONTH'].apply(pd.to_numeric)
montreal_data['YEAR'] = montreal_data['YEAR'].apply(pd.to_numeric)
season = [] 
for value in montreal_data['MONTH']: 
    if value >= 4 and value <= 7: 
        season.append("Summer") 
    elif value >= 7 and value <= 10: 
        season.append("Fall") 
    else:
        season.append("Winter") 
     
    
montreal_data['SEASON'] = season
montreal_data['SEASONINT'] = pd.factorize(montreal_data['SEASON'])[0]

In [5]:
#Merging the tables
montreal_pdq_data=pd.merge(montreal_data ,pdq_data,on =["PDQ"])

In [6]:
#Data Processing
#Splitting data
X=montreal_pdq_data[["X","Y","OCCURENCEYEAR","OCCURENCEMONTH","OCCURENCEDATE","OCCURENCETIME","MUNICIPALITY","TYPEOFROAD","LATITUDE","LONGITUDE","WEEKDAYINT","SEASONINT"]]
y=montreal_pdq_data[["OFFENCE"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
def result(classifier_random):
    y_pred = classifier_random.best_estimator_.predict(X_test)
    print("Training score:"+str(accuracy_score(y_train,classifier_random.best_estimator_.predict(X_train))))
    print("Testing score:"+str(accuracy_score(y_test,y_pred)))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred, target_names=offence_list_names))

In [66]:
#RandomForest
rf_classifier = RandomForestClassifier()
# rf_classifier.fit(X_train, y_train.values.ravel())
# y_pred = rf_classifier.predict(X_test)

#Hyper parameters
params_rf={
    'min_samples_leaf': [int(x) for x in np.linspace(start = 1, stop = 3, num = 3)],
    'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)],
    'max_features': [int(x) for x in np.linspace(start = 1, stop = 8, num = 8)],
    'bootstrap': [True, False],
    'criterion': ['gini','entropy'],
}

rf_random = RandomizedSearchCV(estimator = rf_classifier, param_distributions = params_rf, n_iter = 50, cv = 3, 
                               verbose=2, random_state=0, n_jobs = -1)
rf_random.fit(X_train,y_train.values.ravel())

{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [1, 2, 3], 'n_estimators': [10, 64, 118, 173, 227, 282, 336, 391, 445, 500], 'max_features': [1, 2, 3, 4, 5, 6, 7, 8], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 16.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 38.8min finished
  self.best_estimator_.fit(X, y, **fit_params)


In [69]:
#Results
result(rf_random)

Training score:0.4440678228433899
Testing score:0.41256906718670194
[[1130  433    7 2586 2172    0]
 [ 380 1189    6 3888 4183    0]
 [ 129  189   30  889 1217    0]
 [ 524  781   11 7556 4069    0]
 [ 492  711   10 2595 7866    0]
 [   5    2    0   11   13    0]]
                                  precision    recall  f1-score   support

        Vol de véhicule à moteur       0.42      0.18      0.25      6328
                          Méfait       0.36      0.12      0.18      9646
                  Vols qualifiés       0.47      0.01      0.02      2454
Vol dans / sur véhicule à moteur       0.43      0.58      0.50     12941
                    Introduction       0.40      0.67      0.50     11674
  Infractions entrainant la mort       0.00      0.00      0.00        31

                        accuracy                           0.41     43074
                       macro avg       0.35      0.26      0.24     43074
                    weighted avg       0.41      0.41      0.37  

  'precision', 'predicted', average, warn_for)


In [73]:
#knn
knn_classifier=KNeighborsClassifier()
# knn_classifier.fit(X_train,y_train.values.ravel())
# y_pred=knn_classifier.predict(X_test)

#Hyperparameters
params_knn={
    'n_neighbors': [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
}

knn_random = RandomizedSearchCV(estimator = knn_classifier, param_distributions = params_knn, n_iter = 50, cv = 3, 
                               verbose=2, random_state=0, n_jobs = -1)
knn_random.fit(X_train,y_train.values.ravel());

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 31.3min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=KNeighborsClassifier(algorithm='auto',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None,
                                                  n_jobs=None, n_neighbors=5,
                                                  p=2, weights='uniform'),
                   iid='warn', n_iter=50, n_jobs=-1,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8,
                                                        9, 10],
                                        'weights': ['uniform', 'distance']},
                   pre_dispatch='2*n_jobs', random_state=0, refit=True,
                   re

In [74]:
#Results
result(knn_random)

Training score:0.4440678228433899
Testing score:0.39004968194270323
[[1642 1148  113 1955 1470    0]
 [1011 2852  270 2972 2541    0]
 [ 228  628  358  726  514    0]
 [1439 2366  209 6365 2562    0]
 [1172 2284  254 2380 5584    0]
 [   3   10    1   12    5    0]]
                                  precision    recall  f1-score   support

        Vol de véhicule à moteur       0.30      0.26      0.28      6328
                          Méfait       0.31      0.30      0.30      9646
                  Vols qualifiés       0.30      0.15      0.20      2454
Vol dans / sur véhicule à moteur       0.44      0.49      0.47     12941
                    Introduction       0.44      0.48      0.46     11674
  Infractions entrainant la mort       0.00      0.00      0.00        31

                        accuracy                           0.39     43074
                       macro avg       0.30      0.28      0.28     43074
                    weighted avg       0.38      0.39      0.38  

  'precision', 'predicted', average, warn_for)


In [76]:
#AdaBoost
ab_classifier = AdaBoostClassifier()
# ab_classifier.fit(X_train, y_train.values.ravel())
# y_pred = ab_classifier.predict(X_test)

#Hyperparameters
params_ab={
    'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)],
    'learning_rate': scipy.stats.reciprocal(0.01, 1.0),
    'algorithm': ['SAMME.R', 'SAMME'],
    'random_state': [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)]
}

ab_random = RandomizedSearchCV(estimator = ab_classifier, param_distributions = params_ab, n_iter = 50, cv = 3, 
                               verbose=2, random_state=0, n_jobs = -1)
ab_random.fit(X_train,y_train.values.ravel());

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 36.5min finished


In [77]:
#Results
result(ab_random)

Training score:0.37589673345251085
Testing score:0.37412360124437016
[[ 385  240    0 2862 2841    0]
 [ 254  556    0 4217 4619    0]
 [  48   53    0  985 1368    0]
 [ 280  446    0 7328 4887    0]
 [ 342  454    0 3032 7846    0]
 [   0    1    0   11   19    0]]
                                  precision    recall  f1-score   support

        Vol de véhicule à moteur       0.29      0.06      0.10      6328
                          Méfait       0.32      0.06      0.10      9646
                  Vols qualifiés       0.00      0.00      0.00      2454
Vol dans / sur véhicule à moteur       0.40      0.57      0.47     12941
                    Introduction       0.36      0.67      0.47     11674
  Infractions entrainant la mort       0.00      0.00      0.00        31

                        accuracy                           0.37     43074
                       macro avg       0.23      0.23      0.19     43074
                    weighted avg       0.33      0.37      0.30 

  'precision', 'predicted', average, warn_for)
