In [19]:
#Loading the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [25]:
#Data Loading
montreal_data=pd.read_csv("Montreal_2015toOct2020.csv",encoding='latin_1')
pdq_data=pd.read_csv("pdq.csv",encoding='latin_1')

In [26]:
#Data Cleaning
#Montreal data
montreal_data.rename(columns={"CATEGORIE":"OFFENCE","QUART":"OCCURENCETIME"},inplace=True)
montreal_data = montreal_data.dropna(how='any',axis=0) 

offence_factorize=pd.factorize(montreal_data["OFFENCE"])
montreal_data["OFFENCE"]=offence_factorize[0]
offence_list_names=offence_factorize[1]
montreal_data["OCCURENCETIME"]=pd.factorize(montreal_data["OCCURENCETIME"])[0]

montreal_data["PDQ"]=montreal_data["PDQ"].astype(np.int32)
montreal_data[["OCCURENCEYEAR","OCCURENCEMONTH","OCCURENCEDATE"]]=montreal_data.DATE.str.split("-", expand=True)

#PDQ data
pdq_data.rename(columns={"PREFIX_TEM":"TYPEOFROAD","MUN_TEMP":"MUNICIPALITY"},inplace=True)
pdq_data[["temp","PDQ"]] = pdq_data.DESC_LIEU.str.split("QUARTIER", expand = True)
pdq_data["PDQ"] = pdq_data["PDQ"].astype(np.int32)
pdq_data.drop(["temp","DESC_LIEU","NO_CIV_LIE","DIR_TEMP","LONGITUDE","LATITUDE","NOM_TEMP","OBJECTID"],axis =1 ,\
              inplace=True)

pdq_data["TYPEOFROAD"]=pd.factorize(pdq_data["TYPEOFROAD"])[0]
pdq_data["MUNICIPALITY"]=pd.factorize(pdq_data["MUNICIPALITY"])[0]

In [22]:
#Merging the tables
montreal_pdq_data=pd.merge(montreal_data ,pdq_data,on =["PDQ"])
montreal_pdq_data

Unnamed: 0,OFFENCE,DATE,OCCURENCETIME,PDQ,X,Y,LONGITUDE,LATITUDE,OCCURENCEYEAR,OCCURENCEMONTH,OCCURENCEDATE,TYPEOFROAD,MUNICIPALITY
0,0,2018-09-13,0,30,294904.159001,5.047549e+06,-73.626778,45.567780,2018,09,13,0,1
1,0,2018-04-30,0,30,294904.159001,5.047549e+06,-73.626778,45.567780,2018,04,30,0,1
2,4,2018-01-10,0,30,294670.696005,5.047695e+06,-73.629772,45.569087,2018,01,10,0,1
3,1,2018-11-12,2,30,294670.696005,5.047695e+06,-73.629772,45.569087,2018,11,12,0,1
4,1,2018-08-15,0,30,294670.696005,5.047695e+06,-73.629772,45.569087,2018,08,15,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
172288,0,2020-09-27,0,13,0.000000,0.000000e+00,1.000000,1.000000,2020,09,27,0,10
172289,1,2020-08-26,0,13,0.000000,0.000000e+00,1.000000,1.000000,2020,08,26,0,10
172290,3,2020-08-24,0,13,0.000000,0.000000e+00,1.000000,1.000000,2020,08,24,0,10
172291,2,2020-01-13,2,13,0.000000,0.000000e+00,1.000000,1.000000,2020,01,13,0,10


In [4]:
#Data Processing
#Apriori
# montreal_data_subset=montreal_data[["OCCURENCETIME","X","Y"]]
# records=[]
# for i in range(0,len(montreal_data_subset)):
#     records.append([str(montreal_data_subset.values[i,j]) for j in range(0,3)])
# association_rules = apriori(records, min_support=0.0020, min_confidence=0.2, min_lift=3, min_length=2)
# association_results = list(association_rules)
# print(len(association_rules))

In [27]:
#Data Processing
#Splitting data
X=montreal_pdq_data[["X","Y","OCCURENCEYEAR","OCCURENCEMONTH","OCCURENCEDATE","OCCURENCETIME","MUNICIPALITY","TYPEOFROAD"]]
y=montreal_pdq_data[["OFFENCE"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# feature_scaler=StandardScaler()
# X_train=feature_scaler.fit_transform(X_train)
# X_test=feature_scaler.transform(X_test)

In [28]:
#RandomForest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=10)
rf_classifier.fit(X_train, y_train.values.ravel())
y_pred = rf_classifier.predict(X_test)

In [29]:
#Results
print("Training score:"+str(rf_classifier.score(X_train,y_train)))
print("Testing score:"+str(accuracy_score(y_test,y_pred)))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred, target_names=offence_list_names))

Training score:0.957115080459103
Testing score:0.40880466641516006
[[1304  848  101 1447 1340    1]
 [ 656 2304  153 2314 2255    0]
 [ 175  394  187  565  613    1]
 [ 930 1782  209 5118 2400    2]
 [ 686 1503  116 1855 5173    0]
 [   5    7    0    5    9    1]]
                                  precision    recall  f1-score   support

        Vol de véhicule à moteur       0.35      0.26      0.30      5041
                          Méfait       0.34      0.30      0.32      7682
                  Vols qualifiés       0.24      0.10      0.14      1935
Vol dans / sur véhicule à moteur       0.45      0.49      0.47     10441
                    Introduction       0.44      0.55      0.49      9333
  Infractions entrainant la mort       0.20      0.04      0.06        27

                        accuracy                           0.41     34459
                       macro avg       0.34      0.29      0.30     34459
                    weighted avg       0.40      0.41      0.40   

In [34]:
#KNN
knn_classifier=KNeighborsClassifier(n_neighbors = 10).fit(X_train, y_train)
knn_classifier.fit(X_train,y_train.values.ravel())
y_pred=knn_classifier.predict(X_test)

  


In [35]:
#Results
print("Training score:"+str(knn_classifier.score(X_train,y_train)))
print("Testing score:"+str(accuracy_score(y_test,y_pred)))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred, target_names=offence_list_names))

Training score:0.5079443388423757
Testing score:0.3928726892829159
[[1328  915   95 1560 1143    0]
 [ 839 2269  201 2365 2008    0]
 [ 179  491  283  566  416    0]
 [1167 1870  165 5174 2065    0]
 [ 887 1828  203 1931 4484    0]
 [   3    7    2   10    5    0]]
                                  precision    recall  f1-score   support

        Vol de véhicule à moteur       0.30      0.26      0.28      5041
                          Méfait       0.31      0.30      0.30      7682
                  Vols qualifiés       0.30      0.15      0.20      1935
Vol dans / sur véhicule à moteur       0.45      0.50      0.47     10441
                    Introduction       0.44      0.48      0.46      9333
  Infractions entrainant la mort       0.00      0.00      0.00        27

                        accuracy                           0.39     34459
                       macro avg       0.30      0.28      0.28     34459
                    weighted avg       0.38      0.39      0.39   

  'precision', 'predicted', average, warn_for)


In [36]:
#AdaBoost
ab_classifier = AdaBoostClassifier(n_estimators=100, random_state=10)
ab_classifier.fit(X_train, y_train.values.ravel())
y_pred = ab_classifier.predict(X_test)

In [37]:
#Results
print("Training score:"+str(ab_classifier.score(X_train,y_train)))
print("Testing score:"+str(accuracy_score(y_test,y_pred)))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred, target_names=offence_list_names))

Training score:0.3716209353280033
Testing score:0.37400969267825535
[[ 502  171    0 2098 2270    0]
 [ 329  396    0 3248 3709    0]
 [  69   57    0  768 1041    0]
 [ 386  353    0 5778 3924    0]
 [ 478  309    0 2334 6212    0]
 [   3    1    0    9   14    0]]
                                  precision    recall  f1-score   support

        Vol de véhicule à moteur       0.28      0.10      0.15      5041
                          Méfait       0.31      0.05      0.09      7682
                  Vols qualifiés       0.00      0.00      0.00      1935
Vol dans / sur véhicule à moteur       0.41      0.55      0.47     10441
                    Introduction       0.36      0.67      0.47      9333
  Infractions entrainant la mort       0.00      0.00      0.00        27

                        accuracy                           0.37     34459
                       macro avg       0.23      0.23      0.20     34459
                    weighted avg       0.33      0.37      0.31  

  'precision', 'predicted', average, warn_for)


In [38]:
#GradientBoosting
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=0)
gb_classifier.fit(X_train, y_train.values.ravel())
y_pred = gb_classifier.predict(X_test)

In [39]:
#Results
print("Training score:"+str(gb_classifier.score(X_train,y_train)))
print("Testing score:"+str(accuracy_score(y_test,y_pred)))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred, target_names=offence_list_names))

Training score:0.4050089237778777
Testing score:0.39977944803969934
[[ 753  371    1 1994 1916    6]
 [ 257  931    7 3035 3447    5]
 [  93   92    6  707 1033    4]
 [ 365  664    3 5797 3604    8]
 [ 421  643    1 1976 6289    3]
 [   4    2    0   10   11    0]]
                                  precision    recall  f1-score   support

        Vol de véhicule à moteur       0.40      0.15      0.22      5041
                          Méfait       0.34      0.12      0.18      7682
                  Vols qualifiés       0.33      0.00      0.01      1935
Vol dans / sur véhicule à moteur       0.43      0.56      0.48     10441
                    Introduction       0.39      0.67      0.49      9333
  Infractions entrainant la mort       0.00      0.00      0.00        27

                        accuracy                           0.40     34459
                       macro avg       0.32      0.25      0.23     34459
                    weighted avg       0.39      0.40      0.35  

In [None]:
#KFold
cross_val_score(rf_classifier, X, y.values.ravel(), cv=5)