In [1]:
#Loading the libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from random import randint
import scipy.stats

In [2]:
#Data Loading
toronto_data=pd.read_csv("Toronto_2014to2019.csv")
montreal_data=pd.read_csv("Montreal_2015toOct2020.csv",encoding='latin_1')

In [4]:
#Toronto Data Processing for comparison
col_list = ['X','Y','Long','Lat','occurrenceyear','occurrencemonth','occurrenceday',\
            'occurrencehour','MCI','Hood_ID']
toronto_data=toronto_data[col_list]
toronto_data=toronto_data.dropna(subset=['occurrenceyear'])
crime_var=pd.factorize(toronto_data["MCI"])
toronto_data['MCI'] = crime_var[0]
definition_list_MCI = crime_var[1]
toronto_data['occurrencemonth'] = pd.factorize(toronto_data['occurrencemonth'])[0]

In [5]:
#Montreal Data Processing for comparison
montreal_data.rename(columns={"CATEGORIE":"OFFENCE","QUART":"OCCURENCETIME"},inplace=True)
montreal_data = montreal_data.dropna(how='any',axis=0) 

offence_factorize=pd.factorize(montreal_data["OFFENCE"])
montreal_data["OFFENCE"]=offence_factorize[0]
offence_list_names=offence_factorize[1]
montreal_data["OCCURENCETIME"]=pd.factorize(montreal_data["OCCURENCETIME"])[0]

montreal_data["PDQ"]=montreal_data["PDQ"].astype(np.int32)
montreal_data[["OCCURENCEYEAR","OCCURENCEMONTH","OCCURENCEDATE"]]=montreal_data.DATE.str.split("-", expand=True)

In [6]:
#Splitting Toronto Data
X_tor=toronto_data.drop(['MCI'],axis=1).values
y_tor=toronto_data['MCI'].values
X_train_tor, X_test_tor, y_train_tor, y_test_tor = train_test_split(X_tor, y_tor, test_size = 0.20, random_state = 0)

In [7]:
#Splitting Montreal data
X_mon=montreal_data[["X","Y","OCCURENCEYEAR","OCCURENCEMONTH","OCCURENCEDATE","OCCURENCETIME","LATITUDE","LONGITUDE","PDQ"]]
y_mon=montreal_data[["OFFENCE"]]
X_train_mon, X_test_mon, y_train_mon, y_test_mon = train_test_split(X_mon, y_mon, test_size=0.20, random_state=0)

In [8]:
def runRandomForest(X_train,y_train,X_test,y_test):
    clfRF = RandomForestClassifier(random_state=0, criterion='entropy')
    clfRF.fit(X_train,y_train)
    y_pred_train = clfRF.predict(X_train)
    y_pred_test = clfRF.predict(X_test)
    train_accuracy = sklearn.metrics.accuracy_score(y_train, y_pred_train)
    test_accuracy = sklearn.metrics.accuracy_score(y_test, y_pred_test)
    print(" Random Forest Training Accuracy -> ",train_accuracy*100)
    print(" Random Forest Testing Accuracy -> ",test_accuracy*100)

In [None]:
#RandomForest on Toronto
runRandomForest(X_train_tor,y_train_tor,X_test_tor,y_test_tor)

In [None]:
#RandomForest on Montreal
runRandomForest(X_train_mon,y_train_mon,X_test_mon,y_test_mon)

In [None]:
def randomSearchCV(X_train,y_train,,X_test,y_test,maxFeatures):
    rf_classifier = RandomForestClassifier()
    #Hyper parameters
    params_rf={
        'min_samples_leaf': [int(x) for x in np.linspace(start = 1, stop = 3, num = 3)],
        'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)],
        'max_features': [int(x) for x in np.linspace(start = 1, stop = maxFeatures, num = maxFeatures)],
        'bootstrap': [True, False],
        'criterion': ['gini','entropy'],
    }

    rf_random = RandomizedSearchCV(estimator = rf_classifier, param_distributions = params_rf, n_iter = 50, cv = 3, 
                                   verbose=2, random_state=0, n_jobs = -1)
    rf_random.fit(X_train,y_train)
    y_pred = rf_classifier.best_estimator_.predict(X_test)
    print("Training score:"+str(accuracy_score(y_train,classifier_random.best_estimator_.predict(X_train))))
    print("Testing score:"+str(accuracy_score(y_test,y_pred)))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred, target_names=offence_list_names))

In [None]:
#RandomForest on Toronto
randomSearchCV(X_train_tor,y_train_tor,X_test_tor,y_test_tor,9)

In [None]:
#RandomForest on Montreal
runRandomForest(X_train_mon,y_train_mon,X_test_mon,y_test_mon,9)