In [7]:
#Loading the libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from random import randint
import scipy.stats

In [3]:
#Data Loading
toronto_data=pd.read_csv("Toronto_2014to2019.csv")

In [3]:
#Data Processing
col_list = ['X','Y','Long','Lat','occurrenceyear','occurrencemonth','occurrenceday','occurrencedayofyear','occurrencedayofweek',\
            'occurrencehour','MCI','Division','Hood_ID','premisetype']
toronto_data=toronto_data[col_list]
toronto_data=toronto_data.dropna(subset=['occurrenceyear'])
crime_var=pd.factorize(toronto_data["MCI"])
toronto_data['MCI'] = crime_var[0]
definition_list_MCI = crime_var[1]
toronto_data['premisetype'] = pd.factorize(toronto_data['premisetype'])[0]
toronto_data['occurrencedayofweek'] = pd.factorize(toronto_data['occurrencedayofweek'])[0]
toronto_data['Division'] = pd.factorize(toronto_data['Division'])[0]
toronto_data['occurrencemonth'] = pd.factorize(toronto_data['occurrencemonth'])[0]

In [5]:
#Splitting Data
X=toronto_data.drop(['MCI'],axis=1).values
y=toronto_data['MCI'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [5]:
def result(classifier_random):
    y_pred = classifier_random.best_estimator_.predict(X_test)
    print("Training score:"+str(accuracy_score(y_train,classifier_random.best_estimator_.predict(X_train))))
    print("Testing score:"+str(accuracy_score(y_test,y_pred)))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred, target_names=definition_list_MCI))

In [None]:
#RandomForest
rf_classifier = RandomForestClassifier()
# rf_classifier.fit(X_train, y_train.values.ravel())
# y_pred = rf_classifier.predict(X_test)

#Hyper parameters
params_rf={
    'min_samples_leaf': [int(x) for x in np.linspace(start = 1, stop = 3, num = 3)],
    'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)],
    'max_features': [int(x) for x in np.linspace(start = 1, stop = 8, num = 8)],
    'bootstrap': [True, False],
    'criterion': ['gini','entropy'],
}

rf_random = RandomizedSearchCV(estimator = rf_classifier, param_distributions = params_rf, n_iter = 50, cv = 3, 
                               verbose=2, random_state=0, n_jobs = -1)
rf_random.fit(X_train,y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 37.6min


In [10]:
#Results
result(rf_random)

Training score:0.9825317989097516
Testing score:0.6540846981296637
[[19390  1597   519    27   771]
 [ 4004  4253    38    47   215]
 [ 2376   226  1457     3   310]
 [  929   249    36    26   123]
 [ 2144   456   191    17  1872]]
                 precision    recall  f1-score   support

        Assault       0.67      0.87      0.76     22304
Break and Enter       0.63      0.50      0.55      8557
        Robbery       0.65      0.33      0.44      4372
     Theft Over       0.22      0.02      0.04      1363
     Auto Theft       0.57      0.40      0.47      4680

       accuracy                           0.65     41276
      macro avg       0.55      0.42      0.45     41276
   weighted avg       0.63      0.65      0.63     41276



In [12]:
#knn
knn_classifier=KNeighborsClassifier()
# knn_classifier.fit(X_train,y_train.values.ravel())
# y_pred=knn_classifier.predict(X_test)

#Hyperparameters
params_knn={
    'n_neighbors': [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
}

knn_random = RandomizedSearchCV(estimator = knn_classifier, param_distributions = params_knn, n_iter = 50, cv = 3, 
                               verbose=2, random_state=0, n_jobs = -1)
knn_random.fit(X_train,y_train);

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 49.4min finished


In [13]:
#Results
result(knn_random)

Training score:0.9858509993943064
Testing score:0.5804583777497819
[[19065  1873   688    36   642]
 [ 5695  2341   174    56   291]
 [ 2535   297  1355     3   182]
 [ 1048   185    39    19    72]
 [ 2855   447   187    12  1179]]
                 precision    recall  f1-score   support

        Assault       0.61      0.85      0.71     22304
Break and Enter       0.46      0.27      0.34      8557
        Robbery       0.55      0.31      0.40      4372
     Theft Over       0.15      0.01      0.03      1363
     Auto Theft       0.50      0.25      0.33      4680

       accuracy                           0.58     41276
      macro avg       0.45      0.34      0.36     41276
   weighted avg       0.54      0.58      0.54     41276



In [15]:
#AdaBoost
ab_classifier = AdaBoostClassifier()
# ab_classifier.fit(X_train, y_train.values.ravel())
# y_pred = ab_classifier.predict(X_test)

#Hyperparameters
params_ab={
    'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)],
    'learning_rate': scipy.stats.reciprocal(0.01, 1.0),
    'algorithm': ['SAMME.R', 'SAMME'],
    'random_state': [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)]
}

ab_random = RandomizedSearchCV(estimator = ab_classifier, param_distributions = params_ab, n_iter = 50, cv = 3, 
                               verbose=2, random_state=0, n_jobs = -1)
ab_random.fit(X_train,y_train);

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 19.8min finished


In [16]:
#Results
result(ab_random)

Training score:0.5771653543307087
Testing score:0.5776964822172691
[[19508  2032     7     0   757]
 [ 5216  3282     0     0    59]
 [ 3661   300     7     0   404]
 [ 1061   224     0     0    78]
 [ 2772   858     2     0  1048]]
                 precision    recall  f1-score   support

        Assault       0.61      0.87      0.72     22304
Break and Enter       0.49      0.38      0.43      8557
        Robbery       0.44      0.00      0.00      4372
     Theft Over       0.00      0.00      0.00      1363
     Auto Theft       0.45      0.22      0.30      4680

       accuracy                           0.58     41276
      macro avg       0.40      0.30      0.29     41276
   weighted avg       0.53      0.58      0.51     41276



  'precision', 'predicted', average, warn_for)
