# **Initial Load**

Authenticate with Google Drive and read in our dataset.

In [1]:
# Install any required packages.
!pip install -U -q imbalanced-learn
!pip install -U -q PyDrive

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/226.0 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m225.3/226.0 KB[0m [31m12.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 KB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Import any required libraries.
from collections import Counter
from datetime import datetime
from google.colab import auth
from google.colab import drive
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from oauth2client.client import GoogleCredentials
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np
import pandas as pd

In [47]:
# Authenticate with Google Drive.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [48]:
# Download our dataset from Google Drive.
downloaded_file = drive.CreateFile({ 'id': '14RMV7CRXwwCt_9iLHenyQrB9GC5gYwul' })
downloaded_file.GetContentFile('ChicagoCrimeRecords.csv')
chicago_crime_records = pd.read_csv('ChicagoCrimeRecords.csv')

In [49]:
# Clear the downloaded file and any other related variables from memory, once we've converted it to a data frame.
del downloaded_file, drive, gauth

# **Data Transformation**
Drop any unnecessary or redundant columns, impute or drop any missing values, encode any categorical features, and synthesis additional examples to offset the imbalance in our dataset.

In [50]:
# Drop any NANs.
chicago_crime_records.dropna(inplace = True)

In [51]:
# Randomly sample our dataset, so that it is small enough to work with.
chicago_crime_records = chicago_crime_records.sample(1000000, random_state = 785)

In [52]:
# Drop any classes with a single instance, as SMOTE will be unable to work with them.
is_multi = chicago_crime_records["Primary Type"].value_counts() > 1
chicago_crime_records = chicago_crime_records[chicago_crime_records["Primary Type"].isin(is_multi[is_multi].index)]

In [53]:
# Create a new feature for the weekday, month, and hour on in, or on which the crime occurred.
chicago_crime_records["Weekday"] = chicago_crime_records.Date.map(lambda x: datetime(month = int(x[:2]),day = int(x[3:5]), year = int(x[6:10])).weekday())
chicago_crime_records["Month"] = chicago_crime_records.Date.map(lambda x: int(x[:2]))
chicago_crime_records["Hour"] = chicago_crime_records.Date.map(lambda x: int(x[11:13]) if x[-2:] == "AM" else int(x[11:13]) + 12)

In [54]:
# Remove any features that we are confident will not be of any use to our model.
chicago_crime_records.drop(
    columns = [
        'Arrest',        # Occurs after the crime has been classified.
        'Case Number',   # Assigned after the crime has occurred.
        'Date',          # Replaced by 'Weekday', 'Month' and 'Hour'.
        'Description',   # A more granular version of 'Primary Type'.
        'FBI Code',      # An encoded form of 'Primary Type'.
        'Location',      # Replace by 'Latitude', 'Longitude', 'X Coordinate', 'Y Coordinate'.
        'ID',            # Provides zero insight into the crime as it is simply an index for each row.
        'IUCR',          # An encoded form of 'Primary Type'.
        'Updated On'],   # Updated after the crime has already been recorded.
    axis = 1,
    inplace = True,
    errors = 'ignore')

In [55]:
# Encode the 'Block', 'Domestic' and 'Location Description' features.
chicago_crime_records['Block'] = chicago_crime_records['Block'].astype('category')
chicago_crime_records['Block'] = chicago_crime_records['Block'].cat.codes

chicago_crime_records['Domestic'] = chicago_crime_records['Domestic'].astype(int)

chicago_crime_records['Location Description'] = chicago_crime_records['Location Description'].astype('category')
chicago_crime_records['Location Description'] = chicago_crime_records['Location Description'].cat.codes

In [56]:
# Create a new feature, denoting whether or not a theft occurred.
chicago_crime_records['Theft'] = 0
chicago_crime_records.loc[chicago_crime_records['Primary Type'] == 'THEFT', 'Theft'] = 1

In [57]:
# Separate out our Xs and Y.
X = chicago_crime_records.drop(columns = ['Primary Type', 'Theft'])
Y = chicago_crime_records['Theft']

In [58]:
# Reset the indexes for our data frames.
X.reset_index(drop = True, inplace = True)
Y.reset_index(drop = True, inplace = True)

In [59]:
# Clear our original data frame from memory.
del chicago_crime_records

In [60]:
# Synthesise new examples using SMOTE and undersampling.
oversampler = SMOTE(k_neighbors = 1)
undersampler = RandomUnderSampler()
pipeline = Pipeline(steps= [('o', oversampler), ('u', undersampler)])
X, Y = pipeline.fit_resample(X, Y)

In [None]:
# Determine whether or not any features in the dataset have high VIFs.
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_data)

In [62]:
# Remove some features with high VIFs.
X.drop(
    columns = [
        'District',         # Was removed, as 'Beat' and 'Block' provide more granular information of the same nature.
        'Latitude',         # Had an extremely high VIF (in the hundred thousands).
        'Longitude',        # Had an extremely high VIF (in the ten thousands).
        'X Coordinate',     # Had an extremely high VIF (in the ten thousands).
        'Y Coordinate',     # Had an extremely high VIF (in the hundred thousands).
        'Year'],            # Had a high VIF (35+)
    axis = 1,
    inplace = True,
    errors = 'ignore')

In [63]:
# Split out our data into training and test sets.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 123)

In [64]:
# Clear our unsplit data frames from memory.
del X, Y

# **Model Tuning**
Carry out some parametric tuning before we train our Random Forest classifier.

In [21]:
# Carry out some hyperparameter tuning.
parameters = {
    'n_estimators': [25, 50, 75],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [5, 7, 9],
    'criterion': ['gini', 'entropy']
}

tuned_model = GridSearchCV(RandomForestClassifier(), parameters).fit(X_train, Y_train)

In [None]:
# Take a look at the resultant parameters.
tuned_model.best_params_

# **Model Training**
Train a Random Forest classifier.

In [65]:
# Train a Random Forest classifier.
classifier = RandomForestClassifier(
    bootstrap = True,
    class_weight = None,
    criterion = 'entropy',
    max_depth = 9,
    max_features = 'sqrt',
    max_leaf_nodes = None,
    min_impurity_decrease = 0.0,
    min_samples_leaf = 1,
    min_samples_split = 2,
    min_weight_fraction_leaf = 0.0,
    n_estimators = 75,
    n_jobs = 1,
    oob_score = False,
    random_state = 41,
    verbose = 0,
    warm_start = False)

classifier.fit(X_train, Y_train)
classifier_predictions = classifier.predict(X_test)

# **Model Evalution**
Evaluate our Random Forest classifier.

In [None]:
# Evaluate the importance of the features in our model. Calculated as the decrease in
# node impurity weighted by the probability of reaching that node.
feature_importance = pd.Series(
    classifier.feature_importances_,
    index = X_train.columns).sort_values(ascending = False)

feature_importance

In [None]:
# Evaluate the accuracy, precision, and recall of our model.
print("Accuracy: ", accuracy_score(Y_test, classifier_predictions))
print("Recall", recall_score(Y_test, classifier_predictions, average = 'macro'))
print("Precision", precision_score(Y_test, classifier_predictions, average = 'macro'))