# **Initial Load**

Authenticate with Google Drive and read in our dataset.

In [None]:
# Install any required packages.
!pip install -U -q PyDrive

In [None]:
# Import any required libraries.
from google.colab import auth
from google.colab import drive
from patsy import dmatrices
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from oauth2client.client import GoogleCredentials
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, balanced_accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# Authenticate with Google Drive.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# Download our dataset from Google Drive.
downloaded_file = drive.CreateFile({ 'id': '14RMV7CRXwwCt_9iLHenyQrB9GC5gYwul' })
downloaded_file.GetContentFile('ChicagoCrimeRecords.csv')
chicago_crime_records = pd.read_csv('ChicagoCrimeRecords.csv')

In [None]:
# Clear the downloaded file and any other related variables from memory, once we've converted it to a data frame.
del downloaded_file, drive, gauth

# **Data Preparation**
Trim our dataset and split it out into training and test sets.

In [None]:
# Find any NANs.
chicago_crime_records.isnull().sum()

In [None]:
# Determine the size of our dataset with the NANs removed.
len(chicago_crime_records.index) - chicago_crime_records.isnull().sum().sum()

In [None]:
# Drop any NANs.
chicago_crime_records.dropna(inplace = True)

In [None]:
# Remove any columns that we are confident will not be of any use to our model.
chicago_crime_records.drop(
    columns = [
        'Block',
        'Case Number',
        'Date',
        'Description',
        'ID',
        'Latitude',
        'Location',
        'Longitude',
        'Updated On',
        'X Coordinate',
        'Y Coordinate',
        'Year'],
    axis = 1,
    inplace = True,
    errors = 'ignore')

In [None]:
# Encode the 'Arrest', 'Domestic', 'FBI Code', 'IUCR' and 'Location Description' columns.
chicago_crime_records['Arrest'] = chicago_crime_records['Arrest'].astype(int)

chicago_crime_records['Domestic'] = chicago_crime_records['Domestic'].astype(int)

chicago_crime_records['FBI Code'] = chicago_crime_records['FBI Code'].astype('category')
chicago_crime_records['FBI Code'] = chicago_crime_records['FBI Code'].cat.codes

chicago_crime_records['IUCR'] = chicago_crime_records['IUCR'].astype('category')
chicago_crime_records['IUCR'] = chicago_crime_records['IUCR'].cat.codes

chicago_crime_records['Location Description'] = chicago_crime_records['Location Description'].astype('category')
chicago_crime_records['Location Description'] = chicago_crime_records['Location Description'].cat.codes

In [None]:
# Take a look at our dataset, once we've encoded all of our features.
chicago_crime_records.head()

In [None]:
# Separate out our indepedent and dependent variables.
X = chicago_crime_records.drop(columns = 'Primary Type')
Y = chicago_crime_records['Primary Type']

In [None]:
# Take a small sample of our dataset, to test out our imputation by KNN approach.
X_sample = X.sample(250000, random_state = 111)

In [None]:
# Find any NANs in our sampled dataset.
X_sample.isnull().sum()

In [None]:
# Impute any NANs with KNN.
imputer = KNNImputer(n_neighbors = 5, weights = 'uniform', metric = 'nan_euclidean')
imputer.fit(X_sample)
X_sample_imputed = imputer.transform(X_sample)

In [None]:
# Confirm that all missing values were removed.
print('Missing: %d' % sum(np.isnan(X_sample_imputed).flatten()))

In [None]:
# Split out our data into training and test sets.
x_train, x_test, y_train, y_test = train_test_split(X_sample, Y.sample(250000), test_size = 0.2, random_state = 123)

In [None]:
# Split out our imputed data into training and test sets.
x_imputed_train, x_imputed_test, y_train, y_test = train_test_split(X_sample_imputed, Y.sample(250000), test_size = 0.2, random_state = 123)

In [None]:
# Scale our training and test data.
x_train_scaled = scale(x_imputed_train)
x_test_scaled = scale(x_imputed_test)

In [None]:
# Take a look at our scaled data.
x_train_scaled

In [None]:
# Clear the original data frame and other assorted variables from memory, now that we no longer need it.
del chicago_crime_records, X, Y

# **Model Tuning**
Carry out some parametric tuning before we train our Random Forest classifier.

In [None]:
# Carry out some hyperparameter tuning.
parameters = {
    'n_estimators': [25, 50, 75],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [5, 7, 9],
    'criterion': ['gini', 'entropy']
}

tuned_model = GridSearchCV(RandomForestClassifier(), parameters).fit(x_train.sample(n = 50000), y_train.sample(n = 50000))

In [None]:
# Take a look at the resultant parameters.
tuned_model.best_params_

# **Model Training**
Train a Random Forest classifier.

In [None]:
# Train a Random Forest classifier.
classifier = RandomForestClassifier(
    bootstrap = True,
    class_weight = None,
    criterion = 'entropy',
    max_depth = 7,
    max_leaf_nodes = None,
    min_impurity_decrease = 0.0,
    min_samples_leaf = 1,
    min_samples_split = 2,
    min_weight_fraction_leaf = 0.0,
    n_estimators = 50,
    n_jobs = 1,
    oob_score = False,
    random_state = 41,
    verbose = 0,
    warm_start = False)

classifier.fit(x_train_scaled, y_train)
classifier_predictions = classifier.predict(x_test_scaled)

# **Model Evalution**
Evaluate our Random Forest classifier.

In [None]:
# Evaluate the importance of the features in our model.
feature_importance = pd.Series(
    classifier.feature_importances_,
    index = x_train.columns).sort_values(ascending = False)

feature_importance

In [None]:
# Evaluate the importance of the features in our model.
sns.barplot(x = feature_importance, y = feature_importance.index)
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title('Visualizing Important Features')
plt.legend()
plt.show()

In [None]:
# Display a confusion matrix for our model.
ConfusionMatrixDisplay(
    confusion_matrix = confusion_matrix(y_test, classifier_predictions),
    display_labels = classifier.classes_).plot()

plt.show()

In [None]:
# Evaluate the accuracy, precision, and recall of our model.
print("Accuracy: ", accuracy_score(y_test, classifier_predictions))
print("Recall", recall_score(y_test, classifier_predictions, average = 'macro'))
print("Precision", precision_score(y_test, classifier_predictions, average = 'macro'))