# **Initial Load**

Authenticate with Google Drive and read in our dataset.

In [None]:
# Install any required packages.
!pip install -U -q PyDrive

In [None]:
# Import any required libraries.
from google.colab import auth
from oauth2client.client import GoogleCredentials
from patsy import dmatrices
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from statsmodels.api import add_constant
import statsmodels.discrete.discrete_model as sml
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm

In [None]:
# Authenticate with Google Drive.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# Download our dataset from Google Drive.
downloaded_file = drive.CreateFile({ 'id': '14RMV7CRXwwCt_9iLHenyQrB9GC5gYwul' })
downloaded_file.GetContentFile('ChicagoCrimeRecords.csv')
chicago_crime_records = pd.read_csv('ChicagoCrimeRecords.csv')

# **Data Analysis & Preparation**
Analyse and prepare our data before we attempt to train a predictive model using logistic regression.

In [None]:
# Drop any NANs before we begin.
chicago_crime_records.dropna(inplace = True)

In [None]:
# Take a look at all of the columns in our dataset.
chicago_crime_records.columns.sort_values()

In [None]:
# Remove any columns that we are confident will not be of any use to our model.
chicago_crime_records.drop(
    columns = ['Block', 'Case Number', 'Date', 'Description', 'FBI Code', 'ID', 'IUCR', 'Latitude', 'Location', 'Longitude', 'Updated On', 'X Coordinate', 'Y Coordinate', 'Year'],
    axis = 1,
    inplace = True,
    errors = 'ignore')

In [None]:
# Encode the 'Arrest', 'Domestic', 'Location Description' and 'Primary Type' columns.
chicago_crime_records['Arrest'] = chicago_crime_records['Arrest'].astype(int)

chicago_crime_records['Domestic'] = chicago_crime_records['Domestic'].astype(int)

chicago_crime_records['Location Description'] = chicago_crime_records['Location Description'].astype('category')
chicago_crime_records['Location Description'] = chicago_crime_records['Location Description'].cat.codes

chicago_crime_records['Murder'] = 0
chicago_crime_records.loc[chicago_crime_records['Primary Type'] == 'HOMICIDE', 'Murder'] = 1

In [None]:
# Generate a correlation matrix, based on certain variables in our dataset.
correlation_matrix = chicago_crime_records[[
    'Arrest',
    'Beat',
    'Community Area',
    'District',
    'Domestic',
    'Ward']].corr()

correlation_matrix.style.background_gradient(cmap = 'coolwarm')

In [None]:
# Calculate the V.I.F. for a collection of other variables in our dataset.
features = chicago_crime_records[['Arrest', 'Beat', 'Community Area', 'District', 'Domestic', 'Ward']]
vif_data = pd.DataFrame()
vif_data['Feature'] = features.columns
vif_data["VIF"] = [variance_inflation_factor(features.values, i) for i in range(len(features.columns))]
print(vif_data)

In [None]:
# Based on the previous results, both 'Beat' and 'District' are contributing to multicollinearity.
# Consider dropping one or both of these variables to reduce inflation.
chicago_crime_records.drop(
    columns = ['Beat', 'District'],
    axis = 1,
    inplace = True,
    errors = 'ignore')

# **Model Construction/Training**
Train a collection of logistic regression models with varying sets of features.

In [None]:
# Split up our training and testing sets.
X = np.asarray(chicago_crime_records.drop(columns = ['Murder', 'Primary Type'], axis = 1))
y = np.asarray(chicago_crime_records['Murder'])
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 78)

# Train a logistic regression model.
logit = sml.Logit(y_train, x_train).fit()
print(logit.summary())

In [None]:
# Generate a confusion matrix for our model.
confusion_matrix = pd.crosstab(y_test, np.round_(logit.predict(x_test), 0), rownames = ['Actual'], colnames = ['Predicted'])
sns.heatmap(confusion_matrix, annot = True)

In [None]:
# Generate a classification report for our model.
print(classification_report(y_test, np.round_(logit.predict(x_test), 0)))