# **Initial Load**

Authenticate with Google Drive and read in our dataset.

In [22]:
# Install any required packages.
#!pip install -U -q PyDrive

In [23]:
# Import any required libraries.
#from google.colab import auth
#from patsy import dmatrices
#from pydrive.auth import GoogleAuth
#from pydrive.drive import GoogleDrive

In [24]:
# Authenticate with Google Drive.
#auth.authenticate_user()
#gauth = GoogleAuth()
#gauth.credentials = GoogleCredentials.get_application_default()
#drive = GoogleDrive(gauth)

In [25]:
import pandas as pd
import numpy as np
import seaborn as sb
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
from sklearn.inspection import DecisionBoundaryDisplay

ImportError: cannot import name 'DecisionBoundaryDisplay' from 'sklearn.inspection' (/Users/deborahdjon/opt/anaconda3/lib/python3.9/site-packages/sklearn/inspection/__init__.py)

In [26]:
# Download our dataset from Google Drive.
#downloaded_file = drive.CreateFile({ 'id': '14RMV7CRXwwCt_9iLHenyQrB9GC5gYwul' })
#downloaded_file.GetContentFile('ChicagoCrimeRecords.csv')
chicago_crime_records = pd.read_csv('../Dataset/ChicagoCrimeRecords.csv')

## Data Preprosessng

In [27]:
# Sample to work with a smaller dataset
chicago_crime_records2 = chicago_crime_records.sample(10000)

In [28]:
# Remove any columns that we are confident will not be of any use to our model.
chicago_crime_records2.drop(
    columns = ['Block', 'Case Number', 'Date', 'Description', 'FBI Code', 'ID', 'IUCR', 'Latitude', 'Location', 'Longitude', 'Updated On', 'X Coordinate', 'Y Coordinate', 'Year'],
    axis = 1,
    inplace = True,
    errors = 'ignore')

In [29]:
# Drop any NANs before we begin.
chicago_crime_records2.dropna(inplace = True)

In [30]:
# Take a look at all of the columns in our dataset.
chicago_crime_records.columns.sort_values()

Index(['Arrest', 'Beat', 'Block', 'Case Number', 'Community Area', 'Date',
       'Description', 'District', 'Domestic', 'FBI Code', 'ID', 'IUCR',
       'Latitude', 'Location', 'Location Description', 'Longitude',
       'Primary Type', 'Updated On', 'Ward', 'X Coordinate', 'Y Coordinate',
       'Year'],
      dtype='object')

In [31]:
# Encode the 'Arrest', 'Domestic', 'Location Description' and 'Primary Type' columns.
aa['Arrest'] = chicago_crime_records2['Arrest'].astype(int)

chicago_crime_records2['Domestic'] = chicago_crime_records2['Domestic'].astype(int)

chicago_crime_records2['Location Description'] = chicago_crime_records2['Location Description'].astype('category')
chicago_crime_records2['Location Description'] = chicago_crime_records2['Location Description'].cat.codes

chicago_crime_records2['Murder'] = 0
chicago_crime_records2.loc[chicago_crime_records2['Primary Type'] == 'HOMICIDE', 'Murder'] = 1

In [36]:
chicago_crime_records2 = chicago_crime_records2.sample(frac = 1)

In [47]:
chicago_crime_records2.head()

Unnamed: 0,Primary Type,Location Description,Arrest,Domestic,Ward,Community Area,Murder
3631779,THEFT,46,1,0,6.0,44.0,0
3424782,PROSTITUTION,100,1,0,42.0,8.0,0
3478151,THEFT,88,0,0,49.0,1.0,0
2300782,ROBBERY,100,0,0,24.0,26.0,0
7592260,BATTERY,100,1,0,21.0,73.0,0


In [37]:
# Generate a correlation matrix, based on certain variables in our dataset.
correlation_matrix = chicago_crime_records2[[
    'Arrest',
    'Beat',
    'Community Area',
    'District',
    'Domestic',
    'Ward']].corr()

correlation_matrix.style.background_gradient(cmap = 'coolwarm')

Unnamed: 0,Arrest,Beat,Community Area,District,Domestic,Ward
Arrest,1.0,-0.027986,-0.00245,-0.031349,-0.066212,-0.024045
Beat,-0.027986,1.0,-0.496048,0.953291,-0.052846,0.643168
Community Area,-0.00245,-0.496048,1.0,-0.490745,0.084323,-0.52565
District,-0.031349,0.953291,-0.490745,1.0,-0.049374,0.684649
Domestic,-0.066212,-0.052846,0.084323,-0.049374,1.0,-0.065775
Ward,-0.024045,0.643168,-0.52565,0.684649,-0.065775,1.0


In [38]:
# Calculate the V.I.F. for a collection of other variables in our dataset.
features = chicago_crime_records2[['Arrest', 'Beat', 'Community Area', 'District', 'Domestic', 'Ward']]
vif_data = pd.DataFrame()
vif_data['Feature'] = features.columns
vif_data["VIF"] = [variance_inflation_factor(features.values, i) for i in range(len(features.columns))]
print(vif_data)

          Feature        VIF
0          Arrest   1.322345
1            Beat  40.314328
2  Community Area   1.861431
3        District  43.670020
4        Domestic   1.166055
5            Ward   6.168269


In [39]:
# Based on the previous results, both 'Beat' and 'District' are contributing to multicollinearity.
# Consider dropping one or both of these variables to reduce inflation.
chicago_crime_records2.drop(
    columns = ['Beat', 'District'],
    axis = 1,
    inplace = True,
    errors = 'ignore')

## Modelling

In [40]:
from sklearn.model_selection import train_test_split 

In [44]:
# Split up our training and testing sets.
X = np.asarray(chicago_crime_records2)
y = np.asarray(chicago_crime_records2['Primary Type'])
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 78)

In [45]:
n_neighbors = 15

In [46]:

for weights in ["uniform", "distance"]:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X, y)

    _, ax = plt.subplots()
    DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        cmap=cmap_light,
        ax=ax,
        response_method="predict",
        plot_method="pcolormesh",
        xlabel=iris.feature_names[0],
        ylabel=iris.feature_names[1],
        shading="auto",
    )

    # Plot also the training points
    sns.scatterplot(
        x=X[:, 0],
        y=X[:, 1],
        hue=iris.target_names[y],
        palette=cmap_bold,
        alpha=1.0,
        edgecolor="black",
    )
    plt.title(
        "3-Class classification (k = %i, weights = '%s')" % (n_neighbors, weights)
    )

plt.show()

ValueError: could not convert string to float: 'THEFT'

In [None]:


n_neighbors = 15

# import some data to play with
iris = datasets.load_iris()

# we only take the first two features. We could avoid this ugly
# slicing by using a two-dim dataset
X = iris.data[:, :2]
y = iris.target

# Create color maps
cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"])
cmap_bold = ["darkorange", "c", "darkblue"]

for weights in ["uniform", "distance"]:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X, y)

    _, ax = plt.subplots()
    DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        cmap=cmap_light,
        ax=ax,
        response_method="predict",
        plot_method="pcolormesh",
        xlabel=iris.feature_names[0],
        ylabel=iris.feature_names[1],
        shading="auto",
    )

    # Plot also the training points
    sns.scatterplot(
        x=X[:, 0],
        y=X[:, 1],
        hue=iris.target_names[y],
        palette=cmap_bold,
        alpha=1.0,
        edgecolor="black",
    )
    plt.title(
        "3-Class classification (k = %i, weights = '%s')" % (n_neighbors, weights)
    )

plt.show()