In [106]:
## Import Dependencies ##
import json
import pandas as pd
import numpy as np
import os
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [107]:
## Read The CSV File ###
file = os.path.join('Resources','Crimes_-_2001_to_present.csv')
crime_df = pd.read_csv(file)

  interactivity=interactivity, compiler=compiler, result=result)


# Filter out the columns that are required alone

In [109]:
required_columns = ['IUCR', 'Primary Type', 'Arrest', 'Beat',
       'District', 'Ward', 'Year', 'Police Districts', 'Police Beats','Latitude', 'Longitude']
## Using Loc to filter the required columns 
chicago_selected_df = crime_df.loc[:, required_columns]

# Removing all the null values and changing the data types

In [111]:
## Drop Null Values ##
chicago_selected_df = chicago_selected_df.dropna(axis= 0, how = 'any')
## Reset Index ##
chicago_selected_df.reset_index(drop = True, inplace = True)
## Change Datatypes ##
chicago_selected_df[['District','Ward','Police Districts','Police Beats']] = chicago_selected_df[['District','Ward','Police Districts','Police Beats']].applymap(np.int64)
chicago_selected_df["Arrest"] = chicago_selected_df["Arrest"].astype(int)

Remove Outliers by grouping the Data based on Different Features

In [112]:
# Remove Outliers Based on Primary Type, as in if there are less number of crimes with that crime type remove it
chicago_selected_df = chicago_selected_df.groupby("Primary Type").filter(lambda x : len(x)>9500)
# Remove Outliers Based on Police Districts, as in if there are less number of crimes within that police district remove it
chicago_selected_df = chicago_selected_df.groupby("Police Districts").filter(lambda x : len(x)>1500)

In [114]:
chicago_selected_df.groupby("Primary Type").count()

Unnamed: 0_level_0,IUCR,Arrest,Beat,District,Ward,Year,Police Districts,Police Beats,Latitude,Longitude
Primary Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ARSON,10110,10110,10110,10110,10110,10110,10110,10110,10110,10110
ASSAULT,395707,395707,395707,395707,395707,395707,395707,395707,395707,395707
BATTERY,1154000,1154000,1154000,1154000,1154000,1154000,1154000,1154000,1154000,1154000
BURGLARY,362701,362701,362701,362701,362701,362701,362701,362701,362701,362701
CRIM SEXUAL ASSAULT,25161,25161,25161,25161,25161,25161,25161,25161,25161,25161
CRIMINAL DAMAGE,720391,720391,720391,720391,720391,720391,720391,720391,720391,720391
CRIMINAL TRESPASS,181177,181177,181177,181177,181177,181177,181177,181177,181177,181177
DECEPTIVE PRACTICE,250972,250972,250972,250972,250972,250972,250972,250972,250972,250972
GAMBLING,13335,13335,13335,13335,13335,13335,13335,13335,13335,13335
HOMICIDE,9554,9554,9554,9554,9554,9554,9554,9554,9554,9554


# Create three subsets - 2018 data for latest info, 2010 to 2018 for training and 2019 for validation

In [115]:
######   2010 to 2018 data ######
chicago_2010to2018_df = chicago_selected_df.loc[(chicago_selected_df['Year'] >= 2010) & (chicago_selected_df['Year'] < 2019), :]
chicago_2010to2018_df = chicago_2010to2018_df.sort_values("Year")
chicago_2010to2018_df.reset_index(drop = True, inplace = True)

In [117]:
###### 2018 data ######
chicago_2018_df = chicago_selected_df.loc[chicago_selected_df['Year'] == 2018, :]
chicago_2018_df.reset_index(drop = True, inplace = True)

In [118]:
###### 2019 ######
chicago_2019_df = chicago_selected_df.loc[chicago_selected_df['Year'] == 2019, :]
chicago_2019_df.reset_index(drop = True, inplace = True)

# Enter Machine Learning. Fit, Train, Test, validate and all that Jazz

In [119]:
#split dataset in features and target variable
feature_cols = ['Primary Type', 'Police Districts', 'Police Beats', 'Ward','Beat']
X = chicago_2018_df[feature_cols] # Features
y = chicago_2018_df.Arrest # Target variable
X = pd.get_dummies(X,columns=["Primary Type"], drop_first=True)
X.head()

Unnamed: 0,Police Districts,Police Beats,Ward,Beat,Primary Type_ASSAULT,Primary Type_BATTERY,Primary Type_BURGLARY,Primary Type_CRIM SEXUAL ASSAULT,Primary Type_CRIMINAL DAMAGE,Primary Type_CRIMINAL TRESPASS,...,Primary Type_MOTOR VEHICLE THEFT,Primary Type_NARCOTICS,Primary Type_OFFENSE INVOLVING CHILDREN,Primary Type_OTHER OFFENSE,Primary Type_PROSTITUTION,Primary Type_PUBLIC PEACE VIOLATION,Primary Type_ROBBERY,Primary Type_SEX OFFENSE,Primary Type_THEFT,Primary Type_WEAPONS VIOLATION
0,22,94,3,131,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,19,255,7,434,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,15,136,27,1231,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,22,144,4,123,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,252,34,2213,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [124]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 70% training and 30% test

In [125]:
# Decision Tree Algorithm on the required features
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [126]:
# Predict Using the test Data and get the confusion matrix 
predictions = dtree.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

           0       0.87      0.98      0.92     62917
           1       0.83      0.44      0.57     15626

   micro avg       0.87      0.87      0.87     78543
   macro avg       0.85      0.71      0.75     78543
weighted avg       0.87      0.87      0.85     78543

[[61496  1421]
 [ 8811  6815]]


In [127]:
# Get the accuracy of the decision tree model
from sklearn import metrics
# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, predictions)))


Accuracy: 0.8697


In [128]:
# Fit the model Using the training and test Data and get the Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=600)
rfc.fit(X_train,y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [129]:
# Get the cofusion metrics and classification report for this model
predictions = rfc.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

           0       0.88      0.98      0.92     62917
           1       0.82      0.44      0.58     15626

   micro avg       0.87      0.87      0.87     78543
   macro avg       0.85      0.71      0.75     78543
weighted avg       0.87      0.87      0.85     78543

[[61436  1481]
 [ 8710  6916]]


In [130]:
# Get the accuracy of the Random Forest model
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, predictions)))

Accuracy: 0.8702


In [131]:
#Import Gaussian Naive Bayes model to fit the training and testing data
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
model.fit(X_train,y_train)

#Predict Output
y_pred= model.predict(X_test) # 0:Overcast, 2:Mild


In [132]:
# Get the model accuracy for this model
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8694982366347096


In [133]:
## Ready the 2019 data for prediction based on the feature columns of the training and testing data
predict_df = chicago_2019_df[:10000]
feature_cols = ['Primary Type', 'Police Districts', 'Police Beats', 'Ward','Beat']
X_predict = predict_df[feature_cols] # Features
X_predict = pd.get_dummies(X_predict,columns=["Primary Type"], drop_first=True)
X_predict.head()

Unnamed: 0,Police Districts,Police Beats,Ward,Beat,Primary Type_ASSAULT,Primary Type_BATTERY,Primary Type_BURGLARY,Primary Type_CRIM SEXUAL ASSAULT,Primary Type_CRIMINAL DAMAGE,Primary Type_CRIMINAL TRESPASS,...,Primary Type_MOTOR VEHICLE THEFT,Primary Type_NARCOTICS,Primary Type_OFFENSE INVOLVING CHILDREN,Primary Type_OTHER OFFENSE,Primary Type_PROSTITUTION,Primary Type_PUBLIC PEACE VIOLATION,Primary Type_ROBBERY,Primary Type_SEX OFFENSE,Primary Type_THEFT,Primary Type_WEAPONS VIOLATION
0,12,40,41,1654,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,22,79,42,122,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,24,101,4,212,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,11,32,49,2422,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,24,101,4,212,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [134]:
### Use the validation data across the three models to get the predictions
dtreepredictions = dtree.predict(X_predict)
randomforestpredictions = rfc.predict(X_predict)
naivepredictions = model.predict(X_predict)
predict_df["DtreePredict"] = dtreepredictions
predict_df["Randomforestpredictions"] = randomforestpredictions
predict_df["NaivePredict"] = naivepredictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [139]:
predict_df['resultmatch?'] = np.where(predict_df.DtreePredict == predict_df.Arrest, 'True', 'False')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [140]:
predict_df['resultmatch?'].value_counts()

True     8433
False    1567
Name: resultmatch?, dtype: int64

In [141]:
print("{0:0.2f}% Accuracy achieved by the model".format(8433/10000 * 100))

84.33% Accuracy achieved by the model


In [86]:
import folium
import matplotlib.pyplot as plt
import seaborn as sns

In [87]:
chicago_map = folium.Map(location=[41.864073,-87.706819],
                        zoom_start=11,
                        tiles="CartoDB dark_matter")

In [144]:
Primary_Type = 'THEFT'

In [164]:
filtered_ptype_data = chicago_2018_df.loc[chicago_2018_df["Primary Type"]== Primary_Type, ["Primary Type", "Arrest", 'Police Beats', "Ward", "Police Districts", "Latitude", "Longitude"]]

# groups = filtered_ptype_data.groupby(["Police Districts","Arrest"]).size()
# pivot_table = pd.pivot_table(groups, index =["Police Districts"], columns = ['Arrests'] , aggfunc = np.sum)
# pivot_table

filtered_ptype_data.groupby(["Police Districts"]).Arrest

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000285086A6898>