In [None]:
## Import Dependencies ##
import json
import pandas as pd
import numpy as np
import os
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [None]:
## Read The CSV File ###
file = os.path.join('Resources','Crimes_-_2001_to_present.csv')
crime_df = pd.read_csv(file)

# Filter out the columns that are required alone

In [None]:
required_columns = ['IUCR', 'Primary Type', 'Arrest', 'Beat',
       'District', 'Ward', 'Year', 'Police Districts', 'Police Beats','Latitude', 'Longitude']
## Using Loc to filter the required columns 
chicago_selected_df = crime_df.loc[:, required_columns]

# Removing all the null values and changing the data types

In [None]:
## Drop Null Values ##
chicago_selected_df = chicago_selected_df.dropna(axis= 0, how = 'any')
## Reset Index ##
chicago_selected_df.reset_index(drop = True, inplace = True)
## Change Datatypes ##
chicago_selected_df[['District','Ward','Police Districts','Police Beats']] = chicago_selected_df[['District','Ward','Police Districts','Police Beats']].applymap(np.int64)
chicago_selected_df["Arrest"] = chicago_selected_df["Arrest"].astype(int)

Remove Outliers by grouping the Data based on Different Features

In [None]:
# Remove Outliers Based on Primary Type, as in if there are less number of crimes with that crime type remove it
chicago_selected_df = chicago_selected_df.groupby("Primary Type").filter(lambda x : len(x)>9500)
# Remove Outliers Based on Police Districts, as in if there are less number of crimes within that police district remove it
chicago_selected_df = chicago_selected_df.groupby("Police Districts").filter(lambda x : len(x)>1500)

In [None]:
chicago_2010to2018_df.groupby("Primary Type").count().sort_values(by="IUCR", ascending = False)

# Create three subsets - 2018 data for latest info, 2010 to 2018 for training and 2019 for validation

In [None]:
######   2010 to 2018 data ######
chicago_2010to2018_df = chicago_selected_df.loc[(chicago_selected_df['Year'] >= 2010) & (chicago_selected_df['Year'] < 2019), :]
chicago_2010to2018_df = chicago_2010to2018_df.sort_values("Year")
chicago_2010to2018_df.reset_index(drop = True, inplace = True)

In [None]:
###### 2018 data ######
chicago_2018_df = chicago_selected_df.loc[chicago_selected_df['Year'] == 2018, :]
chicago_2018_df.reset_index(drop = True, inplace = True)
chicago_2018_df.to_csv("Chicago2018dataforplots.csv")

In [None]:
###### 2019 ######
chicago_2019_df = chicago_selected_df.loc[chicago_selected_df['Year'] == 2019, :]
chicago_2019_df.reset_index(drop = True, inplace = True)

# Enter Machine Learning. Fit, Train, Test, validate and all that Jazz

In [None]:
#split dataset in features and target variable
feature_cols = ['Primary Type', 'Police Districts', 'Police Beats', 'Ward','Beat']
X = chicago_2018_df[feature_cols] # Features
y = chicago_2018_df.Arrest # Target variable
X = pd.get_dummies(X,columns=["Primary Type"], drop_first=True)
X.head()

In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 70% training and 30% test

# Decision Tree for Police Beats and districts

In [None]:
# Decision Tree Algorithm on the required features
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [None]:
# Predict Using the test Data and get the confusion matrix 
predictions = dtree.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

In [None]:
# Get the accuracy of the decision tree model
from sklearn import metrics
# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, predictions)))


# Random Forests module

In [None]:
# Fit the model Using the training and test Data and get the Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=600)
rfc.fit(X_train,y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
# Get the cofusion metrics and classification report for this model
predictions = rfc.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

In [None]:
# Get the accuracy of the Random Forest model
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, predictions)))

# Naive Bayes Module

In [None]:
#Import Gaussian Naive Bayes model to fit the training and testing data
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
model.fit(X_train,y_train)

#Predict Output
y_pred= model.predict(X_test) # 0:Overcast, 2:Mild


In [None]:
# Get the model accuracy for this model
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
## Ready the 2019 data for prediction based on the feature columns of the training and testing data
predict_df = chicago_2019_df[:10000]
feature_cols = ['Primary Type', 'Police Districts', 'Police Beats', 'Ward','Beat']
X_predict = predict_df[feature_cols] # Features
X_predict = pd.get_dummies(X_predict,columns=["Primary Type"], drop_first=True)
X_predict.head()

In [None]:
### Use the validation data across the three models to get the predictions
dtreepredictions = dtree.predict(X_predict)
randomforestpredictions = rfc.predict(X_predict)
naivepredictions = model.predict(X_predict)
predict_df["DtreePredict"] = dtreepredictions
predict_df["Randomforestpredictions"] = randomforestpredictions
predict_df["NaivePredict"] = naivepredictions

In [None]:
predict_df['resultmatch?'] = np.where(predict_df.DtreePredict == predict_df.Arrest, 'True', 'False')

In [None]:
predict_df['resultmatch?'].value_counts()

In [None]:
print("{0:0.2f}% Accuracy achieved by the model".format(8433/10000 * 100))