In [1]:
# Import Dependencies
import numpy as np
import pandas as pd
import os
import joblib

## Import and Check Chicago Crime Datasets

In [2]:
# Import 2016 - 2019 crime data
crime_2016 = os.path.join("..","Resources", "crime_clean_2016.csv") 
crime_2017 = os.path.join("..","Resources", "crime_clean_2017.csv") 
crime_2018 = os.path.join("..","Resources", "crime_clean_2018.csv")

crime_2016_df_final = pd.read_csv(crime_2016)
crime_2017_df_final = pd.read_csv(crime_2017)
crime_2018_df_final = pd.read_csv(crime_2018)

join1 = crime_2016_df_final.append(crime_2017_df_final)
training_df = join1.append(crime_2018_df_final)
training_df.head()

Unnamed: 0,id,date,day,month,year,time,hour,month_day,day_of_week,district,...,location_description,x_coordinate,y_coordinate,iucr,fbi_code,primary_type,domestic,latitude,longitude,arrest
0,10819224,2016-12-31T23:59:00.000,31,12,2016,23:59:00.000,23,1231,5,16,...,AIRPORT BUILDING NON-TERMINAL - SECURE AREA,1100658,1934241,810,06,THEFT,False,41.97629,-87.905227,False
1,10801137,2016-12-31T23:58:00.000,31,12,2016,23:58:00.000,23,1231,5,5,...,RESIDENCE,1178014,1829709,430,04B,BATTERY,False,41.688033,-87.623931,False
2,10801110,2016-12-31T23:55:00.000,31,12,2016,23:55:00.000,23,1231,5,19,...,RESIDENCE,1166154,1920300,2250,22,LIQUOR LAW VIOLATION,False,41.936885,-87.66477,True
3,10802006,2016-12-31T23:55:00.000,31,12,2016,23:55:00.000,23,1231,5,1,...,HOTEL/MOTEL,1176964,1902140,486,08B,BATTERY,True,41.886815,-87.625593,False
4,10801865,2016-12-31T23:54:00.000,31,12,2016,23:54:00.000,23,1231,5,6,...,RESIDENCE,1178949,1853139,1310,14,CRIMINAL DAMAGE,True,41.752307,-87.619798,False


In [3]:
training_df['arrest'].value_counts()

False    631208
True     156693
Name: arrest, dtype: int64

In [4]:
# Import 2019 crime data and prepare for testing
crime_2019 = os.path.join("..","Resources", "crime_clean_2019.csv") 

crime_2019_final_df = pd.read_csv(crime_2019)

# crime_2019_final_df.head()

In [5]:
training_df=(training_df[['month','hour','day_of_week','district','block','ward','beat','community_area',
                                           'location_description','fbi_code','primary_type','domestic',
                                           'arrest']])
training_df.head()

Unnamed: 0,month,hour,day_of_week,district,block,ward,beat,community_area,location_description,fbi_code,primary_type,domestic,arrest
0,12,23,5,16,100XX W OHARE ST,41,1651,76,AIRPORT BUILDING NON-TERMINAL - SECURE AREA,06,THEFT,False,False
1,12,23,5,5,0000X W 113TH PL,34,522,49,RESIDENCE,04B,BATTERY,False,False
2,12,23,5,19,030XX N LINCOLN AVE,32,1932,6,RESIDENCE,22,LIQUOR LAW VIOLATION,False,True
3,12,23,5,1,0000X E WACKER PL,42,111,32,HOTEL/MOTEL,08B,BATTERY,True,False
4,12,23,5,6,078XX S INDIANA AVE,6,623,69,RESIDENCE,14,CRIMINAL DAMAGE,True,False


In [6]:
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [7]:
training_df['domestic'] = le.fit_transform(training_df['domestic'])
training_df['block'] = le.fit_transform(training_df['block'])
training_df['location_description'] = le.fit_transform(training_df['location_description'])
training_df['primary_type'] = le.fit_transform(training_df['primary_type'])
training_df['fbi_code'] = le.fit_transform(training_df['fbi_code'])
training_df['arrest'] = le.fit_transform(training_df['arrest'])

In [8]:
# Assign X (data) and y (target)
X = training_df.drop("arrest", axis=1)
y = training_df["arrest"]

In [9]:
X=X[['month','hour','day_of_week','location_description','primary_type']]

In [10]:
training_df.head()

Unnamed: 0,month,hour,day_of_week,district,block,ward,beat,community_area,location_description,fbi_code,primary_type,domestic,arrest
0,12,23,5,16,30127,41,1651,76,3,7,32,0,0
1,12,23,5,5,408,34,522,49,124,5,2,0,0
2,12,23,5,19,13708,32,1932,6,124,23,16,0,1
3,12,23,5,1,187,42,111,32,93,10,2,1,0
4,12,23,5,6,27629,6,623,69,124,16,6,1,0


In [11]:
training_df['arrest'].value_counts()

0    631208
1    156693
Name: arrest, dtype: int64

In [12]:
print('X Shape:', X.shape)
print('y Shape:', y.shape)

X Shape: (787901, 5)
y Shape: (787901,)


In [13]:
# Split data into test/train set (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

In [14]:
print('Training X Shape:', X_train.shape)
print('Training y Shape:', y_train.shape)
print('Testing X Shape:', X_test.shape)
print('Testing y Shape:', y_test.shape)


Training X Shape: (551530, 5)
Training y Shape: (551530,)
Testing X Shape: (236371, 5)
Testing y Shape: (236371,)


In [15]:
# Select Algorithm
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8513692458042653

In [16]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.8625677430818501

In [17]:
feature_importance = sorted(zip(rf.feature_importances_, X), reverse=True)
feature_importance

[(0.48138099603987833, 'primary_type'),
 (0.18442074458271965, 'location_description'),
 (0.12881545221373775, 'hour'),
 (0.10955151292960993, 'month'),
 (0.09583129423405445, 'day_of_week')]

In [18]:
# save the model to disk
filename = 'RF_model.sav'
joblib.dump(rf, filename,compress=3)

['RF_model.sav']

In [19]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = rf.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.88      0.95      0.92    189448
           1       0.72      0.50      0.59     46923

    accuracy                           0.86    236371
   macro avg       0.80      0.73      0.75    236371
weighted avg       0.85      0.86      0.85    236371

