In [1]:
# Import Dependencies
import numpy as np
import pandas as pd
import os

## Import and Check Chicago Crime Datasets

In [2]:
# Import 2016 - 2019 crime data
crime_2016 = os.path.join("..","Resources", "crime_clean_2016.csv") 
crime_2017 = os.path.join("..","Resources", "crime_clean_2017.csv") 
crime_2018 = os.path.join("..","Resources", "crime_clean_2018.csv")

crime_2016_df_final = pd.read_csv(crime_2016)
crime_2017_df_final = pd.read_csv(crime_2017)
crime_2018_df_final = pd.read_csv(crime_2018)

join1 = crime_2016_df_final.append(crime_2017_df_final)
training_df = join1.append(crime_2018_df_final)
training_df.head()

Unnamed: 0,id,date,day,month,year,time,hour,month_day,day_of_week,district,...,location_description,x_coordinate,y_coordinate,iucr,fbi_code,primary_type,domestic,latitude,longitude,arrest
0,10819224,2016-12-31T23:59:00.000,31,12,2016,23:59:00.000,23,1231,5,16,...,AIRPORT BUILDING NON-TERMINAL - SECURE AREA,1100658,1934241,810,06,THEFT,False,41.97629,-87.905227,False
1,10801137,2016-12-31T23:58:00.000,31,12,2016,23:58:00.000,23,1231,5,5,...,RESIDENCE,1178014,1829709,430,04B,BATTERY,False,41.688033,-87.623931,False
2,10801110,2016-12-31T23:55:00.000,31,12,2016,23:55:00.000,23,1231,5,19,...,RESIDENCE,1166154,1920300,2250,22,LIQUOR LAW VIOLATION,False,41.936885,-87.66477,True
3,10802006,2016-12-31T23:55:00.000,31,12,2016,23:55:00.000,23,1231,5,1,...,HOTEL/MOTEL,1176964,1902140,486,08B,BATTERY,True,41.886815,-87.625593,False
4,10801865,2016-12-31T23:54:00.000,31,12,2016,23:54:00.000,23,1231,5,6,...,RESIDENCE,1178949,1853139,1310,14,CRIMINAL DAMAGE,True,41.752307,-87.619798,False


In [3]:
# Import 2019 crime data and prepare for testing
crime_2019 = os.path.join("..","Resources", "crime_clean_2019.csv") 

test_df = pd.read_csv(crime_2019)

# crime_2019_final_df.head()

In [4]:
training_df=(training_df[['month','hour','day_of_week',
                                           'location_description','primary_type',
                                           'arrest']])
training_df.head()

Unnamed: 0,month,hour,day_of_week,location_description,primary_type,arrest
0,12,23,5,AIRPORT BUILDING NON-TERMINAL - SECURE AREA,THEFT,False
1,12,23,5,RESIDENCE,BATTERY,False
2,12,23,5,RESIDENCE,LIQUOR LAW VIOLATION,True
3,12,23,5,HOTEL/MOTEL,BATTERY,False
4,12,23,5,RESIDENCE,CRIMINAL DAMAGE,False


In [5]:
test_df=(test_df[['month','hour','day_of_week',
                                           'location_description','primary_type',
                                           'arrest']])
test_df.head()

Unnamed: 0,month,hour,day_of_week,location_description,primary_type,arrest
0,12,23,1,STREET,WEAPONS VIOLATION,True
1,12,23,1,SIDEWALK,BATTERY,False
2,12,23,1,VEHICLE NON-COMMERCIAL,WEAPONS VIOLATION,True
3,12,23,1,STREET,WEAPONS VIOLATION,False
4,12,23,1,STREET,OTHER OFFENSE,True


In [8]:
from sklearn import tree
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [9]:
training_df['domestic'] = le.fit_transform(training_df['domestic'])
training_df['block'] = le.fit_transform(training_df['block'])
training_df['location_description'] = le.fit_transform(training_df['location_description'])
training_df['primary_type'] = le.fit_transform(training_df['primary_type'])
training_df['fbi_code'] = le.fit_transform(training_df['fbi_code'])
training_df['arrest'] = le.fit_transform(training_df['arrest'])

KeyError: 'domestic'

In [None]:
test_df['domestic'] = le.fit_transform(test_df['domestic'])
test_df['block'] = le.fit_transform(test_df['block'])
test_df['location_description'] = le.fit_transform(test_df['location_description'])
test_df['primary_type'] = le.fit_transform(test_df['primary_type'])
test_df['fbi_code'] = le.fit_transform(test_df['fbi_code'])
test_df['arrest'] = le.fit_transform(test_df['arrest'])

In [None]:
# Assign X (data) and y (target) for train
X_train = training_df.drop("arrest", axis=1)
y_train = training_df["arrest"]

In [10]:
# Assign X (data) and y (target) for test
X_test = test_df.drop("arrest", axis=1)
y_test = test_df["arrest"]

In [11]:
X = X_test + X_train
y = y_train + y_test

In [12]:
print('Training X Shape:', X_train.shape)
print('Training y Shape:', y_train.shape)
print('Testing X Shape:', X_test.shape)
print('Testing y Shape:', y_test.shape)


Training X Shape: (787901, 12)
Training y Shape: (787901,)
Testing X Shape: (256908, 12)
Testing y Shape: (256908,)


In [13]:
# Select Algorithm
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7433400283369922

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
feature_importance = sorted(zip(rf.feature_importances_, X), reverse=True)
feature_importance

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = rf.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
# Output to table
class_report = classification_report(y_test, predictions, output_dict=True)

class_report_df = pd.DataFrame(class_report).transpose()
class_report_df = class_report_df.round(2)
class_report_df["support"] = class_report_df["support"].astype(int)
class_report_df.to_csv("RF_report")