In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB

In [2]:
df = pd.read_csv('Cleaned_Crime.csv', low_memory=False)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,FBI Code,Year,Location
0,0,11556037,JC103643,01/03/2019 07:20:00 PM,0000X W RWY 27R,2890,PUBLIC PEACE VIOLATION,OTHER VIOLATION,AIRCRAFT,False,False,1654,16,26,2019,"(42.002816387, -87.90609433)"
1,1,11626027,JC188126,03/16/2019 05:58:00 PM,001XX N WELLS ST,460,BATTERY,SIMPLE,STREET,False,False,122,1,08B,2019,"(41.88336939, -87.633860272)"
2,2,11622422,JC183696,03/12/2019 10:00:00 PM,008XX E 38TH PL,820,THEFT,$500 AND UNDER,RESIDENTIAL YARD (FRONT/BACK),False,False,212,2,06,2019,"(41.825346902, -87.606780575)"
3,3,11625922,JC185669,03/14/2019 06:42:00 PM,074XX N PAULINA ST,460,BATTERY,SIMPLE,RESIDENCE,False,False,2422,24,08B,2019,"(42.016541612, -87.672499325)"
4,4,11622907,JC185406,03/14/2019 04:03:00 PM,008XX E 38TH PL,5002,OTHER OFFENSE,OTHER VEHICLE OFFENSE,STREET,False,True,212,2,26,2019,"(41.825298645, -87.6069609)"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7029103 entries, 0 to 7029102
Data columns (total 16 columns):
Unnamed: 0              int64
ID                      int64
Case Number             object
Date                    object
Block                   object
IUCR                    object
Primary Type            object
Description             object
Location Description    object
Arrest                  bool
Domestic                bool
Beat                    int64
District                int64
FBI Code                object
Year                    int64
Location                object
dtypes: bool(2), int64(5), object(9)
memory usage: 764.2+ MB


In [5]:
df = df.drop(['Unnamed: 0', 'ID', 'Case Number', 'Date', 'Location'], axis=1)
df['Beat'] = df['Beat'].astype(str)
df['District'] = df['District'].astype(str)
df['Year'] = df['Year'].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7029103 entries, 0 to 7029102
Data columns (total 11 columns):
Block                   object
IUCR                    object
Primary Type            object
Description             object
Location Description    object
Arrest                  bool
Domestic                bool
Beat                    object
District                object
FBI Code                object
Year                    object
dtypes: bool(2), object(9)
memory usage: 496.1+ MB


In [6]:
total = df.Arrest.count()
print("Arrest not made:", round(df.Arrest[df['Arrest'] == False].count()/total, 2), "%")
print("Arrest made:", round(df.Arrest[df['Arrest'] == True].count()/total, 2), "%")

Arrest not made: 0.73 %
Arrest made: 0.27 %


In [7]:
df.nunique()

Block                   59154
IUCR                      402
Primary Type               36
Description               507
Location Description      212
Arrest                      2
Domestic                    2
Beat                      304
District                   24
FBI Code                   26
Year                       20
dtype: int64

In [8]:
Domestic = pd.get_dummies(df['Domestic'],drop_first=True)
District = pd.get_dummies(df['District'],drop_first=True)
FBI_Code = pd.get_dummies(df['FBI Code'],drop_first=True)
Year = pd.get_dummies(df['Year'],drop_first=True)

In [9]:
model_df = pd.concat([Domestic,District,FBI_Code,Year],axis=1)

In [12]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7029103 entries, 0 to 7029102
Data columns (total 68 columns):
True    uint8
10      uint8
11      uint8
12      uint8
14      uint8
15      uint8
16      uint8
17      uint8
18      uint8
19      uint8
2       uint8
20      uint8
21      uint8
22      uint8
24      uint8
25      uint8
3       uint8
31      uint8
4       uint8
5       uint8
6       uint8
7       uint8
8       uint8
9       uint8
01B     uint8
02      uint8
03      uint8
04A     uint8
04B     uint8
05      uint8
06      uint8
07      uint8
08A     uint8
08B     uint8
09      uint8
10      uint8
11      uint8
12      uint8
13      uint8
14      uint8
15      uint8
16      uint8
17      uint8
18      uint8
19      uint8
20      uint8
22      uint8
24      uint8
26      uint8
2002    uint8
2003    uint8
2004    uint8
2005    uint8
2006    uint8
2007    uint8
2008    uint8
2009    uint8
2010    uint8
2011    uint8
2012    uint8
2013    uint8
2014    uint8
2015    uint8
2016 

In [13]:
y = df['Arrest']
X = model_df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(4920372, 68) (2108731, 68) (4920372,) (2108731,)


In [16]:
lr = sklearn.linear_model.LogisticRegression(solver='newton-cg', multi_class='auto')
arrestlr_model = lr.fit(X_train, y_train)
yhat = arrestlr_model.predict(X_test)
print(sklearn.metrics.confusion_matrix(y_test, yhat))
print(arrestlr_model.score(X_test, y_test))

[[1517066   11885]
 [ 309004  270776]]
0.8478283858870572


In [19]:
nb = GaussianNB()
nb_model = nb.fit(X_train, y_train)
yhatnb = nb.predict(X_test)
print(sklearn.metrics.confusion_matrix(y_test, yhatnb))
print(nb_model.score(X_test, y_test))

[[1519913    9038]
 [ 315417  264363]]
0.8461373214506734
