In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [2]:
df = pd.read_csv('Cleaned_Crime.csv', low_memory=False)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,FBI Code,Year,Location
0,0,11556037,JC103643,01/03/2019 07:20:00 PM,0000X W RWY 27R,2890,PUBLIC PEACE VIOLATION,OTHER VIOLATION,AIRCRAFT,False,False,1654,16,26,2019,"(42.002816387, -87.90609433)"
1,1,11626027,JC188126,03/16/2019 05:58:00 PM,001XX N WELLS ST,460,BATTERY,SIMPLE,STREET,False,False,122,1,08B,2019,"(41.88336939, -87.633860272)"
2,2,11622422,JC183696,03/12/2019 10:00:00 PM,008XX E 38TH PL,820,THEFT,$500 AND UNDER,RESIDENTIAL YARD (FRONT/BACK),False,False,212,2,06,2019,"(41.825346902, -87.606780575)"
3,3,11625922,JC185669,03/14/2019 06:42:00 PM,074XX N PAULINA ST,460,BATTERY,SIMPLE,RESIDENCE,False,False,2422,24,08B,2019,"(42.016541612, -87.672499325)"
4,4,11622907,JC185406,03/14/2019 04:03:00 PM,008XX E 38TH PL,5002,OTHER OFFENSE,OTHER VEHICLE OFFENSE,STREET,False,True,212,2,26,2019,"(41.825298645, -87.6069609)"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7029103 entries, 0 to 7029102
Data columns (total 16 columns):
Unnamed: 0              int64
ID                      int64
Case Number             object
Date                    object
Block                   object
IUCR                    object
Primary Type            object
Description             object
Location Description    object
Arrest                  bool
Domestic                bool
Beat                    int64
District                int64
FBI Code                object
Year                    int64
Location                object
dtypes: bool(2), int64(5), object(9)
memory usage: 764.2+ MB


In [5]:
df = df.drop(['Unnamed: 0', 'ID', 'Case Number', 'Date', 'Location'], axis=1)
df['Beat'] = df['Beat'].astype(str)
df['District'] = df['District'].astype(str)
df['Year'] = df['Year'].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7029103 entries, 0 to 7029102
Data columns (total 11 columns):
Block                   object
IUCR                    object
Primary Type            object
Description             object
Location Description    object
Arrest                  bool
Domestic                bool
Beat                    object
District                object
FBI Code                object
Year                    object
dtypes: bool(2), object(9)
memory usage: 496.1+ MB


In [6]:
total = df.Arrest.count()
print("Arrest not made:", round(df.Arrest[df['Arrest'] == False].count()/total, 2), "%")
print("Arrest made:", round(df.Arrest[df['Arrest'] == True].count()/total, 2), "%")

Arrest not made: 0.73 %
Arrest made: 0.27 %


In [7]:
def crime_type(x):
    if x in ['01A','02','03','04A','04B']:
        return 'Violent crime'
    elif x in ['05','06','07','09']:
        return 'Property crime'
    elif x in ['01B','08A','08B','10','11','12','13','14','15','16','17','18','19','20','22','24','26']:
        return 'Less serious offense'

def arrest(x):
    if x:
        return 'Arrest'
    else:
        return 'No Arrest'

def domestic(x):
    if x:
        return 'Domestic'
    else:
        return 'Not Domestic'
    
def location_description(x):
    if 'VEHICLE' in x or x == 'AUTO' or 'TAXI' in x or x == 'TRUCK':
        return 'VEHICLE'
    elif 'CHA' in x and x != 'CURRENCY EXCHANGE':
        return 'CHICAGO HOUSING AUTHORITY'
    elif 'CTA' in x:
        return 'CHICAGO TRANSIT AUTHORITY'
    elif 'COLLEGE' in x:
        return 'COLLEGE'
    elif 'RESIDEN' in x or x == 'APARTMENT' or x == 'BASEMENT' or x == 'COACH HOUSE':
        return 'RESIDENTIAL'
    elif 'AIRPORT' in x:
        return 'AIRPORT'
    elif 'BARBERSHOP' in x or 'BARBER SHOP' in x:
        return 'BARBERSHOP'
    elif 'CHURCH' in x:
        return 'CHURCH'
    elif 'DRIVEWAY' in x:
        return 'DRIVEWAY'
    elif 'FACTORY' in x:
        return 'FACTORY'
    elif 'GARAGE' in x or 'PARKING LOT' in x:
        return 'GARAGE/PARKING LOT'
    elif 'GAS STATION' in x or 'CONVENIENCE STORE' in x:
        return 'GAS STATION/CONVENIENCE STORE'
    elif 'HIGHWAY' in x:
        return 'HIGHWAY'
    elif 'HOSPITAL' in x:
        return 'HOSPITAL'
    elif 'HOTEL' in x or 'MOTEL' in x:
        return 'HOTEL/MOTEL'
    elif 'LAKE' in x or 'RIVER' in x or x == 'LAGOON':
        return 'LAKEFRONT/WATERFRONT/RIVERBANK'
    elif 'MEDICAL' in x:
        return 'MEDICAL/DENTAL OFFICE'
    elif 'THEATER' in x:
        return 'THEATER'
    elif 'NURSING' in x:
        return 'NURSING HOME'
    elif 'OFFICE' in x:
        return 'OFFICE'
    elif 'JAIL' in x:
        return 'JAIL'
    elif 'POLICE' in x:
        return 'POLICE FACILITY/VEH PARKING LOT'
    elif 'POOL' in x:
        return 'POOLROOM'
    elif 'SCHOOL' in x:
        return 'SCHOOL'
    elif 'RAILROAD' in x:
        return 'RAILROAD PROPERTY'
    elif 'STADIUM' in x:
        return 'SPORTS ARENA/STADIUM'
    elif 'TAVERN' in x:
        return 'TAVERN/LIQUOR STORE'
    elif 'VACANT' in x:
        return 'VACANT LOT/LAND'
    elif 'WATERCRAFT' in x:
        return 'BOAT/WATERCRAFT'
    elif x == 'CREDIT UNION' or x == 'BANK':
        return 'BANK/CREDIT UNION'
    elif 'GOVERNMENT BUILDING' in x:
        return 'GOVERNMENT BUILDING'
    else:
        return x

In [8]:
df['FBI Code'] = df['FBI Code'].apply(crime_type)
df['Location Description'] = df['Location Description'].apply(location_description)

In [9]:
df.nunique()

Block                   59154
IUCR                      402
Primary Type               36
Description               507
Location Description       96
Arrest                      2
Domestic                    2
Beat                      304
District                   24
FBI Code                    3
Year                       20
dtype: int64

In [10]:
Domestic = pd.get_dummies(df['Domestic'],drop_first=True)
District = pd.get_dummies(df['District'],drop_first=True)
FBI_Code = pd.get_dummies(df['FBI Code'],drop_first=True)
Year = pd.get_dummies(df['Year'],drop_first=True)
Location = pd.get_dummies(df['Location Description'],drop_first=True)

In [11]:
model_df = pd.concat([Domestic,District,FBI_Code,Year,Location],axis=1)

In [12]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7029103 entries, 0 to 7029102
Columns: 140 entries, True to YMCA
dtypes: uint8(140)
memory usage: 938.5 MB


In [13]:
y = df['Arrest']
X = model_df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(4920372, 140) (2108731, 140) (4920372,) (2108731,)


In [14]:
lr = sklearn.linear_model.LogisticRegression(solver='newton-cg', multi_class='auto')
sfs1 = SFS(lr,
           n_jobs=3,
           k_features=50, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

sfs1 = sfs1.fit(X_train, y_train)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  6.6min
[Parallel(n_jobs=3)]: Done 140 out of 140 | elapsed: 18.5min finished

[2020-04-28 14:34:24] Features: 1/50 -- score: 0.7280831611918773[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  9.3min
[Parallel(n_jobs=3)]: Done 139 out of 139 | elapsed: 32.1min finished

[2020-04-28 15:06:31] Features: 2/50 -- score: 0.729936882821055[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed: 11.8min
[Parallel(n_jobs=3)]: Done 138 out of 138 | elapsed: 37.9min finished

[2020-04-28 15:44:23] Features: 3/50 -- score: 0.7452147927026656[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed: 13.9min
[Parallel(n_jobs=3)]: Done 137 out of 137 | elaps

In [15]:
sfs1.subsets_

{1: {'feature_idx': (60,),
  'cv_scores': array([0.72808316]),
  'avg_score': 0.7280831611918773,
  'feature_names': ('CHICAGO HOUSING AUTHORITY',)},
 2: {'feature_idx': (60, 124),
  'cv_scores': array([0.72993688]),
  'avg_score': 0.729936882821055,
  'feature_names': ('CHICAGO HOUSING AUTHORITY', 'SIDEWALK')},
 3: {'feature_idx': (25, 60, 124),
  'cv_scores': array([0.74521479]),
  'avg_score': 0.7452147927026656,
  'feature_names': ('Violent crime', 'CHICAGO HOUSING AUTHORITY', 'SIDEWALK')},
 4: {'feature_idx': (24, 25, 60, 124),
  'cv_scores': array([0.75206021]),
  'avg_score': 0.7520602100816768,
  'feature_names': ('Property crime',
   'Violent crime',
   'CHICAGO HOUSING AUTHORITY',
   'SIDEWALK')},
 5: {'feature_idx': (0, 24, 25, 60, 124),
  'cv_scores': array([0.75722669]),
  'avg_score': 0.7572266893641375,
  'feature_names': (True,
   'Property crime',
   'Violent crime',
   'CHICAGO HOUSING AUTHORITY',
   'SIDEWALK')},
 6: {'feature_idx': (0, 24, 25, 47, 60, 124),
  'cv_sc

In [16]:
arrestlr_model = lr.fit(X_train, y_train)
yhat = arrestlr_model.predict(X_test)
print(sklearn.metrics.confusion_matrix(y_test, yhat))
print(arrestlr_model.score(X_test, y_test))

[[1517066   11885]
 [ 309004  270776]]
0.8478283858870572


In [19]:
nb = GaussianNB()
nb_model = nb.fit(X_train, y_train)
yhatnb = nb.predict(X_test)
print(sklearn.metrics.confusion_matrix(y_test, yhatnb))
print(nb_model.score(X_test, y_test))

[[1519913    9038]
 [ 315417  264363]]
0.8461373214506734
