### Import Necessary Libraries

In [36]:
import numpy as np 
import pandas as pd
import geopandas as gpd
from shapely import wkt
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/boston-crime-data/crime.csv


### Reading Data

In [37]:
df = pd.read_csv('../input/boston-crime-data/crime.csv', encoding='latin-1')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,I182080058,2403,Disorderly Conduct,DISTURBING THE PEACE,E18,495,,2018-10-03 20:13:00,2018,10,Wednesday,20,Part Two,ARLINGTON ST,42.262608,-71.121186,"(42.26260773, -71.12118637)"
1,I182080053,3201,Property Lost,PROPERTY - LOST,D14,795,,2018-08-30 20:00:00,2018,8,Thursday,20,Part Three,ALLSTON ST,42.352111,-71.135311,"(42.35211146, -71.13531147)"
2,I182080052,2647,Other,THREATS TO DO BODILY HARM,B2,329,,2018-10-03 19:20:00,2018,10,Wednesday,19,Part Two,DEVON ST,42.308126,-71.07693,"(42.30812619, -71.07692974)"
3,I182080051,413,Aggravated Assault,ASSAULT - AGGRAVATED - BATTERY,A1,92,,2018-10-03 20:00:00,2018,10,Wednesday,20,Part One,CAMBRIDGE ST,42.359454,-71.059648,"(42.35945371, -71.05964817)"
4,I182080050,3122,Aircraft,AIRCRAFT INCIDENTS,A7,36,,2018-10-03 20:49:00,2018,10,Wednesday,20,Part Three,PRESCOTT ST,42.375258,-71.024663,"(42.37525782, -71.02466343)"


### Feature Engineering

In [38]:
# Drop NA rows from Lat and Long features and getting rid of wrongs / outliers like (-1, -1)
df = df.dropna(subset = ['Lat', 'Long'])
df = df[df['Lat'] > 0]
df

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,I182080058,2403,Disorderly Conduct,DISTURBING THE PEACE,E18,495,,2018-10-03 20:13:00,2018,10,Wednesday,20,Part Two,ARLINGTON ST,42.262608,-71.121186,"(42.26260773, -71.12118637)"
1,I182080053,3201,Property Lost,PROPERTY - LOST,D14,795,,2018-08-30 20:00:00,2018,8,Thursday,20,Part Three,ALLSTON ST,42.352111,-71.135311,"(42.35211146, -71.13531147)"
2,I182080052,2647,Other,THREATS TO DO BODILY HARM,B2,329,,2018-10-03 19:20:00,2018,10,Wednesday,19,Part Two,DEVON ST,42.308126,-71.076930,"(42.30812619, -71.07692974)"
3,I182080051,413,Aggravated Assault,ASSAULT - AGGRAVATED - BATTERY,A1,92,,2018-10-03 20:00:00,2018,10,Wednesday,20,Part One,CAMBRIDGE ST,42.359454,-71.059648,"(42.35945371, -71.05964817)"
4,I182080050,3122,Aircraft,AIRCRAFT INCIDENTS,A7,36,,2018-10-03 20:49:00,2018,10,Wednesday,20,Part Three,PRESCOTT ST,42.375258,-71.024663,"(42.37525782, -71.02466343)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327815,I050310906-00,3125,Warrant Arrests,WARRANT ARREST,D4,285,,2016-06-05 17:25:00,2016,6,Sunday,17,Part Three,COVENTRY ST,42.336951,-71.085748,"(42.33695098, -71.08574813)"
327816,I030217815-08,111,Homicide,"MURDER, NON-NEGLIGIENT MANSLAUGHTER",E18,520,,2015-07-09 13:38:00,2015,7,Thursday,13,Part One,RIVER ST,42.255926,-71.123172,"(42.25592648, -71.12317207)"
327817,I030217815-08,3125,Warrant Arrests,WARRANT ARREST,E18,520,,2015-07-09 13:38:00,2015,7,Thursday,13,Part Three,RIVER ST,42.255926,-71.123172,"(42.25592648, -71.12317207)"
327818,I010370257-00,3125,Warrant Arrests,WARRANT ARREST,E13,569,,2016-05-31 19:35:00,2016,5,Tuesday,19,Part Three,NEW WASHINGTON ST,42.302333,-71.111565,"(42.30233307, -71.11156487)"


In [39]:
# groupby Reporting Area and get the mean of Long and Lat
reporting_area_centers=df[~df["Long"].isna()].groupby("REPORTING_AREA")["Long","Lat"].mean().reset_index()
# create a WKT representation of Reporting Area Centers: Lat/Lon values
reporting_area_centers["geom"]=reporting_area_centers.loc[:,["Long","Lat"]].apply(lambda x: "POINT("+str(x["Long"])+" "+str(x["Lat"])+")",axis=1)
reporting_area_centers
#source:https://github.com/alptuzel/bostoncrimeprediction/blob/master/Boston%20Crime%20Prediction.ipynb

  


Unnamed: 0,REPORTING_AREA,Long,Lat,geom
0,,-71.079019,42.328928,POINT(-71.07901926097279 42.32892816517891)
1,1,-70.996655,42.390179,POINT(-70.99665524078951 42.39017944236842)
2,10,-71.008469,42.388751,POINT(-71.00846862046863 42.38875119562499)
3,100,-71.054884,42.357810,POINT(-71.05488435035824 42.35781029024798)
4,101,-71.054358,42.357237,POINT(-71.05435766527813 42.357236672872574)
...,...,...,...,...
874,961,-71.125507,42.256711,POINT(-71.12550696916665 42.25671135083332)
875,962,-71.072722,42.341868,POINT(-71.07272215000009 42.341867650000026)
876,97,-71.058791,42.358096,POINT(-71.05879099596038 42.35809580782488)
877,98,-71.057834,42.357567,POINT(-71.05783420354408 42.357566826483406)


In [40]:
# some of Reporting Areas are too close with each other. We can organize them into less number of centers so to have equal distance between reporting (crime) centers
# create 500 centers
# get predictions from our Kmeans model

clusterer = KMeans(n_clusters=500,random_state=0).fit(reporting_area_centers[["Long","Lat"]])
preds_1 = clusterer.predict(reporting_area_centers[["Long","Lat"]])

In [41]:
# for each cluster calculate average Lat and Long

reporting_area_centers["cluster_no"]=preds_1
reporting_area_centers_clusters=reporting_area_centers.groupby("cluster_no")["Lat"].mean().reset_index()
reporting_area_centers_clusters=pd.merge(reporting_area_centers_clusters,reporting_area_centers.groupby("cluster_no")["Long"].mean().reset_index(),left_on="cluster_no",right_on="cluster_no")
reporting_area_centers_clusters

Unnamed: 0,cluster_no,Lat,Long
0,0,42.332979,-71.055322
1,1,42.263788,-71.120683
2,2,42.351111,-71.128802
3,3,42.328831,-71.087448
4,4,42.288759,-71.060760
...,...,...,...
495,495,42.282260,-71.166248
496,496,42.328609,-71.054394
497,497,42.267100,-71.103610
498,498,42.347598,-71.068130


In [42]:
geodf_ra = gpd.GeoDataFrame(reporting_area_centers,geometry=reporting_area_centers["geom"].apply(wkt.loads))
geodf_ra
#source:https://github.com/alptuzel/bostoncrimeprediction/blob/master/Boston%20Crime%20Prediction.ipynb

Unnamed: 0,REPORTING_AREA,Long,Lat,geom,cluster_no,geometry
0,,-71.079019,42.328928,POINT(-71.07901926097279 42.32892816517891),54,POINT (-71.07902 42.32893)
1,1,-70.996655,42.390179,POINT(-70.99665524078951 42.39017944236842),202,POINT (-70.99666 42.39018)
2,10,-71.008469,42.388751,POINT(-71.00846862046863 42.38875119562499),393,POINT (-71.00847 42.38875)
3,100,-71.054884,42.357810,POINT(-71.05488435035824 42.35781029024798),311,POINT (-71.05488 42.35781)
4,101,-71.054358,42.357237,POINT(-71.05435766527813 42.357236672872574),311,POINT (-71.05436 42.35724)
...,...,...,...,...,...,...
874,961,-71.125507,42.256711,POINT(-71.12550696916665 42.25671135083332),258,POINT (-71.12551 42.25671)
875,962,-71.072722,42.341868,POINT(-71.07272215000009 42.341867650000026),396,POINT (-71.07272 42.34187)
876,97,-71.058791,42.358096,POINT(-71.05879099596038 42.35809580782488),120,POINT (-71.05879 42.35810)
877,98,-71.057834,42.357567,POINT(-71.05783420354408 42.357566826483406),390,POINT (-71.05783 42.35757)


In [43]:
geodf_ra['cluster_no'].nunique()

500

In [44]:
#merge the cluster_no with whole data set

df = df.merge(geodf_ra, on='REPORTING_AREA', how='inner')
df

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,...,UCR_PART,STREET,Lat_x,Long_x,Location,Long_y,Lat_y,geom,cluster_no,geometry
0,I182080058,2403,Disorderly Conduct,DISTURBING THE PEACE,E18,495,,2018-10-03 20:13:00,2018,10,...,Part Two,ARLINGTON ST,42.262608,-71.121186,"(42.26260773, -71.12118637)",-71.120683,42.263788,POINT(-71.12068302034626 42.26378807967414),1,POINT (-71.12068 42.26379)
1,I182079552,2629,Harassment,HARASSMENT,E18,495,,2018-10-02 10:22:02,2018,10,...,Part Two,GREENWOOD AVE,42.265460,-71.120239,"(42.26546031, -71.12023924)",-71.120683,42.263788,POINT(-71.12068302034626 42.26378807967414),1,POINT (-71.12068 42.26379)
2,I182079055,3831,Motor Vehicle Accident Response,M/V - LEAVING SCENE - PROPERTY DAMAGE,E18,495,,2018-09-30 12:00:00,2018,9,...,Part Three,ARLINGTON ST,42.262743,-71.121607,"(42.26274326, -71.12160671)",-71.120683,42.263788,POINT(-71.12068302034626 42.26378807967414),1,POINT (-71.12068 42.26379)
3,I182078637,3115,Investigate Person,INVESTIGATE PERSON,E18,495,,2018-09-28 23:11:00,2018,9,...,Part Three,ARLINGTON ST,42.263040,-71.122303,"(42.26303952, -71.12230282)",-71.120683,42.263788,POINT(-71.12068302034626 42.26378807967414),1,POINT (-71.12068 42.26379)
4,I182077199,3831,Motor Vehicle Accident Response,M/V - LEAVING SCENE - PROPERTY DAMAGE,E18,495,,2018-09-24 10:30:00,2018,9,...,Part Three,HYDE PARK AVE,42.262169,-71.121856,"(42.26216909, -71.12185558)",-71.120683,42.263788,POINT(-71.12068302034626 42.26378807967414),1,POINT (-71.12068 42.26379)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306408,I152073866,3106,Property Related Damage,PROPERTY - ACCIDENTAL DAMAGE,E5,659,,2015-09-05 18:11:00,2015,9,...,Part Three,GRETTER RD,42.294650,-71.142209,"(42.29465032, -71.14220871)",-71.142274,42.294403,POINT(-71.14227353777778 42.294403277777775),280,POINT (-71.14227 42.29440)
306409,I152072178,3006,Medical Assistance,SICK/INJURED/MEDICAL - PERSON,E5,659,,2015-08-31 05:09:00,2015,8,...,Part Three,GRETTER RD,42.294650,-71.142209,"(42.29465032, -71.14220871)",-71.142274,42.294403,POINT(-71.14227353777778 42.294403277777775),280,POINT (-71.14227 42.29440)
306410,I152054904,2629,Harassment,HARASSMENT,E5,659,,2015-07-03 00:00:00,2015,7,...,Part Two,GRETTER RD,42.294650,-71.142209,"(42.29465032, -71.14220871)",-71.142274,42.294403,POINT(-71.14227353777778 42.294403277777775),280,POINT (-71.14227 42.29440)
306411,I152050553,3802,Motor Vehicle Accident Response,M/V ACCIDENT - PROPERTY DAMAGE,E5,659,,2015-06-18 15:13:00,2015,6,...,Part Three,WELD ST,42.293356,-71.141822,"(42.29335577, -71.14182237)",-71.142274,42.294403,POINT(-71.14227353777778 42.294403277777775),280,POINT (-71.14227 42.29440)


In [45]:
# label encoding for day_of_weeks
df['DAY_OF_WEEK'] = df['DAY_OF_WEEK'].map({
    'Tuesday':4, 
    'Saturday':2, 
    'Monday':3, 
    'Sunday':1, 
    'Thursday':5, 
    'Wednesday':6,
    'Friday':7
})
df

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,...,UCR_PART,STREET,Lat_x,Long_x,Location,Long_y,Lat_y,geom,cluster_no,geometry
0,I182080058,2403,Disorderly Conduct,DISTURBING THE PEACE,E18,495,,2018-10-03 20:13:00,2018,10,...,Part Two,ARLINGTON ST,42.262608,-71.121186,"(42.26260773, -71.12118637)",-71.120683,42.263788,POINT(-71.12068302034626 42.26378807967414),1,POINT (-71.12068 42.26379)
1,I182079552,2629,Harassment,HARASSMENT,E18,495,,2018-10-02 10:22:02,2018,10,...,Part Two,GREENWOOD AVE,42.265460,-71.120239,"(42.26546031, -71.12023924)",-71.120683,42.263788,POINT(-71.12068302034626 42.26378807967414),1,POINT (-71.12068 42.26379)
2,I182079055,3831,Motor Vehicle Accident Response,M/V - LEAVING SCENE - PROPERTY DAMAGE,E18,495,,2018-09-30 12:00:00,2018,9,...,Part Three,ARLINGTON ST,42.262743,-71.121607,"(42.26274326, -71.12160671)",-71.120683,42.263788,POINT(-71.12068302034626 42.26378807967414),1,POINT (-71.12068 42.26379)
3,I182078637,3115,Investigate Person,INVESTIGATE PERSON,E18,495,,2018-09-28 23:11:00,2018,9,...,Part Three,ARLINGTON ST,42.263040,-71.122303,"(42.26303952, -71.12230282)",-71.120683,42.263788,POINT(-71.12068302034626 42.26378807967414),1,POINT (-71.12068 42.26379)
4,I182077199,3831,Motor Vehicle Accident Response,M/V - LEAVING SCENE - PROPERTY DAMAGE,E18,495,,2018-09-24 10:30:00,2018,9,...,Part Three,HYDE PARK AVE,42.262169,-71.121856,"(42.26216909, -71.12185558)",-71.120683,42.263788,POINT(-71.12068302034626 42.26378807967414),1,POINT (-71.12068 42.26379)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306408,I152073866,3106,Property Related Damage,PROPERTY - ACCIDENTAL DAMAGE,E5,659,,2015-09-05 18:11:00,2015,9,...,Part Three,GRETTER RD,42.294650,-71.142209,"(42.29465032, -71.14220871)",-71.142274,42.294403,POINT(-71.14227353777778 42.294403277777775),280,POINT (-71.14227 42.29440)
306409,I152072178,3006,Medical Assistance,SICK/INJURED/MEDICAL - PERSON,E5,659,,2015-08-31 05:09:00,2015,8,...,Part Three,GRETTER RD,42.294650,-71.142209,"(42.29465032, -71.14220871)",-71.142274,42.294403,POINT(-71.14227353777778 42.294403277777775),280,POINT (-71.14227 42.29440)
306410,I152054904,2629,Harassment,HARASSMENT,E5,659,,2015-07-03 00:00:00,2015,7,...,Part Two,GRETTER RD,42.294650,-71.142209,"(42.29465032, -71.14220871)",-71.142274,42.294403,POINT(-71.14227353777778 42.294403277777775),280,POINT (-71.14227 42.29440)
306411,I152050553,3802,Motor Vehicle Accident Response,M/V ACCIDENT - PROPERTY DAMAGE,E5,659,,2015-06-18 15:13:00,2015,6,...,Part Three,WELD ST,42.293356,-71.141822,"(42.29335577, -71.14182237)",-71.142274,42.294403,POINT(-71.14227353777778 42.294403277777775),280,POINT (-71.14227 42.29440)


In [46]:
#one-hot-encoding for DISTRICT column
def get_dummies(dataframe):

    
    DISTRICT = pd.get_dummies(dataframe.DISTRICT, prefix = 'DIS')
    dataframe = pd.concat([dataframe,DISTRICT],axis = 1)
    dataframe = dataframe.drop(['DISTRICT'],axis = 1)
    
    return dataframe
df =get_dummies(df)
df

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,...,DIS_A7,DIS_B2,DIS_B3,DIS_C11,DIS_C6,DIS_D14,DIS_D4,DIS_E13,DIS_E18,DIS_E5
0,I182080058,2403,Disorderly Conduct,DISTURBING THE PEACE,495,,2018-10-03 20:13:00,2018,10,6,...,0,0,0,0,0,0,0,0,1,0
1,I182079552,2629,Harassment,HARASSMENT,495,,2018-10-02 10:22:02,2018,10,4,...,0,0,0,0,0,0,0,0,1,0
2,I182079055,3831,Motor Vehicle Accident Response,M/V - LEAVING SCENE - PROPERTY DAMAGE,495,,2018-09-30 12:00:00,2018,9,1,...,0,0,0,0,0,0,0,0,1,0
3,I182078637,3115,Investigate Person,INVESTIGATE PERSON,495,,2018-09-28 23:11:00,2018,9,7,...,0,0,0,0,0,0,0,0,1,0
4,I182077199,3831,Motor Vehicle Accident Response,M/V - LEAVING SCENE - PROPERTY DAMAGE,495,,2018-09-24 10:30:00,2018,9,3,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306408,I152073866,3106,Property Related Damage,PROPERTY - ACCIDENTAL DAMAGE,659,,2015-09-05 18:11:00,2015,9,2,...,0,0,0,0,0,0,0,0,0,1
306409,I152072178,3006,Medical Assistance,SICK/INJURED/MEDICAL - PERSON,659,,2015-08-31 05:09:00,2015,8,3,...,0,0,0,0,0,0,0,0,0,1
306410,I152054904,2629,Harassment,HARASSMENT,659,,2015-07-03 00:00:00,2015,7,7,...,0,0,0,0,0,0,0,0,0,1
306411,I152050553,3802,Motor Vehicle Accident Response,M/V ACCIDENT - PROPERTY DAMAGE,659,,2015-06-18 15:13:00,2015,6,5,...,0,0,0,0,0,0,0,0,0,1


In [47]:
df.columns

Index(['INCIDENT_NUMBER', 'OFFENSE_CODE', 'OFFENSE_CODE_GROUP',
       'OFFENSE_DESCRIPTION', 'REPORTING_AREA', 'SHOOTING', 'OCCURRED_ON_DATE',
       'YEAR', 'MONTH', 'DAY_OF_WEEK', 'HOUR', 'UCR_PART', 'STREET', 'Lat_x',
       'Long_x', 'Location', 'Long_y', 'Lat_y', 'geom', 'cluster_no',
       'geometry', 'DIS_A1', 'DIS_A15', 'DIS_A7', 'DIS_B2', 'DIS_B3',
       'DIS_C11', 'DIS_C6', 'DIS_D14', 'DIS_D4', 'DIS_E13', 'DIS_E18',
       'DIS_E5'],
      dtype='object')

In [48]:
#important crimes include the UCR_PART 1 crimes and some other important and violent crimes
important_crimes= ['Aggravated Assault', 
      'Harassment', 
       'Arson', 
       'Homicide', 
       'Criminal Harassment', 
       'Biological Threat',
       'Manslaughter', 'HUMAN TRAFFICKING',
        'Auto Theft', 'Larceny','Robbery','Residential Burglary','Larcency From Motor Vehicle','Other Burglary','Commercial Burglary']

In [49]:
#creating important feature according the list above
df['important']=0
i=0
for x in df['OFFENSE_CODE_GROUP']:
      if x in important_crimes:
        df['important'].iloc[i]=1
    
      i+=1
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,...,DIS_B2,DIS_B3,DIS_C11,DIS_C6,DIS_D14,DIS_D4,DIS_E13,DIS_E18,DIS_E5,important
0,I182080058,2403,Disorderly Conduct,DISTURBING THE PEACE,495,,2018-10-03 20:13:00,2018,10,6,...,0,0,0,0,0,0,0,1,0,0
1,I182079552,2629,Harassment,HARASSMENT,495,,2018-10-02 10:22:02,2018,10,4,...,0,0,0,0,0,0,0,1,0,1
2,I182079055,3831,Motor Vehicle Accident Response,M/V - LEAVING SCENE - PROPERTY DAMAGE,495,,2018-09-30 12:00:00,2018,9,1,...,0,0,0,0,0,0,0,1,0,0
3,I182078637,3115,Investigate Person,INVESTIGATE PERSON,495,,2018-09-28 23:11:00,2018,9,7,...,0,0,0,0,0,0,0,1,0,0
4,I182077199,3831,Motor Vehicle Accident Response,M/V - LEAVING SCENE - PROPERTY DAMAGE,495,,2018-09-24 10:30:00,2018,9,3,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306408,I152073866,3106,Property Related Damage,PROPERTY - ACCIDENTAL DAMAGE,659,,2015-09-05 18:11:00,2015,9,2,...,0,0,0,0,0,0,0,0,1,0
306409,I152072178,3006,Medical Assistance,SICK/INJURED/MEDICAL - PERSON,659,,2015-08-31 05:09:00,2015,8,3,...,0,0,0,0,0,0,0,0,1,0
306410,I152054904,2629,Harassment,HARASSMENT,659,,2015-07-03 00:00:00,2015,7,7,...,0,0,0,0,0,0,0,0,1,1
306411,I152050553,3802,Motor Vehicle Accident Response,M/V ACCIDENT - PROPERTY DAMAGE,659,,2015-06-18 15:13:00,2015,6,5,...,0,0,0,0,0,0,0,0,1,0


In [50]:
df["OCCURRED_ON_DATE"]=pd.to_datetime(df["OCCURRED_ON_DATE"],infer_datetime_format=True)
df["OCCURRED_DAY"]=df["OCCURRED_ON_DATE"].apply(lambda x: x.date())
df = df.drop(['OCCURRED_ON_DATE'],axis = 1)
df

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,REPORTING_AREA,SHOOTING,YEAR,MONTH,DAY_OF_WEEK,HOUR,...,DIS_B3,DIS_C11,DIS_C6,DIS_D14,DIS_D4,DIS_E13,DIS_E18,DIS_E5,important,OCCURRED_DAY
0,I182080058,2403,Disorderly Conduct,DISTURBING THE PEACE,495,,2018,10,6,20,...,0,0,0,0,0,0,1,0,0,2018-10-03
1,I182079552,2629,Harassment,HARASSMENT,495,,2018,10,4,10,...,0,0,0,0,0,0,1,0,1,2018-10-02
2,I182079055,3831,Motor Vehicle Accident Response,M/V - LEAVING SCENE - PROPERTY DAMAGE,495,,2018,9,1,12,...,0,0,0,0,0,0,1,0,0,2018-09-30
3,I182078637,3115,Investigate Person,INVESTIGATE PERSON,495,,2018,9,7,23,...,0,0,0,0,0,0,1,0,0,2018-09-28
4,I182077199,3831,Motor Vehicle Accident Response,M/V - LEAVING SCENE - PROPERTY DAMAGE,495,,2018,9,3,10,...,0,0,0,0,0,0,1,0,0,2018-09-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306408,I152073866,3106,Property Related Damage,PROPERTY - ACCIDENTAL DAMAGE,659,,2015,9,2,18,...,0,0,0,0,0,0,0,1,0,2015-09-05
306409,I152072178,3006,Medical Assistance,SICK/INJURED/MEDICAL - PERSON,659,,2015,8,3,5,...,0,0,0,0,0,0,0,1,0,2015-08-31
306410,I152054904,2629,Harassment,HARASSMENT,659,,2015,7,7,0,...,0,0,0,0,0,0,0,1,1,2015-07-03
306411,I152050553,3802,Motor Vehicle Accident Response,M/V ACCIDENT - PROPERTY DAMAGE,659,,2015,6,5,15,...,0,0,0,0,0,0,0,1,0,2015-06-18


In [51]:
#sorting values
df = df.sort_values(by =['OCCURRED_DAY'], ascending=False)
df['OCCURRED_DAY']= pd.to_datetime(df['OCCURRED_DAY'])
df

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,REPORTING_AREA,SHOOTING,YEAR,MONTH,DAY_OF_WEEK,HOUR,...,DIS_B3,DIS_C11,DIS_C6,DIS_D14,DIS_D4,DIS_E13,DIS_E18,DIS_E5,important,OCCURRED_DAY
0,I182080058,2403,Disorderly Conduct,DISTURBING THE PEACE,495,,2018,10,6,20,...,0,0,0,0,0,0,1,0,0,2018-10-03
69364,I182079874,3301,Verbal Disputes,VERBAL DISPUTE,235,,2018,10,6,11,...,0,1,0,0,0,0,0,0,0,2018-10-03
53696,I182079846,1503,Firearm Violations,"WEAPON - OTHER - CARRYING / POSSESSING, ETC",916,,2018,10,6,10,...,0,0,0,0,0,1,0,0,0,2018-10-03
53697,I182079846,1841,Drug Violation,DRUGS - POSS CLASS A - INTENT TO MFR DIST DISP,916,,2018,10,6,10,...,0,0,0,0,0,1,0,0,0,2018-10-03
76221,I182079857,2405,Disorderly Conduct,DISORDERLY CONDUCT,958,,2018,10,6,10,...,0,0,1,0,0,0,0,0,0,2018-10-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221347,I152049585,2906,Violations,VAL - OPERATING UNREG/UNINS CAR,69,,2015,6,3,1,...,0,0,0,0,0,0,0,0,0,2015-06-15
27852,I152049595,2670,Criminal Harassment,CRIMINAL HARASSMENT,915,,2015,6,3,1,...,0,0,1,0,0,0,0,0,1,2015-06-15
71621,I152049580,3006,Medical Assistance,SICK/INJURED/MEDICAL - PERSON,282,,2015,6,3,12,...,0,0,0,0,0,0,0,0,0,2015-06-15
27851,I152049998,615,Larceny From Motor Vehicle,LARCENY THEFT OF MV PARTS & ACCESSORIES,915,,2015,6,3,21,...,0,0,1,0,0,0,0,0,0,2015-06-15


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306413 entries, 0 to 85919
Data columns (total 34 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   INCIDENT_NUMBER      306413 non-null  object        
 1   OFFENSE_CODE         306413 non-null  int64         
 2   OFFENSE_CODE_GROUP   306413 non-null  object        
 3   OFFENSE_DESCRIPTION  306413 non-null  object        
 4   REPORTING_AREA       306413 non-null  object        
 5   SHOOTING             1018 non-null    object        
 6   YEAR                 306413 non-null  int64         
 7   MONTH                306413 non-null  int64         
 8   DAY_OF_WEEK          306413 non-null  int64         
 9   HOUR                 306413 non-null  int64         
 10  UCR_PART             306320 non-null  object        
 11  STREET               304631 non-null  object        
 12  Lat_x                306413 non-null  float64       
 13  Long_x         

In [53]:
#shuffling the dataset
df = shuffle(df)

### Split train,test and validation sets for Prediction Model

In [54]:
#0.8 of dataset from head is train_set
train_set = df[['Lat_y','Long_y','cluster_no','MONTH','HOUR','DIS_A1','DIS_A15','DIS_A7','DIS_B2','DIS_B3', 'DIS_C11', 'DIS_C6', 
                'DIS_D14', 'DIS_D4', 'DIS_E13', 'DIS_E18', 'DIS_E5','important']].head((int(len(df) *0.8)))
train_set

Unnamed: 0,Lat_y,Long_y,cluster_no,MONTH,HOUR,DIS_A1,DIS_A15,DIS_A7,DIS_B2,DIS_B3,DIS_C11,DIS_C6,DIS_D14,DIS_D4,DIS_E13,DIS_E18,DIS_E5,important
260520,42.323529,-71.091776,268,6,18,0,0,0,1,0,0,0,0,0,0,0,0,0
173514,42.296369,-71.048678,439,5,1,0,0,0,0,0,1,0,0,0,0,0,0,1
215075,42.290977,-71.063966,477,10,21,0,0,0,0,0,1,0,0,0,0,0,0,0
143295,42.330085,-71.070093,76,10,9,0,0,0,0,0,0,1,0,0,0,0,0,0
267087,42.338426,-71.083070,450,8,9,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274941,42.341240,-71.070583,396,5,16,0,0,0,0,0,0,0,0,1,0,0,0,0
176938,42.348030,-71.081593,95,12,23,0,0,0,0,0,0,0,0,1,0,0,0,0
229947,42.365904,-71.057899,306,8,12,1,0,0,0,0,0,0,0,0,0,0,0,0
295438,42.359221,-71.153536,300,9,14,0,0,0,0,0,0,0,1,0,0,0,0,0


In [55]:
#0.2 of dataset from tail is validation_set
validation_set = df.tail(int(len(df) *0.2))[['Lat_y','Long_y','cluster_no','MONTH','HOUR','DIS_A1','DIS_A15','DIS_A7','DIS_B2','DIS_B3', 'DIS_C11', 'DIS_C6', 'DIS_D14', 'DIS_D4', 'DIS_E13', 'DIS_E18', 'DIS_E5','important']]
validation_set


Unnamed: 0,Lat_y,Long_y,cluster_no,MONTH,HOUR,DIS_A1,DIS_A15,DIS_A7,DIS_B2,DIS_B3,DIS_C11,DIS_C6,DIS_D14,DIS_D4,DIS_E13,DIS_E18,DIS_E5,important
161456,42.289652,-71.122627,357,9,10,0,0,0,0,0,0,0,0,0,0,0,1,0
296678,42.330760,-71.092475,344,10,8,0,0,0,1,0,0,0,0,0,0,0,0,0
11233,42.312553,-71.077614,126,6,9,0,0,0,1,0,0,0,0,0,0,0,0,0
3533,42.369905,-71.023843,123,9,11,0,0,1,0,0,0,0,0,0,0,0,0,0
47101,42.356794,-71.060119,183,9,11,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203000,42.322357,-71.073835,238,3,11,0,0,0,1,0,0,0,0,0,0,0,0,0
75426,42.295247,-71.089457,223,1,12,0,0,0,0,1,0,0,0,0,0,0,0,0
185453,42.304334,-71.075469,65,8,18,0,0,0,0,0,1,0,0,0,0,0,0,0
96503,42.346068,-71.143638,10,10,20,0,0,0,0,0,0,0,1,0,0,0,0,0


In [56]:
groundTruth= validation_set['important'].to_frame()

groundTruth

Unnamed: 0,important
161456,0
296678,0
11233,0
3533,0
47101,0
...,...
203000,0
75426,0
185453,0
96503,0


In [57]:
#feature scaling on train_set with StandardScaler
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_set)
train_scaled = pd.DataFrame(data=train_scaled, columns=['Lat_y','Long_y','cluster_no','MONTH','HOUR','DIS_A1','DIS_A15','DIS_A7','DIS_B2','DIS_B3', 'DIS_C11', 'DIS_C6', 'DIS_D14', 'DIS_D4', 'DIS_E13', 'DIS_E18', 'DIS_E5', 'important'])

train_scaled

Unnamed: 0,Lat_y,Long_y,cluster_no,MONTH,HOUR,DIS_A1,DIS_A15,DIS_A7,DIS_B2,DIS_B3,DIS_C11,DIS_C6,DIS_D14,DIS_D4,DIS_E13,DIS_E18,DIS_E5,important
0,0.039278,-0.303793,0.171717,-0.206883,0.776590,-0.346662,-0.145546,-0.211798,2.341771,-0.356506,-0.399348,-0.282255,-0.261699,-0.385398,-0.242693,-0.244253,-0.210554,-0.463805
1,-0.822032,1.160706,1.379105,-0.513697,-1.932671,-0.346662,-0.145546,-0.211798,-0.427027,-0.356506,2.504084,-0.282255,-0.261699,-0.385398,-0.242693,-0.244253,-0.210554,2.156079
2,-0.993049,0.641187,1.647414,1.020377,1.254695,-0.346662,-0.145546,-0.211798,-0.427027,-0.356506,2.504084,-0.282255,-0.261699,-0.385398,-0.242693,-0.244253,-0.210554,-0.463805
3,0.247188,0.432998,-1.183947,1.020377,-0.657725,-0.346662,-0.145546,-0.211798,-0.427027,-0.356506,-0.399348,3.542891,-0.261699,-0.385398,-0.242693,-0.244253,-0.210554,-0.463805
4,0.511712,-0.007970,1.456773,0.406747,-0.657725,-0.346662,-0.145546,-0.211798,-0.427027,-0.356506,-0.399348,-0.282255,-0.261699,2.594718,-0.242693,-0.244253,-0.210554,-0.463805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245125,0.600953,0.416330,1.075493,-0.513697,0.457853,-0.346662,-0.145546,-0.211798,-0.427027,-0.356506,-0.399348,-0.282255,-0.261699,2.594718,-0.242693,-0.244253,-0.210554,-0.463805
245126,0.816264,0.042226,-1.049793,1.634007,1.573431,-0.346662,-0.145546,-0.211798,-0.427027,-0.356506,-0.399348,-0.282255,-0.261699,2.594718,-0.242693,-0.244253,-0.210554,-0.463805
245127,1.383108,0.847339,0.440026,0.406747,-0.179620,2.884656,-0.145546,-0.211798,-0.427027,-0.356506,-0.399348,-0.282255,-0.261699,-0.385398,-0.242693,-0.244253,-0.210554,-0.463805
245128,1.171170,-2.402466,0.397661,0.713562,0.139117,-0.346662,-0.145546,-0.211798,-0.427027,-0.356506,-0.399348,-0.282255,3.821185,-0.385398,-0.242693,-0.244253,-0.210554,-0.463805


In [58]:
#feature scaling on validation_set with StandardScaler
scaled_validation_set = scaler.fit_transform(validation_set)
scaled_validation_set = pd.DataFrame(data=scaled_validation_set, columns=['Lat_y','Long_y','cluster_no','MONTH','HOUR','DIS_A1','DIS_A15','DIS_A7','DIS_B2','DIS_B3', 'DIS_C11', 'DIS_C6', 'DIS_D14', 'DIS_D4', 'DIS_E13', 'DIS_E18', 'DIS_E5', 'important'])
scaled_validation_set = scaled_validation_set.drop('important', axis=1)
scaled_validation_set

Unnamed: 0,Lat_y,Long_y,cluster_no,MONTH,HOUR,DIS_A1,DIS_A15,DIS_A7,DIS_B2,DIS_B3,DIS_C11,DIS_C6,DIS_D14,DIS_D4,DIS_E13,DIS_E18,DIS_E5
0,-1.041913,-1.360552,0.790016,0.714023,-0.493910,-0.348300,-0.141745,-0.209007,-0.432983,-0.358206,-0.398855,-0.279212,-0.260559,-0.387541,-0.241457,-0.238299,4.684385
1,0.270027,-0.328817,0.698408,1.021149,-0.812014,-0.348300,-0.141745,-0.209007,2.309558,-0.358206,-0.398855,-0.279212,-0.260559,-0.387541,-0.241457,-0.238299,-0.213475
2,-0.311042,0.179649,-0.837784,-0.207353,-0.652962,-0.348300,-0.141745,-0.209007,2.309558,-0.358206,-0.398855,-0.279212,-0.260559,-0.387541,-0.241457,-0.238299,-0.213475
3,1.519333,2.019552,-0.858924,0.714023,-0.334858,-0.348300,-0.141745,4.784519,-0.432983,-0.358206,-0.398855,-0.279212,-0.260559,-0.387541,-0.241457,-0.238299,-0.213475
4,1.100900,0.778290,-0.436119,0.714023,-0.334858,2.871086,-0.141745,-0.209007,-0.432983,-0.358206,-0.398855,-0.279212,-0.260559,-0.387541,-0.241457,-0.238299,-0.213475
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61277,0.001867,0.308986,-0.048547,-1.128729,-0.334858,-0.348300,-0.141745,-0.209007,2.309558,-0.358206,-0.398855,-0.279212,-0.260559,-0.387541,-0.241457,-0.238299,-0.213475
61278,-0.863363,-0.225558,-0.154249,-1.742980,-0.175806,-0.348300,-0.141745,-0.209007,-0.432983,2.791687,-0.398855,-0.279212,-0.260559,-0.387541,-0.241457,-0.238299,-0.213475
61279,-0.573346,0.253047,-1.267635,0.406898,0.778508,-0.348300,-0.141745,-0.209007,-0.432983,-0.358206,2.507177,-0.279212,-0.260559,-0.387541,-0.241457,-0.238299,-0.213475
61280,0.758570,-2.079482,-1.655207,1.021149,1.096612,-0.348300,-0.141745,-0.209007,-0.432983,-0.358206,-0.398855,-0.279212,3.837899,-0.387541,-0.241457,-0.238299,-0.213475


In [62]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

def classification_metrics(y_test, predict):
    print("Confusion Matrix:\n")
    print(confusion_matrix(y_test, predict))
    print("\nAccuracy: ", accuracy_score(y_test, predict))
    print("\nClassification report:\n")
    print(classification_report(y_test, predict))

In [63]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_scaled.drop(columns=["important"]), train_set["important"], random_state = 0, test_size = 0.33, shuffle=True)  

In [65]:
# Scale Pos Weight Parameter Search
from lightgbm import LGBMClassifier
scale_pos_weight = [1,2,3,4,5,6,7,8,9,10]
for i in scale_pos_weight:
    print('scale_pos_weight = {}: '.format(i))
    clf = LGBMClassifier(scale_pos_weight=i)
    clf.fit(x_train, y_train)
    predict = clf.predict(x_test)
    proba = clf.predict_proba(x_test)
    cm = confusion_matrix(y_test, predict)
    print(accuracy_score(y_test, predict))
    print('Confusion Matrix: \n', cm)
    print('--------------------')
    print('No : %', round(100 * (cm[0][0]/(cm[0][0]+cm[0][1]))))
    print('Yes : %', round(100* (cm[1][1]/(cm[1][0]+cm[1][1]))))
    print('---------------------------------------------------------')

scale_pos_weight = 1: 
0.8241875069536301
Confusion Matrix: 
 [[66160   349]
 [13873   511]]
--------------------
No : % 99.0
Yes : % 4.0
---------------------------------------------------------
scale_pos_weight = 2: 
0.8192921513604391
Confusion Matrix: 
 [[64659  1850]
 [12768  1616]]
--------------------
No : % 97.0
Yes : % 11.0
---------------------------------------------------------
scale_pos_weight = 3: 
0.7975350153907013
Confusion Matrix: 
 [[61694  4815]
 [11563  2821]]
--------------------
No : % 93.0
Yes : % 20.0
---------------------------------------------------------
scale_pos_weight = 4: 
0.7338459446429234
Confusion Matrix: 
 [[54379 12130]
 [ 9400  4984]]
--------------------
No : % 82.0
Yes : % 35.0
---------------------------------------------------------
scale_pos_weight = 5: 
0.6121419653122026
Confusion Matrix: 
 [[41621 24888]
 [ 6487  7897]]
--------------------
No : % 63.0
Yes : % 55.0
---------------------------------------------------------
scale_pos_weight

### LGBM Classifier

### Test Set Results

In [66]:
lgbm = LGBMClassifier(scale_pos_weight = 5, random_state=0)
lgbm.fit(x_train, y_train)
lgbm_pred = lgbm.predict(x_test)

classification_metrics(y_test, lgbm_pred)

Confusion Matrix:

[[41621 24888]
 [ 6487  7897]]

Accuracy:  0.6121419653122026

Classification report:

              precision    recall  f1-score   support

           0       0.87      0.63      0.73     66509
           1       0.24      0.55      0.33     14384

    accuracy                           0.61     80893
   macro avg       0.55      0.59      0.53     80893
weighted avg       0.75      0.61      0.66     80893



### Validation Set Results

In [67]:
predict = pd.DataFrame(data=lgbm.predict(scaled_validation_set))
lgbm_accuracy = classification_metrics(groundTruth, predict)
print('lgbm_accuracy:', lgbm_accuracy)

Confusion Matrix:

[[31880 18600]
 [ 5111  5691]]

Accuracy:  0.6130837766391436

Classification report:

              precision    recall  f1-score   support

           0       0.86      0.63      0.73     50480
           1       0.23      0.53      0.32     10802

    accuracy                           0.61     61282
   macro avg       0.55      0.58      0.53     61282
weighted avg       0.75      0.61      0.66     61282

lgbm_accuracy: None
