In [1]:
%matplotlib inline
import requests
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.signal import savgol_filter
import csv

# Data Cleanning

## NYPD Complaints

In [30]:
nypd_complaint = pd.read_csv('data/NYPD_Complaint_Data.csv', dtype=str)

# Historic data is oversize that cannot be uploaded with Git Large file System, please download the data from:
# https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i
nypd_complaint_h = pd.read_csv('data/NYPD_Complaint_Data_Historic.csv', dtype=str)
nypd_complaint_h.head()

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,...,SUSP_SEX,TRANSIT_DISTRICT,Latitude,Longitude,Lat_Lon,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
0,700381962,05/28/2015,15:00:00,,,46,06/01/2015,578,HARRASSMENT 2,638,...,M,,40.84586773,-73.915888033,"(40.84586773, -73.915888033)",PATROL BORO BRONX,,25-44,WHITE HISPANIC,F
1,642234217,10/28/2013,13:50:00,10/28/2013,13:50:00,120,10/28/2013,351,CRIMINAL MISCHIEF & RELATED OF,259,...,,,40.627060894,-74.077149232,"(40.627060894, -74.077149232)",PATROL BORO STATEN ISLAND,,45-64,WHITE,M
2,242465164,05/09/2012,20:50:00,05/09/2012,21:00:00,24,05/09/2012,236,DANGEROUS WEAPONS,782,...,,,40.800965968,-73.969047272,"(40.800965968, -73.969047272)",PATROL BORO MAN NORTH,,,UNKNOWN,E
3,927207428,01/03/2014,13:30:00,01/03/2014,13:35:00,108,01/03/2014,109,GRAND LARCENY,409,...,M,,40.745241809,-73.894253382,"(40.745241809, -73.894253382)",PATROL BORO QUEENS NORTH,,45-64,ASIAN / PACIFIC ISLANDER,M
4,492142357,04/13/2016,00:00:00,,,40,04/13/2016,351,CRIMINAL MISCHIEF & RELATED OF,258,...,,,40.810351863,-73.924942326,"(40.810351863, -73.924942326)",PATROL BORO BRONX,,UNKNOWN,UNKNOWN,E


In [33]:
nypd_complaint['CMPLNT_FR_DT'] = nypd_complaint['CMPLNT_FR_DT'].fillna('NaN')
nypd_complaint_h['CMPLNT_FR_DT'] = nypd_complaint_h['CMPLNT_FR_DT'].fillna('NaN')

In [35]:
nypd_complaint = nypd_complaint[['CMPLNT_FR_DT', 'LAW_CAT_CD','SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX', 'Latitude', 'Longitude']]
nypd_complaint = nypd_complaint.rename({'CMPLNT_FR_DT': 'date', 'LAW_CAT_CD': 'level'}, axis='columns').rename(str.lower, axis='columns')

nypd_complaint_h = nypd_complaint_h[['CMPLNT_FR_DT', 'LAW_CAT_CD','SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX', 'Latitude', 'Longitude']]
nypd_complaint_h = nypd_complaint_h.rename({'CMPLNT_FR_DT': 'date', 'LAW_CAT_CD': 'level'}, axis='columns').rename(str.lower, axis='columns')
nypd_complaint_h

Unnamed: 0,date,level,susp_age_group,susp_race,susp_sex,vic_age_group,vic_race,vic_sex,latitude,longitude
0,05/28/2015,VIOLATION,25-44,BLACK,M,25-44,WHITE HISPANIC,F,40.84586773,-73.915888033
1,10/28/2013,MISDEMEANOR,,,,45-64,WHITE,M,40.627060894,-74.077149232
2,05/09/2012,MISDEMEANOR,,,,,UNKNOWN,E,40.800965968,-73.969047272
3,01/03/2014,FELONY,,UNKNOWN,M,45-64,ASIAN / PACIFIC ISLANDER,M,40.745241809,-73.894253382
4,04/13/2016,MISDEMEANOR,,,,UNKNOWN,UNKNOWN,E,40.810351863,-73.924942326
...,...,...,...,...,...,...,...,...,...,...
6983202,12/02/2018,MISDEMEANOR,25-44,WHITE HISPANIC,M,25-44,ASIAN / PACIFIC ISLANDER,F,40.78504961600004,-73.85685176799996
6983203,01/20/2018,MISDEMEANOR,25-44,BLACK,F,25-44,BLACK,M,,
6983204,08/03/2018,FELONY,,,,25-44,BLACK HISPANIC,F,40.814612305000026,-73.90363724699995
6983205,12/10/2018,MISDEMEANOR,UNKNOWN,UNKNOWN,U,18-24,ASIAN / PACIFIC ISLANDER,M,,


### we are going to compare the data between 2019 and 2020, therefore, we need to remove other data

In [36]:
nypd_complaint_2019 = nypd_complaint_h[nypd_complaint_h['date'].str.contains('2019')].reset_index(drop=True)
nypd_complaint_2019

Unnamed: 0,date,level,susp_age_group,susp_race,susp_sex,vic_age_group,vic_race,vic_sex,latitude,longitude
0,01/01/2019,FELONY,UNKNOWN,UNKNOWN,M,18-24,BLACK HISPANIC,F,40.887451313000042,-73.847607786999959
1,01/01/2019,FELONY,45-64,BLACK,M,UNKNOWN,UNKNOWN,E,40.701527648000081,-73.943227361999959
2,01/01/2019,FELONY,,,,18-24,WHITE,M,40.72127357100004,-73.99359794999998
3,01/01/2019,VIOLATION,45-64,BLACK,M,25-44,BLACK,F,40.809845604000031,-73.936607355999968
4,01/02/2019,FELONY,25-44,BLACK,M,25-44,BLACK,M,40.790850244000069,-73.97456654299998
...,...,...,...,...,...,...,...,...,...,...
450971,12/22/2019,MISDEMEANOR,UNKNOWN,UNKNOWN,U,25-44,WHITE,M,40.718183078000038,-73.995975546999944
450972,12/30/2019,MISDEMEANOR,25-44,BLACK HISPANIC,M,25-44,WHITE HISPANIC,F,40.825063990000046,-73.877397260999942
450973,12/27/2019,MISDEMEANOR,25-44,WHITE,M,UNKNOWN,UNKNOWN,D,40.723809613000071,-73.991804658999968
450974,12/29/2019,MISDEMEANOR,25-44,WHITE,F,UNKNOWN,UNKNOWN,D,40.752387917000078,-73.973274663999973


In [40]:
nypd_complaint_2020 = nypd_complaint[nypd_complaint['date'].str.contains('2020')].reset_index(drop=True)
nypd_complaint_2020

Unnamed: 0,date,level,susp_age_group,susp_race,susp_sex,vic_age_group,vic_race,vic_sex,latitude,longitude
0,12/23/2020,FELONY,,,,18-24,BLACK,M,40.62576896100006,-73.99141682199996
1,12/21/2020,FELONY,,,,25-44,BLACK,M,40.67458330800008,-73.93022154099998
2,11/22/2020,FELONY,UNKNOWN,UNKNOWN,U,25-44,BLACK,F,40.82310129900002,-73.86969046099993
3,11/22/2020,FELONY,25-44,BLACK,M,25-44,BLACK,F,40.88745131300004,-73.84760778699997
4,11/21/2020,FELONY,,,,18-24,BLACK HISPANIC,M,40.80022202900005,-73.93084834199995
...,...,...,...,...,...,...,...,...,...,...
404887,01/04/2020,MISDEMEANOR,UNKNOWN,WHITE HISPANIC,M,UNKNOWN,UNKNOWN,D,40.74134137300007,-73.97839260899997
404888,01/02/2020,FELONY,UNKNOWN,UNKNOWN,U,45-64,ASIAN / PACIFIC ISLANDER,M,40.68871610400004,-73.82636559499997
404889,01/02/2020,FELONY,25-44,BLACK,M,UNKNOWN,UNKNOWN,D,40.78994739900003,-73.97535415699997
404890,01/05/2020,MISDEMEANOR,45-64,WHITE HISPANIC,M,UNKNOWN,UNKNOWN,E,40.88233829700005,-73.89165215599996


### We can see that there are lots of NaN values, we can fill them with 'UNKNOWN' instead

In [41]:
nypd_complaint_2019 = nypd_complaint_2019.fillna('UNKNOWN')
nypd_complaint_2020 = nypd_complaint_2020.fillna('UNKNOWN')
nypd_complaint_2020

Unnamed: 0,date,level,susp_age_group,susp_race,susp_sex,vic_age_group,vic_race,vic_sex,latitude,longitude
0,12/23/2020,FELONY,UNKNOWN,UNKNOWN,UNKNOWN,18-24,BLACK,M,40.62576896100006,-73.99141682199996
1,12/21/2020,FELONY,UNKNOWN,UNKNOWN,UNKNOWN,25-44,BLACK,M,40.67458330800008,-73.93022154099998
2,11/22/2020,FELONY,UNKNOWN,UNKNOWN,U,25-44,BLACK,F,40.82310129900002,-73.86969046099993
3,11/22/2020,FELONY,25-44,BLACK,M,25-44,BLACK,F,40.88745131300004,-73.84760778699997
4,11/21/2020,FELONY,UNKNOWN,UNKNOWN,UNKNOWN,18-24,BLACK HISPANIC,M,40.80022202900005,-73.93084834199995
...,...,...,...,...,...,...,...,...,...,...
404887,01/04/2020,MISDEMEANOR,UNKNOWN,WHITE HISPANIC,M,UNKNOWN,UNKNOWN,D,40.74134137300007,-73.97839260899997
404888,01/02/2020,FELONY,UNKNOWN,UNKNOWN,U,45-64,ASIAN / PACIFIC ISLANDER,M,40.68871610400004,-73.82636559499997
404889,01/02/2020,FELONY,25-44,BLACK,M,UNKNOWN,UNKNOWN,D,40.78994739900003,-73.97535415699997
404890,01/05/2020,MISDEMEANOR,45-64,WHITE HISPANIC,M,UNKNOWN,UNKNOWN,E,40.88233829700005,-73.89165215599996


In [42]:
nypd_complaint_2020['level'].value_counts().rename_axis('level').reset_index(name='counts')

Unnamed: 0,level,counts
0,MISDEMEANOR,207627
1,FELONY,130794
2,VIOLATION,66471


### check age groups

In [43]:
nypd_complaint_2020['susp_age_group'].value_counts().rename_axis('susp_age_group').reset_index(name='counts')

Unnamed: 0,susp_age_group,counts
0,UNKNOWN,234081
1,25-44,98792
2,45-64,33155
3,18-24,29185
4,<18,6463
5,65+,3196
6,2020,10
7,2019,2
8,-977,1
9,-965,1


### we can see that there are some unreasonable age groups, since it is difficult to understand its original meanings, we decide to remove all these rows.

In [44]:
nypd_complaint_2020 = nypd_complaint_2020[~nypd_complaint_2020['susp_age_group'].isin(['2020', '2019', '-962', '-12', '-71', '1925', '-965', '-942', '-977', '1020'])]
nypd_complaint_2020['susp_age_group'].value_counts().rename_axis('susp_age_group').reset_index(name='counts')


Unnamed: 0,susp_age_group,counts
0,UNKNOWN,234081
1,25-44,98792
2,45-64,33155
3,18-24,29185
4,<18,6463
5,65+,3196


In [45]:
nypd_complaint_2020['susp_race'].value_counts().rename_axis('susp_race').reset_index(name='counts')

Unnamed: 0,susp_race,counts
0,UNKNOWN,185806
1,BLACK,113846
2,WHITE HISPANIC,49784
3,WHITE,28476
4,BLACK HISPANIC,15720
5,ASIAN / PACIFIC ISLANDER,10633
6,AMERICAN INDIAN/ALASKAN NATIVE,607


In [46]:
nypd_complaint_2020['susp_sex'].value_counts().rename_axis('susp_sex').reset_index(name='counts')

Unnamed: 0,susp_sex,counts
0,M,183152
1,UNKNOWN,92098
2,U,81033
3,F,48589


### We can see that 'U' should represent 'UNKNOWN', so we can merge these two types to 'Others'.

In [47]:
nypd_complaint_2020 = nypd_complaint_2020.replace({'susp_sex': {'U': 'Others', 'UNKNOWN': 'Others'}})
nypd_complaint_2020['susp_sex'].value_counts().rename_axis('susp_sex').reset_index(name='counts')

Unnamed: 0,susp_sex,counts
0,M,183152
1,Others,173131
2,F,48589


### We are going to do the same thing for vic's data

In [48]:
nypd_complaint_2020['vic_age_group'].value_counts().rename_axis('vic_age_group').reset_index(name='counts')

Unnamed: 0,vic_age_group,counts
0,25-44,156856
1,UNKNOWN,98647
2,45-64,82828
3,18-24,36948
4,65+,18288
5,<18,11284
6,-948,2
7,-968,1
8,1014,1
9,950,1


In [49]:
nypd_complaint_2020 = nypd_complaint_2020[nypd_complaint_2020['vic_age_group'].isin(['25-44', 'UNKNOWN', '45-64', '18-24', '65+', '<18'])]
nypd_complaint_2020['vic_age_group'].value_counts().rename_axis('vic_age_group').reset_index(name='counts')


Unnamed: 0,vic_age_group,counts
0,25-44,156856
1,UNKNOWN,98647
2,45-64,82828
3,18-24,36948
4,65+,18288
5,<18,11284


In [50]:
nypd_complaint_2020['vic_race'].value_counts().rename_axis('vic_race').reset_index(name='counts')

Unnamed: 0,vic_race,counts
0,UNKNOWN,107825
1,BLACK,107637
2,WHITE HISPANIC,74680
3,WHITE,64170
4,ASIAN / PACIFIC ISLANDER,31621
5,BLACK HISPANIC,17619
6,AMERICAN INDIAN/ALASKAN NATIVE,1299


In [51]:
nypd_complaint_2020['vic_sex'].value_counts().rename_axis('vic_sex').reset_index(name='counts')

Unnamed: 0,vic_sex,counts
0,F,160060
1,M,149569
2,D,62317
3,E,32904
4,UNKNOWN,1


### Since we have no idea what is sex D or E, we decided to mark them as 'Others'.

In [52]:
nypd_complaint_2020 = nypd_complaint_2020.replace({'vic_sex': {'D': 'Others', 'E': 'Others', 'UNKNOWN': 'Others'}})
nypd_complaint_2020['vic_sex'].value_counts().rename_axis('vic_sex').reset_index(name='counts')

Unnamed: 0,vic_sex,counts
0,F,160060
1,M,149569
2,Others,95222


In [53]:
nypd_complaint_2020

Unnamed: 0,date,level,susp_age_group,susp_race,susp_sex,vic_age_group,vic_race,vic_sex,latitude,longitude
0,12/23/2020,FELONY,UNKNOWN,UNKNOWN,Others,18-24,BLACK,M,40.62576896100006,-73.99141682199996
1,12/21/2020,FELONY,UNKNOWN,UNKNOWN,Others,25-44,BLACK,M,40.67458330800008,-73.93022154099998
2,11/22/2020,FELONY,UNKNOWN,UNKNOWN,Others,25-44,BLACK,F,40.82310129900002,-73.86969046099993
3,11/22/2020,FELONY,25-44,BLACK,M,25-44,BLACK,F,40.88745131300004,-73.84760778699997
4,11/21/2020,FELONY,UNKNOWN,UNKNOWN,Others,18-24,BLACK HISPANIC,M,40.80022202900005,-73.93084834199995
...,...,...,...,...,...,...,...,...,...,...
404887,01/04/2020,MISDEMEANOR,UNKNOWN,WHITE HISPANIC,M,UNKNOWN,UNKNOWN,Others,40.74134137300007,-73.97839260899997
404888,01/02/2020,FELONY,UNKNOWN,UNKNOWN,Others,45-64,ASIAN / PACIFIC ISLANDER,M,40.68871610400004,-73.82636559499997
404889,01/02/2020,FELONY,25-44,BLACK,M,UNKNOWN,UNKNOWN,Others,40.78994739900003,-73.97535415699997
404890,01/05/2020,MISDEMEANOR,45-64,WHITE HISPANIC,M,UNKNOWN,UNKNOWN,Others,40.88233829700005,-73.89165215599996


### And we are going to do the same steps for 2019 complaints

In [54]:
nypd_complaint_2019['level'].value_counts().rename_axis('level').reset_index(name='counts')

Unnamed: 0,level,counts
0,MISDEMEANOR,241775
1,FELONY,137569
2,VIOLATION,71632


In [55]:
nypd_complaint_2019['susp_age_group'].value_counts().rename_axis('susp_age_group').reset_index(name='counts')

Unnamed: 0,susp_age_group,counts
0,UNKNOWN,238857
1,25-44,115618
2,45-64,42259
3,18-24,38397
4,<18,11847
5,65+,3971
6,2019,8
7,929,2
8,-966,2
9,-1,2


In [56]:
nypd_complaint_2019 = nypd_complaint_2019[nypd_complaint_2019['susp_age_group'] != '-928']
nypd_complaint_2019['susp_age_group'].value_counts().rename_axis('vic_age_group').reset_index(name='counts')


Unnamed: 0,vic_age_group,counts
0,UNKNOWN,238857
1,25-44,115618
2,45-64,42259
3,18-24,38397
4,<18,11847
5,65+,3971
6,2019,8
7,929,2
8,-966,2
9,-1,2


In [57]:
nypd_complaint_2019['susp_race'].value_counts().rename_axis('susp_race').reset_index(name='counts')

Unnamed: 0,susp_race,counts
0,UNKNOWN,192454
1,BLACK,131216
2,WHITE HISPANIC,59810
3,WHITE,34102
4,BLACK HISPANIC,19173
5,ASIAN / PACIFIC ISLANDER,13140
6,AMERICAN INDIAN/ALASKAN NATIVE,1081


In [58]:
nypd_complaint_2019['susp_sex'].value_counts().rename_axis('susp_sex').reset_index(name='counts')

Unnamed: 0,susp_sex,counts
0,M,209109
1,UNKNOWN,105688
2,U,72378
3,F,63801


In [59]:
nypd_complaint_2019 = nypd_complaint_2019.replace({'susp_sex': {'U': 'Others','UNKNOWN': 'Others'}})
nypd_complaint_2019['susp_sex'].value_counts().rename_axis('vic_sex').reset_index(name='counts')

Unnamed: 0,vic_sex,counts
0,M,209109
1,Others,178066
2,F,63801


In [60]:
nypd_complaint_2019['vic_age_group'].value_counts().rename_axis('vic_age_group').reset_index(name='counts')

Unnamed: 0,vic_age_group,counts
0,25-44,160183
1,UNKNOWN,124987
2,45-64,84064
3,18-24,43453
4,<18,19163
5,65+,19086
6,-56,3
7,936,3
8,-2,2
9,-967,2


In [61]:
nypd_complaint_2019['vic_race'].value_counts().rename_axis('vic_race').reset_index(name='counts')

Unnamed: 0,vic_race,counts
0,UNKNOWN,133585
1,BLACK,113960
2,WHITE HISPANIC,77877
3,WHITE,70428
4,ASIAN / PACIFIC ISLANDER,33968
5,BLACK HISPANIC,18526
6,AMERICAN INDIAN/ALASKAN NATIVE,2632


In [62]:
nypd_complaint_2019['vic_sex'].value_counts().rename_axis('vic_sex').reset_index(name='counts')

Unnamed: 0,vic_sex,counts
0,F,178553
1,M,158496
2,D,64830
3,E,49094
4,UNKNOWN,3


In [63]:
nypd_complaint_2019 = nypd_complaint_2019.replace({'susp_sex': {'D': 'Others','E': 'Others'}})
nypd_complaint_2019['susp_sex'].value_counts().rename_axis('vic_sex').reset_index(name='counts')

Unnamed: 0,vic_sex,counts
0,M,209109
1,Others,178066
2,F,63801


In [64]:
nypd_complaint_2019

Unnamed: 0,date,level,susp_age_group,susp_race,susp_sex,vic_age_group,vic_race,vic_sex,latitude,longitude
0,01/01/2019,FELONY,UNKNOWN,UNKNOWN,M,18-24,BLACK HISPANIC,F,40.887451313000042,-73.847607786999959
1,01/01/2019,FELONY,45-64,BLACK,M,UNKNOWN,UNKNOWN,E,40.701527648000081,-73.943227361999959
2,01/01/2019,FELONY,UNKNOWN,UNKNOWN,Others,18-24,WHITE,M,40.72127357100004,-73.99359794999998
3,01/01/2019,VIOLATION,45-64,BLACK,M,25-44,BLACK,F,40.809845604000031,-73.936607355999968
4,01/02/2019,FELONY,25-44,BLACK,M,25-44,BLACK,M,40.790850244000069,-73.97456654299998
...,...,...,...,...,...,...,...,...,...,...
450971,12/22/2019,MISDEMEANOR,UNKNOWN,UNKNOWN,Others,25-44,WHITE,M,40.718183078000038,-73.995975546999944
450972,12/30/2019,MISDEMEANOR,25-44,BLACK HISPANIC,M,25-44,WHITE HISPANIC,F,40.825063990000046,-73.877397260999942
450973,12/27/2019,MISDEMEANOR,25-44,WHITE,M,UNKNOWN,UNKNOWN,D,40.723809613000071,-73.991804658999968
450974,12/29/2019,MISDEMEANOR,25-44,WHITE,F,UNKNOWN,UNKNOWN,D,40.752387917000078,-73.973274663999973


In [65]:
nypd_complaint_2020.to_csv('data/nypd_complaint_2020_cleaned.csv')
nypd_complaint_2019.to_csv('data/nypd_complaint_2019_cleaned.csv')