# This notebook 

- preprocess

- clean

- save data into pickle files

Package:

- `pandas`

- `scipy`

- `seaborn`

# Data resources : 


- [Crash dataset](https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if/data)



- [Crime dataset](https://data.cityofchicago.org/Public-Safety/Crimes-One-year-prior-to-present/x2n5-8w5q/data)



- [Ridesharing dataset](https://data.cityofchicago.org/Transportation/Transportation-Network-Providers-Trips/m6dm-c72p)


- [Population dataset (updated as of July 11, 2019)](https://simplemaps.com/data/us-zips)



- [Pothole dataset](https://data.cityofchicago.org/Transportation/Potholes-Patched-Previous-Seven-Days/caad-5j9e?referrer=embed)



- [Pothole map](https://www.chicago.gov/city/en/depts/cdot/dataset/potholetracker.html)





In [52]:
# Necessary imports
import pandas as pd
import numpy as np
import datetime
import math
from scipy import spatial
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [53]:
!ls

collect_data.py                p2_gis_preprocess.ipynb
[34mdataset[m[m                        p2_map.ipynb
[34mgis_dataset[m[m                    p2_model.ipynb
[34mimages[m[m                         [31mp2_presentation.key[m[m
[34mmap[m[m                            p2_presentation.pdf
p2_chicago_car_eda.ipynb       p2_presentation.pptx
p2_chicago_car_model.ipynb     project2.ipynb
p2_crash_data_preprocess.ipynb visualization.ipynb


In [54]:
!ls ./dataset

Array_of_Things_Locations.csv          chicago_crash.pkl
Crimes_-_One_year_prior_to_present.csv chicago_population.pkl
Potholes_Patched.csv                   chicago_potholes.pkl
Potholes_Patched_2018.csv              chicago_traffic_crime.pkl
Traffic_Crashes_-_Crashes.csv          chicago_zipcode.pkl
Traffic_Crashes_-_Crashes_2018.csv     input.pkl
addr_0000_1000.plk                     input_test.pkl
addr_10000_50000.plk                   intersections_per_zipcode.pkl
addr_1000_5000.plk                     intersections_with_zipcode.pkl
addr_50000_80000.plk                   [34msimplemaps_uszips_basicv1[m[m
addr_5000_10000.plk                    [31muszips.csv[m[m
chicago_boundaries_zipcodes.geojson    zipcodes.geojson


# I. Helper function

In [55]:
def to_Cartesian(lat, lon):
    '''
    function to convert latitude and longitude to 3D cartesian coordinates
    '''
    R = 6371 # radius of the Earth in kilometers

    x = R * math.cos(lat) * math.cos(lon)
    y = R * math.cos(lat) * math.sin(lon)
    z = R * math.sin(lat)
    return x, y, z

def deg2rad(degree):
    '''
    function to convert degree to radian
    '''
    rad = degree * 2*np.pi / 360
    return rad

def rad2deg(rad):
    '''
    function to convert radian to degree
    '''
    degree = rad/2/np.pi * 360
    return degree

def distToKM(x):
    '''
    function to convert cartesian distance to real distance in km
    '''
    R = 6371 # earth radius
    gamma = 2*np.arcsin(deg2rad(x/(2*R))) # compute the angle of the isosceles triangle
    dist = 2*R*math.sin(gamma/2) # compute the side of the triangle
    return dist

def kmToDIST(x):
    '''
    function to convert real distance in km to cartesian distance 
    '''
    R = 6371 # earth radius
    gamma = 2*np.arcsin(x/2./R) 
    
    dist = 2*R*rad2deg(sin(gamma / 2.))
    return dist


In [56]:
def interfer_zipcode_from_distance(df, lat_ref, lng_ref , n_closest=5):
    '''
    function to estimate the zipcode of each lat/lgn in df by calculating 
    it's Euclide distance to every reference lat/lgn in lat_ref/lgn_ref,
    return top 5 closest ref_lat/ref_lgn 
    '''
    lats_1d = df['lat'].values
    lons_1d = df['lng'].values

    # convert the grid points and reference points to cartesian coordinates
    x, y, z = zip(*map(to_Cartesian, lats_1d, lons_1d))
    x_ref, y_ref, z_ref = to_Cartesian(lat_ref, lng_ref)
    
    # create the KD-tree using the 3D cartesian coordinates
    coordinates = list(zip(x, y, z))
    tree = spatial.cKDTree(coordinates)

    # get the cartesian distances from the 10 closest points
    dist, ix = tree.query((x_ref, y_ref, z_ref), n_closest)

    # print out the real distances in km
    #print(list(map(distToKM, dist)))
    
    #return dist,ix
    return df['zipcode'].iloc[ix[0]]

In [57]:
# start and end date to select data
start_date = datetime.datetime(2018, 7, 9)
end_date = datetime.datetime(2019, 7, 7)
start_date, end_date

(datetime.datetime(2018, 7, 9, 0, 0), datetime.datetime(2019, 7, 7, 0, 0))

# II. Preprocessing data

## 1. Collect  population and zipcode from Chicago

In [58]:
#  Data updated as of July 11, 2019.
# data source : https://simplemaps.com/data/us-zips

uszipcode = pd.read_csv('./dataset/uszips.csv')
print(uszipcode.columns)
uszipcode = uszipcode.rename(columns={'zip' : 'zipcode'})
uszipcode.head(3)

Index(['zip', 'lat', 'lng', 'city', 'state_id', 'state_name', 'zcta',
       'parent_zcta', 'population', 'density', 'county_fips', 'county_name',
       'all_county_weights', 'imprecise', 'military', 'timezone'],
      dtype='object')


Unnamed: 0,zipcode,lat,lng,city,state_id,state_name,zcta,parent_zcta,population,density,county_fips,county_name,all_county_weights,imprecise,military,timezone
0,601,18.18004,-66.75218,Adjuntas,PR,Puerto Rico,True,,18570,111.4,72001,Adjuntas,"{'72001':99.43,'72141':0.57}",False,False,America/Puerto_Rico
1,602,18.36073,-67.17517,Aguada,PR,Puerto Rico,True,,41520,523.5,72003,Aguada,{'72003':100},False,False,America/Puerto_Rico
2,603,18.45439,-67.12202,Aguadilla,PR,Puerto Rico,True,,54689,667.9,72005,Aguadilla,{'72005':100},False,False,America/Puerto_Rico


#### Extract population from Chicago ,Illinois

In [59]:
cook_zipcode = uszipcode[(uszipcode['state_name'] == 'Illinois') &  (uszipcode['county_name'] == 'Cook')]

print('Cook county has {} rows'.format(len(cook_zipcode['zipcode'])))
print('Cook county has {} zipcodes'.format(len(cook_zipcode['zipcode'].unique())))

cook_zipcode['zipcode_location'] = list(zip(cook_zipcode['lat'], cook_zipcode['lng']))


cook_zipcode[cook_zipcode['zipcode'] == 60640]

Cook county has 163 rows
Cook county has 163 zipcodes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,zipcode,lat,lng,city,state_id,state_name,zcta,parent_zcta,population,density,county_fips,county_name,all_county_weights,imprecise,military,timezone,zipcode_location
20719,60640,41.97236,-87.66347,Chicago,IL,Illinois,True,,65790,10530.6,17031,Cook,{'17031':100},False,False,America/Chicago,"(41.972359999999995, -87.66346999999999)"


In [60]:
chicago_population = cook_zipcode[['zipcode','lat', 'lng', 'zipcode_location', 'city', 'state_name', 'population', 'density']]
chicago_population.to_pickle('./dataset/chicago_population.pkl')

chicago_population[chicago_population['zipcode'] == 60657]

Unnamed: 0,zipcode,lat,lng,zipcode_location,city,state_name,population,density
20734,60657,41.93998,-87.65374,"(41.93998, -87.65374)",Chicago,Illinois,65996,11207.9


#### Extract zipcodes from Chicago

In [61]:
chicago_zipcode =  cook_zipcode[['zipcode','lat', 'lng']]
chicago_zipcode.to_pickle('./dataset/chicago_zipcode.pkl')

chicago_zipcode.head()

Unnamed: 0,zipcode,lat,lng
20407,60004,42.11201,-87.97917
20408,60005,42.06394,-87.98568
20409,60007,42.0076,-87.99298
20410,60008,42.07441,-88.02265
20416,60016,42.04972,-87.89166


## 2. Collect potholes data in Chicago 

In [62]:
potholes = pd.read_csv('./dataset/Potholes_Patched.csv')
print(len(potholes))
print(potholes.columns)
potholes = potholes.rename(columns={'NUMBER OF POTHOLES FILLED ON BLOCK' : 'num_potholes'})
potholes.head(3)
#potholes.info()

39139
Index(['ADDRESS', 'REQUEST DATE', 'COMPLETION DATE',
       'NUMBER OF POTHOLES FILLED ON BLOCK', 'LATITUDE', 'LONGITUDE',
       'LOCATION', 'Boundaries - ZIP Codes', 'Community Areas', 'Zip Codes',
       'Census Tracts', 'Wards'],
      dtype='object')


Unnamed: 0,ADDRESS,REQUEST DATE,COMPLETION DATE,num_potholes,LATITUDE,LONGITUDE,LOCATION,Boundaries - ZIP Codes,Community Areas,Zip Codes,Census Tracts,Wards
0,5228 W SCHOOL ST,05/18/2019 09:13:52 AM,07/12/2019 03:30:53 PM,5,41.940765,-87.757937,POINT (-87.757936560398 41.940765068641),21.0,15.0,22618.0,361.0,17.0
1,4814 W DIVISION ST,07/12/2019 03:29:33 PM,07/12/2019 03:30:14 PM,9,41.902617,-87.746663,POINT (-87.746663149052 41.902616939108),5.0,26.0,4299.0,62.0,45.0
2,5211 W HENDERSON ST,06/14/2019 03:24:54 PM,07/12/2019 03:29:07 PM,1,41.941214,-87.75729,POINT (-87.757289794412 41.941213642538),21.0,15.0,22618.0,361.0,17.0


In [63]:
# sort df by date
potholes['REQUEST DATE'] = pd.to_datetime(potholes['REQUEST DATE'])
potholes['COMPLETION DATE'] = pd.to_datetime(potholes['COMPLETION DATE'])
potholes = potholes.sort_values(['REQUEST DATE'])

potholes.head(3)

Unnamed: 0,ADDRESS,REQUEST DATE,COMPLETION DATE,num_potholes,LATITUDE,LONGITUDE,LOCATION,Boundaries - ZIP Codes,Community Areas,Zip Codes,Census Tracts,Wards
32431,5214 N LOVEJOY AVE,2018-01-19 20:58:00,2018-11-08 10:48:35,1,,,,,,,,
30161,330 W GOETHE Street,2018-01-31 13:11:12,2018-12-26 12:40:18,2,41.905862,-87.637595,POINT (-87.637595490904 41.905862218497),54.0,37.0,14926.0,290.0,46.0
36305,2413 N SB OUTER LAKE SHORE DR,2018-02-22 19:37:49,2018-08-16 07:27:14,1,,,,,,,,


In [64]:
# create new columns for duration of potholed and month
potholes['duration'] = potholes['COMPLETION DATE'] - potholes['REQUEST DATE']
potholes['month'] = potholes['REQUEST DATE'].dt.month

### 2.a Select data with start/end date period

In [65]:
potholes = potholes[(potholes['REQUEST DATE'] >= start_date) &(potholes['REQUEST DATE'] <= end_date)]
potholes.head(3)

Unnamed: 0,ADDRESS,REQUEST DATE,COMPLETION DATE,num_potholes,LATITUDE,LONGITUDE,LOCATION,Boundaries - ZIP Codes,Community Areas,Zip Codes,Census Tracts,Wards,duration,month
36866,6089 W DIVERSEY AVE,2018-07-09 06:44:15,2018-08-03 14:24:17,8,41.931035,-87.778138,POINT (-87.778137730383 41.931035160324),2.0,19.0,22615.0,78.0,21.0,25 days 07:40:02,7
30985,3921 W WILCOX ST,2018-07-09 06:44:21,2018-12-06 11:18:49,4,,,,,,,,,150 days 04:34:28,7
38655,10904 S AVENUE E,2018-07-09 07:00:02,2018-07-13 13:36:26,1,,,,,,,,,4 days 06:36:24,7


### 2.b Estimate the zipcode of each pothole by its lat/lgn

In [66]:
# select non-nan and non-zero values in LAT/LGN
potholes = potholes[(potholes['LATITUDE'].notnull()) & (potholes['LONGITUDE'].notnull())]
potholes = potholes.loc[(potholes['LATITUDE'] != 0.0) & (potholes['LONGITUDE'] != 0.0)]


# estimate zipcode of each location from its distance to center location of each zipcode
potholes['location'] = list(zip(potholes['LATITUDE'], potholes['LONGITUDE']))
                                                    
potholes['zipcode'] = potholes['location'].apply(lambda x : \
                                     interfer_zipcode_from_distance( chicago_zipcode , x[0], x[1])) 

print(len(potholes))
potholes.head(3)

30210


Unnamed: 0,ADDRESS,REQUEST DATE,COMPLETION DATE,num_potholes,LATITUDE,LONGITUDE,LOCATION,Boundaries - ZIP Codes,Community Areas,Zip Codes,Census Tracts,Wards,duration,month,location,zipcode
36866,6089 W DIVERSEY AVE,2018-07-09 06:44:15,2018-08-03 14:24:17,8,41.931035,-87.778138,POINT (-87.778137730383 41.931035160324),2.0,19.0,22615.0,78.0,21.0,25 days 07:40:02,7,"(41.93103516, -87.77813773)",60639
27999,4516 W JACKSON Boulevard,2018-07-09 08:17:28,2019-01-18 07:56:33,28,41.876964,-87.738515,POINT (-87.738514926893 41.876963571504),30.0,27.0,21572.0,716.0,23.0,192 days 23:39:05,7,"(41.876963572, -87.73851492700001)",60624
25425,6147 S EVANS Avenue,2018-07-09 10:32:20,2019-02-11 14:00:55,7,41.782683,-87.606964,POINT (-87.606963979375 41.782682981034),60.0,9.0,22260.0,471.0,4.0,217 days 03:28:35,7,"(41.782682981, -87.606963979)",60637


### 2.c Save dataframe into pickle 

In [67]:
potholes.to_pickle('./dataset/chicago_potholes.pkl')

## 3. Collect crime data from Chicago 

In [68]:
chicago_crimes = pd.read_csv('./dataset/Crimes_-_One_year_prior_to_present.csv')

print(len(chicago_crimes))
print(chicago_crimes.columns)

#chicago_crimes.columns = chicago_crimes.columns.str.replace(' ','')
chicago_crimes.columns = [column.strip() for column in chicago_crimes.columns]
print(chicago_crimes.columns)
chicago_crimes.head(3)
#chicago_crimes.info()'

261708
Index(['CASE#', 'DATE  OF OCCURRENCE', 'BLOCK', ' IUCR',
       ' PRIMARY DESCRIPTION', ' SECONDARY DESCRIPTION',
       ' LOCATION DESCRIPTION', 'ARREST', 'DOMESTIC', 'BEAT', 'WARD', 'FBI CD',
       'X COORDINATE', 'Y COORDINATE', 'LATITUDE', 'LONGITUDE', 'LOCATION'],
      dtype='object')
Index(['CASE#', 'DATE  OF OCCURRENCE', 'BLOCK', 'IUCR', 'PRIMARY DESCRIPTION',
       'SECONDARY DESCRIPTION', 'LOCATION DESCRIPTION', 'ARREST', 'DOMESTIC',
       'BEAT', 'WARD', 'FBI CD', 'X COORDINATE', 'Y COORDINATE', 'LATITUDE',
       'LONGITUDE', 'LOCATION'],
      dtype='object')


Unnamed: 0,CASE#,DATE OF OCCURRENCE,BLOCK,IUCR,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,LOCATION DESCRIPTION,ARREST,DOMESTIC,BEAT,WARD,FBI CD,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION
0,JB341693,07/09/2018 05:09:00 AM,006XX N STATE ST,496,BATTERY,AGGRAVATED DOMESTIC BATTERY: KNIFE/CUTTING INST,HOTEL/MOTEL,Y,N,1832,42.0,04B,1176247.0,1904872.0,41.894328,-87.628143,"(41.894327846, -87.62814321)"
1,JB342588,07/09/2018 05:10:00 AM,034XX W 66TH ST,620,BURGLARY,UNLAWFUL ENTRY,RESIDENCE-GARAGE,N,N,831,15.0,05,1154592.0,1860643.0,41.773419,-87.708856,"(41.773418836, -87.708856396)"
2,JB341677,07/09/2018 05:10:00 AM,078XX S SANGAMON ST,560,ASSAULT,SIMPLE,APARTMENT,N,N,621,17.0,08A,1171335.0,1852819.0,41.751599,-87.647709,"(41.75159879, -87.647708896)"


### 3.a Convert date into datetime and check if the data is in start/end date period

In [69]:
# convert date into datetime 
chicago_crimes['DATE  OF OCCURRENCE'] = pd.to_datetime(chicago_crimes['DATE  OF OCCURRENCE'])
chicago_crimes = chicago_crimes.sort_values(['DATE  OF OCCURRENCE'])

In [70]:
chicago_crimes['DATE  OF OCCURRENCE'].min(),chicago_crimes['DATE  OF OCCURRENCE'].max()

(Timestamp('2018-07-09 05:09:00'), Timestamp('2019-07-08 23:58:00'))

In [71]:
# count and keep track crime rate by month 
chicago_crimes['month'] = chicago_crimes['DATE  OF OCCURRENCE'].dt.month

chicago_crimes['num_crimes'] = [1] * len(chicago_crimes) 

### 3.b Estimate the zipcode of each crime incident by its lat/lgn

In [72]:
### extract zipcode from coordinate of crime incident ###

# select non-nan and non-zero values in LAT/LGN
chicago_crimes = chicago_crimes[(chicago_crimes['LATITUDE'].notnull())&(chicago_crimes['LONGITUDE'].notnull())]
chicago_crimes = chicago_crimes.loc[(chicago_crimes['LATITUDE'] != 0.0) & (chicago_crimes['LONGITUDE'] != 0.0)]



# estimate zipcode of each location from its distance to center location of each zipcode
chicago_crimes['location'] = list(zip(chicago_crimes['LATITUDE'], chicago_crimes['LONGITUDE'])) 
                                                    
chicago_crimes['zipcode'] = chicago_crimes['location'].apply(lambda x : \
                                     interfer_zipcode_from_distance( chicago_zipcode, x[0], x[1])) 
chicago_crimes.head(3)

Unnamed: 0,CASE#,DATE OF OCCURRENCE,BLOCK,IUCR,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,LOCATION DESCRIPTION,ARREST,DOMESTIC,BEAT,...,FBI CD,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION,month,num_crimes,location,zipcode
0,JB341693,2018-07-09 05:09:00,006XX N STATE ST,496,BATTERY,AGGRAVATED DOMESTIC BATTERY: KNIFE/CUTTING INST,HOTEL/MOTEL,Y,N,1832,...,04B,1176247.0,1904872.0,41.894328,-87.628143,"(41.894327846, -87.62814321)",7,1,"(41.894327845999996, -87.62814321)",60611
1,JB342588,2018-07-09 05:10:00,034XX W 66TH ST,620,BURGLARY,UNLAWFUL ENTRY,RESIDENCE-GARAGE,N,N,831,...,05,1154592.0,1860643.0,41.773419,-87.708856,"(41.773418836, -87.708856396)",7,1,"(41.773418836, -87.708856396)",60629
2,JB341677,2018-07-09 05:10:00,078XX S SANGAMON ST,560,ASSAULT,SIMPLE,APARTMENT,N,N,621,...,08A,1171335.0,1852819.0,41.751599,-87.647709,"(41.75159879, -87.647708896)",7,1,"(41.75159879, -87.647708896)",60620


In [73]:
print( set(chicago_crimes['LOCATION DESCRIPTION']) )

print(len(set(chicago_crimes['zipcode'])))

# select crimes relating to vehicles
subset = ['HIGHWAY/EXPRESSWAY', 'SIDEWALK', 'STREET', 'CTA BUS', 'CTA BUS STOP',
          'DRIVEWAY - RESIDENTIAL','TAXICAB','VEHICLE - DELIVERY TRUCK',
          'VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER, LYFT)','VEHICLE NON-COMMERCIAL',
          'VEHICLE-COMMERCIAL','VEHICLE-COMMERCIAL - ENTERTAINMENT/PARTY BUS',]


chicago_crimes[chicago_crimes['LOCATION DESCRIPTION'] == \
                                'VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER, LYFT)']['zipcode'].unique()

{nan, 'RESIDENCE', 'OTHER RAILROAD PROP / TRAIN DEPOT', 'GAS STATION DRIVE/PROP.', 'HOTEL/MOTEL', 'AIRPORT/AIRCRAFT', 'AIRPORT EXTERIOR - SECURE AREA', 'AIRPORT TERMINAL LOWER LEVEL - SECURE AREA', 'BASEMENT', 'FARM', 'CTA STATION', 'CONVENIENCE STORE', 'CTA GARAGE / OTHER PROPERTY', 'CHA PARKING LOT', 'SMALL RETAIL STORE', 'SCHOOL, PUBLIC, BUILDING', 'VEHICLE-COMMERCIAL - ENTERTAINMENT/PARTY BUS', 'WAREHOUSE', 'CTA PLATFORM', 'GAS STATION', 'CTA BUS STOP', 'WOODED AREA', 'GARAGE', 'BANK', 'HOUSE', 'COLLEGE/UNIVERSITY GROUNDS', 'VEHICLE-COMMERCIAL - TROLLEY BUS', 'SIDEWALK', 'CEMETARY', 'AIRPORT TERMINAL MEZZANINE - NON-SECURE AREA', 'DEPARTMENT STORE', 'CAR WASH', 'OTHER', 'NURSING HOME/RETIREMENT HOME', 'AIRPORT BUILDING NON-TERMINAL - NON-SECURE AREA', 'VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER, LYFT)', 'DAY CARE CENTER', 'COLLEGE/UNIVERSITY RESIDENCE HALL', 'SCHOOL, PRIVATE, BUILDING', 'AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA', 'AIRPORT PARKING LOT', 'BAR OR TAVERN', 'C

array([60610, 60659, 60624, 60608, 60653, 60661, 60620, 60637, 60603,
       60639, 60651, 60632, 60657, 60618, 60644, 60628, 60645, 60707,
       60647, 60642, 60604, 60602, 60601, 60640, 60654, 60605, 60617,
       60613, 60611, 60606, 60626, 60616, 60302, 60623, 60614, 60612,
       60607, 60622, 60636, 60643, 60621, 60638, 60629, 60660, 60018,
       60619, 60615, 60634, 60649, 60641, 60176, 60609])

### 3.c Filter out the vehicle-related crimes and count number of crimes for each zipcode

In [74]:
## aggregate number of crime by month for each zipcode
df_crimes  = chicago_crimes


# select vehicle-related crimes 
subset = ['HIGHWAY/EXPRESSWAY', 'SIDEWALK', 'STREET', 'CTA BUS', 'CTA BUS STOP',
          'DRIVEWAY - RESIDENTIAL','TAXICAB','VEHICLE - DELIVERY TRUCK',
          'VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER, LYFT)','VEHICLE NON-COMMERCIAL',
          'VEHICLE-COMMERCIAL','VEHICLE-COMMERCIAL - ENTERTAINMENT/PARTY BUS',]


df_crimes['traffic_crime'] = df_crimes['LOCATION DESCRIPTION'].apply(lambda x : 'traffic' if x \
                                                                     in subset  else 'non traffic')


df_crimes = df_crimes.groupby(['zipcode', 'month'])['traffic_crime'].value_counts().unstack()
df_crimes = df_crimes.fillna(0)
df_crimes  = df_crimes.reset_index()


df_crimes['percent_traffic_related_crime'] = df_crimes[['traffic', \
                                                'non traffic']].apply(lambda x: 1*x[0]/x.sum(), axis=1)

df_crimes['num_crime'] = df_crimes[['traffic', 'non traffic']].apply(lambda x: x.sum(), axis=1)
print(len(df_crimes))
df_crimes.head(3)

897


traffic_crime,zipcode,month,non traffic,traffic,percent_traffic_related_crime,num_crime
0,60018,1,75.0,1.0,0.013158,76.0
1,60018,2,61.0,2.0,0.031746,63.0
2,60018,3,68.0,2.0,0.028571,70.0


### 3.e Save dataframe into pickle 

In [75]:
df_crimes.to_pickle('./dataset/chicago_traffic_crime.pkl')

## 4. Collect car accident data from Chicago 

In [76]:
car_accident = pd.read_csv('./dataset/Traffic_Crashes_-_Crashes.csv')
print(len(car_accident))
print(car_accident.columns)
car_accident.head(3)

117324
Index(['RD_NO', 'CRASH_DATE_EST_I', 'CRASH_DATE', 'POSTED_SPEED_LIMIT',
       'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION',
       'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'LANE_CNT',
       'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'REPORT_TYPE',
       'CRASH_TYPE', 'INTERSECTION_RELATED_I', 'NOT_RIGHT_OF_WAY_I',
       'HIT_AND_RUN_I', 'DAMAGE', 'DATE_POLICE_NOTIFIED',
       'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO',
       'STREET_DIRECTION', 'STREET_NAME', 'BEAT_OF_OCCURRENCE',
       'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I',
       'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 'NUM_UNITS',
       'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL',
       'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING',
       'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
       'INJURIES_UNKNOWN', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH',
       'LATITUDE', 'LO

Unnamed: 0,RD_NO,CRASH_DATE_EST_I,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,...,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION
0,JC337982,,07/06/2019 11:58:00 PM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",SIDESWIPE SAME DIRECTION,DIVIDED - W/MEDIAN (NOT RAISED),...,0.0,0.0,2.0,0.0,23,7,7,41.811447,-87.723868,POINT (-87.723868030607 41.811446743513)
1,JC338027,,07/06/2019 11:50:00 PM,25,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",SIDESWIPE SAME DIRECTION,DIVIDED - W/MEDIAN (NOT RAISED),...,0.0,0.0,2.0,0.0,23,7,7,41.779028,-87.74246,POINT (-87.742460299258 41.779027950311)
2,JC338000,,07/06/2019 11:45:00 PM,25,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",PARKED MOTOR VEHICLE,DIVIDED - W/MEDIAN BARRIER,...,0.0,0.0,1.0,0.0,23,7,7,41.753885,-87.664887,POINT (-87.664886700675 41.753884677038)


In [77]:
set(car_accident['WEATHER_CONDITION'])
car_accident['WEATHER_CONDITION'].value_counts()

car_accident['ROADWAY_SURFACE_COND'].value_counts()

set(car_accident['POSTED_SPEED_LIMIT'])
#car_accident['POSTED_SPEED_LIMIT'].value_counts()

{0,
 1,
 2,
 3,
 5,
 6,
 7,
 9,
 10,
 11,
 14,
 15,
 20,
 22,
 24,
 25,
 30,
 32,
 33,
 35,
 36,
 39,
 40,
 45,
 50,
 55,
 60,
 65,
 70}

### 4.a Convert date into datetime and check if the data is in start/end date period

In [78]:
# convert CRASH_DATE to datetime and sort by date

#car_accident['CRASH_DATE'] = car_accident['CRASH_DATE'].apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y %H:%M:%S %p'))
car_accident['CRASH_DATE'] = pd.to_datetime(car_accident['CRASH_DATE'])
car_accident = car_accident.sort_values(['CRASH_DATE'])

In [79]:
# create new columns for number of accident and month
car_accident['num_accidents'] = len(car_accident)*[1]
car_accident['month'] = car_accident['CRASH_DATE'].dt.month

In [80]:
car_accident['CRASH_DATE'].min(), car_accident['CRASH_DATE'].max()

(Timestamp('2018-07-09 00:01:00'), Timestamp('2019-07-06 23:58:00'))

### 4.b Estimate the zipcode of each crash location by its lat/lgn

In [81]:
# select non-nan and non-zero values in LAT/LGN
car_accident = car_accident[(car_accident['LATITUDE'].notnull()) & (car_accident['LONGITUDE'].notnull())]
car_accident = car_accident.loc[(car_accident['LATITUDE'] != 0.0) & (car_accident['LONGITUDE'] != 0.0)]



# estimate zipcode of each location from its distance to center location of each zipcode
car_accident['location'] = list(zip(car_accident['LATITUDE'], car_accident['LONGITUDE'])) 
                                                    
car_accident['zipcode'] = car_accident['location'].apply(lambda x : \
                                     interfer_zipcode_from_distance( chicago_zipcode, x[0], x[1])) 

print(len(car_accident))
car_accident.head(3)

116643


Unnamed: 0,RD_NO,CRASH_DATE_EST_I,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,...,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION,num_accidents,month,location,zipcode
117323,JB342213,,2018-07-09 00:01:00,20,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",PARKED MOTOR VEHICLE,NOT DIVIDED,...,0,2,7,41.715306,-87.672339,POINT (-87.672339289922 41.71530620272),1,7,"(41.715306203000004, -87.67233929)",60643
117322,JB342263,Y,2018-07-09 00:01:00,30,NO CONTROLS,NO CONTROLS,CLEAR,UNKNOWN,PARKED MOTOR VEHICLE,ONE-WAY,...,0,2,7,41.988448,-87.658018,POINT (-87.658017536403 41.988448292397),1,7,"(41.988448292, -87.658017536)",60660
117321,JB343234,N,2018-07-09 00:01:00,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,ONE-WAY,...,0,2,7,41.860296,-87.6919,POINT (-87.691899960264 41.860296377464),1,7,"(41.860296377, -87.69189996)",60608


### 4.c Save dataframe into pickle 

In [82]:
car_accident.to_pickle('./dataset/chicago_crash.pkl')

## 5. Consolidate dataset

### 5.1 Load df for potholes

In [83]:
df_potholes = pd.read_pickle('./dataset/chicago_potholes.pkl')
print(len(df_potholes))
print(df_potholes.columns)
df_potholes.head()

30210
Index(['ADDRESS', 'REQUEST DATE', 'COMPLETION DATE', 'num_potholes',
       'LATITUDE', 'LONGITUDE', 'LOCATION', 'Boundaries - ZIP Codes',
       'Community Areas', 'Zip Codes', 'Census Tracts', 'Wards', 'duration',
       'month', 'location', 'zipcode'],
      dtype='object')


Unnamed: 0,ADDRESS,REQUEST DATE,COMPLETION DATE,num_potholes,LATITUDE,LONGITUDE,LOCATION,Boundaries - ZIP Codes,Community Areas,Zip Codes,Census Tracts,Wards,duration,month,location,zipcode
36866,6089 W DIVERSEY AVE,2018-07-09 06:44:15,2018-08-03 14:24:17,8,41.931035,-87.778138,POINT (-87.778137730383 41.931035160324),2.0,19.0,22615.0,78.0,21.0,25 days 07:40:02,7,"(41.93103516, -87.77813773)",60639
27999,4516 W JACKSON Boulevard,2018-07-09 08:17:28,2019-01-18 07:56:33,28,41.876964,-87.738515,POINT (-87.738514926893 41.876963571504),30.0,27.0,21572.0,716.0,23.0,192 days 23:39:05,7,"(41.876963572, -87.73851492700001)",60624
25425,6147 S EVANS Avenue,2018-07-09 10:32:20,2019-02-11 14:00:55,7,41.782683,-87.606964,POINT (-87.606963979375 41.782682981034),60.0,9.0,22260.0,471.0,4.0,217 days 03:28:35,7,"(41.782682981, -87.606963979)",60637
36119,9300 S DAUPHIN AVE,2018-07-09 11:31:02,2018-08-17 14:12:03,10,41.724166,-87.603931,POINT (-87.603931046816 41.724165599312),61.0,40.0,21546.0,429.0,35.0,39 days 02:41:01,7,"(41.724165599, -87.603931047)",60619
33439,9400 S COTTAGE GROVE Avenue,2018-07-09 11:31:37,2018-10-17 10:17:19,19,41.72401,-87.60495,POINT (-87.604950370155 41.724010297496),61.0,40.0,21546.0,430.0,43.0,99 days 22:45:42,7,"(41.724010297, -87.60495037)",60619


In [84]:
df_potholes = df_potholes.groupby(['zipcode', 'month'])['num_potholes'].sum()
#df_potholes = df_potholes.fillna(0)
df_potholes  = df_potholes.reset_index()
df_potholes.head()

Unnamed: 0,zipcode,month,num_potholes
0,60068,1,59
1,60068,2,242
2,60068,3,320
3,60068,4,48
4,60068,5,28


### 5.2 Load df for polpulation

In [85]:
df_population = pd.read_pickle('./dataset/chicago_population.pkl')
print(len(df_population))
df_population.head()

163


Unnamed: 0,zipcode,lat,lng,zipcode_location,city,state_name,population,density
20407,60004,42.11201,-87.97917,"(42.11201, -87.97917)",Arlington Heights,Illinois,50582,1762.0
20408,60005,42.06394,-87.98568,"(42.06394, -87.98568)",Arlington Heights,Illinois,29308,1725.4
20409,60007,42.0076,-87.99298,"(42.0076, -87.99298)",Elk Grove Village,Illinois,33820,927.1
20410,60008,42.07441,-88.02265,"(42.07441, -88.02265)",Rolling Meadows,Illinois,22717,1697.4
20416,60016,42.04972,-87.89166,"(42.04972, -87.89166)",Des Plaines,Illinois,59690,2169.2


### 5.3 Load df for crime rate

In [86]:
df_crimes = pd.read_pickle('./dataset/chicago_traffic_crime.pkl')
print(len(df_crimes))
df_crimes.head()

897


traffic_crime,zipcode,month,non traffic,traffic,percent_traffic_related_crime,num_crime
0,60018,1,75.0,1.0,0.013158,76.0
1,60018,2,61.0,2.0,0.031746,63.0
2,60018,3,68.0,2.0,0.028571,70.0
3,60018,4,88.0,3.0,0.032967,91.0
4,60018,5,80.0,1.0,0.012346,81.0


### 5.4 Load df for car accident

In [87]:
df_car_accident = pd.read_pickle('./dataset/chicago_crash.pkl')
print(len(df_car_accident))
df_car_accident.head()

116643


Unnamed: 0,RD_NO,CRASH_DATE_EST_I,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,...,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION,num_accidents,month,location,zipcode
117323,JB342213,,2018-07-09 00:01:00,20,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",PARKED MOTOR VEHICLE,NOT DIVIDED,...,0,2,7,41.715306,-87.672339,POINT (-87.672339289922 41.71530620272),1,7,"(41.715306203000004, -87.67233929)",60643
117322,JB342263,Y,2018-07-09 00:01:00,30,NO CONTROLS,NO CONTROLS,CLEAR,UNKNOWN,PARKED MOTOR VEHICLE,ONE-WAY,...,0,2,7,41.988448,-87.658018,POINT (-87.658017536403 41.988448292397),1,7,"(41.988448292, -87.658017536)",60660
117321,JB343234,N,2018-07-09 00:01:00,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,ONE-WAY,...,0,2,7,41.860296,-87.6919,POINT (-87.691899960264 41.860296377464),1,7,"(41.860296377, -87.69189996)",60608
117320,JB341590,Y,2018-07-09 00:05:00,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,OTHER,...,0,2,7,41.687032,-87.661431,POINT (-87.661431239707 41.687032403613),1,7,"(41.687032404, -87.66143124)",60643
117319,JB341556,,2018-07-09 00:15:00,10,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,ALLEY,...,0,2,7,41.995914,-87.697204,POINT (-87.697204263835 41.995914147938),1,7,"(41.995914148000004, -87.69720426399999)",60659


### 5.5 Load df for intersection

In [88]:
df_intersections_per_zipcode = pd.read_pickle('./dataset/intersections_per_zipcode.pkl')

print(len(df_intersections_per_zipcode))
df_intersections_per_zipcode.head()

61


Unnamed: 0,zipcode,intersection_count
0,60647,699
1,60639,612
2,60707,172
3,60622,419
4,60651,504


### Calculte the percentage of accident at night

In [89]:
df_car_accident['dark'] = df_car_accident['LIGHTING_CONDITION'].apply(lambda x : 'DARKNESS' if x \
                                                in ['DARKNESS', 'DARKNESS, LIGHTED ROAD', 'DUSK']  \
                                                else 'NOT DARKNESS')


df_darkness = df_car_accident.groupby(['zipcode', 'month'])['dark'].value_counts().unstack()

df_darkness = df_darkness.fillna(0)
df_darkness  = df_darkness.reset_index()


df_darkness['percent_accident_at_night'] = df_darkness[['DARKNESS',\
                                                        'NOT DARKNESS']].apply(lambda x: 1*x[0]/x.sum(), axis=1)

df_darkness['num_accident'] = df_darkness[['DARKNESS', 'NOT DARKNESS']].apply(lambda x: x.sum(), axis=1)

df_darkness.head()

dark,zipcode,month,DARKNESS,NOT DARKNESS,percent_accident_at_night,num_accident
0,60018,1,6.0,10.0,0.375,16.0
1,60018,2,5.0,8.0,0.384615,13.0
2,60018,3,1.0,12.0,0.076923,13.0
3,60018,4,3.0,16.0,0.157895,19.0
4,60018,5,4.0,30.0,0.117647,34.0


### Calculte the percentage of accident in snowing weather

In [90]:
df_car_accident['snow'] = df_car_accident['WEATHER_CONDITION'].apply(lambda x : 'SNOW' if x in ['SNOW']  else 'NOT SNOW')
df_snow = df_car_accident.groupby(['zipcode', 'month'])['snow'].value_counts().unstack()
df_snow = df_snow.fillna(0)
df_snow = df_snow.reset_index()


df_snow['percent_accident_in_snow'] = df_snow[['SNOW', 'NOT SNOW']].apply(lambda x: 1*x[0]/x.sum(), axis=1)
df_snow['num_accident'] = df_snow[['SNOW', 'NOT SNOW']].apply(lambda x: x.sum(), axis=1)
df_snow.head()

snow,zipcode,month,NOT SNOW,SNOW,percent_accident_in_snow,num_accident
0,60018,1,12.0,4.0,0.25,16.0
1,60018,2,12.0,1.0,0.076923,13.0
2,60018,3,13.0,0.0,0.0,13.0
3,60018,4,19.0,0.0,0.0,19.0
4,60018,5,34.0,0.0,0.0,34.0


### Merge dataframe

In [91]:
df = pd.merge(df_darkness, df_snow, on=['zipcode', 'month', 'num_accident'])
df = pd.merge(df,  df_population, on='zipcode')
print(len(df ))
df.head()

898


Unnamed: 0,zipcode,month,DARKNESS,NOT DARKNESS,percent_accident_at_night,num_accident,NOT SNOW,SNOW,percent_accident_in_snow,lat,lng,zipcode_location,city,state_name,population,density
0,60018,1,6.0,10.0,0.375,16.0,12.0,4.0,0.25,41.99744,-87.89693,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099,707.9
1,60018,2,5.0,8.0,0.384615,13.0,12.0,1.0,0.076923,41.99744,-87.89693,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099,707.9
2,60018,3,1.0,12.0,0.076923,13.0,13.0,0.0,0.0,41.99744,-87.89693,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099,707.9
3,60018,4,3.0,16.0,0.157895,19.0,19.0,0.0,0.0,41.99744,-87.89693,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099,707.9
4,60018,5,4.0,30.0,0.117647,34.0,34.0,0.0,0.0,41.99744,-87.89693,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099,707.9


In [92]:
df = pd.merge(df, df_potholes, how='outer', on=['zipcode', 'month'])
df = df.fillna(0)
print(len(df))
df.head()

908


Unnamed: 0,zipcode,month,DARKNESS,NOT DARKNESS,percent_accident_at_night,num_accident,NOT SNOW,SNOW,percent_accident_in_snow,lat,lng,zipcode_location,city,state_name,population,density,num_potholes
0,60018,1,6.0,10.0,0.375,16.0,12.0,4.0,0.25,41.99744,-87.89693,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099.0,707.9,0.0
1,60018,2,5.0,8.0,0.384615,13.0,12.0,1.0,0.076923,41.99744,-87.89693,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099.0,707.9,0.0
2,60018,3,1.0,12.0,0.076923,13.0,13.0,0.0,0.0,41.99744,-87.89693,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099.0,707.9,0.0
3,60018,4,3.0,16.0,0.157895,19.0,19.0,0.0,0.0,41.99744,-87.89693,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099.0,707.9,0.0
4,60018,5,4.0,30.0,0.117647,34.0,34.0,0.0,0.0,41.99744,-87.89693,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099.0,707.9,0.0


In [93]:
df = pd.merge(df,  df_crimes, how='outer', on=['zipcode', 'month'])
df = df.fillna(0)
print(len(df))
df.head()

914


Unnamed: 0,zipcode,month,DARKNESS,NOT DARKNESS,percent_accident_at_night,num_accident,NOT SNOW,SNOW,percent_accident_in_snow,lat,...,zipcode_location,city,state_name,population,density,num_potholes,non traffic,traffic,percent_traffic_related_crime,num_crime
0,60018,1,6.0,10.0,0.375,16.0,12.0,4.0,0.25,41.99744,...,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099.0,707.9,0.0,75.0,1.0,0.013158,76.0
1,60018,2,5.0,8.0,0.384615,13.0,12.0,1.0,0.076923,41.99744,...,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099.0,707.9,0.0,61.0,2.0,0.031746,63.0
2,60018,3,1.0,12.0,0.076923,13.0,13.0,0.0,0.0,41.99744,...,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099.0,707.9,0.0,68.0,2.0,0.028571,70.0
3,60018,4,3.0,16.0,0.157895,19.0,19.0,0.0,0.0,41.99744,...,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099.0,707.9,0.0,88.0,3.0,0.032967,91.0
4,60018,5,4.0,30.0,0.117647,34.0,34.0,0.0,0.0,41.99744,...,"(41.997440000000005, -87.89693)",Des Plaines,Illinois,30099.0,707.9,0.0,80.0,1.0,0.012346,81.0


In [94]:
df = pd.merge(df, df_intersections_per_zipcode, how='outer', on=['zipcode'])
print(len(df))
df = df.dropna()

939


In [95]:
df.columns

Index(['zipcode', 'month', 'DARKNESS', 'NOT DARKNESS',
       'percent_accident_at_night', 'num_accident', 'NOT SNOW', 'SNOW',
       'percent_accident_in_snow', 'lat', 'lng', 'zipcode_location', 'city',
       'state_name', 'population', 'density', 'num_potholes', 'non traffic',
       'traffic', 'percent_traffic_related_crime', 'num_crime',
       'intersection_count'],
      dtype='object')

In [96]:
df = df[['num_accident', 'DARKNESS', 'NOT DARKNESS',
       'percent_accident_at_night', 'NOT SNOW', 'SNOW',
       'percent_accident_in_snow', 'lat', 'lng', 'zipcode_location', 'city',
       'state_name', 'population', 'density', 'num_potholes', 'non traffic',
       'traffic', 'percent_traffic_related_crime',
       'intersection_count', 'num_crime', 'zipcode', 'month']]

In [97]:
#df = df[['zipcode', 'month','num_accident', 'percent_accident_at_night' ,'percent_accident_in_snow', 'lat', 'lng', 'population', 'density', 'num_potholes']]

### Save dataframe into pickle 

In [98]:
df.to_pickle('./dataset/input.pkl')

In [99]:
df.head()

Unnamed: 0,num_accident,DARKNESS,NOT DARKNESS,percent_accident_at_night,NOT SNOW,SNOW,percent_accident_in_snow,lat,lng,zipcode_location,...,population,density,num_potholes,non traffic,traffic,percent_traffic_related_crime,intersection_count,num_crime,zipcode,month
144,81.0,35.0,46.0,0.432099,64.0,17.0,0.209877,41.88526,-87.62194,"(41.885259999999995, -87.62194000000001)",...,11110.0,11892.3,48.0,104.0,29.0,0.218045,119.0,133.0,60601,1.0
145,60.0,28.0,32.0,0.466667,55.0,5.0,0.083333,41.88526,-87.62194,"(41.885259999999995, -87.62194000000001)",...,11110.0,11892.3,406.0,75.0,27.0,0.264706,119.0,102.0,60601,2.0
146,81.0,33.0,48.0,0.407407,80.0,1.0,0.012346,41.88526,-87.62194,"(41.885259999999995, -87.62194000000001)",...,11110.0,11892.3,424.0,104.0,28.0,0.212121,119.0,132.0,60601,3.0
147,93.0,26.0,67.0,0.27957,92.0,1.0,0.010753,41.88526,-87.62194,"(41.885259999999995, -87.62194000000001)",...,11110.0,11892.3,257.0,87.0,38.0,0.304,119.0,125.0,60601,4.0
148,87.0,22.0,65.0,0.252874,87.0,0.0,0.0,41.88526,-87.62194,"(41.885259999999995, -87.62194000000001)",...,11110.0,11892.3,446.0,123.0,40.0,0.245399,119.0,163.0,60601,5.0


In [100]:
len(df['zipcode'].unique()) , df['zipcode'].unique()

(58, array([60601, 60602, 60603, 60604, 60605, 60606, 60607, 60608, 60609,
        60610, 60611, 60612, 60613, 60614, 60615, 60616, 60617, 60618,
        60619, 60620, 60621, 60622, 60623, 60624, 60625, 60626, 60628,
        60629, 60630, 60631, 60632, 60633, 60634, 60636, 60637, 60638,
        60639, 60640, 60641, 60642, 60643, 60644, 60645, 60646, 60647,
        60649, 60651, 60652, 60653, 60654, 60655, 60656, 60657, 60659,
        60660, 60661, 60707, 60827]))

In [101]:
# 60068 --> 60131 (95 points)
# 60419
#df[df['zipcode']==60657][['LATITUDE', 'LONGITUDE', 'zipcode']]

In [102]:
df.columns

Index(['num_accident', 'DARKNESS', 'NOT DARKNESS', 'percent_accident_at_night',
       'NOT SNOW', 'SNOW', 'percent_accident_in_snow', 'lat', 'lng',
       'zipcode_location', 'city', 'state_name', 'population', 'density',
       'num_potholes', 'non traffic', 'traffic',
       'percent_traffic_related_crime', 'intersection_count', 'num_crime',
       'zipcode', 'month'],
      dtype='object')

### Reference :
    

https://jakevdp.github.io/PythonDataScienceHandbook/03.07-merge-and-join.html


https://towardsdatascience.com/pandas-tips-and-tricks-33bcc8a40bb9


https://github.com/ywnch/toolbox/blob/master/demo_connect_poi.ipynb


https://crowtherlab.pageflow.io/cities-of-the-future-visualizing-climate-change-to-inspire-action#210425


https://towardsdatascience.com/connecting-pois-to-a-road-network-358a81447944


https://catalog.data.gov/dataset/cook-county-highway-department-jursidictions-kml


https://www.researchgate.net/post/How_do_we_interpret_curvature_plan_and_profile_values_calculated_in_ArcGis


https://medium.com/geoai/using-machine-learning-to-predict-car-accident-risk-4d92c91a7d57

https://gis.utah.gov/data/

https://www.dupageforest.org/news/news-releases/deer-vehicle-collisions-2018

https://www.wired.com/story/waze-data-help-predict-car-crashes-cut-response-time/

https://github.com/rileypredum/East-Bay-Housing-Web-Scrape

https://towardsdatascience.com/web-scraping-craigslist-a-complete-tutorial-c41cea4f4981


