In [1]:
import pandas as pd

# Crash Data

Crash data was obtained from [here](https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95).

In [2]:
crash_df = pd.read_csv('../../data/Motor_Vehicle_Collisions_-_Crashes.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
crash_df

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,04/14/2021,5:32,,,,,,BRONX WHITESTONE BRIDGE,,,...,Unspecified,,,,4407480,Sedan,Sedan,,,
1,04/13/2021,21:35,BROOKLYN,11217.0,40.683580,-73.976170,"(40.68358, -73.97617)",,,620 ATLANTIC AVENUE,...,,,,,4407147,Sedan,,,,
2,04/15/2021,16:15,,,,,,HUTCHINSON RIVER PARKWAY,,,...,,,,,4407665,Station Wagon/Sport Utility Vehicle,,,,
3,04/13/2021,16:00,BROOKLYN,11222.0,,,,VANDERVORT AVENUE,ANTHONY STREET,,...,Unspecified,,,,4407811,Sedan,,,,
4,04/12/2021,8:25,,,0.000000,0.000000,"(0.0, 0.0)",EDSON AVENUE,,,...,Unspecified,,,,4406885,Station Wagon/Sport Utility Vehicle,Sedan,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1837045,07/06/2012,15:09,MANHATTAN,10035.0,40.801235,-73.941815,"(40.8012354, -73.9418153)",EAST 119 STREET,PARK AVENUE,,...,Unspecified,,,,59654,SPORT UTILITY / STATION WAGON,PASSENGER VEHICLE,,,
1837046,07/03/2012,17:30,QUEENS,11102.0,40.774711,-73.933386,"(40.7747112, -73.9333863)",27 AVENUE,4 STREET,,...,Unspecified,,,,272592,PASSENGER VEHICLE,SPORT UTILITY / STATION WAGON,,,
1837047,07/01/2012,15:30,BROOKLYN,11236.0,40.645032,-73.919978,"(40.6450318, -73.9199775)",RALPH AVENUE,CLARENDON ROAD,,...,Unspecified,,,,135041,SMALL COM VEH(4 TIRES),PASSENGER VEHICLE,,,
1837048,07/08/2012,18:30,,,40.786122,-73.804078,"(40.7861217, -73.8040782)",,,,...,Unspecified,,,,3055617,PASSENGER VEHICLE,PASSENGER VEHICLE,,,


We can see that the earliest date in this dataset is 2013. We should strive to get weather data corresponding through these dates.

In [4]:
crash_df.columns

Index(['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE',
       'LONGITUDE', 'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME',
       'OFF STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3',
       'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5',
       'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'],
      dtype='object')

In [5]:
crash_df['CRASH DATE'].min()

'01/01/2013'

Let's first filter so that we have only crashes where a pedestrian or a cyclist was injured or killed. We infer the risk to pedestrians and cyclists in any given crash to be roughly comparable.

In [16]:
crash_df = crash_df.query('(`NUMBER OF PEDESTRIANS INJURED` > 0) '
               '| (`NUMBER OF PEDESTRIANS KILLED` > 0) '
               '| (`NUMBER OF CYCLIST INJURED` > 0) '
               '| (`NUMBER OF CYCLIST KILLED` > 0)')

## Location Analysis

~8% of latitude/longitude values for crashes are empty. Also, LOCATION seems to be empty exactly when LATITUDE/LONGITUDE are empty:

In [17]:
(crash_df['LATITUDE'].isna()).mean(), crash_df['LONGITUDE'].isna().mean(), crash_df['LOCATION'].isna().mean()

(0.07847199265381083, 0.07847199265381083, 0.07847199265381083)

An additional 0.1% of lat/long values are 0, which we can assume to be an empty/erroneous value:

In [18]:
(crash_df.LONGITUDE == 0).mean()

0.0013370064279155187

We'll save these separately:

In [19]:
crash_df_loc = crash_df.query('~LATITUDE.isna() & ~LONGITUDE.isna() & LATITUDE != 0 & LONGITUDE != 0')

In [20]:
crash_df_no_loc = crash_df.drop(index=crash_df_loc.index)

Of the remaining entries, about 88% of them have no address.

In [21]:
crash_df_no_loc['OFF STREET NAME'].isna().mean()

0.845821060382916

For the ones that do have an address, though, we can try to geocode those addresses into a lat/long:

In [22]:
crash_df_address = crash_df_no_loc.query('~`OFF STREET_NAME`.isna()')

In [23]:
crash_df_address

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
3232,04/24/2021,19:56,MANHATTAN,10013.0,,,,,,197 WORTH STREET,...,,,,,4411201,Moped,,,,
3812,04/13/2021,11:40,BROOKLYN,11217.0,,,,,,62 Hanson,...,,,,,4411394,Sedan,,,,
4261,03/12/2021,17:40,,,,,,,,81 WEST DRIVE,...,,,,,4399921,Bike,,,,
4992,04/30/2021,12:59,,,,,,,,75 WEST DRIVE,...,Unspecified,,,,4412623,Bike,E-Bike,,,
5714,05/02/2021,17:40,,,,,,,,80 WEST DRIVE,...,Unspecified,,,,4412626,Bike,Bike,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1833651,07/09/2012,8:53,,,,,,,,254 EAST 2 STREET (GAS STATION),...,,,,,13927,TAXI,,,,
1834017,07/05/2012,0:39,,,,,,,,METRO AVE UNDER HILL AT GRAND AVE,...,Unspecified,,,,213301,UNKNOWN,BICYCLE,,,
1835422,07/05/2012,17:57,,,,,,,,PARKING LOT 961 E 174 ST,...,,,,,78670,PASSENGER VEHICLE,,,,
1836151,07/08/2012,10:00,,,,,,,,PARKING LOT PARSONS BLVD & NORTRHERN BLV,...,,,,,245687,PASSENGER VEHICLE,,,,


In [24]:
import osmnx as ox

In [25]:
import re
from tqdm import tqdm
locations = pd.Series(dtype='O')
for i, x in tqdm(crash_df_address.iterrows()):
    # Hack to allow this process to be resumed
    if i in locations.index: # and locations.loc[i] is not None:
        print('Skipping index {} as it was already processed.'.format(i))
        continue
    address = '{}, NY {}'.format(
        re.sub(r'\s+', ' ', x['OFF STREET NAME']),
        # x.BOROUGH if isinstance(x.BOROUGH, str) else 'New York',
        int(x['ZIP CODE']) if not np.isnan(x['ZIP CODE']) else ''
    )
    try:
        locations.loc[i] = ox.geocoder.geocode(address)
    except ValueError:
        print('Error for index {} with address {}'.format(i, address))
        locations.loc[i] = None

0it [00:00, ?it/s]


NameError: name 'np' is not defined

In [26]:
crash_df_loc

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
1,04/13/2021,21:35,BROOKLYN,11217.0,40.683580,-73.976170,"(40.68358, -73.97617)",,,620 ATLANTIC AVENUE,...,,,,,4407147,Sedan,,,,
43,04/13/2021,20:34,BROOKLYN,11213.0,40.668495,-73.925606,"(40.668495, -73.925606)",EASTERN PARKWAY,BUFFALO AVENUE,,...,,,,,4408259,Sedan,,,,
51,04/15/2021,12:05,,,40.761436,-73.769950,"(40.761436, -73.76995)",BELL BOULEVARD,,,...,,,,,4407636,Station Wagon/Sport Utility Vehicle,,,,
52,04/16/2021,11:00,QUEENS,11368.0,40.749580,-73.865410,"(40.74958, -73.86541)",,,100-10 ROOSEVELT AVENUE,...,Unspecified,,,,4407792,Station Wagon/Sport Utility Vehicle,Bike,,,
62,04/15/2021,20:13,BRONX,10457.0,40.847440,-73.899680,"(40.84744, -73.89968)",EAST TREMONT AVENUE,PARK AVENUE,,...,,,,,4407797,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1837009,07/07/2012,18:40,,,40.867335,-73.822707,"(40.8673349, -73.8227066)",,,,...,Unspecified,,,,2912116,PASSENGER VEHICLE,BICYCLE,,,
1837010,07/06/2012,13:33,BROOKLYN,11209.0,40.625780,-74.024154,"(40.6257805, -74.0241544)",5 AVENUE,80 STREET,,...,Unspecified,,,,140835,PASSENGER VEHICLE,BICYCLE,,,
1837035,07/03/2012,15:22,BROOKLYN,11212.0,40.661997,-73.919593,"(40.661997, -73.9195931)",EAST 98 STREET,KINGS HIGHWAY,,...,,,,,135064,SPORT UTILITY / STATION WAGON,,,,
1837036,07/08/2012,15:00,QUEENS,11102.0,40.767228,-73.918907,"(40.767228, -73.9189065)",33 STREET,28 ROAD,,...,,,,,272639,PASSENGER VEHICLE,,,,


In [27]:
crash_df_loc.to_csv('../../data/crash_data_normalized.csv')