In [1]:
import pandas as pd

In [2]:
TIME_PERIOD_START='2019-01-01'
TIME_PERIOD_END='2021-11-01'

# Crash Data

Crash data was obtained from [here](https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95).

In [3]:
crash_df = pd.read_csv('../../data/raw_data/Motor_Vehicle_Collisions_-_Crashes.csv', parse_dates=['CRASH DATE'], infer_datetime_format=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
crash_df

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,2021-04-14,5:32,,,,,,BRONX WHITESTONE BRIDGE,,,...,Unspecified,,,,4407480,Sedan,Sedan,,,
1,2021-04-13,21:35,BROOKLYN,11217.0,40.683580,-73.976170,"(40.68358, -73.97617)",,,620 ATLANTIC AVENUE,...,,,,,4407147,Sedan,,,,
2,2021-04-15,16:15,,,,,,HUTCHINSON RIVER PARKWAY,,,...,,,,,4407665,Station Wagon/Sport Utility Vehicle,,,,
3,2021-04-13,16:00,BROOKLYN,11222.0,,,,VANDERVORT AVENUE,ANTHONY STREET,,...,Unspecified,,,,4407811,Sedan,,,,
4,2021-04-12,8:25,,,0.000000,0.000000,"(0.0, 0.0)",EDSON AVENUE,,,...,Unspecified,,,,4406885,Station Wagon/Sport Utility Vehicle,Sedan,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1837045,2012-07-06,15:09,MANHATTAN,10035.0,40.801235,-73.941815,"(40.8012354, -73.9418153)",EAST 119 STREET,PARK AVENUE,,...,Unspecified,,,,59654,SPORT UTILITY / STATION WAGON,PASSENGER VEHICLE,,,
1837046,2012-07-03,17:30,QUEENS,11102.0,40.774711,-73.933386,"(40.7747112, -73.9333863)",27 AVENUE,4 STREET,,...,Unspecified,,,,272592,PASSENGER VEHICLE,SPORT UTILITY / STATION WAGON,,,
1837047,2012-07-01,15:30,BROOKLYN,11236.0,40.645032,-73.919978,"(40.6450318, -73.9199775)",RALPH AVENUE,CLARENDON ROAD,,...,Unspecified,,,,135041,SMALL COM VEH(4 TIRES),PASSENGER VEHICLE,,,
1837048,2012-07-08,18:30,,,40.786122,-73.804078,"(40.7861217, -73.8040782)",,,,...,Unspecified,,,,3055617,PASSENGER VEHICLE,PASSENGER VEHICLE,,,


In [5]:
crash_df.columns

Index(['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE',
       'LONGITUDE', 'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME',
       'OFF STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3',
       'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5',
       'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'],
      dtype='object')

We filter the dataset for only crashes from 2019, which is the year of data that we're focusing on here.

Let's first filter so that we have only crashes for our target time period and where a pedestrian or a cyclist was injured or killed, or one of the involved vehicle types included "bike" in it. We presume the risk to pedestrians and cyclists in any given crash to be roughly comparable.

In [6]:
crash_df_filtered = crash_df.query(f'`CRASH DATE` >= "{TIME_PERIOD_START}" & `CRASH DATE` <= "{TIME_PERIOD_END}"').query(
        '(`NUMBER OF PEDESTRIANS KILLED` > 0)'
        '| (`NUMBER OF PEDESTRIANS INJURED` > 0)'
        '| (`NUMBER OF CYCLIST KILLED` > 0)'
        '| (`NUMBER OF CYCLIST INJURED` > 0)'
        '|' + ' | '.join([f'`VEHICLE TYPE CODE {i+1}`.str.contains("Bike", case=False, na=False)' for i in range(5)]))

In [7]:
crash_df_filtered['CRASH DATE'].max()

Timestamp('2021-11-01 00:00:00')

In [8]:
crash_df_filtered

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
1,2021-04-13,21:35,BROOKLYN,11217.0,40.683580,-73.976170,"(40.68358, -73.97617)",,,620 ATLANTIC AVENUE,...,,,,,4407147,Sedan,,,,
10,2021-04-11,21:06,BROOKLYN,11226.0,,,,BEVERLEY ROAD,EAST 21 STREET,,...,,,,,4406488,Taxi,,,,
43,2021-04-13,20:34,BROOKLYN,11213.0,40.668495,-73.925606,"(40.668495, -73.925606)",EASTERN PARKWAY,BUFFALO AVENUE,,...,,,,,4408259,Sedan,,,,
51,2021-04-15,12:05,,,40.761436,-73.769950,"(40.761436, -73.76995)",BELL BOULEVARD,,,...,,,,,4407636,Station Wagon/Sport Utility Vehicle,,,,
52,2021-04-16,11:00,QUEENS,11368.0,40.749580,-73.865410,"(40.74958, -73.86541)",,,100-10 ROOSEVELT AVENUE,...,Unspecified,,,,4407792,Station Wagon/Sport Utility Vehicle,Bike,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430467,2019-01-01,18:30,,,,,,HORACE HARDING EXPRESSWAY,MARATHON PARKWAY,,...,,,,,4060560,Station Wagon/Sport Utility Vehicle,,,,
430516,2019-01-01,21:14,MANHATTAN,10029.0,40.798622,-73.941630,"(40.798622, -73.94163)",EAST 116 STREET,LEXINGTON AVENUE,,...,,,,,4060921,Station Wagon/Sport Utility Vehicle,,,,
430714,2019-01-02,9:15,QUEENS,11354.0,40.761547,-73.831130,"(40.761547, -73.83113)",MAIN STREET,37 AVENUE,,...,,,,,4060805,Bike,,,,
430834,2019-01-01,16:36,BROOKLYN,11217.0,40.683060,-73.973785,"(40.68306, -73.973785)",SOUTH PORTLAND AVENUE,ATLANTIC AVENUE,,...,Unspecified,,,,4060553,Station Wagon/Sport Utility Vehicle,Bike,,,


## Location Analysis

~8% of latitude/longitude values for crashes are empty. Also, LOCATION seems to be empty exactly when LATITUDE/LONGITUDE are empty:

In [9]:
(crash_df_filtered['LATITUDE'].isna()).mean(), crash_df_filtered['LONGITUDE'].isna().mean(), crash_df_filtered['LOCATION'].isna().mean()

(0.05985155675701311, 0.05985155675701311, 0.05985155675701311)

An additional 0.1% of lat/long values are 0, which we can assume to be an empty/erroneous value:

In [10]:
(crash_df_filtered.LONGITUDE == 0).mean()

0.0036043726731640226

We'll save these separately:

In [11]:
crash_df_loc = crash_df_filtered.query('~LATITUDE.isna() & ~LONGITUDE.isna() & LATITUDE != 0 & LONGITUDE != 0')

In [12]:
crash_df_no_loc = crash_df_filtered.drop(index=crash_df_loc.index)

Of the remaining entries, about 88% of them have no address.

In [13]:
crash_df_no_loc['OFF STREET NAME'].isna().mean()

0.7795216741405082

For the ones that do have an address, though, we can try to geocode those addresses into a lat/long:

In [14]:
crash_df_address = crash_df_no_loc.query('~`OFF STREET_NAME`.isna()')

In [15]:
crash_df_loc

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
1,2021-04-13,21:35,BROOKLYN,11217.0,40.683580,-73.976170,"(40.68358, -73.97617)",,,620 ATLANTIC AVENUE,...,,,,,4407147,Sedan,,,,
43,2021-04-13,20:34,BROOKLYN,11213.0,40.668495,-73.925606,"(40.668495, -73.925606)",EASTERN PARKWAY,BUFFALO AVENUE,,...,,,,,4408259,Sedan,,,,
51,2021-04-15,12:05,,,40.761436,-73.769950,"(40.761436, -73.76995)",BELL BOULEVARD,,,...,,,,,4407636,Station Wagon/Sport Utility Vehicle,,,,
52,2021-04-16,11:00,QUEENS,11368.0,40.749580,-73.865410,"(40.74958, -73.86541)",,,100-10 ROOSEVELT AVENUE,...,Unspecified,,,,4407792,Station Wagon/Sport Utility Vehicle,Bike,,,
62,2021-04-15,20:13,BRONX,10457.0,40.847440,-73.899680,"(40.84744, -73.89968)",EAST TREMONT AVENUE,PARK AVENUE,,...,,,,,4407797,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430301,2019-01-01,4:15,BROOKLYN,11212.0,40.670240,-73.907005,"(40.67024, -73.907005)",STONE AVENUE,PITKIN AVENUE,,...,,,,,4061446,Sedan,,,,
430516,2019-01-01,21:14,MANHATTAN,10029.0,40.798622,-73.941630,"(40.798622, -73.94163)",EAST 116 STREET,LEXINGTON AVENUE,,...,,,,,4060921,Station Wagon/Sport Utility Vehicle,,,,
430714,2019-01-02,9:15,QUEENS,11354.0,40.761547,-73.831130,"(40.761547, -73.83113)",MAIN STREET,37 AVENUE,,...,,,,,4060805,Bike,,,,
430834,2019-01-01,16:36,BROOKLYN,11217.0,40.683060,-73.973785,"(40.68306, -73.973785)",SOUTH PORTLAND AVENUE,ATLANTIC AVENUE,,...,Unspecified,,,,4060553,Station Wagon/Sport Utility Vehicle,Bike,,,
