In [192]:
# imports
import pandas as pd
import numpy as np
import osmnx as ox
import matplotlib.pyplot as plt

# no max rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [193]:
# load crash data
crash_data = pd.read_csv('../../data/crash_data_normalized.csv')

# examine head
crash_data.head()

Unnamed: 0.1,Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,1,2021-04-13,21:35,BROOKLYN,11217.0,40.68358,-73.97617,"(40.68358, -73.97617)",,,620 ATLANTIC AVENUE,1.0,0.0,1,0,0,0,0,0,Unspecified,,,,,4407147,Sedan,,,,
1,43,2021-04-13,20:34,BROOKLYN,11213.0,40.668495,-73.925606,"(40.668495, -73.925606)",EASTERN PARKWAY,BUFFALO AVENUE,,1.0,0.0,1,0,0,0,0,0,Failure to Yield Right-of-Way,,,,,4408259,Sedan,,,,
2,51,2021-04-15,12:05,,,40.761436,-73.76995,"(40.761436, -73.76995)",BELL BOULEVARD,,,1.0,0.0,1,0,0,0,0,0,Driver Inattention/Distraction,,,,,4407636,Station Wagon/Sport Utility Vehicle,,,,
3,52,2021-04-16,11:00,QUEENS,11368.0,40.74958,-73.86541,"(40.74958, -73.86541)",,,100-10 ROOSEVELT AVENUE,1.0,0.0,0,0,1,0,0,0,Turning Improperly,Unspecified,,,,4407792,Station Wagon/Sport Utility Vehicle,Bike,,,
4,62,2021-04-15,20:13,BRONX,10457.0,40.84744,-73.89968,"(40.84744, -73.89968)",EAST TREMONT AVENUE,PARK AVENUE,,1.0,0.0,1,0,0,0,0,0,Driver Inattention/Distraction,,,,,4407797,,,,,


In [194]:
#look at columns
crash_data.columns

Index(['Unnamed: 0', 'CRASH DATE', 'CRASH TIME', 'BOROUGH', 'ZIP CODE',
       'LATITUDE', 'LONGITUDE', 'LOCATION', 'ON STREET NAME',
       'CROSS STREET NAME', 'OFF STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3',
       'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5',
       'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'],
      dtype='object')

In [195]:
vehicle_columns = [ 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2','VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5']

all_vehicle_types = []
i=1
for col in vehicle_columns:
    all_vehicle_types+=set(crash_data[col])
    i+=1

all_vehicle_types = set(all_vehicle_types)
all_vehicle_types

# bike related value is 'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion'

{"''lime mope",
 '3-Door',
 '4 dr sedan',
 'AMB',
 'AMBU',
 'AMBUL',
 'AMBULANCE',
 'Ambulance',
 'Armored Truck',
 'BICYCLE',
 'BOX',
 'BOX T',
 'BOX TRUCK',
 'BTM',
 'BUs',
 'Backhoe Lo',
 'Beverage Truck',
 'Bike',
 'Box Truck',
 'Bulk Agriculture',
 'Bus',
 'CEMENT TRU',
 'COM',
 'COMME',
 'COMMERCIAL',
 'COURIER VA',
 'Carry All',
 'Cement tru',
 'Chassis Cab',
 'Commercial',
 'Concrete Mixer',
 'Const',
 'Convertible',
 'DELIV',
 'DELIVERY',
 'DELV',
 'DIRT',
 'DIRT BIKE',
 'DIRTBIKE',
 'DODGE RAM',
 'DOLLAR VAN',
 'Dirt Bike',
 'Dirt bike',
 'Dump',
 'E - B',
 'E BIK',
 'E BIKE',
 'E SCO',
 'E bik',
 'E bike',
 'E-BIKE',
 'E-Bik',
 'E-Bike',
 'E-Sco',
 'E-Scooter',
 'E-bik',
 'E-bike',
 'EBIKE',
 'ELEC. UNIC',
 'ELECT',
 'ELECTRIC S',
 'Elect',
 'FDNY',
 'FDNY AMBUL',
 'FDNY Ambul',
 'FDNY FIRE',
 'FIRE ENGIN',
 'FIRE TRUCK',
 'FORK',
 'FORKL',
 'Fire Truck',
 'Flat Bed',
 'Flat Rack',
 'Ford Van',
 'Ford sprin',
 'Fork lift',
 'Forklift',
 'Front-Load',
 'GARBAGE TR',
 'GAS POW

In [196]:
# leaving out 'dirt bike', 'dirtbike', 'Minibike','Minicycle','Dirt Bike','Dirt bike','pedicab','Pedicab'
# look at 'DELIV','DELIVERY', 'DELV'
bike_values= ['Bike','BICYCLE','uni e-bike','e-bike','ebike','E - B','E BIK','E BIKE','E bik','E bike','E-BIKE','E-Bik','E-Bike','E-bik','E-bike','EBIKE']

# see how many bike_values involved in crashes
crash_data.head()
for col in vehicle_columns:
    for value in bike_values:
        print(col, " ", value, " count: ", sum(crash_data[col]==value))
       

VEHICLE TYPE CODE 1   Bike  count:  3842
VEHICLE TYPE CODE 1   BICYCLE  count:  1
VEHICLE TYPE CODE 1   uni e-bike  count:  0
VEHICLE TYPE CODE 1   e-bike  count:  0
VEHICLE TYPE CODE 1   ebike  count:  1
VEHICLE TYPE CODE 1   E - B  count:  1
VEHICLE TYPE CODE 1   E BIK  count:  1
VEHICLE TYPE CODE 1   E BIKE  count:  0
VEHICLE TYPE CODE 1   E bik  count:  1
VEHICLE TYPE CODE 1   E bike  count:  1
VEHICLE TYPE CODE 1   E-BIKE  count:  2
VEHICLE TYPE CODE 1   E-Bik  count:  73
VEHICLE TYPE CODE 1   E-Bike  count:  1016
VEHICLE TYPE CODE 1   E-bik  count:  0
VEHICLE TYPE CODE 1   E-bike  count:  1
VEHICLE TYPE CODE 1   EBIKE  count:  0
VEHICLE TYPE CODE 2   Bike  count:  10498
VEHICLE TYPE CODE 2   BICYCLE  count:  0
VEHICLE TYPE CODE 2   uni e-bike  count:  1
VEHICLE TYPE CODE 2   e-bike  count:  1
VEHICLE TYPE CODE 2   ebike  count:  0
VEHICLE TYPE CODE 2   E - B  count:  0
VEHICLE TYPE CODE 2   E BIK  count:  0
VEHICLE TYPE CODE 2   E BIKE  count:  1
VEHICLE TYPE CODE 2   E bik  coun

In [197]:
# map values to BICYCLE and EBIKE
bike_value_mapper = {'BICYCLE':'BICYCLE','Bike':'BICYCLE','EBIKE':'EBIKE','uni e-bike':'EBIKE','e-bike':'EBIKE','ebike':'EBIKE','E - B':'EBIKE','E BIK':'EBIKE','E BIKE':'EBIKE','E bik':'EBIKE',
                     'E bike':'EBIKE','E-BIKE':'EBIKE','E-Bik':'EBIKE','E-Bike':'EBIKE','E-bik':'EBIKE','E-bike':'EBIKE'
                    }

for col in vehicle_columns:
    crash_data[col] =  crash_data[col].map(bike_value_mapper)
    
crash_data.head()

Unnamed: 0.1,Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,1,2021-04-13,21:35,BROOKLYN,11217.0,40.68358,-73.97617,"(40.68358, -73.97617)",,,620 ATLANTIC AVENUE,1.0,0.0,1,0,0,0,0,0,Unspecified,,,,,4407147,,,,,
1,43,2021-04-13,20:34,BROOKLYN,11213.0,40.668495,-73.925606,"(40.668495, -73.925606)",EASTERN PARKWAY,BUFFALO AVENUE,,1.0,0.0,1,0,0,0,0,0,Failure to Yield Right-of-Way,,,,,4408259,,,,,
2,51,2021-04-15,12:05,,,40.761436,-73.76995,"(40.761436, -73.76995)",BELL BOULEVARD,,,1.0,0.0,1,0,0,0,0,0,Driver Inattention/Distraction,,,,,4407636,,,,,
3,52,2021-04-16,11:00,QUEENS,11368.0,40.74958,-73.86541,"(40.74958, -73.86541)",,,100-10 ROOSEVELT AVENUE,1.0,0.0,0,0,1,0,0,0,Turning Improperly,Unspecified,,,,4407792,,BICYCLE,,,
4,62,2021-04-15,20:13,BRONX,10457.0,40.84744,-73.89968,"(40.84744, -73.89968)",EAST TREMONT AVENUE,PARK AVENUE,,1.0,0.0,1,0,0,0,0,0,Driver Inattention/Distraction,,,,,4407797,,,,,


In [198]:
for col in vehicle_columns:
    for value in ['BICYCLE','EBIKE']:
        print(col, " ", value, " count: ", sum(crash_data[col]==value))

VEHICLE TYPE CODE 1   BICYCLE  count:  3843
VEHICLE TYPE CODE 1   EBIKE  count:  1097
VEHICLE TYPE CODE 2   BICYCLE  count:  10498
VEHICLE TYPE CODE 2   EBIKE  count:  2060
VEHICLE TYPE CODE 3   BICYCLE  count:  126
VEHICLE TYPE CODE 3   EBIKE  count:  25
VEHICLE TYPE CODE 4   BICYCLE  count:  17
VEHICLE TYPE CODE 4   EBIKE  count:  3
VEHICLE TYPE CODE 5   BICYCLE  count:  3
VEHICLE TYPE CODE 5   EBIKE  count:  0


In [199]:
crash_data.columns

Index(['Unnamed: 0', 'CRASH DATE', 'CRASH TIME', 'BOROUGH', 'ZIP CODE',
       'LATITUDE', 'LONGITUDE', 'LOCATION', 'ON STREET NAME',
       'CROSS STREET NAME', 'OFF STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3',
       'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5',
       'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'],
      dtype='object')

In [200]:
crash_data['YEAR'] = crash_data['CRASH DATE'].str[:4]
yearly_num_injured_or_killed = crash_data.groupby('YEAR').agg({'NUMBER OF CYCLIST INJURED':sum,'NUMBER OF CYCLIST KILLED':sum, 'NUMBER OF PEDESTRIANS INJURED':sum, 'NUMBER OF PEDESTRIANS KILLED':sum})
yearly_num_injured_or_killed

# num cyclists injured went up from 2019 to 2020 while num cyclist killed, ped injured and ped killed went down

Unnamed: 0_level_0,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019,4710,29,10183,123
2020,5265,24,6396,93
2021,3730,12,5266,92


In [201]:
crash_data['BICYCLE_INVOLVED 1'] = crash_data['VEHICLE TYPE CODE 1']=='BICYCLE'
crash_data['BICYCLE_INVOLVED 2'] = crash_data['VEHICLE TYPE CODE 2']=='BICYCLE'
crash_data['BICYCLE_INVOLVED 3'] = crash_data['VEHICLE TYPE CODE 3']=='BICYCLE'
crash_data['BICYCLE_INVOLVED 4'] = crash_data['VEHICLE TYPE CODE 4']=='BICYCLE'
crash_data['BICYCLE_INVOLVED 5'] = crash_data['VEHICLE TYPE CODE 5']=='BICYCLE'
crash_data['BICYCLE_INVOLVED TOTAL'] = crash_data['BICYCLE_INVOLVED 1']+crash_data['BICYCLE_INVOLVED 2']+crash_data['BICYCLE_INVOLVED 3']+crash_data['BICYCLE_INVOLVED 4']+crash_data['BICYCLE_INVOLVED 5']

crash_data['EBIKE_INVOLVED 1'] = crash_data['VEHICLE TYPE CODE 1']=='EBIKE'
crash_data['EBIKE_INVOLVED 2'] = crash_data['VEHICLE TYPE CODE 2']=='EBIKE'
crash_data['EBIKE_INVOLVED 3'] = crash_data['VEHICLE TYPE CODE 3']=='EBIKE'
crash_data['EBIKE_INVOLVED 4'] = crash_data['VEHICLE TYPE CODE 4']=='EBIKE'
crash_data['EBIKE_INVOLVED 5'] = crash_data['VEHICLE TYPE CODE 5']=='EBIKE'
crash_data['EBIKE_INVOLVED TOTAL'] = crash_data['EBIKE_INVOLVED 1']+crash_data['EBIKE_INVOLVED 2']+crash_data['EBIKE_INVOLVED 3']+crash_data['EBIKE_INVOLVED 4']+crash_data['EBIKE_INVOLVED 5']

yearly_bike_ebike = crash_data.groupby('YEAR').agg({'BICYCLE_INVOLVED TOTAL':sum,'EBIKE_INVOLVED TOTAL':sum})
yearly_bike_ebike

# num bike involved decreased, num ebike involved increases

Unnamed: 0_level_0,BICYCLE_INVOLVED TOTAL,EBIKE_INVOLVED TOTAL
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1
2019,5638,241
2020,4889,880
2021,3788,2041
