In [2]:
import pandas as pd
import osmnx as ox

In [80]:
trip_df = pd.read_csv('../../data/trip_data_normalized_2019-01-01_2019-12-31.csv')
crash_df = pd.read_csv('../../data/crash_data_normalized_with_node_graph.csv')
weather_df = pd.read_csv('../../data/weather_data_normalized_2031-01-01_2021-10-31.csv')

So our goal here:

1. Get the trip and crash datasets to have comparable sets of columns so that they can be directly concatenated.
1. Append the weather dataset's columns by joining on the date for each event.

In [63]:
time_df = trip_df['starttime'].str.split(' ', 1, expand=True)

trip_df_norm = trip_df.drop(columns=[
    'TRIP_ID',
    'Unnamed: 0',
    'starttime',
    'stoptime',
    'start station id',
    'start station name',
    'start station latitude',
    'tripduration',
    'start station longitude',
    'end station id',
    'end station name',
    'end station latitude',
    'end station longitude',
    'bikeid',
    'usertype',
    'birth year',
    'gender'
])

trip_df_norm['NUMBER OF PERSONS INJURED'] = trip_df_norm['NUMBER OF PERSONS KILLED'] = 0
trip_df_norm['NUMBER OF PEDESTRIANS INJURED'] = trip_df_norm['NUMBER OF PEDESTRIANS KILLED'] = 0
trip_df_norm['NUMBER OF CYCLIST INJURED'] = trip_df_norm['NUMBER OF CYCLIST KILLED'] = 0
trip_df_norm['NUMBER OF MOTORIST INJURED'] = trip_df_norm['NUMBER OF MOTORIST KILLED'] = 0
trip_df_norm['EVENT_DIST_FROM_NODE'] = 0

trip_df_norm['EVENT_DATE'] = time_df[0]
trip_df_norm['EVENT_TIME'] = time_df[1]

trip_df_norm['IS_CRASH'] = False

trip_df_norm

Unnamed: 0,NODE_ID,NODE_LATITUDE,NODE_LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,EVENT_DIST_FROM_NODE,EVENT_DATE,EVENT_TIME,IS_CRASH
0,42450025,40.775593,-73.956338,0,0,0,0,0,0,0,0,0,2019-10-01,00:07:07.3460,False
1,42439403,40.776235,-73.955870,0,0,0,0,0,0,0,0,0,2019-10-01,00:07:07.3460,False
2,42439406,40.775293,-73.953637,0,0,0,0,0,0,0,0,0,2019-10-01,00:07:07.3460,False
3,42443029,40.774666,-73.954099,0,0,0,0,0,0,0,0,0,2019-10-01,00:07:07.3460,False
4,42438506,40.774021,-73.954570,0,0,0,0,0,0,0,0,0,2019-10-01,00:07:07.3460,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2549395,42429693,40.767495,-73.959317,0,0,0,0,0,0,0,0,0,2019-11-30,23:31:06.3030,False
2549396,7064022528,40.767469,-73.959252,0,0,0,0,0,0,0,0,0,2019-11-30,23:31:06.3030,False
2549397,7064022522,40.766840,-73.959718,0,0,0,0,0,0,0,0,0,2019-11-30,23:31:06.3030,False
2549398,7064022520,40.766209,-73.960149,0,0,0,0,0,0,0,0,0,2019-11-30,23:31:06.3030,False


In [107]:
crash_df_norm = crash_df.drop(columns=[
    'Unnamed: 0',
    'Unnamed: 0.1',
    'LATITUDE',
    'LONGITUDE',
    'LOCATION',
    'ZIP CODE',
    'BOROUGH',
    *[f'VEHICLE TYPE CODE {i+1}' for i in range(5)],
    *[f'CONTRIBUTING FACTOR VEHICLE {i+1}' for i in range(5)],
    'ON STREET NAME',
    'CROSS STREET NAME',
    'OFF STREET NAME',
    'COLLISION_ID'
]).rename(columns={
    'CRASH DATE': 'EVENT_DATE',
    'CRASH TIME': 'EVENT_TIME',
    'NODE_DIST_FROM_CRASH_M': 'EVENT_DIST_FROM_NODE'
}).assign(
    EVENT_DATE=lambda df: pd.to_datetime(df['EVENT_DATE']).dt.date.astype(str),
    EVENT_TIME=lambda df: df['EVENT_TIME'] + ':00.0000',
    IS_CRASH=lambda df: True
)

crash_df_norm

Unnamed: 0,EVENT_DATE,EVENT_TIME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,NODE_ID,EVENT_DIST_FROM_NODE,NODE_LATITUDE,NODE_LONGITUDE,IS_CRASH
0,2021-04-20,9:22:00.0000,1.0,0.0,1,0,0,0,0,0,42447076,1.914519,40.768476,-73.963601,True
1,2021-04-22,19:16:00.0000,1.0,0.0,0,0,1,0,0,0,42428010,1.160962,40.767541,-73.953249,True
2,2021-04-21,1:05:00.0000,1.0,0.0,1,0,0,0,0,0,42439563,0.972515,40.770119,-73.957398,True
3,2021-04-23,14:12:00.0000,1.0,0.0,0,0,1,0,0,0,42439561,1.315988,40.771053,-73.959651,True
4,2021-07-19,0:20:00.0000,1.0,0.0,1,0,0,0,0,0,42456049,1.798715,40.761488,-73.960589,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1948,2012-07-06,21:20:00.0000,2.0,0.0,2,0,0,0,0,0,42456060,0.492336,40.772341,-73.952682,True
1949,2012-07-06,9:36:00.0000,1.0,0.0,0,0,1,0,0,0,42450015,0.753201,40.774289,-73.957290,True
1950,2012-07-04,14:00:00.0000,1.0,0.0,0,0,1,0,0,0,42438805,1.786375,40.768477,-73.955483,True
1951,2012-07-06,15:45:00.0000,1.0,0.0,1,0,0,0,0,0,42456041,85.591821,40.758981,-73.962432,True


In [108]:
all_event_df = pd.concat((trip_df_norm, crash_df_norm)).reset_index().drop(columns=['index'])

In [109]:
all_event_df

Unnamed: 0,NODE_ID,NODE_LATITUDE,NODE_LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,EVENT_DIST_FROM_NODE,EVENT_DATE,EVENT_TIME,IS_CRASH
0,42450025,40.775593,-73.956338,0.0,0.0,0,0,0,0,0,0,0.000000,2019-10-01,00:07:07.3460,False
1,42439403,40.776235,-73.955870,0.0,0.0,0,0,0,0,0,0,0.000000,2019-10-01,00:07:07.3460,False
2,42439406,40.775293,-73.953637,0.0,0.0,0,0,0,0,0,0,0.000000,2019-10-01,00:07:07.3460,False
3,42443029,40.774666,-73.954099,0.0,0.0,0,0,0,0,0,0,0.000000,2019-10-01,00:07:07.3460,False
4,42438506,40.774021,-73.954570,0.0,0.0,0,0,0,0,0,0,0.000000,2019-10-01,00:07:07.3460,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2551348,42456060,40.772341,-73.952682,2.0,0.0,2,0,0,0,0,0,0.492336,2012-07-06,21:20:00.0000,True
2551349,42450015,40.774289,-73.957290,1.0,0.0,0,0,1,0,0,0,0.753201,2012-07-06,9:36:00.0000,True
2551350,42438805,40.768477,-73.955483,1.0,0.0,0,0,1,0,0,0,1.786375,2012-07-04,14:00:00.0000,True
2551351,42456041,40.758981,-73.962432,1.0,0.0,1,0,0,0,0,0,85.591821,2012-07-06,15:45:00.0000,True


In [116]:
weather_df

Unnamed: 0,DATE,AWND,PGTM,PRCP,SNOW,SNWD,TMAX,TMIN,TOBS,WT01,...,WT09,WT10,WT11,WT13,WT14,WT15,WT16,WT18,WT19,WT22
0,2013-01-01,11.23875,1843.500,0.000741,0.0,0.722222,39.785714,27.571429,33.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,2013-01-02,9.70375,1201.750,0.000000,0.0,0.350000,35.500000,22.285714,24.666667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2013-01-03,7.40750,941.250,0.000000,0.0,0.287500,32.714286,22.642857,25.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2013-01-04,11.32250,1220.750,0.000000,0.0,0.273333,37.000000,27.071429,33.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2013-01-05,8.02625,1126.500,0.000000,0.0,0.178571,41.714286,29.714286,33.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3221,2021-10-27,13.67375,848.500,1.602989,0.0,0.000000,62.923077,52.769231,55.800000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3222,2021-10-28,6.34750,1180.500,0.008101,0.0,0.000000,59.615385,46.769231,47.200000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3223,2021-10-29,13.67250,1912.750,0.043947,0.0,0.000000,57.076923,45.769231,47.200000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3224,2021-10-30,8.47125,119.750,0.648353,0.0,0.000000,62.461538,51.615385,54.000000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [110]:
all_event_df_with_weather = pd.merge(all_event_df, weather_df, left_on='EVENT_DATE', right_on='DATE', how='left')

Looks like some of these crashes happened in 2012. We'll drop them:

In [118]:
all_event_df_with_weather.query('DATE.isna()')['EVENT_DATE']

2551224    2012-12-14
2551226    2012-12-17
2551228    2012-12-25
2551229    2012-12-12
2551230    2012-12-24
              ...    
2551348    2012-07-06
2551349    2012-07-06
2551350    2012-07-04
2551351    2012-07-06
2551352    2012-07-09
Name: EVENT_DATE, Length: 123, dtype: object

In [119]:
all_event_df_with_weather.dropna(inplace=True)

In [None]:
all_event_df_with_weather.to_csv('../../data/unified_dataset.csv')