In [1]:
# group by node id (since rows are currently node events) .mean of is_crash to have dummy danger metric
# associate the node ids with user traslation table to have node ID with tract ID then associate with demographic data
# linear regression: ex: percentage black with crash ratio

In [2]:
import pandas as pd

In [3]:
# load node events
node_events_df = pd.read_csv("../../data/unified_dataset.csv", usecols=['NODE_ID', 'IS_CRASH'])

In [22]:
# check shape
node_events_df.shape

(22594153, 2)

In [23]:
# examine head
node_events_df.head()

Unnamed: 0,NODE_ID,IS_CRASH
0,42430108,False
1,42424864,False
2,42423456,False
3,4602414021,False
4,42452015,False


In [24]:
# check columns
node_events_df.columns

Index(['NODE_ID', 'IS_CRASH'], dtype='object')

In [25]:
# create df with node ID and IS_CRASH
node_ids_crash_avg = node_events_df
node_ids_crash_avg.head()

Unnamed: 0,NODE_ID,IS_CRASH
0,42430108,False
1,42424864,False
2,42423456,False
3,4602414021,False
4,42452015,False


In [27]:
# examine head
node_id_crash_df.head()

Unnamed: 0_level_0,IS_CRASH
NODE_ID,Unnamed: 1_level_1
42421828,0.0
42421837,0.0
42421877,0.000272
42421889,3.2e-05
42421927,0.0


In [28]:
# check unique NOIDE_ID in df matches number of rows in node_id_crash_df
assert len(set(node_events_df['NODE_ID'])) == node_id_crash_df.shape[0]

# check how many unique NODE_ID in original df
len(set(node_events_df['NODE_ID']))

2101

In [29]:
# load node id to census tract key table
node_id_census_tract = pd.read_csv("../../data/node_id_census_tract_key.csv")

# check shape
node_id_census_tract.shape

(2587, 2)

In [30]:
# examine head
node_id_census_tract.head()

Unnamed: 0,osmid,ct2010
0,42421828,2800
1,42421837,2800
2,42430924,2800
3,42430938,2800
4,42436957,2800


In [31]:
# look at why there are more rows in census 
# ASSUMING THESE ARE NODES FOR WHICH WE HAVE NO NODE EVENTS (neither crashes nor citiBike)
print("number of node ids in census tract df not in crash df: ", len(set(node_id_census_tract['osmid'])) - len(set(node_events_df['NODE_ID'])))

node_id_crash_df.columns

number of node ids in census tract df not in crash df:  486


Index(['IS_CRASH'], dtype='object')

In [32]:
# merge on node ids: census tracts and avg crash
node_ids__crash_avg_census_tracts = node_id_crash_df.merge(node_id_census_tract, how = 'left', left_index=True, right_on='osmid')

# rename columns to be clearer
col_names = {'osmid':'NODE_ID','IS_CRASH':'CRASH_AVG','ct2010':'CENSUS_TRACT_ID'}
node_ids__crash_avg_census_tracts.rename(columns=col_names, inplace=True)

# check shape
node_ids__crash_avg_census_tracts.shape

(2101, 3)

In [33]:
# examine head
node_ids__crash_avg_census_tracts.head()

Unnamed: 0,CRASH_AVG,NODE_ID,CENSUS_TRACT_ID
0,0.0,42421828,2800
1,0.0,42421837,2800
27,0.000272,42421877,6300
52,3.2e-05,42421889,7100
72,0.0,42421927,5502


In [34]:
node_ids__crash_avg_census_tracts.sort_values('CRASH_AVG', ascending=False)

Unnamed: 0,CRASH_AVG,NODE_ID,CENSUS_TRACT_ID
171,1.0,42431902,900
123,1.0,42429507,700
2578,1.0,370893397,2000
970,1.0,4500012797,5400
557,1.0,7137281959,6200
...,...,...,...
2404,0.0,42456568,7200
1260,0.0,42456303,5900
1259,0.0,42456211,5900
1258,0.0,42456206,5900


In [35]:
print("Nodes with no Crashes: number of nodes (# CRASH / # EVENTS) == 0 : ", sum(node_ids__crash_avg_census_tracts['CRASH_AVG']==0))
# there are Crash Averages ==1 so looking into this:
## do we want to remove Crash Averages that are above a certain fraction?
## for example, greater than .001 meaning that there was one crash in 1000 events for the node

print("number of nodes (# CRASH / # EVENTS) == 1 : ", sum(node_ids__crash_avg_census_tracts['CRASH_AVG']==1))
print("number of nodes (# CRASH / # EVENTS) > 1 / 10 : ", sum(node_ids__crash_avg_census_tracts['CRASH_AVG']>.1))
print("number of nodes (# CRASH / # EVENTS) > 1 / 100 : ", sum(node_ids__crash_avg_census_tracts['CRASH_AVG']>.01))
print("number of nodes (# CRASH / # EVENTS) > 1 / 1000 : ", sum(node_ids__crash_avg_census_tracts['CRASH_AVG']>.001))

Nodes with no Crashes: number of nodes (# CRASH / # EVENTS) == 0 :  1005
number of nodes (# CRASH / # EVENTS) == 1 :  29
number of nodes (# CRASH / # EVENTS) > 1 / 10 :  29
number of nodes (# CRASH / # EVENTS) > 1 / 100 :  36
number of nodes (# CRASH / # EVENTS) > 1 / 1000 :  125


In [36]:
# look at sample CRASH EVENT == 1 node from original df
node_events_df.loc[node_events_df['NODE_ID']==1728266963,:]

Unnamed: 0,NODE_ID,IS_CRASH
22591956,1728266963,True
22592078,1728266963,True


In [37]:
# look at rows with Crash Avg == 1 to ignore in correlation
crash_avg_equals_one = node_ids__crash_avg_census_tracts.loc[node_ids__crash_avg_census_tracts['CRASH_AVG']==1,:]

In [38]:
# create df with rows where Crash Avg < 1
node_ids__crash_avg_census_tracts_clean = node_ids__crash_avg_census_tracts.loc[node_ids__crash_avg_census_tracts['CRASH_AVG']<1,:]

# check shape
node_ids__crash_avg_census_tracts_clean.shape

(2072, 3)

In [39]:
# save csv with node ids, census tracts, and DUMMY DANGER metric
node_ids__crash_avg_census_tracts_clean.to_csv(path_or_buf='../../data/DUMMY_CRASH_AVG_FOR_CORRELATION_TESTS_node_id_census_tract_key.csv')