Join the crash dataset to the street centerline dataset. 

In [1]:
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point

pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 200)

In [2]:
denver_crs = 3502
web_crs = 4326

In [3]:
from crash_data_analysis import CrashDataAnalysis
cda = CrashDataAnalysis()
df = cda.crash_dataframe()

In [4]:
df = df[df.geo_lon.notnull()].reset_index().copy()

In [5]:
crashes = gpd.GeoDataFrame(
    data=df
    , geometry=gpd.points_from_xy(df['geo_lon'], df['geo_lat'])
    , crs=4326
)

In [6]:
# crashes = crashes.tail(100).copy()
crashes = crashes[crashes.reported_date.dt.strftime('%Y-%m') == '2022-08']

In [7]:
crashes[['street_a', 'street_b']] = crashes.incident_address.str.split('/', expand=True)
crashes['street_a'] = crashes['street_a'].str.strip()
crashes['street_b'] = crashes['street_b'].str.strip()
# crashes['street_c'] = crashes['street_c'].str.strip()

In [8]:
crashes['link'] = crashes.apply(
    lambda row: f'https://www.google.com/maps/search/?api=1&query={row.geo_lat},{row.geo_lon}', axis=1
)

In [9]:
centerlines = gpd.read_file('shapefiles/street_centerline/street_centerline.shp')

In [10]:
centerlines = centerlines[centerlines['FULLNAME'] != 'PRIVATE RD'].reset_index().copy()

In [11]:
centerlines = centerlines.to_crs(denver_crs)
crashes = crashes.to_crs(denver_crs)

In [12]:
offset = 35 # feet
bbox = crashes.bounds + [-offset, -offset, offset, offset]

In [13]:
hits = bbox.apply(lambda row: list(centerlines.sindex.intersection(row)), axis=1)

In [14]:
crashes_streets_list = []

for crash_index, row in tqdm(hits.iteritems(), total=len(hits)):
    for street_index in row:
        new_row = [
            crashes.loc[crash_index].incident_id
            , centerlines.loc[street_index].MASTERID
        ]
        
        crashes_streets_list += [new_row]

crashes_streets = pd.DataFrame(crashes_streets_list, columns=['incident_id', 'MASTERID'])

100%|█████████████████████████████████████████████████████████████████████████████| 1557/1557 [00:04<00:00, 350.57it/s]


In [15]:
# for street in hits[212222]:
#     print('{}: {} from {} to {}'.format(
#         centerlines.loc[street].MASTERID
#         , centerlines.loc[street].FULLNAME
#         , centerlines.loc[street].FROMNAME
#         , centerlines.loc[street].TONAME
#     ))

In [16]:
crashes_streets = pd.merge(crashes_streets, crashes, how='inner', on='incident_id')
crashes_streets = pd.merge(crashes_streets, centerlines, how='inner', on='MASTERID')

In [17]:
incident_mapping = {
    'N BIGHTON BLVD': 'N BRIGHTON BLVD'
    , 'I25 HWYNB': 'INTERSTATE 25'
    , 'I25 HWYSB': 'INTERSTATE 25'
    , 'I25HWY': 'INTERSTATE 25'
    , 'I70 HWYWB': 'INTERSTATE 70'
    , 'I70 HWYEB': 'INTERSTATE 70'
    , 'I225 HWYNB': 'INTERSTATE 225'
    , 'I225 HWYSB': 'INTERSTATE 225'
    , 'W 6TH AVE': 'W 6TH AVENUE FWY'
    , 'PARK AVEW': 'PARK AVE W'
    , 'S MONACO ST': 'S MONACO STREET PKWY'
    , 'N MONACO ST': 'N MONACO STREET PKWY'
    , 'E MLK BLVD': 'E MARTIN LUTHER KING BLVD'
    , 'E MARTIN LUTHER KING BLVD': 'E MARTIN LUTHER KING JR BLVD'
    , 'E GVR BLVD': 'GREEN VALLEY RANCH BLVD'
}
crashes_streets['incident_address_corrected'] = crashes_streets.incident_address.replace(incident_mapping, regex=True)

In [18]:
crashes_streets['street_named_in_crash'] = crashes_streets.apply(
    lambda row: row.FULLNAME in row.incident_address_corrected, axis=1)

In [19]:
freeways = [
    'INTERSTATE 25'
    , 'INTERSTATE 70'
    , 'INTERSTATE 225'
    , 'W 6TH AVENUE FWY'
]
crashes_streets['at_freeway'] = False

for f in freeways:
    crashes_streets.loc[crashes_streets.incident_address_corrected.str.contains(f), 'at_freeway'] = True

In [20]:
bad_mapping = crashes_streets[~crashes_streets.street_named_in_crash][
    ['incident_address_corrected', 'FULLNAME', 'link']
]

In [21]:
most_common = bad_mapping.groupby('FULLNAME').size().sort_values(ascending=False)
# most_common.head(20)

In [22]:
# bad_mapping[bad_mapping.FULLNAME == 'E MARTIN LUTHER KING JR BLVD']

In [23]:
pd.pivot_table(
    data=crashes_streets
    , index='street_named_in_crash'
    , columns='at_freeway'
    , values='one'
    , aggfunc=sum
    , margins=True
)

at_freeway,False,True,All
street_named_in_crash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,507,287,794
True,3379,1194,4573
All,3886,1481,5367


In [24]:
# crashes_streets.to_csv('crashes_streets.csv', index=False)

## Analysis

In [25]:
street_counts = crashes_streets[
    (crashes_streets.street_named_in_crash) & ~(crashes_streets.at_freeway)
].groupby('FULLNAME').agg(
    num_crashes=('incident_id', pd.Series.nunique)
)

street_counts['crashes_per_week'] = (street_counts['num_crashes'] / 31) * 7

In [26]:
street_miles = pd.DataFrame(centerlines.groupby('FULLNAME').LEN_MI.sum()).reset_index()

In [27]:
street_counts_miles = pd.merge(street_counts, street_miles, how='inner', on='FULLNAME')

In [28]:
street_counts_miles['crashes_per_week_per_mile'] = (
    street_counts_miles['crashes_per_week'] / street_counts_miles['LEN_MI']
)

In [29]:
most_crashes = street_counts_miles[street_counts_miles['crashes_per_week'] > 2].copy()

In [30]:
most_crashes['street_rank'] = most_crashes['crashes_per_week_per_mile'].rank(ascending=False)

In [31]:
most_crashes.sort_values(by='crashes_per_week_per_mile', ascending=False)

Unnamed: 0,FULLNAME,num_crashes,crashes_per_week,LEN_MI,crashes_per_week_per_mile,street_rank
146,MARKET ST,10,2.258065,0.920408,2.453331,1.0
7,20TH ST,11,2.483871,1.598246,1.554124,2.0
416,W ALAMEDA AVE,27,6.096774,4.027947,1.513618,3.0
304,PARK AVE W,16,3.612903,2.461982,1.467477,4.0
95,E COLFAX AVE,35,7.903226,5.544884,1.425319,5.0
2,15TH ST,11,2.483871,1.748134,1.42087,6.0
21,CALIFORNIA ST,9,2.032258,1.595103,1.274061,7.0
144,LEETSDALE DR,13,2.935484,2.341576,1.253636,8.0
158,N BROADWAY,24,5.419355,4.509889,1.20166,9.0
23,CHAMPA ST,10,2.258065,1.998296,1.129995,10.0


In [32]:
# centerlines[centerlines['FULLNAME'] == 'N SPEER BLVD'].to_clipboard()

In [33]:
sorted(most_crashes.FULLNAME)

['15TH ST',
 '20TH ST',
 'CALIFORNIA ST',
 'CHAMPA ST',
 'E 12TH AVE',
 'E 13TH AVE',
 'E 14TH AVE',
 'E 16TH AVE',
 'E 17TH AVE',
 'E 18TH AVE',
 'E 40TH AVE',
 'E 47TH AVE',
 'E 56TH AVE',
 'E 8TH AVE',
 'E ALAMEDA AVE',
 'E COLFAX AVE',
 'E EVANS AVE',
 'E HAMPDEN AVE',
 'E LOWRY BLVD',
 'E MARTIN LUTHER KING BLVD',
 'GREEN VALLEY RANCH BLVD',
 'LEETSDALE DR',
 'MARKET ST',
 'N BROADWAY',
 'N CENTRAL PARK BLVD',
 'N COLORADO BLVD',
 'N DOWNING ST',
 'N FEDERAL BLVD',
 'N HAVANA ST',
 'N LINCOLN ST',
 'N MONACO STREET PKWY',
 'N PEORIA ST',
 'N QUEBEC ST',
 'N SHERIDAN BLVD',
 'N SPEER BLVD',
 'N TOWER RD',
 'N YORK ST',
 'PARK AVE W',
 'S COLORADO BLVD',
 'S FEDERAL BLVD',
 'S MONACO STREET PKWY',
 'S QUEBEC ST',
 'S SANTA FE DR',
 'S UNIVERSITY BLVD',
 'W 8TH AVE',
 'W ALAMEDA AVE',
 'W COLFAX AVE',
 'W FLORIDA AVE']

In [34]:
# crashes.groupby('street_a').size().sort_values(ascending=False)