In [46]:
import os
import psycopg2
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape

from config import DB_VISION_ZERO, DB_MOPED

In [47]:
def get_data(query, cursor):
    """
    Get data from database
    """
    cursor.execute(query)
    data = cursor.fetchall()
    field_names = [i[0] for i in cursor.description]
    df = pd.DataFrame(data, columns=field_names)

    return df

conn_vz = psycopg2.connect(
    dbname = DB_VISION_ZERO['dbname'],
    user = DB_VISION_ZERO["user"],
    host = DB_VISION_ZERO["host"],
    password = DB_VISION_ZERO["password"],
    port=5432
)

conn_moped = psycopg2.connect(
    dbname = DB_MOPED["dbname"],
    user = DB_MOPED["user"],
    host = DB_MOPED["host"],
    password = DB_MOPED["password"],
    port = 5432
)

cursor_vz = conn_vz.cursor()
cursor_moped = conn_moped.cursor()

# Moped processing

In [48]:
# Creating moped dataframe
QUERY_MOPED = """SELECT project_id, project_component_id, geometry, 
line_geometry, substantial_completion_date, project_name,
component_name, component_name_full, component_subtype, 
component_work_types, type_name FROM component_arcgis_online_view"""

# Creating moped dataframe
df_moped = get_data(QUERY_MOPED, cursor_moped)

In [49]:
# Data frame info
df_moped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12266 entries, 0 to 12265
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   project_id                   12266 non-null  int64              
 1   project_component_id         11828 non-null  float64            
 2   geometry                     11828 non-null  object             
 3   line_geometry                11828 non-null  object             
 4   substantial_completion_date  2963 non-null   datetime64[ns, UTC]
 5   project_name                 12266 non-null  object             
 6   component_name               12266 non-null  object             
 7   component_name_full          12266 non-null  object             
 8   component_subtype            8893 non-null   object             
 9   component_work_types         2310 non-null   object             
 10  type_name                    421 non-null    o

In [50]:
# Dropping observations where substantial completion date or line geometry is absent
df_moped_filter = df_moped.dropna(subset=['substantial_completion_date', 'line_geometry'])
df_moped_filter.head()

Unnamed: 0,project_id,project_component_id,geometry,line_geometry,substantial_completion_date,project_name,component_name,component_name_full,component_subtype,component_work_types,type_name
10,12,181.0,"{'type': 'MultiPoint', 'coordinates': [[-97.73...","{'type': 'LineString', 'coordinates': [[-97.73...",2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Signal,Signal - Traffic,Traffic,Modification,Signal - Mod
11,12,183.0,"{'type': 'MultiPoint', 'coordinates': [[-97.73...","{'type': 'LineString', 'coordinates': [[-97.73...",2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Signal,Signal - Traffic,Traffic,Modification,Signal - Mod
12,12,182.0,"{'type': 'MultiPoint', 'coordinates': [[-97.73...","{'type': 'LineString', 'coordinates': [[-97.73...",2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Signal,Signal - Traffic,Traffic,Modification,Signal - Mod
13,12,469.0,"{'type': 'MultiPoint', 'coordinates': [[-97.73...","{'type': 'MultiLineString', 'coordinates': [[[...",2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Intersection,Intersection - Improvement,Improvement,,Signal - Mod
14,12,16.0,"{'type': 'MultiPoint', 'coordinates': [[-97.73...","{'type': 'LineString', 'coordinates': [[-97.73...",2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Signal,Signal - Traffic,Traffic,Modification,Signal - Mod


In [51]:
df_moped_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2779 entries, 10 to 12230
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   project_id                   2779 non-null   int64              
 1   project_component_id         2779 non-null   float64            
 2   geometry                     2779 non-null   object             
 3   line_geometry                2779 non-null   object             
 4   substantial_completion_date  2779 non-null   datetime64[ns, UTC]
 5   project_name                 2779 non-null   object             
 6   component_name               2779 non-null   object             
 7   component_name_full          2779 non-null   object             
 8   component_subtype            2229 non-null   object             
 9   component_work_types         855 non-null    object             
 10  type_name                    86 non-null     object

In [52]:
# Convert timestamp columns to string
timestamp_columns = ["substantial_completion_date"]

for col in timestamp_columns:
    df_moped_filter.loc[:, col] = df_moped_filter[col].astype(str)

# Apply the geometry transformation
df_moped_filter.loc[:, "geometry"] = df_moped_filter["geometry"].apply(lambda x: shape(x) if x is not None else None)
df_moped_filter.loc[:, "line_geometry"] = df_moped_filter["line_geometry"].apply(lambda x: shape(x) if x is not None else None)

# Create GeoDataFrame
gdf_moped = gpd.GeoDataFrame(df_moped_filter, geometry="geometry")

In [53]:
# Adding a unique ID column
gdf_moped.insert(0, 'moped_component_id', range(1, 1 + len(gdf_moped)))

In [54]:
gdf_moped.head()

Unnamed: 0,moped_component_id,project_id,project_component_id,geometry,line_geometry,substantial_completion_date,project_name,component_name,component_name_full,component_subtype,component_work_types,type_name
10,1,12,181.0,MULTIPOINT (-97.73351 30.26751),"LINESTRING (-97.733436244 30.267508296, -97.73...",2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Signal,Signal - Traffic,Traffic,Modification,Signal - Mod
11,2,12,183.0,MULTIPOINT (-97.73430 30.26772),"LINESTRING (-97.73422624299999 30.267724297, -...",2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Signal,Signal - Traffic,Traffic,Modification,Signal - Mod
12,3,12,182.0,MULTIPOINT (-97.73386 30.26657),"LINESTRING (-97.73378324399999 30.266570296, -...",2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Signal,Signal - Traffic,Traffic,Modification,Signal - Mod
13,4,12,469.0,"MULTIPOINT (-97.73467 30.26683, -97.73431 30.2...",MULTILINESTRING ((-97.73423324300001 30.267730...,2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Intersection,Intersection - Improvement,Improvement,,Signal - Mod
14,5,12,16.0,MULTIPOINT (-97.73467 30.26682),"LINESTRING (-97.734592244 30.266820297, -97.73...",2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Signal,Signal - Traffic,Traffic,Modification,Signal - Mod


# VisionZero processing

In [55]:
# Creaing vision zero dataframe
QUERY_CRASH_DATA = """SELECT crash_id, crash_fatal_fl, crash_date,
road_constr_zone_fl, latitude, longitude, tot_injry_cnt, 
death_cnt, est_comp_cost FROM atd_txdot_crashes"""

df_vz = get_data(QUERY_CRASH_DATA, cursor_vz)

In [56]:
df_vz.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421662 entries, 0 to 421661
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   crash_id             421662 non-null  int64  
 1   crash_fatal_fl       421662 non-null  object 
 2   crash_date           421662 non-null  object 
 3   road_constr_zone_fl  421656 non-null  object 
 4   latitude             389942 non-null  float64
 5   longitude            389942 non-null  float64
 6   tot_injry_cnt        421656 non-null  float64
 7   death_cnt            421657 non-null  float64
 8   est_comp_cost        421656 non-null  object 
dtypes: float64(4), int64(1), object(4)
memory usage: 29.0+ MB


In [57]:
# Keepiing only those observations where x-y coordinates are present
df_vz_filter = df_vz[df_vz['latitude'].notnull() & df_vz['longitude'].notnull()]

In [58]:
df_vz_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 389942 entries, 0 to 421661
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   crash_id             389942 non-null  int64  
 1   crash_fatal_fl       389942 non-null  object 
 2   crash_date           389942 non-null  object 
 3   road_constr_zone_fl  389942 non-null  object 
 4   latitude             389942 non-null  float64
 5   longitude            389942 non-null  float64
 6   tot_injry_cnt        389942 non-null  float64
 7   death_cnt            389942 non-null  float64
 8   est_comp_cost        389942 non-null  object 
dtypes: float64(4), int64(1), object(4)
memory usage: 29.8+ MB


In [59]:
# Convert timestamp columns to string
timestamp_columns = ["crash_date"]

for col in timestamp_columns:
    df_vz_filter.loc[:, col] = df_vz_filter[col].astype(str)

In [60]:
# Creating geodataframe
gdf_vz = gpd.GeoDataFrame(df_vz_filter,
                          geometry=gpd.points_from_xy(df_vz_filter.longitude,
                                                      df_vz_filter.latitude),
                                                      crs='EPSG:4326')

gdf_vz.head()

Unnamed: 0,crash_id,crash_fatal_fl,crash_date,road_constr_zone_fl,latitude,longitude,tot_injry_cnt,death_cnt,est_comp_cost,geometry
0,18803974,N,2022-03-09,N,29.957308,-97.877322,1.0,0.0,2366000.0,POINT (-97.87732 29.95731)
1,17777972,N,2020-07-08,N,30.534709,-97.601158,0.0,0.0,153000.0,POINT (-97.60116 30.53471)
2,18802102,N,2022-03-09,N,29.870615,-97.896162,0.0,0.0,153000.0,POINT (-97.89616 29.87061)
3,17038781,N,2019-04-25,N,30.248468,-97.735439,0.0,0.0,204000.0,POINT (-97.73544 30.24847)
4,18827744,N,2022-03-09,N,30.058752,-97.803758,0.0,0.0,153000.0,POINT (-97.80376 30.05875)


# Spatial join

In [61]:
# Creating buffer for joining
gdf_moped = gdf_moped.set_geometry('line_geometry')
gdf_moped.set_crs(epsg=4326, inplace=True)
gdf_moped_proj = gdf_moped.to_crs(epsg=32614)
buffer_distance = 20

gdf_moped_proj = gdf_moped.to_crs(epsg=32614)

In [62]:
gdf_moped_proj['buffered_geometry'] = gdf_moped_proj.geometry.buffer(buffer_distance)
buffered_moped_gdf = gdf_moped_proj.set_geometry('buffered_geometry').to_crs('EPSG:4326')

Buffered geometry results in line strings and multi line strings being turned into polygons

In [63]:
buffered_moped_gdf.head()

Unnamed: 0,moped_component_id,project_id,project_component_id,geometry,line_geometry,substantial_completion_date,project_name,component_name,component_name_full,component_subtype,component_work_types,type_name,buffered_geometry
10,1,12,181.0,MULTIPOINT (-97.73351 30.26751),"LINESTRING (621833.101 3349106.852, 621832.966...",2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Signal,Signal - Traffic,Traffic,Modification,Signal - Mod,"POLYGON ((-97.73323 30.26752, -97.73323 30.267..."
11,2,12,183.0,MULTIPOINT (-97.73430 30.26772),"LINESTRING (621756.837 3349129.944, 621756.702...",2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Signal,Signal - Traffic,Traffic,Modification,Signal - Mod,"POLYGON ((-97.73402 30.26774, -97.73402 30.267..."
12,3,12,182.0,MULTIPOINT (-97.73386 30.26657),"LINESTRING (621800.878 3349002.525, 621800.743...",2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Signal,Signal - Traffic,Traffic,Modification,Signal - Mod,"POLYGON ((-97.73358 30.26659, -97.73358 30.266..."
13,4,12,469.0,"MULTIPOINT (-97.73467 30.26683, -97.73431 30.2...","MULTILINESTRING ((621756.156 3349130.601, 6217...",2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Intersection,Intersection - Improvement,Improvement,,Signal - Mod,"MULTIPOLYGON (((-97.73323 30.26752, -97.73323 ..."
14,5,12,16.0,MULTIPOINT (-97.73467 30.26682),"LINESTRING (621722.743 3349029.365, 621722.609...",2022-10-10 05:00:00+00:00,East 7th Street & East 8th Street / I-35,Signal,Signal - Traffic,Traffic,Modification,Signal - Mod,"POLYGON ((-97.73439 30.26684, -97.73438 30.266..."


In [64]:
# Spatial join
crashes_near_projects = gpd.sjoin(gdf_vz, buffered_moped_gdf, how='inner')

# Creating a unique ID column
crashes_near_projects['crash_project_component_id'] = crashes_near_projects['crash_id'].astype(str) + "-" + crashes_near_projects['project_id'].astype(str) + "-" + crashes_near_projects['project_component_id'].astype(str)

In [65]:
print('Number of unique crashes in merged dataset:', crashes_near_projects['crash_id'].nunique())
print('Number of unique moped component IDs in merged dataset:', crashes_near_projects['moped_component_id'].nunique())

Number of unique crashes in merged dataset: 100426
Number of unique moped component IDs in merged dataset: 2096


In [66]:
crashes_near_projects.info()

<class 'pandas.core.frame.DataFrame'>
Index: 194791 entries, 8 to 420756
Data columns (total 24 columns):
 #   Column                       Non-Null Count   Dtype              
---  ------                       --------------   -----              
 0   crash_id                     194791 non-null  int64              
 1   crash_fatal_fl               194791 non-null  object             
 2   crash_date                   194791 non-null  object             
 3   road_constr_zone_fl          194791 non-null  object             
 4   latitude                     194791 non-null  float64            
 5   longitude                    194791 non-null  float64            
 6   tot_injry_cnt                194791 non-null  float64            
 7   death_cnt                    194791 non-null  float64            
 8   est_comp_cost                194791 non-null  object             
 9   geometry_left                194791 non-null  geometry           
 10  index_right                  194791 n

# Analysis

In [67]:
# Formatting crash date
crashes_near_projects['crash_date'] = pd.to_datetime(crashes_near_projects['crash_date'], errors='coerce').dt.tz_localize('UTC', nonexistent='NaT', ambiguous='NaT').dt.tz_convert('UTC')

In [68]:
crashes_near_projects.info()

<class 'pandas.core.frame.DataFrame'>
Index: 194791 entries, 8 to 420756
Data columns (total 24 columns):
 #   Column                       Non-Null Count   Dtype              
---  ------                       --------------   -----              
 0   crash_id                     194791 non-null  int64              
 1   crash_fatal_fl               194791 non-null  object             
 2   crash_date                   194791 non-null  datetime64[ns, UTC]
 3   road_constr_zone_fl          194791 non-null  object             
 4   latitude                     194791 non-null  float64            
 5   longitude                    194791 non-null  float64            
 6   tot_injry_cnt                194791 non-null  float64            
 7   death_cnt                    194791 non-null  float64            
 8   est_comp_cost                194791 non-null  object             
 9   geometry_left                194791 non-null  geometry           
 10  index_right                  194791 n

In [69]:
# Re-arranging columns
# unique identifier for each observation
crashes_near_projects.insert(0, 'crash_project_component_id', crashes_near_projects.pop('crash_project_component_id'))

# moped_component_id
crashes_near_projects.insert(2, 'moped_component_id', crashes_near_projects.pop('moped_component_id'))

# crash_date
crashes_near_projects.insert(4, 'crash_date', crashes_near_projects.pop('crash_date'))

# project compoenent ID
crashes_near_projects.insert(3, 'project_component_id', crashes_near_projects.pop('project_component_id'))

# Substantial completion date
crashes_near_projects.insert(5, 'substantial_completion_date', crashes_near_projects.pop('substantial_completion_date'))

In [70]:
# Creating a binary version of the fatality column
crashes_near_projects['crash_fatal_binary'] = crashes_near_projects['crash_fatal_fl'].apply(lambda x: 1 if x == "Y" else 0)
crashes_near_projects.pop('crash_fatal_fl')

# Rearranging the crash fatal binary column 
crashes_near_projects.insert(4, 'crash_fatal_binary', crashes_near_projects.pop('crash_fatal_binary'))

In [71]:
crashes_near_projects.head()

Unnamed: 0,crash_project_component_id,crash_id,moped_component_id,project_component_id,crash_fatal_binary,substantial_completion_date,crash_date,road_constr_zone_fl,latitude,longitude,...,index_right,project_id,geometry_right,line_geometry,project_name,component_name,component_name_full,component_subtype,component_work_types,type_name
8,15208414-1723-2381.0,15208414,1101,2381.0,0,2018-05-02 05:00:00+00:00,2016-07-13 00:00:00+00:00,N,30.244775,-97.730522,...,2444,1723,MULTIPOINT (-97.73037 30.24484),"LINESTRING (622162.931 3346597.886, 622162.797...",Lakeshore at Riverside_Intersection Reconstruc...,Intersection,Intersection - Improvement,Improvement,,
12633,18787731-1723-2381.0,18787731,1101,2381.0,0,2018-05-02 05:00:00+00:00,2022-02-18 00:00:00+00:00,N,30.244798,-97.730539,...,2444,1723,MULTIPOINT (-97.73037 30.24484),"LINESTRING (622162.931 3346597.886, 622162.797...",Lakeshore at Riverside_Intersection Reconstruc...,Intersection,Intersection - Improvement,Improvement,,
18334,18745503-1723-2381.0,18745503,1101,2381.0,0,2018-05-02 05:00:00+00:00,2022-02-13 00:00:00+00:00,N,30.244767,-97.730516,...,2444,1723,MULTIPOINT (-97.73037 30.24484),"LINESTRING (622162.931 3346597.886, 622162.797...",Lakeshore at Riverside_Intersection Reconstruc...,Intersection,Intersection - Improvement,Improvement,,
18853,18939703-1723-2381.0,18939703,1101,2381.0,1,2018-05-02 05:00:00+00:00,2022-05-15 00:00:00+00:00,N,30.244775,-97.730522,...,2444,1723,MULTIPOINT (-97.73037 30.24484),"LINESTRING (622162.931 3346597.886, 622162.797...",Lakeshore at Riverside_Intersection Reconstruc...,Intersection,Intersection - Improvement,Improvement,,
26344,13655953-1723-2381.0,13655953,1101,2381.0,0,2018-05-02 05:00:00+00:00,2013-10-26 00:00:00+00:00,N,30.244775,-97.730522,...,2444,1723,MULTIPOINT (-97.73037 30.24484),"LINESTRING (622162.931 3346597.886, 622162.797...",Lakeshore at Riverside_Intersection Reconstruc...,Intersection,Intersection - Improvement,Improvement,,


In [72]:
crashes_near_projects['crash_fatal_binary'].value_counts()

crash_fatal_binary
0    193979
1       812
Name: count, dtype: int64

In [73]:
crashes_near_projects.info()

<class 'pandas.core.frame.DataFrame'>
Index: 194791 entries, 8 to 420756
Data columns (total 24 columns):
 #   Column                       Non-Null Count   Dtype              
---  ------                       --------------   -----              
 0   crash_project_component_id   194791 non-null  object             
 1   crash_id                     194791 non-null  int64              
 2   moped_component_id           194791 non-null  int64              
 3   project_component_id         194791 non-null  float64            
 4   crash_fatal_binary           194791 non-null  int64              
 5   substantial_completion_date  194791 non-null  datetime64[ns, UTC]
 6   crash_date                   194791 non-null  datetime64[ns, UTC]
 7   road_constr_zone_fl          194791 non-null  object             
 8   latitude                     194791 non-null  float64            
 9   longitude                    194791 non-null  float64            
 10  tot_injry_cnt                194791 n

In [74]:
# Creating indicator variables for crash occuring pre and post completion of mobility project
crashes_near_projects.insert(7, 'crash_pre_completion', crashes_near_projects['crash_date'] < crashes_near_projects['substantial_completion_date'])
crashes_near_projects.insert(8, 'crash_post_completion', crashes_near_projects['crash_date'] > crashes_near_projects['substantial_completion_date'])

In [75]:
# Creating time difference variables
crashes_near_projects.insert(9, 'crash_project_date_diff', crashes_near_projects['substantial_completion_date'] - crashes_near_projects['crash_date'])

In [76]:
# Converting estimated comp cost to float format
crashes_near_projects['est_comp_cost'] = crashes_near_projects['est_comp_cost'].map(lambda x: float(x))

crashes_near_projects.info()

<class 'pandas.core.frame.DataFrame'>
Index: 194791 entries, 8 to 420756
Data columns (total 27 columns):
 #   Column                       Non-Null Count   Dtype              
---  ------                       --------------   -----              
 0   crash_project_component_id   194791 non-null  object             
 1   crash_id                     194791 non-null  int64              
 2   moped_component_id           194791 non-null  int64              
 3   project_component_id         194791 non-null  float64            
 4   crash_fatal_binary           194791 non-null  int64              
 5   substantial_completion_date  194791 non-null  datetime64[ns, UTC]
 6   crash_date                   194791 non-null  datetime64[ns, UTC]
 7   crash_pre_completion         194791 non-null  bool               
 8   crash_post_completion        194791 non-null  bool               
 9   crash_project_date_diff      194791 non-null  timedelta64[ns]    
 10  road_constr_zone_fl          194791 n

In [77]:
# Function to calculate duration in years
def calculate_duration(df, date_col1, date_col2):
    duration = (df[date_col2] - df[date_col1]).dt.total_seconds() / (365.25 * 24 * 3600)
    return duration

crashes_near_projects['pre_completion_duration'] = crashes_near_projects['crash_pre_completion'] * calculate_duration(crashes_near_projects, 'crash_date', 'substantial_completion_date')
crashes_near_projects['post_completion_duration'] = crashes_near_projects['crash_post_completion'] * calculate_duration(crashes_near_projects, 'substantial_completion_date', 'crash_date')

pre_completion_stats = crashes_near_projects[crashes_near_projects['crash_pre_completion'] == True].groupby('moped_component_id').agg({
    'crash_id': 'count',
    'pre_completion_duration': 'sum',
    'crash_fatal_binary': 'sum',
    'tot_injry_cnt': 'sum',
    'death_cnt': 'sum',
    'est_comp_cost': 'sum'
}).rename(columns={'crash_id': 'pre_crash_count',
                   'crash_fatal_binary': 'pre_fatal_crash_count',
                   'tot_injry_cnt': 'pre_total_injury_count',
                   'death_cnt': 'pre_total_death_count',
                   'est_comp_cost': 'pre_est_comp_cost'}).reset_index()

post_completion_stats = crashes_near_projects[crashes_near_projects['crash_post_completion'] == True].groupby('moped_component_id').agg({
    'crash_id': 'count',
    'post_completion_duration': 'sum',
    'crash_fatal_binary': 'sum',
    'tot_injry_cnt': 'sum',
    'death_cnt': 'sum',
    'est_comp_cost': 'sum'
}).rename(columns={'crash_id': 'post_crash_count',
                   'crash_fatal_binary': 'post_fatal_crash_count',
                   'tot_injry_cnt': 'post_total_injury_count',
                   'death_cnt': 'post_total_death_count',
                   'est_comp_cost': 'post_est_comp_cost'}).reset_index()


# Merging
annualized_statistics = pre_completion_stats.merge(post_completion_stats, on='moped_component_id', how='outer').fillna(0)

# Calculating annualized statistics
# Crash rate
annualized_statistics['pre_annualized_crash_rate'] = annualized_statistics['pre_crash_count'] / annualized_statistics['pre_completion_duration']
annualized_statistics['post_annualized_crash_rate'] = annualized_statistics['post_crash_count'] / annualized_statistics['post_completion_duration']

# Fatality
annualized_statistics['pre_annualized_fatal_crash_rate'] = annualized_statistics['pre_fatal_crash_count'] / annualized_statistics['pre_completion_duration']
annualized_statistics['post_annualized_fatal_crash_rate'] = annualized_statistics['post_fatal_crash_count'] / annualized_statistics['post_completion_duration']

# Injury count
annualized_statistics['pre_annualized_injury_rate'] = annualized_statistics['pre_total_injury_count'] / annualized_statistics['pre_completion_duration']
annualized_statistics['post_annualized_injury_rate'] = annualized_statistics['post_total_injury_count'] / annualized_statistics['post_completion_duration']

# Death count
annualized_statistics['pre_annualized_death_rate'] = annualized_statistics['pre_total_death_count'] / annualized_statistics['pre_completion_duration']
annualized_statistics['post_annualized_death_rate'] = annualized_statistics['post_total_death_count'] / annualized_statistics['post_completion_duration']

# Estimated cost
annualized_statistics['pre_annualized_cost'] = annualized_statistics['pre_est_comp_cost'] / annualized_statistics['pre_completion_duration']
annualized_statistics['post_annualized_cost'] = annualized_statistics['post_est_comp_cost'] / annualized_statistics['post_completion_duration']

In [78]:
# Getting completion date for each moped component id
completion_dates = crashes_near_projects.groupby('moped_component_id')['substantial_completion_date'].first().reset_index()

# Merging into the annualized crash rate DataFrame
annualized_statistics = annualized_statistics.merge(completion_dates, on='moped_component_id', how='left')

In [79]:
annualized_statistics = annualized_statistics[['moped_component_id',
                                               'substantial_completion_date', 
                                               'pre_annualized_crash_rate', 
                                               'post_annualized_crash_rate',
                                               'pre_annualized_fatal_crash_rate',
                                               'post_annualized_fatal_crash_rate',
                                               'pre_annualized_injury_rate',
                                               'post_annualized_injury_rate',
                                               'pre_annualized_death_rate',
                                               'post_annualized_death_rate',
                                               'pre_annualized_cost',
                                               'post_annualized_cost'
                                               ]]

In [80]:
annualized_statistics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2096 entries, 0 to 2095
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype              
---  ------                            --------------  -----              
 0   moped_component_id                2096 non-null   int64              
 1   substantial_completion_date       2096 non-null   datetime64[ns, UTC]
 2   pre_annualized_crash_rate         1892 non-null   float64            
 3   post_annualized_crash_rate        1472 non-null   float64            
 4   pre_annualized_fatal_crash_rate   1892 non-null   float64            
 5   post_annualized_fatal_crash_rate  1472 non-null   float64            
 6   pre_annualized_injury_rate        1892 non-null   float64            
 7   post_annualized_injury_rate       1472 non-null   float64            
 8   pre_annualized_death_rate         1892 non-null   float64            
 9   post_annualized_death_rate        1472 non-null   float64      

In [81]:
# Creating difference columns between pre and post
annualized_statistics.insert(4, 'delta_crash_rate', annualized_statistics['post_annualized_crash_rate']  - annualized_statistics['pre_annualized_crash_rate'])
annualized_statistics.insert(7, 'delta_fatal_crash_rate', annualized_statistics['post_annualized_fatal_crash_rate']  - annualized_statistics['pre_annualized_fatal_crash_rate'])
annualized_statistics.insert(10, 'delta_injury_rate', annualized_statistics['post_annualized_injury_rate']  - annualized_statistics['pre_annualized_injury_rate'])
annualized_statistics.insert(13, 'delta_death_rate', annualized_statistics['post_annualized_death_rate']  - annualized_statistics['pre_annualized_death_rate'])
annualized_statistics.insert(16, 'delta_comp_cost', annualized_statistics['post_annualized_cost']  - annualized_statistics['pre_annualized_cost'])


In [82]:
crashes_near_projects.info()

<class 'pandas.core.frame.DataFrame'>
Index: 194791 entries, 8 to 420756
Data columns (total 29 columns):
 #   Column                       Non-Null Count   Dtype              
---  ------                       --------------   -----              
 0   crash_project_component_id   194791 non-null  object             
 1   crash_id                     194791 non-null  int64              
 2   moped_component_id           194791 non-null  int64              
 3   project_component_id         194791 non-null  float64            
 4   crash_fatal_binary           194791 non-null  int64              
 5   substantial_completion_date  194791 non-null  datetime64[ns, UTC]
 6   crash_date                   194791 non-null  datetime64[ns, UTC]
 7   crash_pre_completion         194791 non-null  bool               
 8   crash_post_completion        194791 non-null  bool               
 9   crash_project_date_diff      194791 non-null  timedelta64[ns]    
 10  road_constr_zone_fl          194791 n

In [83]:
# Merging additional information such as component name, type, etc.
additional_info = crashes_near_projects[['moped_component_id', 
                                         'component_name', 
                                         'component_name_full', 
                                         'component_subtype',
                                         'component_work_types', 
                                         'type_name',
                                         'line_geometry',
                                         'project_name',
                                         'project_id',
                                         'project_component_id']].drop_duplicates()

annualized_statistics = annualized_statistics.merge(additional_info, on='moped_component_id', how='left')

In [84]:
annualized_statistics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2096 entries, 0 to 2095
Data columns (total 26 columns):
 #   Column                            Non-Null Count  Dtype              
---  ------                            --------------  -----              
 0   moped_component_id                2096 non-null   int64              
 1   substantial_completion_date       2096 non-null   datetime64[ns, UTC]
 2   pre_annualized_crash_rate         1892 non-null   float64            
 3   post_annualized_crash_rate        1472 non-null   float64            
 4   delta_crash_rate                  1268 non-null   float64            
 5   pre_annualized_fatal_crash_rate   1892 non-null   float64            
 6   post_annualized_fatal_crash_rate  1472 non-null   float64            
 7   delta_fatal_crash_rate            1268 non-null   float64            
 8   pre_annualized_injury_rate        1892 non-null   float64            
 9   post_annualized_injury_rate       1472 non-null   float64      

In [85]:
# Reordering
all_columns = annualized_statistics.columns.tolist()

first_column = all_columns[0]
last_six = all_columns[-6:]
new_order = [first_column] + last_six + all_columns[1:-6]
annualized_statistics = annualized_statistics[new_order]

In [86]:
annualized_statistics.to_csv('../Output/annualized_statistics.csv', na_rep="NA", index=False)

In [87]:
annualized_statistics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2096 entries, 0 to 2095
Data columns (total 26 columns):
 #   Column                            Non-Null Count  Dtype              
---  ------                            --------------  -----              
 0   moped_component_id                2096 non-null   int64              
 1   component_work_types              724 non-null    object             
 2   type_name                         69 non-null     object             
 3   line_geometry                     2096 non-null   geometry           
 4   project_name                      2096 non-null   object             
 5   project_id                        2096 non-null   int64              
 6   project_component_id              2096 non-null   float64            
 7   substantial_completion_date       2096 non-null   datetime64[ns, UTC]
 8   pre_annualized_crash_rate         1892 non-null   float64            
 9   post_annualized_crash_rate        1472 non-null   float64      