# Project type level analysis

In [35]:
import psycopg2
import json
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape
from shapely.geometry import LineString, MultiLineString
import matplotlib.pyplot as plt
import re

from config import DB_VISION_ZERO, DB_MOPED
from helper import dict_factory, to_json_list


def get_data(query, cursor):
    """
    Get data from database
    """
    cursor.execute(query)
    data = cursor.fetchall()
    field_names = [i[0] for i in cursor.description]
    df = pd.DataFrame(data, columns=field_names)

    return df

conn_vz = psycopg2.connect(
    dbname = DB_VISION_ZERO['dbname'],
    user = DB_VISION_ZERO["user"],
    host = DB_VISION_ZERO["host"],
    password = DB_VISION_ZERO["password"],
    port=5432
)

conn_moped = psycopg2.connect(
    dbname = DB_MOPED["dbname"],
    user = DB_MOPED["user"],
    host = DB_MOPED["host"],
    password = DB_MOPED["password"],
    port = 5432
)

cursor_vz = conn_vz.cursor()
cursor_moped = conn_moped.cursor()

## Moped data

In [36]:
# Creating moped dataframe
QUERY_MOPED = """SELECT project_id, project_component_id, geometry, 
line_geometry, substantial_completion_date, 
substantial_completion_date_estimated, completion_date, completion_end_date,
component_name FROM component_arcgis_online_view"""

# Creating moped dataframe
df_moped = get_data(QUERY_MOPED, cursor_moped)

# Dropping observations where substantial completion date and component type are missing
df_moped_filter = df_moped.dropna(subset=['substantial_completion_date', 
                                          'component_name',
                                          'line_geometry'])

In [37]:
# Getting project types
project_types_counts = df_moped_filter['component_name'].value_counts()
project_types = project_types_counts.index.tolist()

In [38]:
# Convert timestamp columns to string using .loc
timestamp_columns = ["substantial_completion_date",
                     "substantial_completion_date_estimated", 
                     "completion_date", 
                     "completion_end_date"]

for col in timestamp_columns:
    df_moped_filter.loc[:, col] = df_moped_filter[col].astype(str)

# Geometry transformation using shape
df_moped_filter.loc[:, "geometry"] = df_moped_filter["geometry"].apply(lambda x: shape(x) if x is not None else None)
df_moped_filter.loc[:, "line_geometry"] = df_moped_filter["line_geometry"].apply(lambda x: shape(x) if x is not None else None)

# Create GeoDataFrame
gdf_moped = gpd.GeoDataFrame(df_moped_filter, geometry="geometry")

In [39]:
# Adding a unique ID column to the moped data frame
gdf_moped.insert(0, 'moped_component_id', range(1, 1 + len(gdf_moped)))

## VisionZero

In [40]:
# Creaing vision zero dataframe
QUERY_CRASH_DATA = """SELECT crash_id, crash_fatal_fl, crash_date, road_constr_zone_fl, latitude, longitude, tot_injry_cnt, death_cnt, est_comp_cost 
FROM atd_txdot_crashes"""

df_vz = get_data(QUERY_CRASH_DATA, cursor_vz)

# Keepiing only those observations where x-y coordinates are present
df_vz_filter = df_vz[df_vz['latitude'].notnull() & df_vz['longitude'].notnull()]

# Convert timestamp columns to string using .loc
timestamp_columns = ["crash_date"]

for col in timestamp_columns:
    df_vz_filter.loc[:, col] = df_vz_filter[col].astype(str)


# Creating geodataframe
gdf_vz = gpd.GeoDataFrame(df_vz_filter,
                          geometry=gpd.points_from_xy(df_vz_filter.longitude,
                                                      df_vz_filter.latitude),
                                                      crs='EPSG:4326')

## Spatial join

In [41]:
# Creating buffer for joining
gdf_moped = gdf_moped.set_geometry('line_geometry')
gdf_moped.set_crs(epsg=4326, inplace=True)
gdf_moped_proj = gdf_moped.to_crs(epsg=32614)
buffer_distance = 20

gdf_moped_proj = gdf_moped.to_crs(epsg=32614)

gdf_moped_proj['buffered_geometry'] = gdf_moped_proj.geometry.buffer(buffer_distance)
buffered_moped_gdf = gdf_moped_proj.set_geometry('buffered_geometry').to_crs('EPSG:4326')

In [42]:
# Creating dataframes for each project type
project_dataframes = {}

# Loop over each category and save the subset DataFrame to a CSV file
for project in project_types:
    # Subset the DataFrame based on the current category
    subset_df = df_moped[df_moped['component_name'] == project]
    
    # Sanitize the category name for use in a variable name
    sanitized_project = re.sub(r'[^a-zA-Z0-9]', '_', project)
    
    # Store the subset DataFrame in the dictionary
    project_dataframes[sanitized_project] = subset_df

In [43]:
project_dataframes

{'Bike_Lane':        project_id  project_component_id  \
 26             22               10465.0   
 35             22               10463.0   
 294           246               11526.0   
 370           278               11605.0   
 375           279               11525.0   
 ...           ...                   ...   
 12026        3485               12304.0   
 12029        3485               12302.0   
 12034        3487               12312.0   
 12104        3525               12414.0   
 12142        3536               12525.0   
 
                                                 geometry  \
 26     {'type': 'MultiLineString', 'coordinates': [[[...   
 35     {'type': 'MultiLineString', 'coordinates': [[[...   
 294    {'type': 'MultiLineString', 'coordinates': [[[...   
 370    {'type': 'MultiLineString', 'coordinates': [[[...   
 375    {'type': 'MultiLineString', 'coordinates': [[[...   
 ...                                                  ...   
 12026  {'type': 'MultiLineStr

In [None]:
# Perform the spatial join
crashes_near_projects = gpd.sjoin(gdf_vz, buffered_moped_gdf, how='inner')

# Creating a unique ID column
crashes_near_projects['crash_project_component_id'] = crashes_near_projects['crash_id'].astype(str) + "-" + crashes_near_projects['project_id'].astype(str) + "-" + crashes_near_projects['project_component_id'].astype(str)