In [None]:
import pandas as pd
from pathlib import Path

import geopandas as gpd
import contextily as ctx
from shapely.geometry import LineString
import matplotlib.pyplot as plt
import matplotlib.animation as animation

import numpy as np
from linearmodels import PanelOLS

BAYWHEELS = Path("./DATA") # CLEANED BAYWHEELS DATA GOES HERE
RIDERSHIP = Path("./DATA") # SFMTA MUNI RIDERSHIP DATA GOES HERE

GTFS_ROUTES = Path("./DATA")   # SFMTA ROUTES DATA GOES HERE
GTFS_TRIPS = Path("./DATA") # SFMTA TRIPS DATA GOES HERE
GTFS_STOPS = Path("./DATA") # SFMTA STOPS DATA GOES HERE

GTFS_STOP_TIMES = Path("../DATA")       # SFMTA STOP TIMES DATA GOES HERE
GTFS_SHAPES = Path("../DATA")       # SFMTA SHAPES DATA GOES HERE

In [None]:
baywheels_sf = pd.read_csv(BAYWHEELS, engine = 'pyarrow')        
muni = pd.read_csv(RIDERSHIP, engine = 'pyarrow')

routes = pd.read_csv(GTFS_ROUTES, engine = 'pyarrow')
trips = pd.read_csv(GTFS_TRIPS, engine = 'pyarrow')
stops = pd.read_csv(GTFS_STOPS, engine = 'pyarrow')
stop_times = pd.read_csv(GTFS_STOP_TIMES, engine = 'pyarrow')
shapes = pd.read_csv(GTFS_SHAPES, engine = 'pyarrow')

In [None]:
baywheels_sf = baywheels_sf[baywheels_sf['end_station_id'] != 'SF-Y7'].copy() 
baywheels_sf = baywheels_sf[baywheels_sf['started_at'] > '2019-06-01'].copy()

bikeshare_stations = baywheels_sf.sort_values('ended_at').drop_duplicates(subset = ['end_station_id'], keep = 'first').copy()
bikeshare_stations = bikeshare_stations[['end_station_id', 'end_lat', 'end_lng', 'ended_at']].rename(columns = {'ended_at': 'first_appeared_at'})
bikeshare_stations['first_appeared_at'] = pd.to_datetime(bikeshare_stations['first_appeared_at']).dt.normalize()

In [None]:
muni['Month'] = pd.to_datetime(muni['Month'], format = '%B %Y').dt.normalize()

muni.dropna(subset = ['Average Daily Boardings'], inplace = True)
muni['Average Daily Boardings'] = muni['Average Daily Boardings'].str.replace(',', '').astype('int64')

BUS_SERIVCE_CATEGORIES = ['Frequent Local', 'Grid', 'Rapid Bus', 'Connector']
muni = muni[muni['Service Category'].isin(BUS_SERIVCE_CATEGORIES)].copy()
muni['Route'] = muni['Route'].str.upper()
muni = muni[muni['Service Day of the Week'] == 'Weekday'][['Month', 'Route', 'Average Daily Boardings']].copy()

In [None]:
bus_routes = routes[routes['route_type'] == 3].copy()
bus_trips = trips[trips['route_id'].isin(bus_routes['route_id'])].copy()
bus_stop_times = stop_times[stop_times['trip_id'].isin(bus_trips['trip_id'])].copy()

bus_stop_times = bus_stop_times.merge(stops[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']], on = 'stop_id', how = 'left')
bus_trips = bus_trips.merge(bus_routes[['route_id', 'route_short_name', 'route_long_name']], on = 'route_id', how = 'left')

bus_route_stops = bus_stop_times.merge(bus_trips[['trip_id', 'route_id', 'route_short_name', 'route_long_name', 'direction_id', 'trip_headsign']])

route_stops = bus_route_stops.sort_values(['route_id', 'direction_id', 'stop_sequence']).drop_duplicates(['route_id', 'direction_id', 'stop_id'])

keep_route_stop_columns = ['stop_id', 'direction_id', 'stop_sequence', 'stop_name', 'route_short_name', 'route_long_name', 'stop_lat', 'stop_lon']
route_stops = route_stops[keep_route_stop_columns]

route_stops = route_stops[route_stops['direction_id'] == 1].copy()              

In [None]:
shapes = shapes.sort_values(['shape_id', 'shape_pt_sequence'])

bus_shapes = shapes.groupby('shape_id')[['shape_pt_lon', 'shape_pt_lat']].apply(lambda df: LineString(zip(df['shape_pt_lon'], df['shape_pt_lat']))).reset_index(name = 'geometry')
bus_shapes_gdf = gpd.GeoDataFrame(bus_shapes, geometry = 'geometry', crs = 'EPSG:4326')

bus_shape_routes = bus_shapes_gdf.merge(bus_trips.drop_duplicates('shape_id'), on = 'shape_id', how = 'left')
bus_shape_routes = bus_shape_routes[bus_shape_routes['direction_id'] == 1.0]
bus_shape_routes = bus_shape_routes[bus_shape_routes['route_id'].notna()].copy()

In [None]:
# HOW IS THE TREATMENT DEFINED? (GIF)
bus_df = bus_shape_routes
bike_df = bikeshare_stations

monthly_dates = pd.date_range(start = bike_df['first_appeared_at'].min(), end = bike_df['first_appeared_at'].max(), freq = 'ME')

bus_gdf = gpd.GeoDataFrame(bus_df, geometry = 'geometry', crs = 'EPSG: 4326')
bus_gdf_3857 = bus_gdf.to_crs(epsg = 3857)

bike_gdf = gpd.GeoDataFrame(bike_df, geometry = gpd.points_from_xy(bike_df.end_lng, bike_df.end_lat), crs = 'EPSG:4326')
bike_gdf_3857 = bike_gdf.to_crs(epsg = 3857)
bike_gdf_3857['x_3857'] = bike_gdf_3857.geometry.x
bike_gdf_3857['y_3857'] = bike_gdf_3857.geometry.y

target_route_name = '18'
target_route = bus_gdf_3857[bus_gdf_3857['route_short_name'] == target_route_name]

treatment_zone = target_route.buffer(400)
treatment_polygon = treatment_zone.geometry.iloc[0]

minx, miny, maxx, maxy = treatment_zone.total_bounds
buffer_margin = 1000

fig, ax = plt.subplots(figsize = (12,12))
ax.set_xlim(minx - buffer_margin, maxx + buffer_margin)
ax.set_ylim(miny - buffer_margin, maxy + buffer_margin)
ax.axis('off')
fig.subplots_adjust(left = 0, bottom = 0, right = 1, top = 1)

treatment_zone.plot(ax = ax, color = 'orange', alpha = 0.1, edgecolor = 'orange', linestyle = '--', linewidth = 1)
target_route.plot(ax = ax, color = 'blue', linewidth = 3, alpha = 0.8)
ctx.add_basemap(ax, source = ctx.providers.CartoDB.Positron)

scat_outside = ax.scatter([], [], c = 'red', s=50, alpha=0.7, edgecolors='white', linewidth = 0.5, zorder = 5)
scat_inside = ax.scatter([], [], c = 'green', s=50, alpha=1.0, edgecolors='white', linewidth = 0.5, zorder = 6)

date_text = ax.text(0.02, 0.95, '', transform = ax.transAxes, fontsize = 12, bbox = dict(facecolor = 'white', alpha = 0.9, boxstyle = 'round'))

def update(frame_date):
    current_stations = bike_gdf_3857[bike_gdf_3857['first_appeared_at'] <= frame_date]
    
    is_inside_mask = current_stations.geometry.within(treatment_polygon)

    stations_in = current_stations[is_inside_mask]
    stations_out = current_stations[~is_inside_mask]

    if not stations_in.empty:
        scat_inside.set_offsets(np.c_[stations_in['x_3857'], stations_in['y_3857']])
    else:
        scat_inside.set_offsets(np.empty((0, 2)))

    if not stations_out.empty:
        scat_outside.set_offsets(np.c_[stations_out['x_3857'], stations_out['y_3857']])
    else:
        scat_outside.set_offsets(np.empty((0, 2)))

    date_text.set_text(frame_date.strftime('%B %Y'))

    return scat_inside, scat_outside , date_text


ani = animation.FuncAnimation(fig, update, frames = monthly_dates, interval = 150, blit = True)
output_file = 'method_treatment_definition.gif'
ani.save(output_file, writer = 'pillow', fps = 3, dpi = 100)
plt.close()

In [None]:
muni_route_number_dictionary = {
    '21': '6'
}

muni_route_long_name_dictionary = {
    'HAYES': 'HAYES/PARNASSUS',
    'HAIGHT/PARNASSUS': 'HAYES/PARNASSUS'
}

muni_drop_suspended_routes_list = ['JACKSON', 'TOWNSEND', 'VAN NESS']      

standardize_route_stops_long_name_dictionary = {
    'HAYES-PARNASSUS': 'HAYES/PARNASSUS',       
    'HAIGHT-NORIEGA': 'HAIGHT/NORIEGA',
    'FOLSOM-PACIFIC': 'FOLSOM/PACIFIC',
    'ASHBURY-18TH ST': 'ASHBURY/18TH',
    'UNION-STOCKTON': 'UNION/STOCKTON',
    'VAN NESS-MISSION': 'VAN NESS/MISSION',
    'QUINTARA-24TH STREET': 'QUINTARA/24TH STREET',

}

route_stops_drop_routes_list = ['BAYVIEW HUNTERS POINT EXPRESS','CALIFORNIA EXPRESS','MARINA EXPRESS','BART EARLY BIRD','BAYSHORE A EXPRESS',
                                'BAYSHORE B EXPRESS', 'SAN BRUNO OWL','3RD-19TH AVE OWL','INGLESIDE BUS','OWL TARAVAL',
                                'JUDAH BUS','OWL JUDAH','THIRD BUS']


muni[['route_number', 'route_long_name']] = muni['Route'].str.split(' ', n = 1, expand = True)

In [None]:
route_stops['route_long_name'] = route_stops['route_long_name'].replace(standardize_route_stops_long_name_dictionary)
muni['route_long_name'] = muni['route_long_name'].replace(muni_route_long_name_dictionary)

muni['route_number'] = muni['route_number'].replace(muni_route_number_dictionary)

muni = muni[~muni['route_long_name'].isin(muni_drop_suspended_routes_list)]
route_stops = route_stops[~route_stops['route_long_name'].isin(route_stops_drop_routes_list)]

muni = muni[['Month', 'Average Daily Boardings', 'route_number', 'route_long_name']]
route_stops = route_stops[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'route_long_name']] 

In [None]:
stops_gdf = gpd.GeoDataFrame(route_stops, geometry = gpd.points_from_xy(route_stops['stop_lon'], route_stops['stop_lat'], crs = 'EPSG:4326').to_crs(epsg = 3857))
stations_gdf = gpd.GeoDataFrame(bikeshare_stations, geometry = gpd.points_from_xy(bikeshare_stations['end_lng'], bikeshare_stations['end_lat']), crs = 'EPSG:4326').to_crs(epsg = 3857)

stops_buffered = stops_gdf.copy()
stops_buffered['geometry'] = stops_buffered.geometry.buffer(400)

route_stop_station = gpd.sjoin(stations_gdf, stops_buffered, predicate = 'within', how = 'inner')

route_station_pairs = route_stop_station[['route_long_name', 'end_station_id', 'first_appeared_at']].drop_duplicates() 

In [None]:
route_month = muni[['route_long_name', 'Month']].drop_duplicates()

route_month_station = route_month.merge(route_station_pairs, on = 'route_long_name', how = 'left')

mask_active = route_month_station['first_appeared_at'] <= route_month_station['Month']
route_month_station = route_month_station[mask_active]

route_month_station_counts = route_month_station.groupby(['route_long_name', 'Month'])['end_station_id'].nunique().reset_index(name = 'unique_stations_within_400m')

muni_with_station_counts = muni.merge(route_month_station_counts, on = ['route_long_name', 'Month'], how = 'left').fillna({'unique_stations_within_400m': 0})

muni_with_station_counts['treated'] = (muni_with_station_counts['unique_stations_within_400m'] > 0).astype(int)

In [None]:
df = muni_with_station_counts.copy()

route_treatment_summary = (
    df.groupby('route_number')['treated'].agg(['min','max','mean'])
    .assign(
        group = lambda d: np.select(
            [(d['max'] == 0), 
             (d['min'] == 0) & (d['max'] == 1), 
             (d['min'] == 1)
            ], 
            ['never_treated', 'switcher', 'always_treated'], 
            default = 'other'
        )
    )
)

route_treatment_counts = route_treatment_summary['group'].value_counts()

In [None]:
panel = df.copy()
panel = panel.set_index(['route_number', 'Month'])

panel['treated'] = (panel['unique_stations_within_400m'] > 0).astype(int)
panel['log_boardings'] = np.log(panel['Average Daily Boardings'])

model = PanelOLS(dependent = panel['log_boardings'], exog = panel[['treated']], entity_effects = True, time_effects = True)
print(model.fit(cov_type = 'clustered', cluster_entity = True))

In [None]:
panel = df.copy()
panel = panel.set_index(['route_number', 'Month'])

panel['treated'] = (panel['unique_stations_within_400m'] > 0).astype(int)
panel['log_boardings'] = np.log(panel['Average Daily Boardings'])

model = PanelOLS(dependent = panel['log_boardings'], exog = panel[['unique_stations_within_400m']], entity_effects = True, time_effects = True)
print(model.fit(cov_type = 'clustered', cluster_entity = True))