In [24]:
import os

os.getcwd()

from pathlib import Path

# Project root = two levels up from notebooks/EDA
PROJECT_ROOT = Path.cwd().parents[1]

DATA_RAW = PROJECT_ROOT / "data" / "raw"

DATA_RAW

list(DATA_RAW.iterdir())


[PosixPath('/Users/danielbrown/Desktop/mta-ace-buses/data/raw/speeds_monthly_2015.csv'),
 PosixPath('/Users/danielbrown/Desktop/mta-ace-buses/data/raw/journeyperformance_monthly_2017.csv'),
 PosixPath('/Users/danielbrown/Desktop/mta-ace-buses/data/raw/bus_lane_geometry.csv'),
 PosixPath('/Users/danielbrown/Desktop/mta-ace-buses/data/raw/able_ace_start.csv'),
 PosixPath('/Users/danielbrown/Desktop/mta-ace-buses/data/raw/waitassessment_monthly_2015.csv')]

In [25]:
import pandas as pd

df_speeds = pd.read_csv(DATA_RAW / "journeyperformance_monthly_2017.csv")
df_speeds.head()


Unnamed: 0,month,borough,trip_type,route_id,period,number_of_customers,additional_bus_stop_time,additional_travel_time,customer_journey_time_performance
0,2017-08-01,Bronx,LCL/LTD,BX1,Off-Peak,219531.64,2.051653,0.69601,70.1999545%
1,2017-08-01,Bronx,LCL/LTD,BX1,Peak,144822.4,1.963801,0.8554,68.3975101%
2,2017-08-01,Bronx,LCL/LTD,BX10,Off-Peak,142222.2,1.262757,0.150511,78.2773277%
3,2017-08-01,Bronx,LCL/LTD,BX10,Peak,98257.17,0.869035,-0.195989,85.6885107%
4,2017-08-01,Bronx,LCL/LTD,BX11,Off-Peak,177312.86,1.555327,0.22831,77.2575616%


In [26]:
df_speeds = df_speeds[df_speeds["period"] == "Peak"]

df_speeds["customer_journey_time_performance"] = (
    df_speeds["customer_journey_time_performance"]
        .str.replace("%", "", regex=False)
        .astype(float)
)

avg_perf = (
    df_speeds
        .groupby(["month", "route_id"], as_index=False)
        ["customer_journey_time_performance"]
        .mean()
)

df_grouped = avg_perf

In [27]:
#Dealing with ACE

In [28]:
import pandas as pd

df_ace = pd.read_csv(DATA_RAW / "able_ace_start.csv")
df_ace.head()

import pandas as pd
from datetime import datetime

df_ace_filtered = df_ace

# Convert Implementation Date to datetime
df_ace_filtered["Implementation Date"] = pd.to_datetime(
    df_ace_filtered["Implementation Date"], format="%m/%d/%Y"
)

# Function to snap to closest first of month
def snap_to_nearest_first_of_month(dt: pd.Timestamp) -> pd.Timestamp:
    first_prev = pd.Timestamp(year=dt.year, month=dt.month, day=1)
    # first of next month
    if dt.month == 12:
        first_next = pd.Timestamp(year=dt.year + 1, month=1, day=1)
    else:
        first_next = pd.Timestamp(year=dt.year, month=dt.month + 1, day=1)
    
    # return whichever is closer
    if (dt - first_prev) <= (first_next - dt):
        return first_prev
    else:
        return first_next

# Apply snapping
df_ace_filtered["Implementation FirstOfMonth"] = df_ace_filtered["Implementation Date"].apply(snap_to_nearest_first_of_month)

# Show result
df_ace_filtered.head()

Unnamed: 0,Route,Program,Implementation Date,Implementation FirstOfMonth
0,M15+,ABLE,2019-10-07,2019-10-01
1,B44+,ABLE,2019-10-30,2019-11-01
2,M14+,ABLE,2019-11-21,2019-12-01
3,B46+,ABLE,2020-02-19,2020-03-01
4,M23+,ABLE,2020-08-10,2020-08-01


In [29]:
df_grouped["month"] = pd.to_datetime(df_grouped["month"])
df_ace_filtered["Implementation FirstOfMonth"] = pd.to_datetime(
    df_ace_filtered["Implementation FirstOfMonth"]
)

df_ace_filtered = df_ace_filtered.rename(columns={"Route": "route_id"})

program_timeline = (
    df_ace_filtered
    .pivot_table(
        index="route_id",
        columns="Program",
        values="Implementation FirstOfMonth",
        aggfunc="min"
    )
    .reset_index()
)

program_timeline.head()

Program,route_id,ABLE,ACE
0,B11,NaT,2025-11-01
1,B25,2022-12-01,2024-10-01
2,B26,2023-10-01,2024-10-01
3,B35,NaT,2024-09-01
4,B41,NaT,2024-09-01


In [30]:
df = df_grouped.merge(
    program_timeline,
    on="route_id",
    how="inner"
)
df.head()

Unnamed: 0,month,route_id,customer_journey_time_performance,ABLE,ACE
0,2017-08-01,B11,78.438363,NaT,2025-11-01
1,2017-08-01,B25,69.017145,2022-12-01,2024-10-01
2,2017-08-01,B26,75.941068,2023-10-01,2024-10-01
3,2017-08-01,B35,65.965438,NaT,2024-09-01
4,2017-08-01,B41,70.10149,NaT,2024-09-01


In [31]:
#Prophet Starts

In [32]:
# Rename columns to Prophet's required schema
df = df.rename(columns={
    "month": "ds",
    "customer_journey_time_performance": "y"
})

# Ensure ds is datetime
df["ds"] = pd.to_datetime(df["ds"])
df

Unnamed: 0,ds,route_id,y,ABLE,ACE
0,2017-08-01,B11,78.438363,NaT,2025-11-01
1,2017-08-01,B25,69.017145,2022-12-01,2024-10-01
2,2017-08-01,B26,75.941068,2023-10-01,2024-10-01
3,2017-08-01,B35,65.965438,NaT,2024-09-01
4,2017-08-01,B41,70.101490,NaT,2024-09-01
...,...,...,...,...,...
5042,2025-12-01,Q58,63.085435,2023-07-01,2024-07-01
5043,2025-12-01,Q6,60.633556,NaT,2025-09-01
5044,2025-12-01,Q69,80.533000,NaT,2024-10-01
5045,2025-12-01,S46,72.305196,NaT,2024-09-01


In [33]:
from prophet import Prophet
import pandas as pd
import numpy as np

# ------------------------------
# CONFIG
# ------------------------------
MIN_OBS = 12

COVID_START = "2020-03-01"
COVID_END   = "2021-03-01"   # conservative: include recovery

# ------------------------------
# Step 0: Prepare results list
# ------------------------------
results = []

# ------------------------------
# Step 1: Loop over routes
# ------------------------------
for route in df["route_id"].unique():

    # Select route data
    df_r = df[df["route_id"] == route].sort_values("ds").copy()

    # ------------------------------
    # Step 1a: REMOVE COVID PERIOD
    # ------------------------------
    df_r = df_r[
        ~((df_r["ds"] >= COVID_START) & (df_r["ds"] <= COVID_END))
    ]

    # Skip if too little data remains
    if len(df_r) < MIN_OBS:
        continue

    # ------------------------------
    # Step 2: Policy indicators
    # ------------------------------
    df_r["is_able"] = (
        df_r["ABLE"].notna() & (df_r["ds"] >= df_r["ABLE"])
    ).astype(int)

    df_r["is_ace"] = (
        df_r["ACE"].notna() & (df_r["ds"] >= df_r["ACE"])
    ).astype(int)

    # ------------------------------
    # Step 3: Fit Prophet
    # ------------------------------
    try:
        m0 = Prophet(
            yearly_seasonality=True,
            weekly_seasonality=False,
            daily_seasonality=False,
            changepoint_prior_scale=0.05  # conservative trend
        )

        m0.add_regressor("is_able")
        m0.add_regressor("is_ace")

        m0.fit(df_r[["ds", "y", "is_able", "is_ace"]])

    except Exception as e:
        print(f"Skipping route {route}: {e}")
        continue

    # ------------------------------
    # Step 4: Counterfactual (no ABLE / ACE)
    # ------------------------------
    future = df_r[["ds"]].copy()
    future["is_able"] = 0
    future["is_ace"] = 0

    forecast = m0.predict(future)

    df_r = df_r.merge(
        forecast[["ds", "yhat"]],
        on="ds",
        how="left"
    )

    df_r.rename(columns={"yhat": "yhat_no_policy"}, inplace=True)

    # ------------------------------
    # Step 5: Effect masks
    # ------------------------------
    able_only_mask = (df_r["is_able"] == 1) & (df_r["is_ace"] == 0)
    ace_mask = (df_r["is_ace"] == 1)

    # ------------------------------
    # Step 6: Compute effects
    # ------------------------------
    able_effect = (
        (df_r.loc[able_only_mask, "y"].mean()
         - df_r.loc[able_only_mask, "yhat_no_policy"].mean())
        / df_r.loc[able_only_mask, "yhat_no_policy"].mean()
    ) * 100 if able_only_mask.any() else np.nan

    ace_effect = (
        (df_r.loc[ace_mask, "y"].mean()
         - df_r.loc[ace_mask, "yhat_no_policy"].mean())
        / df_r.loc[ace_mask, "yhat_no_policy"].mean()
    ) * 100 if ace_mask.any() else np.nan

    # ------------------------------
    # Step 7: Store results
    # ------------------------------
    results.append({
        "route_id": route,
        "able_effect_pct": able_effect,
        "ace_incremental_effect_pct": ace_effect
    })

# ------------------------------
# Step 8: Final dataframe
# ------------------------------
df_effects = pd.DataFrame(results)

# ------------------------------
# Step 9: Total compounded effect
# ------------------------------
df_effects["total_effect_pct"] = (
    (1 + df_effects["able_effect_pct"].fillna(0) / 100)
    * (1 + df_effects["ace_incremental_effect_pct"].fillna(0) / 100)
    - 1
) * 100

DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

df_effects.to_csv(
    DATA_PROCESSED / "journeyperformance_peak.csv",
    index=False
)

df_effects



  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.
17:43:44 - cmdstanpy - INFO - Chain [1] start processing
17:43:44 - cmdstanpy - INFO - Chain [1] done processing
17:43:44 - cmdstanpy - INFO - Chain [1] start processing
17:43:44 - cmdstanpy - INFO - Chain [1] done processing
17:43:44 - cmdstanpy - INFO - Chain [1] start processing
17:43:45 - cmdstanpy - INFO - Chain [1] done processing
17:43:45 - cmdstanpy - INFO - Chain [1] start processing
17:43:45 - cmdstanpy - INFO - Chain [1] done processing
17:43:45 - cmdstanpy - INFO - Chain [1] start processing
17:43:45 - cmdstanpy - INFO - Chain [1] done processing
17:43:45 - cmdstanpy - INFO - Chain [1] start processing
17:43:46 - cmdstanpy - INFO - Chain [1] done processing
17:43:46 - cmdstanpy - INFO - Chain [1] start processing
17:43:46 - cmdstanpy - INFO - Chain [1] done processing
17:43:46 - cmdstanpy - INFO - Chain [1] start processing
17:43:46 - cmdstanpy - INFO - Chain [1] done

Unnamed: 0,route_id,able_effect_pct,ace_incremental_effect_pct,total_effect_pct
0,B11,,3.947306,3.947306
1,B25,-1.416152,1.542811,0.10481
2,B26,-1.140652,6.192134,4.98085
3,B35,,-0.801084,-0.801084
4,B41,,3.098419,3.098419
5,B44+,-2.687866,2.406262,-0.346281
6,B46+,-1.110435,-2.649703,-3.730714
7,B60,,9.739904,9.739904
8,B62,-7.159236,-5.827161,-12.569216
9,B63,,-1.853115,-1.853115


In [34]:
import pandas as pd
import numpy as np

def robust_summary(series):
    s = series.dropna()
    return pd.Series({
        "n_routes": s.shape[0],
        "mean": s.mean(),
        "median": s.median(),
        "trimmed_mean_10pct": s.sort_values().iloc[int(0.1*len(s)) : int(0.9*len(s))].mean()
            if len(s) >= 10 else np.nan,
        "std_dev": s.std(),
        "mad": (s - s.median()).abs().median(),
        "min": s.min(),
        "q25": s.quantile(0.25),
        "q75": s.quantile(0.75),
        "max": s.max(),
        "pct_positive": (s > 0).mean() * 100,
        "pct_negative": (s < 0).mean() * 100,
        "pct_near_zero": (s.abs() < 0.5).mean() * 100
    })


summary_stats = pd.concat(
    {
        "ABLE effect (%)": robust_summary(df_effects["able_effect_pct"]),
        "ACE incremental effect (%)": robust_summary(df_effects["ace_incremental_effect_pct"]),
        "TOTAL compounded effect (%)": robust_summary(df_effects["total_effect_pct"])
    },
    axis=1
)

summary_stats


Unnamed: 0,ABLE effect (%),ACE incremental effect (%),TOTAL compounded effect (%)
n_routes,20.0,53.0,53.0
mean,0.479143,2.301479,2.702991
median,-1.450662,1.542811,1.23574
trimmed_mean_10pct,-0.368076,1.54813,1.387098
std_dev,6.54475,7.479404,11.293205
mad,2.17872,3.53497,3.748962
min,-7.650294,-12.264235,-16.274548
q25,-2.468953,-1.908216,-1.908216
q75,2.981191,5.214614,4.984701
max,17.602716,31.250525,54.354182
