In [72]:
import os

os.getcwd()

from pathlib import Path

# Project root = two levels up from notebooks/EDA
PROJECT_ROOT = Path.cwd().parents[1]

DATA_RAW = PROJECT_ROOT / "data" / "raw"

DATA_RAW

list(DATA_RAW.iterdir())


[PosixPath('/Users/danielbrown/Desktop/mta-ace-buses/data/raw/speeds_monthly_2015.csv'),
 PosixPath('/Users/danielbrown/Desktop/mta-ace-buses/data/raw/journeyperformance_monthly_2017.csv'),
 PosixPath('/Users/danielbrown/Desktop/mta-ace-buses/data/raw/bus_lane_geometry.csv'),
 PosixPath('/Users/danielbrown/Desktop/mta-ace-buses/data/raw/able_ace_start.csv'),
 PosixPath('/Users/danielbrown/Desktop/mta-ace-buses/data/raw/waitassessment_monthly_2015.csv')]

In [73]:
import pandas as pd

df_speeds = pd.read_csv(DATA_RAW / "speeds_monthly_2015.csv")
df_speeds.head()


Unnamed: 0,month,borough,day_type,trip_type,route_id,period,total_operating_time,total_mileage,average_speed
0,2015-01-01,Bronx,1,LCL/LTD,BX1,Off-Peak,8710307,62940902.4,7.23
1,2015-01-01,Bronx,1,LCL/LTD,BX1,Peak,4334312,30316503.6,6.99
2,2015-01-01,Bronx,2,LCL/LTD,BX1,Off-Peak,2498651,18742158.0,7.5
3,2015-01-01,Bronx,2,LCL/LTD,BX1,Peak,1008139,7417580.4,7.36
4,2015-01-01,Bronx,1,LCL/LTD,BX10,Off-Peak,5778595,52543814.4,9.09


In [74]:
# Assuming your dataframe is called df_speeds
# Remove commas and convert to numeric
df_speeds["total_operating_time"] = pd.to_numeric(
    df_speeds["total_operating_time"].str.replace(",", ""), errors="coerce"
)
df_speeds["total_mileage"] = pd.to_numeric(
    df_speeds["total_mileage"].str.replace(",", ""), errors="coerce"
)

df_speeds = df_speeds[df_speeds["period"] == "Off-Peak"]


# Group by month and route_id
df_grouped = (
    df_speeds.groupby(["month", "route_id"], as_index=False)
        .agg({
            "total_mileage": "sum",
            "total_operating_time": "sum"
        })
)

# Calculate average speed per group
df_grouped["average_speed"] = df_grouped["total_mileage"] / df_grouped["total_operating_time"]

# Reset index (optional, just to be safe)
df_grouped.reset_index(drop=True, inplace=True)

# Show result
df_grouped.head()


Unnamed: 0,month,route_id,total_mileage,total_operating_time,average_speed
0,2015-01-01,B1,94019947.2,11821585.0,7.953244
1,2015-01-01,B100,39095798.4,3866750.0,10.110764
2,2015-01-01,B103,108678236.4,11729714.0,9.265208
3,2015-01-01,B11,61016360.4,9437158.0,6.465544
4,2015-01-01,B12,65305461.6,10260576.0,6.364697


In [75]:
#Dealing with ACE

In [76]:
import pandas as pd

df_ace = pd.read_csv(DATA_RAW / "able_ace_start.csv")
df_ace.head()


Unnamed: 0,Route,Program,Implementation Date
0,M15+,ABLE,10/07/2019
1,B44+,ABLE,10/30/2019
2,M14+,ABLE,11/21/2019
3,B46+,ABLE,02/19/2020
4,M23+,ABLE,08/10/2020


In [77]:
import pandas as pd
from datetime import datetime

df_ace_filtered = df_ace

# Convert Implementation Date to datetime
df_ace_filtered["Implementation Date"] = pd.to_datetime(
    df_ace_filtered["Implementation Date"], format="%m/%d/%Y"
)

# Function to snap to closest first of month
def snap_to_nearest_first_of_month(dt: pd.Timestamp) -> pd.Timestamp:
    first_prev = pd.Timestamp(year=dt.year, month=dt.month, day=1)
    # first of next month
    if dt.month == 12:
        first_next = pd.Timestamp(year=dt.year + 1, month=1, day=1)
    else:
        first_next = pd.Timestamp(year=dt.year, month=dt.month + 1, day=1)
    
    # return whichever is closer
    if (dt - first_prev) <= (first_next - dt):
        return first_prev
    else:
        return first_next

# Apply snapping
df_ace_filtered["Implementation FirstOfMonth"] = df_ace_filtered["Implementation Date"].apply(snap_to_nearest_first_of_month)

# Show result
df_ace_filtered.head()

Unnamed: 0,Route,Program,Implementation Date,Implementation FirstOfMonth
0,M15+,ABLE,2019-10-07,2019-10-01
1,B44+,ABLE,2019-10-30,2019-11-01
2,M14+,ABLE,2019-11-21,2019-12-01
3,B46+,ABLE,2020-02-19,2020-03-01
4,M23+,ABLE,2020-08-10,2020-08-01


In [78]:
df_grouped["month"] = pd.to_datetime(df_grouped["month"])
df_ace_filtered["Implementation FirstOfMonth"] = pd.to_datetime(
    df_ace_filtered["Implementation FirstOfMonth"]
)


In [79]:
df_ace_filtered = df_ace_filtered.rename(columns={"Route": "route_id"})


In [80]:
program_timeline = (
    df_ace_filtered
    .pivot_table(
        index="route_id",
        columns="Program",
        values="Implementation FirstOfMonth",
        aggfunc="min"
    )
    .reset_index()
)

program_timeline.head()


Program,route_id,ABLE,ACE
0,B11,NaT,2025-11-01
1,B25,2022-12-01,2024-10-01
2,B26,2023-10-01,2024-10-01
3,B35,NaT,2024-09-01
4,B41,NaT,2024-09-01


In [81]:
df = df_grouped.merge(
    program_timeline,
    on="route_id",
    how="inner"
)
df.head()

Unnamed: 0,month,route_id,total_mileage,total_operating_time,average_speed,ABLE,ACE
0,2015-01-01,B11,61016360.4,9437158.0,6.465544,NaT,2025-11-01
1,2015-01-01,B25,68279126.4,10115231.0,6.75013,2022-12-01,2024-10-01
2,2015-01-01,B26,74710918.8,10563628.0,7.072468,2023-10-01,2024-10-01
3,2015-01-01,B35,159427440.0,24485715.0,6.511039,NaT,2024-09-01
4,2015-01-01,B41,192735432.0,25892711.0,7.443617,NaT,2024-09-01


In [82]:
# Rename columns to Prophet's required schema
df = df.rename(columns={
    "month": "ds",
    "average_speed": "y"
})

# Ensure ds is datetime
df["ds"] = pd.to_datetime(df["ds"])
df

Unnamed: 0,ds,route_id,total_mileage,total_operating_time,y,ABLE,ACE
0,2015-01-01,B11,6.101636e+07,9.437158e+06,6.465544,NaT,2025-11-01
1,2015-01-01,B25,6.827913e+07,1.011523e+07,6.750130,2022-12-01,2024-10-01
2,2015-01-01,B26,7.471092e+07,1.056363e+07,7.072468,2023-10-01,2024-10-01
3,2015-01-01,B35,1.594274e+08,2.448572e+07,6.511039,NaT,2024-09-01
4,2015-01-01,B41,1.927354e+08,2.589271e+07,7.443617,NaT,2024-09-01
...,...,...,...,...,...,...,...
6698,2025-12-01,Q58,4.634525e+04,6.078059e+03,7.625008,2023-07-01,2024-07-01
6699,2025-12-01,Q6,2.899829e+04,3.607242e+03,8.038909,NaT,2025-09-01
6700,2025-12-01,Q69,1.911977e+04,2.560906e+03,7.466021,NaT,2024-10-01
6701,2025-12-01,S46,3.155039e+04,2.818230e+03,11.195106,NaT,2024-09-01


In [83]:
'''
from prophet import Prophet
import pandas as pd
import numpy as np

# ------------------------------
# Step 1: Select a single route
# ------------------------------
route = "BX19"
df_r = df[df["route_id"] == route].sort_values("ds").copy()

# ------------------------------
# Step 2: Prepare columns for Prophet
# ------------------------------
# Prophet expects 'ds' for datetime and 'y' for target
# We'll add regressors:
# - is_able: 1 if ABLE is active that month
# - is_ace: 1 if ACE is active that month
# - is_covid: 1 if month is during COVID unusual period (e.g., Mar 2020 - Jun 2021)
df_r["is_able"] = (df_r["ABLE"].notna() & (df_r["ds"] >= df_r["ABLE"])).astype(int)
df_r["is_ace"] = (df_r["ACE"].notna() & (df_r["ds"] >= df_r["ACE"])).astype(int)
# Example COVID period â€” adjust if needed
df_r["is_covid"] = ((df_r["ds"] >= "2020-03-01") & (df_r["ds"] <= "2020-09-01")).astype(int)

# ------------------------------
# Step 3: Fit Prophet model
# ------------------------------
# We'll model pre-policy trends, COVID effects, seasonality, and regressors
m0 = Prophet(
    yearly_seasonality=True,  # bus speeds vary by month
    weekly_seasonality=False, # monthly data, weekly not needed
    daily_seasonality=False
)

# Add regressors
m0.add_regressor("is_able")
m0.add_regressor("is_ace")
m0.add_regressor("is_covid")

# Fit model on historical data
m0.fit(df_r[["ds", "y", "is_able", "is_ace", "is_covid"]])

# ------------------------------
# Step 4: Make predictions WITHOUT policies
# ------------------------------
# This gives us the counterfactual (what speeds would have been without ABLE or ACE)
# Set ABLE and ACE to 0, keep COVID regressor as-is
future = df_r[["ds", "is_covid"]].copy()
future["is_able"] = 0
future["is_ace"] = 0

# Predict
forecast = m0.predict(future)

# Merge predictions back into df_r
df_r = df_r.merge(
    forecast[["ds", "yhat"]],
    on="ds",
    how="left"
)
df_r.rename(columns={"yhat": "yhat_no_policy"}, inplace=True)

# ------------------------------
# Step 5: Compute effects
# ------------------------------
# ABLE effect: average speed increase during ABLE months
able_mask = df_r["is_able"] == 1
ace_mask = df_r["is_ace"] == 1

# ABLE effect: % change vs counterfactual
able_effect = (
    (df_r.loc[able_mask & ~ace_mask, "y"].mean() - 
     df_r.loc[able_mask & ~ace_mask, "yhat_no_policy"].mean())
    / df_r.loc[able_mask & ~ace_mask, "yhat_no_policy"].mean()
) * 100

# ACE incremental effect: % change vs counterfactual (already with ABLE)
ace_effect = (
    (df_r.loc[ace_mask, "y"].mean() - 
     df_r.loc[ace_mask, "yhat_no_policy"].mean())
    / df_r.loc[ace_mask, "yhat_no_policy"].mean()
) * 100

# ------------------------------
# Step 6: Check results
# ------------------------------
print(f"ABLE effect: {able_effect:.2f}%")
print(f"ACE incremental effect: {ace_effect:.2f}%")

# Optional: see the dataframe
df_r[["ds", "y", "yhat_no_policy", "is_able", "is_ace", "is_covid"]].head(20)
df_r
'''

'\nfrom prophet import Prophet\nimport pandas as pd\nimport numpy as np\n\n# ------------------------------\n# Step 1: Select a single route\n# ------------------------------\nroute = "BX19"\ndf_r = df[df["route_id"] == route].sort_values("ds").copy()\n\n# ------------------------------\n# Step 2: Prepare columns for Prophet\n# ------------------------------\n# Prophet expects \'ds\' for datetime and \'y\' for target\n# We\'ll add regressors:\n# - is_able: 1 if ABLE is active that month\n# - is_ace: 1 if ACE is active that month\n# - is_covid: 1 if month is during COVID unusual period (e.g., Mar 2020 - Jun 2021)\ndf_r["is_able"] = (df_r["ABLE"].notna() & (df_r["ds"] >= df_r["ABLE"])).astype(int)\ndf_r["is_ace"] = (df_r["ACE"].notna() & (df_r["ds"] >= df_r["ACE"])).astype(int)\n# Example COVID period â€” adjust if needed\ndf_r["is_covid"] = ((df_r["ds"] >= "2020-03-01") & (df_r["ds"] <= "2020-09-01")).astype(int)\n\n# ------------------------------\n# Step 3: Fit Prophet model\n# ------

In [84]:
from prophet import Prophet
import pandas as pd
import numpy as np

# ------------------------------
# CONFIG
# ------------------------------
MIN_OBS = 12

COVID_START = "2020-03-01"
COVID_END   = "2021-03-01"   # conservative: include recovery

# ------------------------------
# Step 0: Prepare results list
# ------------------------------
results = []

# ------------------------------
# Step 1: Loop over routes
# ------------------------------
for route in df["route_id"].unique():

    # Select route data
    df_r = df[df["route_id"] == route].sort_values("ds").copy()

    # ------------------------------
    # Step 1a: REMOVE COVID PERIOD
    # ------------------------------
    df_r = df_r[
        ~((df_r["ds"] >= COVID_START) & (df_r["ds"] <= COVID_END))
    ]

    # Skip if too little data remains
    if len(df_r) < MIN_OBS:
        continue

    # ------------------------------
    # Step 2: Policy indicators
    # ------------------------------
    df_r["is_able"] = (
        df_r["ABLE"].notna() & (df_r["ds"] >= df_r["ABLE"])
    ).astype(int)

    df_r["is_ace"] = (
        df_r["ACE"].notna() & (df_r["ds"] >= df_r["ACE"])
    ).astype(int)

    # ------------------------------
    # Step 3: Fit Prophet
    # ------------------------------
    try:
        m0 = Prophet(
            yearly_seasonality=True,
            weekly_seasonality=False,
            daily_seasonality=False,
            changepoint_prior_scale=0.05  # conservative trend
        )

        m0.add_regressor("is_able")
        m0.add_regressor("is_ace")

        m0.fit(df_r[["ds", "y", "is_able", "is_ace"]])

    except Exception as e:
        print(f"Skipping route {route}: {e}")
        continue

    # ------------------------------
    # Step 4: Counterfactual (no ABLE / ACE)
    # ------------------------------
    future = df_r[["ds"]].copy()
    future["is_able"] = 0
    future["is_ace"] = 0

    forecast = m0.predict(future)

    df_r = df_r.merge(
        forecast[["ds", "yhat"]],
        on="ds",
        how="left"
    )

    df_r.rename(columns={"yhat": "yhat_no_policy"}, inplace=True)

    # ------------------------------
    # Step 5: Effect masks
    # ------------------------------
    able_only_mask = (df_r["is_able"] == 1) & (df_r["is_ace"] == 0)
    ace_mask = (df_r["is_ace"] == 1)

    # ------------------------------
    # Step 6: Compute effects
    # ------------------------------
    able_effect = (
        (df_r.loc[able_only_mask, "y"].mean()
         - df_r.loc[able_only_mask, "yhat_no_policy"].mean())
        / df_r.loc[able_only_mask, "yhat_no_policy"].mean()
    ) * 100 if able_only_mask.any() else np.nan

    ace_effect = (
        (df_r.loc[ace_mask, "y"].mean()
         - df_r.loc[ace_mask, "yhat_no_policy"].mean())
        / df_r.loc[ace_mask, "yhat_no_policy"].mean()
    ) * 100 if ace_mask.any() else np.nan

    # ------------------------------
    # Step 7: Store results
    # ------------------------------
    results.append({
        "route_id": route,
        "able_effect_pct": able_effect,
        "ace_incremental_effect_pct": ace_effect
    })

# ------------------------------
# Step 8: Final dataframe
# ------------------------------
df_effects = pd.DataFrame(results)

# ------------------------------
# Step 9: Total compounded effect
# ------------------------------
df_effects["total_effect_pct"] = (
    (1 + df_effects["able_effect_pct"].fillna(0) / 100)
    * (1 + df_effects["ace_incremental_effect_pct"].fillna(0) / 100)
    - 1
) * 100

DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

df_effects.to_csv(
    DATA_PROCESSED / "speed_offpeak.csv",
    index=False
)

df_effects



14:22:04 - cmdstanpy - INFO - Chain [1] start processing
14:22:04 - cmdstanpy - INFO - Chain [1] done processing
14:22:04 - cmdstanpy - INFO - Chain [1] start processing
14:22:05 - cmdstanpy - INFO - Chain [1] done processing
14:22:05 - cmdstanpy - INFO - Chain [1] start processing
14:22:05 - cmdstanpy - INFO - Chain [1] done processing
14:22:05 - cmdstanpy - INFO - Chain [1] start processing
14:22:05 - cmdstanpy - INFO - Chain [1] done processing
14:22:05 - cmdstanpy - INFO - Chain [1] start processing
14:22:05 - cmdstanpy - INFO - Chain [1] done processing
14:22:05 - cmdstanpy - INFO - Chain [1] start processing
14:22:05 - cmdstanpy - INFO - Chain [1] done processing
14:22:05 - cmdstanpy - INFO - Chain [1] start processing
14:22:05 - cmdstanpy - INFO - Chain [1] done processing
14:22:05 - cmdstanpy - INFO - Chain [1] start processing
14:22:05 - cmdstanpy - INFO - Chain [1] done processing
14:22:05 - cmdstanpy - INFO - Chain [1] start processing
14:22:05 - cmdstanpy - INFO - Chain [1]

Unnamed: 0,route_id,able_effect_pct,ace_incremental_effect_pct,total_effect_pct
0,B11,,2.028047,2.028047
1,B25,-1.305507,0.879068,-0.437916
2,B26,1.609815,5.388966,7.085533
3,B35,,0.251281,0.251281
4,B41,,2.988319,2.988319
5,B44+,3.585335,4.700871,8.454748
6,B60,,0.328331,0.328331
7,B62,0.8476,-1.908239,-1.076814
8,B63,,0.033968,0.033968
9,B68,,-0.698818,-0.698818


In [85]:
import pandas as pd
import numpy as np

def robust_summary(series):
    s = series.dropna()
    return pd.Series({
        "n_routes": s.shape[0],
        "mean": s.mean(),
        "median": s.median(),
        "trimmed_mean_10pct": s.sort_values().iloc[int(0.1*len(s)) : int(0.9*len(s))].mean()
            if len(s) >= 10 else np.nan,
        "std_dev": s.std(),
        "mad": (s - s.median()).abs().median(),
        "min": s.min(),
        "q25": s.quantile(0.25),
        "q75": s.quantile(0.75),
        "max": s.max(),
        "pct_positive": (s > 0).mean() * 100,
        "pct_negative": (s < 0).mean() * 100,
        "pct_near_zero": (s.abs() < 0.5).mean() * 100
    })


summary_stats = pd.concat(
    {
        "ABLE effect (%)": robust_summary(df_effects["able_effect_pct"]),
        "ACE incremental effect (%)": robust_summary(df_effects["ace_incremental_effect_pct"]),
        "TOTAL compounded effect (%)": robust_summary(df_effects["total_effect_pct"])
    },
    axis=1
)

summary_stats


Unnamed: 0,ABLE effect (%),ACE incremental effect (%),TOTAL compounded effect (%)
n_routes,20.0,53.0,53.0
mean,1.255675,1.056479,1.62523
median,-0.243874,0.42208,0.404451
trimmed_mean_10pct,0.309012,0.44014,0.402957
std_dev,4.692658,3.551624,6.63577
mad,1.134909,1.14697,1.154867
min,-3.92774,-3.893869,-7.668668
q25,-1.078213,-0.698818,-0.72489
q75,1.369901,1.653353,1.653353
max,15.20336,17.667504,35.556918


In [86]:
# Converting to TypeScript
df = df_effects.copy()

df = df.rename(columns={
    "route_id": "routeId",
    "able_effect_pct": "ableEffectPct",
    "ace_incremental_effect_pct": "aceIncrementalEffectPct",
    "total_effect_pct": "totalEffectPct"
})

# Convert NaN â†’ None
df = df.where(pd.notnull(df), None)


import math

def to_ts_value(v):
    if v is None:
        return "null"
    if isinstance(v, float) and math.isnan(v):
        return "null"
    return round(float(v), 4)


rows = []

for _, r in df.iterrows():
    rows.append(
        f"""  {{
    routeId: "{r.routeId}",
    ableEffectPct: {to_ts_value(r.ableEffectPct)},
    aceIncrementalEffectPct: {to_ts_value(r.aceIncrementalEffectPct)},
    totalEffectPct: {to_ts_value(r.totalEffectPct)}
  }}"""
    )

# ðŸ”§ FIX: join rows *outside* the f-string
rows_joined = ",\n".join(rows)

ts = f"""
export type SpeedOffPeakRow = {{
  routeId: string;
  ableEffectPct: number | null;
  aceIncrementalEffectPct: number | null;
  totalEffectPct: number;
}};

export const speedOffPeak: SpeedOffPeakRow[] = [
{rows_joined}
];
"""

output_path = (
    "/Users/danielbrown/Desktop/mta-ace-buses/src/data/processed/speedOffPeak.ts"
)

with open(output_path, "w") as f:
    f.write(ts)

print("âœ… speedOffPeak.ts written")


âœ… speedOffPeak.ts written
