In [4]:
# CONFIG
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# set paths
DATA_PATH = "/Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Data/Motor_Vehicle_Collisions_-_Crashes_20250917.csv"
OUT_DIR   = "/Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Tables and Figures"
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

# set policy window
POLICY_DATE = pd.Timestamp("2022-08-01")
START_DATE  = POLICY_DATE - pd.DateOffset(months=6)
END_DATE    = POLICY_DATE + pd.DateOffset(months=6)

# LOAD DATA
use_cols = ["CRASH DATE", "NUMBER OF PERSONS INJURED"]
df = pd.read_csv(DATA_PATH, usecols=use_cols, low_memory=False)

# PARSE AND FILTER
# parse date
df["CRASH DATE"] = pd.to_datetime(df["CRASH DATE"], errors="coerce")
df = df.dropna(subset=["CRASH DATE"]).copy()

# keep numeric injuries
df["NUMBER OF PERSONS INJURED"] = pd.to_numeric(df["NUMBER OF PERSONS INJURED"], errors="coerce").fillna(0)

# filter to ±6 months
mask = (df["CRASH DATE"] >= START_DATE) & (df["CRASH DATE"] <= END_DATE)
win = df.loc[mask].copy()

# handle empty window
if win.empty:
    win = df.copy()
    window_label = "full sample (policy year not fully in range)"
else:
    window_label = "±6 months around policy"

# DAILY SERIES
# aggregate injuries per day
daily = (
    win.groupby("CRASH DATE")["NUMBER OF PERSONS INJURED"]
       .sum()
       .rename("injuries")
       .sort_index()
)

# reindex to fill missing days
full_idx = pd.date_range(daily.index.min(), daily.index.max(), freq="D")
daily = daily.reindex(full_idx).fillna(0)

# 7-day centered moving average
smooth = daily.rolling(window=7, center=True, min_periods=3).mean()

# PLOT: SMOOTHED DAILY LINE
fig, ax = plt.subplots(figsize=(11, 5.5))
ax.plot(daily.index, daily.values, linewidth=0.8, alpha=0.35, label="Daily total")
ax.plot(smooth.index, smooth.values, linewidth=2.0, label="7-day moving average")

# policy line if inside visible range
if daily.index.min() <= POLICY_DATE <= daily.index.max():
    ax.axvline(POLICY_DATE, linestyle="--", color="red", linewidth=1.2, label="Policy start")

ax.set_title(f"Injuries per Day ({window_label})")
ax.set_xlabel("Date")
ax.set_ylabel("Number of injuries")
ax.grid(True, linestyle="--", alpha=0.5)
ax.legend(loc="upper right")
fig.tight_layout()

daily_pdf = Path(OUT_DIR) / "daily_injuries_smoothed.pdf"
fig.savefig(daily_pdf, bbox_inches="tight")
plt.close(fig)

# PLOT: WEEKLY BAR (AVERAGE DAILY INJURIES PER WEEK)
weekly = daily.resample("W-MON").mean()

fig, ax = plt.subplots(figsize=(11, 5.0))
ax.bar(weekly.index, weekly.values, width=5, align="center")
ax.set_title("Average Daily Injuries per Week")
ax.set_xlabel("Week")
ax.set_ylabel("Avg daily injuries")
ax.grid(True, axis="y", linestyle="--", alpha=0.5)

if weekly.index.min() <= POLICY_DATE <= weekly.index.max():
    ax.axvline(POLICY_DATE, linestyle="--", color="red", linewidth=1.2, label="Policy start")
    ax.legend()

fig.tight_layout()
weekly_pdf = Path(OUT_DIR) / "weekly_injuries_bar.pdf"
fig.savefig(weekly_pdf, bbox_inches="tight")
plt.close(fig)

# SUMMARY STATS (SPLIT BEFORE/AFTER POLICY)
def desc_row(series):
    d = series.describe()
    return pd.Series({
        "Mean": d["mean"],
        "St. Dev.": d["std"],
        "Minimum": d["min"],
        "Maximum": d["max"],
    })

before = daily.loc[daily.index < POLICY_DATE]
after  = daily.loc[daily.index >= POLICY_DATE]

rows = []
if not before.empty:
    rows.append(("Before policy", desc_row(before)))
if not after.empty:
    rows.append(("After policy",  desc_row(after)))

summ = pd.DataFrame({name: row for name, row in rows}).T
summ = summ[["Mean","St. Dev.","Minimum","Maximum"]].round(2)

# export latex and csv
summ_csv = Path(OUT_DIR) / "daily_injuries_summary_split.csv"
summ_tex = Path(OUT_DIR) / "daily_injuries_summary_split.tex"
summ.to_csv(summ_csv, index=True)
with open(summ_tex, "w") as f:
    f.write(summ.to_latex(index=True, float_format="%.2f",
                          caption="Summary Statistics: Daily Injuries (Before vs After Policy)",
                          label="tab:daily_injuries_summary_split"))

# LOG
print(f"[saved] {daily_pdf}")
print(f"[saved] {weekly_pdf}")
print(f"[saved] {summ_csv}")
print(f"[saved] {summ_tex}")

# Clean export version (no caption/label, underscores escaped)
core_tex = Path(OUT_DIR) / "daily_injuries_summary_split_core.tex"
summ.to_latex(
    core_tex,
    index=True,
    float_format="%.2f",
    escape=True  
)
print(f"[saved clean LaTeX table] {core_tex}")



[saved] /Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Tables and Figures/daily_injuries_smoothed.pdf
[saved] /Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Tables and Figures/weekly_injuries_bar.pdf
[saved] /Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Tables and Figures/daily_injuries_summary_split.csv
[saved] /Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Tables and Figures/daily_injuries_summary_split.tex
[saved clean LaTeX table] /Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Tables and Figures/daily_injuries_summary_split_core.tex
