In [4]:
# ============================================
# CONFIG
# ============================================
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# --- paths (edit if needed) ---
DATA_PATH = "/Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Data/Motor_Vehicle_Collisions_-_Crashes_20250917.csv"
OUT_DIR   = "/Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Tables and Figures"
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

# --- policy / window ---
POLICY_DATE = pd.Timestamp("2022-08-01")
START_DATE  = POLICY_DATE - pd.DateOffset(months=6)
END_DATE    = POLICY_DATE + pd.DateOffset(months=6)

# --- night definition (edit if desired) ---
NIGHT_START = 21  # 9pm
NIGHT_END   = 6   # 6am

def is_night_hour(h: float) -> bool:
    """Return True if hour is considered 'night' per NIGHT_START/END."""
    # Night = [NIGHT_START, 24) ∪ [0, NIGHT_END)
    return pd.notna(h) and ((h >= NIGHT_START) or (h < NIGHT_END))


# ============================================
# LOAD DATA
# ============================================
use_cols = [
    "CRASH DATE", "CRASH TIME", "NUMBER OF PERSONS INJURED",
    "VEHICLE TYPE CODE 1", "VEHICLE TYPE CODE 2", "VEHICLE TYPE CODE 3",
    "VEHICLE TYPE CODE 4", "VEHICLE TYPE CODE 5"
]
df = pd.read_csv(DATA_PATH, usecols=use_cols, low_memory=False)

# Parse dates/times
df["CRASH DATE"] = pd.to_datetime(df["CRASH DATE"], errors="coerce")
df = df.dropna(subset=["CRASH DATE"]).copy()

df["hour"] = pd.to_datetime(df["CRASH TIME"], format="%H:%M", errors="coerce").dt.hour
df["NUMBER OF PERSONS INJURED"] = pd.to_numeric(
    df["NUMBER OF PERSONS INJURED"], errors="coerce"
).fillna(0)

# Night flag
df["is_night"] = df["hour"].apply(is_night_hour)

# ============================================
# VEHICLE TYPE TAGS
# ============================================
veh_cols = [c for c in df.columns if c.startswith("VEHICLE TYPE CODE")]

def norm(s):
    if pd.isna(s):
        return ""
    return str(s).strip().lower()

# Keyword sets (adjust as needed)
EMERGENCY_KW = {
    "ambulance", "police", "nypd", "ems", "fire", "fire truck", "emergency"
}
BUS_KW = {
    "bus", "school bus", "transit bus", "charter bus"
}
TRUCK_KW = {
    "truck", "box truck", "tractor", "tractor truck", "semi", "dump", "tow",
    "flatbed", "garbage", "cement"
}
CAR_KW = {
    "passenger", "sedan", "sport utility", "station wagon", "taxi", "cab",
    "van", "minivan", "hatchback", "coupe", "limousine"
}

def involves(row, kw_set):
    vals = [norm(row[c]) for c in veh_cols]
    # match by substring (robust to variants like "tractor truck (semi)")
    return any(any(k in v for k in kw_set) for v in vals)

for cat, kws in {
    "emergency": EMERGENCY_KW,
    "bus": BUS_KW,
    "truck": TRUCK_KW,
    "car": CAR_KW,
}.items():
    df[f"involves_{cat}"] = df.apply(lambda r: involves(r, kws), axis=1)


# ============================================
# WINDOW FILTER
# ============================================
mask = (df["CRASH DATE"] >= START_DATE) & (df["CRASH DATE"] <= END_DATE)
win = df.loc[mask].copy()

if win.empty:
    # Fall back to full sample if the 12-month window is empty
    win = df.copy()
    window_label = "full sample (policy year not fully in range)"
else:
    window_label = "±6 months around policy"

# ============================================
# DAILY SERIES (OVERALL) + PLOTS
# ============================================
# Overall daily injuries
daily_all = (
    win.groupby("CRASH DATE")["NUMBER OF PERSONS INJURED"]
       .sum()
       .rename("injuries")
       .sort_index()
)

# Reindex to fill missing dates
full_idx = pd.date_range(daily_all.index.min(), daily_all.index.max(), freq="D")
daily_all = daily_all.reindex(full_idx).fillna(0)

# 7-day centered moving average
smooth = daily_all.rolling(window=7, center=True, min_periods=3).mean()

# --- Plot 1: daily + 7-day MA ---
fig, ax = plt.subplots(figsize=(11, 5.5))
ax.plot(daily_all.index, daily_all.values, linewidth=0.8, alpha=0.35, label="Daily total")
ax.plot(smooth.index, smooth.values, linewidth=2.0, label="7-day moving average")
if daily_all.index.min() <= POLICY_DATE <= daily_all.index.max():
    ax.axvline(POLICY_DATE, linestyle="--", linewidth=1.2, label="Policy start")
ax.set_title(f"Injuries per Day ({window_label})")
ax.set_xlabel("Date"); ax.set_ylabel("Number of injuries")
ax.grid(True, linestyle="--", alpha=0.5); ax.legend(loc="upper right")
fig.tight_layout()
daily_pdf = Path(OUT_DIR) / "daily_injuries_smoothed.pdf"
fig.savefig(daily_pdf, bbox_inches="tight")
plt.close(fig)

# --- Plot 2: weekly bar of avg daily injuries ---
weekly = daily_all.resample("W-MON").mean()
fig, ax = plt.subplots(figsize=(11, 5.0))
ax.bar(weekly.index, weekly.values, width=5, align="center")
ax.set_title("Average Daily Injuries per Week")
ax.set_xlabel("Week"); ax.set_ylabel("Avg daily injuries")
ax.grid(True, axis="y", linestyle="--", alpha=0.5)
if weekly.index.min() <= POLICY_DATE <= weekly.index.max():
    ax.axvline(POLICY_DATE, linestyle="--", linewidth=1.2, label="Policy start")
    ax.legend()
fig.tight_layout()
weekly_pdf = Path(OUT_DIR) / "weekly_injuries_bar.pdf"
fig.savefig(weekly_pdf, bbox_inches="tight")
plt.close(fig)

# ============================================
# SUMMARY TABLE (requested rows only)
# ============================================
def daily_sum(filtered_df: pd.DataFrame) -> pd.Series:
    """Sum injuries per day and fill missing dates within that series' span."""
    if filtered_df.empty:
        return pd.Series(dtype=float)
    s = (
        filtered_df.groupby("CRASH DATE")["NUMBER OF PERSONS INJURED"]
                   .sum()
                   .sort_index()
    )
    full = pd.date_range(s.index.min(), s.index.max(), freq="D")
    return s.reindex(full).fillna(0)

def desc_row(series: pd.Series) -> pd.Series:
    if series.empty:
        return pd.Series({"Mean": np.nan, "St. Dev.": np.nan, "Minimum": np.nan, "Maximum": np.nan})
    d = series.describe()
    return pd.Series({
        "Mean":    d["mean"],
        "St. Dev.": d["std"],
        "Minimum":  d["min"],
        "Maximum":  d["max"],
    })

# Masks on windowed data
m_before = win["CRASH DATE"] < POLICY_DATE
m_after  = ~m_before
m_night  = win["is_night"] == True

rows = []

# Intercept: overall daily injuries in the window
rows.append(("Intercept", desc_row(daily_sum(win))))

# Before×Night: night hours before policy
rows.append(("Before×Night", desc_row(daily_sum(win.loc[m_before & m_night]))))

# After: all hours after policy
rows.append(("After", desc_row(daily_sum(win.loc[m_after]))))

# Before: all hours before policy
rows.append(("Before", desc_row(daily_sum(win.loc[m_before]))))

# Vehicle Types (all dates/hours; each crash counts for a type if any vehicle matches)
for label, col in [
    ("Vehicle Type — Emergency", "involves_emergency"),
    ("Vehicle Type — Bus",       "involves_bus"),
    ("Vehicle Type — Car",       "involves_car"),
    ("Vehicle Type — Truck",     "involves_truck"),
]:
    rows.append((label, desc_row(daily_sum(win.loc[win[col]]))))

# Assemble table
summ = pd.DataFrame({name: row for name, row in rows}).T
summ = summ[["Mean", "St. Dev.", "Minimum", "Maximum"]].round(2)

# ============================================
# EXPORTS
# ============================================
# CSV
summ_csv = Path(OUT_DIR) / "daily_injuries_summary_requested_rows.csv"
summ.to_csv(summ_csv, index=True)

# LaTeX with caption/label
summ_tex = Path(OUT_DIR) / "daily_injuries_summary_requested_rows.tex"
with open(summ_tex, "w") as f:
    f.write(summ.to_latex(
        index=True,
        float_format="%.2f",
        caption="Summary Statistics: Intercept, Before×Night, After, Before, and Vehicle Types",
        label="tab:injuries_requested_rows"
    ))

# Clean LaTeX table (no caption/label; LaTeX-safe)
core_tex = Path(OUT_DIR) / "daily_injuries_summary_requested_rows_core.tex"
summ.to_latex(core_tex, index=True, float_format="%.2f", escape=True)

# Logs
print(f"[saved] {daily_pdf}")
print(f"[saved] {weekly_pdf}")
print(f"[saved] {summ_csv}")
print(f"[saved] {summ_tex}")
print(f"[saved clean LaTeX table] {core_tex}")




[saved] /Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Tables and Figures/daily_injuries_smoothed.pdf
[saved] /Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Tables and Figures/weekly_injuries_bar.pdf
[saved] /Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Tables and Figures/daily_injuries_summary_requested_rows.csv
[saved] /Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Tables and Figures/daily_injuries_summary_requested_rows.tex
[saved clean LaTeX table] /Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Tables and Figures/daily_injuries_summary_requested_rows_core.tex
