In [1]:
# ============================================
# NYC 24/7 Cameras — Regression Tree (scikit-learn)
# Data window: ±6 months around Aug 1, 2022
# Outputs: LaTeX tables + PDF figures for LaTeX
# ============================================

import os
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_text
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance

# ------------------ Config ------------------
data_path  = "/Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Data/Motor_Vehicle_Collisions_-_Crashes_20250917.csv"
report_dir = "/Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Week 3/Report"
os.makedirs(report_dir, exist_ok=True)

policy_date = pd.Timestamp('2022-08-01')
start_date  = policy_date - pd.DateOffset(months=6)
end_date    = policy_date + pd.DateOffset(months=6)

# ------------------ Load & basic clean ------------------
use_cols = [
    "CRASH DATE","CRASH TIME","BOROUGH","LATITUDE","LONGITUDE",
    "NUMBER OF PERSONS INJURED"
]
df = pd.read_csv(data_path, usecols=use_cols, low_memory=False)

# Parse datetime
df["CRASH_DATETIME"] = pd.to_datetime(
    df["CRASH DATE"].astype(str) + " " + df["CRASH TIME"].astype(str),
    errors="coerce"
)
df = df.dropna(subset=["CRASH_DATETIME"])

# Filter ±6 months
df = df[(df["CRASH_DATETIME"] >= start_date) & (df["CRASH_DATETIME"] < end_date)]




# ------------------ Features ------------------
df["hour"]    = df["CRASH_DATETIME"].dt.hour
df["weekday"] = df["CRASH_DATETIME"].dt.weekday  # 0=Mon
df["month"]   = df["CRASH_DATETIME"].dt.month
df["post"]    = (df["CRASH_DATETIME"] >= policy_date).astype(int)

# Target: log(1 + injured)
y = np.log1p(df["NUMBER OF PERSONS INJURED"].fillna(0))

# Feature set
X = df[[
    "hour","weekday","month","LATITUDE","LONGITUDE","BOROUGH","post"
]]

num_features = ["hour","weekday","month","LATITUDE","LONGITUDE","post"]
cat_features = ["BOROUGH"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median"))
        ]), num_features),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_features),
    ]
)


# ------------------ Features ------------------
# Make sure CRASH_DATETIME exists before this step
df["hour"]    = df["CRASH_DATETIME"].dt.hour
df["weekday"] = df["CRASH_DATETIME"].dt.weekday  # 0=Mon
df["month"]   = df["CRASH_DATETIME"].dt.month
df["post"]    = (df["CRASH_DATETIME"] >= policy_date).astype(int)

# Now you can safely drop datetime + other irrelevant vars
df = df.drop(
    columns=[
        "LATITUDE", "LONGITUDE",
        "CRASH DATE", "CRASH TIME", "CRASH_DATETIME",  # <-- safe to drop *after* feature engineering
        "COLLISION_ID", "CRASH_ID", "VEHICLE_ID",
        "LOCATION", "ON STREET NAME", "CROSS STREET NAME", "OFF STREET NAME"
    ],
    errors="ignore"
)

print("Remaining columns:", df.columns.tolist())

# ------------------ Train/test split ------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

# ------------------ Model & tuning ------------------
tree_reg = DecisionTreeRegressor(random_state=42)

pipe = Pipeline(steps=[
    ("pre", preprocess),
    ("model", tree_reg)
])

param_grid = {
    "model__max_depth": [3,4,5,6,8,10],
    "model__min_samples_leaf": [50,100,200]
}

gs = GridSearchCV(
    pipe, param_grid,
    scoring="neg_mean_squared_error",
    cv=5, n_jobs=-1, verbose=0
)
gs.fit(X_train, y_train)

best_pipe = gs.best_estimator_
y_pred = best_pipe.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# --- Metrics (LaTeX) ---
def rmse_score(y_true, y_pred):
    try:
        # works on newer sklearn
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        # fallback for older sklearn
        return np.sqrt(mean_squared_error(y_true, y_pred))

rmse = rmse_score(y_test, y_pred)
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)


# ------------------ Tree plot (PDF) ------------------
# Extract fitted tree and feature names after preprocessing
fitted_tree = best_pipe.named_steps["model"]
feature_names = best_pipe.named_steps["pre"].get_feature_names_out()

plt.figure(figsize=(18, 10))
plot_tree(
    fitted_tree,
    feature_names=feature_names,
    filled=True, rounded=True,
    impurity=False,
    max_depth=fitted_tree.get_params()["max_depth"]
)
plt.title("Decision Tree (best model)")
tree_pdf = os.path.join(report_dir, "decision_tree.pdf")
plt.savefig(tree_pdf, bbox_inches="tight")
plt.close()
print(f"[saved] {tree_pdf}")

# ------------------ Text rules (.txt for appendix) ------------------
rules_txt = export_text(fitted_tree, feature_names=list(feature_names))
rules_path = os.path.join(report_dir, "decision_tree_rules.txt")
with open(rules_path, "w") as f:
    f.write(rules_txt)
print(f"[saved] {rules_path}")

# ------------------ Permutation importance (LaTeX + PDF bar chart) ------------------
pre = best_pipe.named_steps["pre"]

# Robust feature name extraction
try:
    feature_names = pre.get_feature_names_out()
except AttributeError:
    # Older sklearn fallback
    ohe = pre.named_transformers_["cat"].named_steps["onehot"]
    try:
        cat_names = ohe.get_feature_names(cat_features)
    except AttributeError:
        cat_names = ohe.get_feature_names_out(cat_features)
    feature_names = np.r_[num_features, cat_names]

# Compute permutation importance
perm = permutation_importance(best_pipe, X_test, y_test,
                              n_repeats=10, random_state=42, scoring="neg_mean_squared_error")

# Match dimensions (feature_names and perm.importances_mean should align now)
imp_df = pd.DataFrame({
    "feature": feature_names,
    "importance": perm.importances_mean
}).sort_values("importance", ascending=False).head(20)

# Save LaTeX table
imp_tex = os.path.join(report_dir, "tree_permutation_importance.tex")
with open(imp_tex, "w") as f:
    f.write(imp_df.to_latex(index=False, float_format="%.4f",
                            caption="Top Features by Permutation Importance",
                            label="tab:tree_perm_importance"))
print(f"[saved] {imp_tex}")

# Save PDF bar chart
plt.figure(figsize=(8, 6))
plt.barh(imp_df["feature"], imp_df["importance"])
plt.gca().invert_yaxis()
plt.xlabel("Permutation importance (ΔMSE)")
plt.title("Decision Tree — Top Features")
imp_pdf = os.path.join(report_dir, "tree_permutation_importance.pdf")
plt.tight_layout()
plt.savefig(imp_pdf)
plt.close()
print(f"[saved] {imp_pdf}")



FileNotFoundError: [Errno 2] No such file or directory: '/Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Data/Motor_Vehicle_Collisions_-_Crashes_20250917.csv'