In [6]:
# ============================================
# NYC 24/7 Cameras — Regression Tree (scikit-learn)
# Data window: ±6 months around Aug 1, 2022
# Outputs: LaTeX tables + PDF figures for LaTeX
# ============================================

import os
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_text
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance

# ------------------ CONFIG ------------------
data_path  = "/Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Research Project/Original Paper/Data/Motor_Vehicle_Collisions_-_Crashes_20250917.csv"
report_dir = "/Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Week 3/Report"
os.makedirs(report_dir, exist_ok=True)

policy_date = pd.Timestamp('2022-08-01')
start_date  = policy_date - pd.DateOffset(months=6)
end_date    = policy_date + pd.DateOffset(months=6)

# ------------------ LOAD & BASIC CLEAN ------------------
use_cols = [
    "CRASH DATE","CRASH TIME","BOROUGH","LATITUDE","LONGITUDE",
    "NUMBER OF PERSONS INJURED"
]
df = pd.read_csv(data_path, usecols=use_cols, low_memory=False)

# parse datetime
df["CRASH_DATETIME"] = pd.to_datetime(
    df["CRASH DATE"].astype(str) + " " + df["CRASH TIME"].astype(str),
    errors="coerce"
)
df = df.dropna(subset=["CRASH_DATETIME"])

# filter ±6 months
df = df[(df["CRASH_DATETIME"] >= start_date) & (df["CRASH_DATETIME"] < end_date)]

# ------------------ FEATURES ------------------
df["hour"]    = df["CRASH_DATETIME"].dt.hour
df["weekday"] = df["CRASH_DATETIME"].dt.weekday
df["month"]   = df["CRASH_DATETIME"].dt.month
df["post"]    = (df["CRASH_DATETIME"] >= policy_date).astype(int)

# target: log(1 + injured)
y = np.log1p(df["NUMBER OF PERSONS INJURED"].fillna(0))

# feature set
X = df[["hour","weekday","month","LATITUDE","LONGITUDE","BOROUGH","post"]]

num_features = ["hour","weekday","month","LATITUDE","LONGITUDE","post"]
cat_features = ["BOROUGH"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median"))
        ]), num_features),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_features),
    ]
)

# ------------------ TRAIN/TEST SPLIT ------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

# ------------------ MODEL & TUNING ------------------
tree_reg = DecisionTreeRegressor(random_state=42)
pipe = Pipeline(steps=[("pre", preprocess), ("model", tree_reg)])

# make tree one level shallower overall (cap at 8 rather than 10)
param_grid = {
    "model__max_depth": [3],
    "model__min_samples_leaf": [200]
}

gs = GridSearchCV(
    pipe, param_grid,
    scoring="neg_mean_squared_error",
    cv=5, n_jobs=-1, verbose=0
)
gs.fit(X_train, y_train)

best_pipe = gs.best_estimator_
y_pred = best_pipe.predict(X_test)

# ------------------ METRICS ------------------
def rmse_score(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))

rmse = rmse_score(y_test, y_pred)
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print("Decision Tree Performance:")
print(f"  RMSE: {rmse:.4f}")
print(f"  MAE : {mae:.4f}")
print(f"  R^2 : {r2:.4f}")

# ------------------ PRETTY FEATURE NAMES ------------------
def prettify_feature_names(names):
    num_map = {
        "hour": "Hour",
        "weekday": "Day of week",
        "month": "Month",
        "LATITUDE": "Latitude",
        "LONGITUDE": "Longitude",
        "post": "After policy"
    }
    pretty = []
    for n in names:
        # ColumnTransformer prefixes like 'num__hour' or 'cat__BOROUGH_BROOKLYN'
        if "__" in n:
            block, rest = n.split("__", 1)
        else:
            block, rest = "", n

        if block == "num":
            pretty.append(num_map.get(rest, rest.title()))
        elif block == "cat":
            # rest like 'BOROUGH_BROOKLYN'
            if rest.startswith("BOROUGH_"):
                val = rest.split("BOROUGH_", 1)[1].replace("_", " ").title()
                pretty.append(f"Borough = {val}")
            else:
                pretty.append(rest.replace("_", " ").title())
        else:
            pretty.append(rest.replace("_", " ").title())
    return np.array(pretty, dtype=object)

# get transformed feature names and prettify
pre = best_pipe.named_steps["pre"]
try:
    raw_feature_names = pre.get_feature_names_out()
except AttributeError:
    # fallback if older sklearn
    ohe = pre.named_transformers_["cat"].named_steps["onehot"]
    try:
        cat_raw = ohe.get_feature_names(cat_features)
    except AttributeError:
        cat_raw = ohe.get_feature_names_out(cat_features)
    raw_feature_names = np.r_[["num__"+n for n in num_features],
                              ["cat__"+n for n in cat_raw]]
pretty_feature_names = prettify_feature_names(raw_feature_names)

# ------------------ TREE PLOT (PDF) ------------------
fitted_tree = best_pipe.named_steps["model"]
plt.figure(figsize=(18, 10))
plot_tree(
    fitted_tree,
    feature_names=pretty_feature_names,
    filled=True, rounded=True,
    impurity=False,
    max_depth=fitted_tree.get_params()["max_depth"]
)
plt.title(f"Regression Tree")
tree_pdf = os.path.join(report_dir, "decision_tree.pdf")
plt.savefig(tree_pdf, bbox_inches="tight")
plt.close()
print(f"[saved] {tree_pdf}")

# ------------------ TEXT RULES ------------------
rules_txt = export_text(fitted_tree, feature_names=list(pretty_feature_names))
rules_path = os.path.join(report_dir, "decision_tree_rules.txt")
with open(rules_path, "w") as f:
    f.write(rules_txt)
print(f"[saved] {rules_path}")

# ------------------ PERMUTATION IMPORTANCE ------------------
perm = permutation_importance(
    best_pipe, X_test, y_test,
    n_repeats=10, random_state=42, scoring="neg_mean_squared_error"
)

# match dimensions and prettify names
n = min(len(pretty_feature_names), len(perm.importances_mean))
imp_df = pd.DataFrame({
    "Feature": pretty_feature_names[:n],
    "Importance": perm.importances_mean[:n]
}).sort_values("Importance", ascending=False).head(20)

imp_tex = os.path.join(report_dir, "tree_permutation_importance.tex")
with open(imp_tex, "w") as f:
    f.write(imp_df.to_latex(index=False, float_format="%.4f",
                            caption="Top Features by Permutation Importance",
                            label="tab:tree_perm_importance"))
print(f"[saved] {imp_tex}")

imp_pdf = os.path.join(report_dir, "tree_permutation_importance.pdf")
plt.figure(figsize=(8, 6))
plt.barh(imp_df["Feature"], imp_df["Importance"])
plt.gca().invert_yaxis()
plt.xlabel("Permutation importance (ΔMSE)")
plt.title("Decision Tree — Top Features")
plt.tight_layout()
plt.savefig(imp_pdf)
plt.close()
print(f"[saved] {imp_pdf}")




Decision Tree Performance:
  RMSE: 0.4169
  MAE : 0.3772
  R^2 : 0.0077
[saved] /Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Week 3/Report/decision_tree.pdf
[saved] /Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Week 3/Report/decision_tree_rules.txt
[saved] /Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Week 3/Report/tree_permutation_importance.tex
[saved] /Users/eamon/Desktop/University/UofT 2025-26/Fall/Applied Machine Learning/Week 3/Report/tree_permutation_importance.pdf
