# TTC Modeling — End‑to‑End (with Comparison & Per‑Class Charts)

This notebook loads the engineered TTC dataset, builds compact features, trains **three classifiers** (Logistic Regression, Random Forest, Gradient Boosting), evaluates them with a **time‑based 80/10/10 split**, and saves the required figures:

- `confusion_matrix_[model].png`
- `model_comparison.png`
- `feature_importance_[model].png`
- `per_class_performance_[model].png`

**Tip:** Set `TARGET` to `'delay_bin'` (severity) or to `'incident_type'/'incident_slim'` for incident classification.

## Parameters

In [1]:
from pathlib import Path
import os
DATA_PATHS = [
    r"C:\\Users\\Papi\\DSI\\ML_12\\data\\TTC_Feature_Engineered_2014_2025.csv",
    "/mnt/data/TTC_Feature_Engineered_2014_2025.csv",
]
for _p in DATA_PATHS:
    if Path(_p).exists():
        DATA = _p; break
else:
    raise FileNotFoundError("TTC_Feature_Engineered_2014_2025.csv not found in known paths.")
print("Using DATA =", DATA)
SAMPLE_MAX = None
TOP_ROUTES = 60
TOP_LOCATIONS = 60
TOP_INCIDENTS = 30
OUT_DIR = "reports"
FIG_DIR = f"{OUT_DIR}/figures/model_results"
TARGET = "delay_bin"
RANDOM_STATE = 42
DEFAULT_LABELS = ["Low","Medium","High","Severe"]

Using DATA = C:\\Users\\Papi\\DSI\\ML_12\\data\\TTC_Feature_Engineered_2014_2025.csv


## Imports

In [2]:
import time
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support,
    confusion_matrix, multilabel_confusion_matrix, log_loss, classification_report)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
%matplotlib inline

## Helpers

In [3]:
from pathlib import Path
def ensure_dirs(out_dir: str, fig_dir: str):
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    Path(fig_dir).mkdir(parents=True, exist_ok=True)
def find_datetime_column(df: pd.DataFrame):
    for c in ["timestamp","datetime","date","incident_date","created_at"]:
        if c in df.columns: return c
    for c in df.columns:
        if pd.api.types.is_datetime64_any_dtype(df[c]): return c
    return None
def add_time_parts(df: pd.DataFrame, dt_col: str | None) -> pd.DataFrame:
    if dt_col and dt_col in df.columns:
        ts = df[dt_col]
        if not pd.api.types.is_datetime64_any_dtype(ts):
            ts = pd.to_datetime(ts, errors="coerce")
        if "hour" not in df.columns: df["hour"] = ts.dt.hour
        if "weekday" not in df.columns: df["weekday"] = ts.dt.weekday
    return df
def collapse_rare(series: pd.Series, top_k: int, other_label="Other") -> pd.Series:
    vc = series.astype(str).fillna(other_label).str.strip().value_counts()
    keep = set(vc.head(top_k).index)
    s = series.astype(str).fillna(other_label).str.strip()
    return s.where(s.isin(keep), other_label)
def time_split_indices(df: pd.DataFrame, train: float, valid: float, test: float, dt_col: str | None, use_time: bool=True):
    n = len(df)
    if use_time and dt_col and dt_col in df.columns:
        ts = df[dt_col]
        if not pd.api.types.is_datetime64_any_dtype(ts):
            ts = pd.to_datetime(ts, errors="coerce")
        order = np.argsort(ts.values)
    else:
        order = np.arange(n); rng = np.random.RandomState(RANDOM_STATE); rng.shuffle(order)
    n_tr = int(n * train); n_va = int(n * (train + valid))
    return order[:n_tr], order[n_tr:n_va], order[n_va:]
def per_class_table(y_true, y_pred, labels):
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, labels=labels, zero_division=0)
    mlcm = multilabel_confusion_matrix(y_true, y_pred, labels=labels)
    spec = []
    for k in range(len(labels)):
        tn, fp, fn, tp = mlcm[k].ravel()
        spec.append((tn/(tn+fp)) if (tn+fp) else 0.0)
    return pd.DataFrame({"class": labels, "precision": p, "recall": r, "f1": f1, "specificity": spec, "support": s})
def aggregated_table(y_true, y_pred, labels, proba=None):
    out = {"accuracy": float(accuracy_score(y_true, y_pred))}
    for avg in ("macro","weighted","micro"):
        P, R, F1, _ = precision_recall_fscore_support(y_true, y_pred, labels=labels, average=avg, zero_division=0)
        out[f"precision_{avg}"] = float(P)
        out[f"recall_{avg}"] = float(R)
        out[f"f1_{avg}"] = float(F1)
    if proba is not None:
        label_to_idx = {c: i for i, c in enumerate(labels)}
        y_idx = pd.Series(y_true).map(label_to_idx).to_numpy()
        try: out["log_loss"] = float(log_loss(y_idx, proba, labels=list(range(len(labels)))))
        except Exception: out["log_loss"] = None
    else: out["log_loss"] = None
    return out
def plot_confusions(y_true, y_pred, labels, out_png):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    fig, ax = plt.subplots(figsize=(7,5))
    ax.imshow(cm)
    ax.set_xticks(range(len(labels))); ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels, rotation=45, ha="right"); ax.set_yticklabels(labels)
    ax.set_xlabel("Predicted"); ax.set_ylabel("True"); ax.set_title("Confusion Matrix (counts)")
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, int(cm[i, j]), ha="center", va="center",
                    color="white" if cm[i, j] > cm.max()/2 else "black")
    fig.tight_layout(); fig.savefig(out_png, dpi=160); plt.close(fig)
def feature_names_from_onehot(ohe: OneHotEncoder, cat_cols):
    try:
        cats = ohe.categories_
        names = []
        for col, cats_i in zip(cat_cols, cats):
            names.extend([f"{col}__{c}" for c in cats_i])
        return names
    except Exception:
        return [f"f_{i}" for i in range(sum(len(c) for c in ohe.categories_))]
def make_feature_importance_bar(names, importances, out_png, title):
    idx = np.argsort(importances)[::-1]
    names_sorted = [names[i] for i in idx][:25]
    imps_sorted = [importances[i] for i in idx][:25]
    fig, ax = plt.subplots(figsize=(9,6))
    ax.barh(range(len(names_sorted))[::-1], imps_sorted[::-1])
    ax.set_yticks(range(len(names_sorted))[::-1]); ax.set_yticklabels(names_sorted[::-1])
    ax.set_xlabel("Importance"); ax.set_title(title)
    fig.tight_layout(); fig.savefig(out_png, dpi=160); plt.close(fig)

## Load data & build features

In [4]:
ensure_dirs(OUT_DIR, FIG_DIR)
df = pd.read_csv(DATA, low_memory=False)
dt_col = find_datetime_column(df)
df = add_time_parts(df, dt_col)
if SAMPLE_MAX is not None and len(df) > SAMPLE_MAX:
    idx = np.linspace(0, len(df)-1, SAMPLE_MAX, dtype=int)
    df = df.iloc[idx].reset_index(drop=True)
if 'route' in df.columns: df['route_slim'] = collapse_rare(df['route'], top_k=TOP_ROUTES)
else: df['route_slim'] = 'Unknown'
loc_col = None
for cand in ['station','stop','stop_name','location','intersection','stop_id']:
    if cand in df.columns: loc_col = cand; break
if loc_col: df['location_slim'] = collapse_rare(df[loc_col], top_k=TOP_LOCATIONS)
else: df['location_slim'] = 'Unknown'
delay_col = None
for cand in ['delay_minutes','delay_min','delay','duration_min','duration']:
    if cand in df.columns: delay_col = cand; break
if delay_col: df[delay_col] = pd.to_numeric(df[delay_col], errors='coerce')
if TARGET in ('incident_type','incident_slim') and 'incident_slim' not in df.columns and 'incident_type' in df.columns:
    df['incident_slim'] = collapse_rare(df['incident_type'], top_k=TOP_INCIDENTS)
tgt_col = TARGET
if TARGET == 'incident_type' and 'incident_slim' in df.columns: tgt_col = 'incident_slim'
if tgt_col not in df.columns: raise ValueError(f"Target '{tgt_col}' not found.")
df = df[df[tgt_col].notna()].copy()
y = df[tgt_col].astype(str)
labels_present = sorted(y.unique().tolist())
DEFAULT_LABELS = DEFAULT_LABELS  # keep for context
LABELS = DEFAULT_LABELS if set(labels_present).issubset(set(DEFAULT_LABELS)) else labels_present
num_cols = [c for c in ['hour','weekday'] if c in df.columns]
if delay_col: num_cols.append(delay_col)
cat_cols = [c for c in ['route_slim','location_slim'] if c in df.columns]
X = df[num_cols + cat_cols].copy()
len(df), len(LABELS), LABELS[:10]

(67151, 4, ['Low', 'Medium', 'High', 'Severe'])

## Preprocess & split (time-aware 80/10/10)

In [5]:
i_tr, i_va, i_te = time_split_indices(df, 0.8, 0.1, 0.1, dt_col, use_time=True)
X_tr, y_tr = X.iloc[i_tr], y.iloc[i_tr]
X_va, y_va = X.iloc[i_va], y.iloc[i_va]
X_te, y_te = X.iloc[i_te], y.iloc[i_te]
num_pipe = SimpleImputer(strategy='median')
try: ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError: ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
cat_pipe = Pipeline([('impute', SimpleImputer(strategy='constant', fill_value='missing')), ('ohe', ohe)])
pre = ColumnTransformer([('num', num_pipe, num_cols), ('cat', cat_pipe, cat_cols)], remainder='drop')
pre.fit(X_tr)
Xtr = pre.transform(X_tr); Xva = pre.transform(X_va); Xte = pre.transform(X_te)
feature_names = num_cols + feature_names_from_onehot(pre.named_transformers_['cat'].named_steps['ohe'], cat_cols)
Xtr.shape, Xva.shape, Xte.shape

((53720, 100), (6715, 100), (6716, 100))

## Train models & save metrics + figures

In [6]:
def train_logreg(Xtr, ytr, Xva, yva, Xte, yte, C=1.0, max_iter=300, solver='saga', class_weight='balanced'):
    model = LogisticRegression(max_iter=max_iter, solver=solver, class_weight=class_weight, C=C, random_state=RANDOM_STATE)
    t0 = time.time(); model.fit(np.vstack([Xtr, Xva]), np.hstack([ytr, yva])); t1 = time.time()
    ypred = model.predict(Xte)
    proba = None
    if hasattr(model, 'predict_proba'):
        classes = model.classes_.tolist(); proba_full = model.predict_proba(Xte); col = {c:i for i,c in enumerate(classes)}
        proba = np.column_stack([proba_full[:, col[c]] if c in col else np.zeros(len(ypred)) for c in LABELS])
    agg = aggregated_table(yte, ypred, LABELS, proba); percls = per_class_table(yte, ypred, LABELS)
    return model, agg, percls, proba, (t1 - t0)
ensure_dirs(OUT_DIR, FIG_DIR)
logreg, agg_lr, per_lr, proba_lr, time_lr = train_logreg(Xtr, y_tr, Xva, y_va, Xte, y_te)
plot_confusions(y_te, logreg.predict(Xte), LABELS, f"{FIG_DIR}/baseline_confusion_matrix.png")
per_lr.to_csv(f"{OUT_DIR}/baseline_per_class.csv", index=False)
rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, class_weight='balanced', max_features='sqrt')
t0 = time.time(); rf.fit(np.vstack([Xtr, Xva]), np.hstack([y_tr, y_va])); t1 = time.time(); time_rf = t1 - t0
pred_rf = rf.predict(Xte)
agg_rf = aggregated_table(y_te, pred_rf, LABELS, None)
per_rf = per_class_table(y_te, pred_rf, LABELS)
plot_confusions(y_te, pred_rf, LABELS, f"{FIG_DIR}/rf_confusion_matrix.png")
per_rf.to_csv(f"{OUT_DIR}/rf_per_class.csv", index=False)
imp_rf = rf.feature_importances_
make_feature_importance_bar(feature_names, imp_rf, f"{FIG_DIR}/rf_feature_importance.png", 'Random Forest Feature Importance')
gbc = GradientBoostingClassifier(random_state=RANDOM_STATE, n_estimators=150, learning_rate=0.1, max_depth=3)
t0 = time.time(); gbc.fit(np.vstack([Xtr, Xva]), np.hstack([y_tr, y_va])); t1 = time.time(); time_gb = t1 - t0
pred_gb = gbc.predict(Xte)
agg_gb = aggregated_table(y_te, pred_gb, LABELS, None)
per_gb = per_class_table(y_te, pred_gb, LABELS)
plot_confusions(y_te, pred_gb, LABELS, f"{FIG_DIR}/xgb_confusion_matrix.png")
per_gb.to_csv(f"{OUT_DIR}/xgb_per_class.csv", index=False)
try:
    imp_gb = gbc.feature_importances_
    make_feature_importance_bar(feature_names, imp_gb, f"{FIG_DIR}/xgb_feature_importance.png", 'Gradient Boosting Feature Importance')
except Exception: pass
print('Weighted F1:', {'LogReg': round(agg_lr['f1_weighted'],3), 'RF': round(agg_rf['f1_weighted'],3), 'GB': round(agg_gb['f1_weighted'],3)})



Weighted F1: {'LogReg': 0.291, 'RF': 0.333, 'GB': 0.379}


## Rush-hour & non-rush evaluation (optional)

In [7]:
if 'hour' in X_te.columns:
    rush_mask = X_te['hour'].isin([7,8,9,16,17,18])
    print('== Rush-hour (GB) ==')
    print(classification_report(y_te[rush_mask], pred_gb[rush_mask], zero_division=0))
    print('== Non-rush (GB) ==')
    print(classification_report(y_te[~rush_mask], pred_gb[~rush_mask], zero_division=0))
else:
    print('Hour feature not available for rush-hour slice.')

== Rush-hour (GB) ==
              precision    recall  f1-score   support

        High       0.45      0.32      0.37       621
         Low       0.42      0.14      0.21       398
      Medium       0.21      0.74      0.33       321
      Severe       0.21      0.01      0.02       370

    accuracy                           0.29      1710
   macro avg       0.32      0.30      0.23      1710
weighted avg       0.35      0.29      0.25      1710

== Non-rush (GB) ==
              precision    recall  f1-score   support

        High       0.44      0.57      0.49      1731
         Low       0.64      0.34      0.44      1224
      Medium       0.18      0.40      0.24       641
      Severe       0.57      0.26      0.36      1410

    accuracy                           0.40      5006
   macro avg       0.46      0.39      0.38      5006
weighted avg       0.49      0.40      0.41      5006



## Create model comparison & per-class performance charts

In [8]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
summary = pd.DataFrame([
    {'Model':'LogReg', 'accuracy':agg_lr['accuracy'], 'f1_weighted':agg_lr['f1_weighted'], 'f1_macro':per_lr['f1'].mean()},
    {'Model':'RandomForest', 'accuracy':agg_rf['accuracy'], 'f1_weighted':agg_rf['f1_weighted'], 'f1_macro':per_rf['f1'].mean()},
    {'Model':'GradientBoosting','accuracy':agg_gb['accuracy'], 'f1_weighted':agg_gb['f1_weighted'], 'f1_macro':per_gb['f1'].mean()},
])
fig, ax = plt.subplots(figsize=(7,5))
x = np.arange(len(summary))
ax.bar(x - 0.2, summary['accuracy'], width=0.2, label='Accuracy')
ax.bar(x,         summary['f1_weighted'], width=0.2, label='F1_weighted')
ax.bar(x + 0.2,   summary['f1_macro'],    width=0.2, label='F1_macro')
ax.set_xticks(x); ax.set_xticklabels(summary['Model'])
ax.set_ylabel('Score'); ax.set_title('Model comparison')
ax.legend()
fig.tight_layout(); fig.savefig(f"{FIG_DIR}/model_comparison.png", dpi=160); plt.close(fig)
print('Saved:', f"{FIG_DIR}/model_comparison.png")
def plot_per_class(df_per, title, fname):
    dfp = df_per.sort_values('support', ascending=False)
    pos = np.arange(len(dfp))
    width = 0.4
    fig, ax = plt.subplots(figsize=(10,6))
    ax.bar(pos - width/2, dfp['precision'].to_numpy(dtype=float), width, label='precision')
    ax.bar(pos + width/2, dfp['recall'].to_numpy(dtype=float),   width, label='recall')
    ax.set_xticks(pos); ax.set_xticklabels(dfp['class'], rotation=45, ha='right')
    ax.set_ylim(0,1); ax.set_title(title); ax.legend()
    fig.tight_layout(); fig.savefig(f"{FIG_DIR}/{fname}", dpi=160); plt.close(fig)
    print('Saved:', f"{FIG_DIR}/{fname}")
plot_per_class(per_lr, 'Per-class (LogReg)', 'per_class_performance_logreg.png')
plot_per_class(per_rf, 'Per-class (RandomForest)', 'per_class_performance_rf.png')
plot_per_class(per_gb, 'Per-class (GradientBoosting)', 'per_class_performance_gb.png')

Saved: reports/figures/model_results/model_comparison.png
Saved: reports/figures/model_results/per_class_performance_logreg.png
Saved: reports/figures/model_results/per_class_performance_rf.png
Saved: reports/figures/model_results/per_class_performance_gb.png


## Duplicate names to the exact requested pattern

In [9]:
import shutil, os
mapping = {
    'baseline_confusion_matrix.png': 'confusion_matrix_logreg.png',
    'rf_confusion_matrix.png':       'confusion_matrix_rf.png',
    'xgb_confusion_matrix.png':      'confusion_matrix_gb.png',
}
for src, dst in mapping.items():
    s = os.path.join(FIG_DIR, src); d = os.path.join(FIG_DIR, dst)
    if os.path.exists(s):
        shutil.copyfile(s, d); print('Saved:', d)
    else:
        print('Missing:', s)
mapping_imp = {
    'rf_feature_importance.png': 'feature_importance_rf.png',
    'xgb_feature_importance.png':'feature_importance_gb.png',
}
for src, dst in mapping_imp.items():
    s = os.path.join(FIG_DIR, src); d = os.path.join(FIG_DIR, dst)
    if os.path.exists(s):
        shutil.copyfile(s, d); print('Saved:', d)
    else:
        print('Missing:', s)

Saved: reports/figures/model_results\confusion_matrix_logreg.png
Saved: reports/figures/model_results\confusion_matrix_rf.png
Saved: reports/figures/model_results\confusion_matrix_gb.png
Saved: reports/figures/model_results\feature_importance_rf.png
Saved: reports/figures/model_results\feature_importance_gb.png
