📓 Notebook 6 — Interactive Dashboard

Purpose:
Transition from static explainability (Notebook 5) to interactive, real-time exploration of model reasoning and training trends.
The dashboard connects your PostgreSQL database, trained Random Forest models, and SHAP explainers into a transparent interface.

🧭 Overview

Core Objectives
Load models and processed run data from PostgreSQL
Explore single-run predictions with SHAP explanations
Compare two runs side-by-side (what changed?)
View global feature importances and training trends

Inputs
models/random_forest_classifier.pkl
models/shap_explainer_clf.pkl
PostgreSQL table runs_summary

Outputs
Interactive Streamlit dashboard (06_dashboard_app.py)
Visual insights into model reasoning and performance dynamics

In [None]:
# ===================================================
# 🧭 Notebook 6 — Interactive Dashboard
# ===================================================
# Purpose:
# Move from static explainability (Notebook 5)
# → to *interactive exploration* and *real-time insights*.
#
# This notebook connects to PostgreSQL (Notebook 7),
# loads trained models (Notebook 5),
# and builds an interactive Streamlit dashboard
# for local/global SHAP interpretation.
# ===================================================

# ---------------------------------------------------
# 📦 1. Setup & Imports
# ---------------------------------------------------
import sys
from pathlib import Path

# ✅ FIXED: Ensure the project root is correctly resolved
# This finds the "running-agent" folder even when Streamlit runs from a parent dir.
project_root = Path(__file__).resolve().parents[1]
sys.path.append(str(project_root))

print("✅ Project root added to path:", project_root)

# ---------------------------------------------------
# 🔧 Robust project root detection
# ---------------------------------------------------
import sys
from pathlib import Path

def _find_project_root() -> Path:
    """
    Walk up from __file__ and CWD to find a directory that
    contains both 'models' and 'src'. Falls back to the folder
    named 'running-agent' if found by name.
    """
    candidates = [Path(__file__).resolve(), Path.cwd().resolve()]
    seen = set()

    for base in candidates:
        for p in [base] + list(base.parents):
            if p in seen:
                continue
            seen.add(p)
            if (p / "models").exists() and (p / "src").exists():
                return p

    # Fallback: choose the directory named 'running-agent' if present
    for p in [Path(__file__).resolve()] + list(Path(__file__).resolve().parents):
        if p.name.lower() == "running-agent":
            return p

    # Last resort: one level up from this file
    return Path(__file__).resolve().parents[1]

project_root = _find_project_root()
sys.path.append(str(project_root))
print("✅ Project root resolved to:", project_root)

# Paths that depend on the root
model_dir = project_root / "models"
rf_model_path = model_dir / "random_forest_classifier.pkl"
explainer_path = model_dir / "shap_explainer_clf.pkl"

print("📂 Model directory:", model_dir)
print(" - Classifier exists:", rf_model_path.exists(), "→", rf_model_path)
print(" - SHAP explainer exists:", explainer_path.exists(), "→", explainer_path)

# ---------------------------------------------------
# 🔍 Standard Libraries
# ---------------------------------------------------
import pandas as pd
import numpy as np
import joblib
import shap
import streamlit as st
import altair as alt
import matplotlib.pyplot as plt

# ---------------------------------------------------
# 🧩 Custom Project Imports
# ---------------------------------------------------
from src.db_utils import get_engine

# ---------------------------------------------------
# ⚙️ Streamlit Setup (safe to ignore warnings in Jupyter)
# ---------------------------------------------------
try:
    st.set_page_config(page_title="🏃‍♂️ Running Insights Dashboard", layout="wide")
except Exception:
    print("⚠️ Streamlit context not active (OK in Jupyter)")

# ---------------------------------------------------
# 📁 Define Model Paths
# ---------------------------------------------------
model_dir = project_root / "models"
rf_model_path = model_dir / "random_forest_classifier.pkl"
explainer_path = model_dir / "shap_explainer_clf.pkl"

print("📂 Model directory:", model_dir)
print(" - Classifier exists:", rf_model_path.exists())
print(" - SHAP explainer exists:", explainer_path.exists())



In [None]:
# ---------------------------------------------------
# 🧠 2. Load Models and Data (PostgreSQL + Pickles)
# ---------------------------------------------------
@st.cache_resource
def load_models():
    """Load trained model and SHAP explainer."""
    rf_clf = joblib.load(rf_model_path)
    try:
        explainer_clf = joblib.load(explainer_path)
        print("✅ Models and explainer loaded successfully.")
    except FileNotFoundError:
        explainer_clf = None
        print("⚠️ No SHAP explainer found – plots will be skipped.")
    return rf_clf, explainer_clf


@st.cache_data
def load_data():
    """Read processed runs from PostgreSQL."""
    engine = get_engine()
    df = pd.read_sql("SELECT * FROM runs_summary ORDER BY date DESC", engine)
    df["date"] = pd.to_datetime(df["date"])
    return df


rf_clf, explainer_clf = load_models()
summary_df = load_data()

print("✅ Models and data loaded — ready for dashboard.")
print("Data shape:", summary_df.shape)
summary_df.head(3)


| Feature             | Description                   |
| ------------------- | ----------------------------- |
| `total_distance_km` | Distance of run in km         |
| `duration_min`      | Duration in minutes           |
| `avg_pace_min_km`   | Average pace (min/km)         |
| `avg_cadence`       | Steps per minute              |
| `total_elev_gain`   | Elevation gain (m)            |
| `avg_stride_len_m`  | Average stride length (m)     |
| `avg_gct_est_ms`    | Ground contact time (ms)      |
| `pace_variability`  | Pace consistency index        |
| `cadence_drift`     | Cadence stability index       |
| `load_7d`           | 7-day rolling training load   |
| `load_28d`          | 28-day rolling training load  |
| `fastest_1km_pace`  | Fastest 1 km segment (min/km) |
| `fastest_5min_pace` | Fastest 5 min pace (min/km)   |


In [None]:
available_features = [
    "total_distance_km","duration_min","avg_pace_min_km","avg_cadence",
    "total_elev_gain","avg_stride_len_m","avg_gct_est_ms",
    "pace_variability","cadence_drift","load_7d","load_28d",
    "fastest_1km_pace","fastest_5min_pace"
]

In [None]:
import numpy as np
import pandas as pd

def shap_vector_for_sample(explainer, model, X_one_row: pd.DataFrame):
    """
    Return a 1-D SHAP vector for the predicted class and the raw array for debugging.
    Handles shapes like (1, n_features, n_classes) or (1, n_classes, n_features).
    """
    assert X_one_row.shape[0] == 1, "Pass exactly one sample"

    nfeat = X_one_row.shape[1]
    cols = list(X_one_row.columns)
    try:
        c = int(np.argmax(model.predict_proba(X_one_row), axis=1)[0])
    except Exception:
        c = 0

    raw = explainer.shap_values(X_one_row)
    arr = np.asarray(raw)

    if isinstance(raw, list):
        v = np.array(raw[c]).reshape(-1)
        if v.shape[0] == nfeat: return v, raw
        elif v.ndim > 1 and v.shape[1] == nfeat: return v[0], raw
        else: raise ValueError(f"List SHAP shape {v.shape}")
    elif arr.ndim == 3 and arr.shape[1] == nfeat:
        v = arr[0, :, c]
    elif arr.ndim == 3 and arr.shape[2] == nfeat:
        v = arr[0, c, :]
    elif arr.ndim == 2 and arr.shape == (1, nfeat):
        v = arr[0, :]
    elif arr.ndim == 1 and arr.shape[0] == nfeat:
        v = arr
    else:
        raise ValueError(f"Unexpected SHAP shape {arr.shape}")
    return v, raw

print("✅ SHAP vector helper ready")


In [None]:
st.title("🏃‍♂️ Running Insights Dashboard")
st.markdown("""
Explore your running data and model reasoning interactively.  
Tabs:
1️⃣ Inspect a single run prediction  
2️⃣ Compare two runs side-by-side  
3️⃣ Explore global feature importance and performance trends
""")

tab1, tab2, tab3 = st.tabs(["🔍 Single Run","📈 Compare Runs","🌍 Global Insights"])


In [None]:
with tab1:
    st.subheader("Single Run SHAP Explanation")
    options = summary_df["date"].dt.strftime("%Y-%m-%d").tolist()
    selected_date = st.selectbox("Select a run date", options)
    case = summary_df[summary_df["date"].dt.strftime("%Y-%m-%d") == selected_date]

    if case.empty:
        st.warning("No data for selected date.")
    else:
        X_means = summary_df[available_features].mean()
        case_X = case[available_features].fillna(X_means).iloc[[0]]
        pred_label = rf_clf.predict(case_X)[0]
        st.markdown(f"### 🏷️ Predicted Cluster: **{pred_label}**")

        if explainer_clf is None:
            st.info("SHAP explainer missing — skip plot.")
        else:
            shap_vec, _ = shap_vector_for_sample(explainer_clf, rf_clf, case_X)
            contrib = pd.Series(shap_vec, index=case_X.columns).sort_values(key=lambda x: x.abs(), ascending=False)
            st.bar_chart(contrib.head(10))
            st.caption("Top 10 SHAP feature contributions for selected run.")


In [None]:
with tab2:
    st.subheader("Compare Two Runs (SHAP Difference)")

    dates = summary_df["date"].dt.strftime("%Y-%m-%d").tolist()
    col1, col2 = st.columns(2)
    d1 = col1.selectbox("Run 1 date", dates, index=0)
    d2 = col2.selectbox("Run 2 date", dates, index=1)

    df1 = summary_df[summary_df["date"].dt.strftime("%Y-%m-%d") == d1]
    df2 = summary_df[summary_df["date"].dt.strftime("%Y-%m-%d") == d2]

    if explainer_clf and (not df1.empty and not df2.empty):
        shap1, _ = shap_vector_for_sample(explainer_clf, rf_clf, df1[available_features])
        shap2, _ = shap_vector_for_sample(explainer_clf, rf_clf, df2[available_features])
        diff = pd.Series(shap2 - shap1, index=available_features).sort_values(key=lambda x: x.abs(), ascending=False)
        st.bar_chart(diff.head(10))
        st.caption("Difference in SHAP contributions between two runs.")
    else:
        st.warning("Ensure both runs and explainer are available.")


In [None]:
with tab3:
    st.subheader("Global Feature Importance and Trends")

    if explainer_clf:
        shap_values = explainer_clf.shap_values(summary_df[available_features])
        mean_abs = np.mean(np.abs(np.array(shap_values)), axis=(0, 1))
        global_df = pd.DataFrame({"feature": available_features, "importance": mean_abs})
        global_df = global_df.sort_values("importance", ascending=False)

        chart = alt.Chart(global_df.head(10)).mark_bar().encode(
            x=alt.X("importance:Q", title="Mean |SHAP|"),
            y=alt.Y("feature:N", sort="-x", title="Feature")
        )
        st.altair_chart(chart, use_container_width=True)
        st.caption("Mean absolute SHAP importance across all runs.")

    st.markdown("#### Performance trends over time")
    trend = alt.Chart(summary_df).mark_line(point=True).encode(
        x="date:T", y="avg_pace_min_km:Q", color=alt.value("#007AFF")
    )
    st.altair_chart(trend, use_container_width=True)
