In [None]:
# -----------------------------------------------------------------------------
# 📘 Notebook: 07_visualization_and_dashboard.ipynb
#
# Purpose:
#   Build an interactive dashboard for exploring run clusters, model predictions,
#   and feature explainability (Stage 5 + 6 outputs).
#
# Inputs:
#   - data/strava/processed/run_clusters.parquet
#   - data/strava/processed/run_predictions.parquet
#   - data/strava/processed/model_metrics.csv
#
# Outputs:
#   - Streamlit dashboard (run via: streamlit run app_dashboard.py)
#
# Steps:
#   1) Load artifacts (clusters, predictions, metrics)
#   2) Visual summary of clusters (PCA, feature histograms)
#   3) Model performance charts (actual vs predicted, residuals)
#   4) Explainability viewer (SHAP global + local)
#   5) Optional Neo4j integration (relational insight)
#   6) Clean launch instructions
# -----------------------------------------------------------------------------

# --- 1) Load artifacts -------------------------------------------------------
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

cluster_path = Path("../data/strava/processed/run_clusters.parquet")
pred_path    = Path("../data/strava/processed/run_predictions.parquet")
metrics_path = Path("../data/strava/processed/model_metrics.csv")

df_clusters = pd.read_parquet(cluster_path)
df_preds    = pd.read_parquet(pred_path)
metrics     = pd.read_csv(metrics_path)

print(f"✅ Loaded {len(df_clusters):,} runs with clusters and {len(df_preds):,} predictions")
display(metrics)

# --- 2) Cluster visualization ------------------------------------------------
# Interpretation tip:
# - These charts let you visually inspect how clusters differ in pace, cadence, elevation, etc.

sns.set(style="whitegrid")
plt.figure(figsize=(6,4))
sns.countplot(data=df_clusters, x="cluster", palette="tab10")
plt.title("Cluster distribution")
plt.show()

plt.figure(figsize=(6,4))
sns.boxplot(data=df_clusters, x="cluster", y="avg_pace", palette="tab10")
plt.title("Average pace per cluster")
plt.ylabel("Pace (min/km)")
plt.show()

# --- 3) Model evaluation -----------------------------------------------------
# Compare actual vs predicted pace from Stage 6

plt.figure(figsize=(6,6))
plt.scatter(df_preds["actual_pace"], df_preds["predicted_pace"], alpha=0.7)
plt.plot([df_preds["actual_pace"].min(), df_preds["actual_pace"].max()],
         [df_preds["actual_pace"].min(), df_preds["actual_pace"].max()],
         "r--")
plt.xlabel("Actual pace (min/km)")
plt.ylabel("Predicted pace (min/km)")
plt.title("Actual vs Predicted pace")
plt.grid(True)
plt.show()

# Residuals
resid = df_preds["actual_pace"] - df_preds["predicted_pace"]
plt.figure(figsize=(6,4))
sns.histplot(resid, bins=30, kde=True, color="steelblue")
plt.title("Residual distribution")
plt.xlabel("Residual (min/km)")
plt.show()

# --- 4) (Optional) SHAP global view -----------------------------------------
# Load saved SHAP values or recompute quickly if available
# Example: shap.summary_plot(global_shap_values, feature_names=feature_names)
# To integrate interactively, Streamlit code below will embed plots dynamically.

# --- 5) Streamlit dashboard scaffolding -------------------------------------
# Save this as 'app_dashboard.py' in project root.
# Run with:  streamlit run app_dashboard.py

"""
# app_dashboard.py
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

st.set_page_config(page_title="Running Analytics Dashboard", layout="wide")

@st.cache_data
def load_data():
    clusters = pd.read_parquet("data/strava/processed/run_clusters.parquet")
    preds = pd.read_parquet("data/strava/processed/run_predictions.parquet")
    metrics = pd.read_csv("data/strava/processed/model_metrics.csv")
    return clusters, preds, metrics

clusters, preds, metrics = load_data()

st.title("🏃 Running Analytics Dashboard")

st.subheader("Cluster Overview")
col1, col2 = st.columns(2)
with col1:
    st.bar_chart(clusters["cluster"].value_counts())
with col2:
    st.box_chart(clusters, x="cluster", y="avg_pace")

st.subheader("Model Performance")
st.write(metrics)

fig, ax = plt.subplots()
sns.scatterplot(x="actual_pace", y="predicted_pace", data=preds, ax=ax)
ax.plot([preds["actual_pace"].min(), preds["actual_pace"].max()],
        [preds["actual_pace"].min(), preds["actual_pace"].max()], "r--")
st.pyplot(fig)
"""

# --- 6) Launch instructions --------------------------------------------------
print("""
🚀 To start the interactive dashboard:
1. Activate your virtual environment.
2. In the repo root, run:
      streamlit run app_dashboard.py
3. The dashboard will open in your browser (default: http://localhost:8501)
""")
