In [None]:
# --- Stage 2: Cleaning & Per-Run Summary ---------------------------------

import pandas as pd
import numpy as np
from pathlib import Path

raw_path = Path("../data/strava/processed/strava_runs.parquet")
df = pd.read_parquet(raw_path)
print(f"Loaded {len(df):,} rows")

# --- 2.1 Drop unknown or low-quality columns ------------------------------
manifest = pd.read_csv("../data/strava/processed/data_quality_manifest.csv", index_col="column")
keep_cols = manifest.query("category != 'drop'").index.tolist()
df = df[keep_cols]
print(f"Keeping {len(keep_cols)} columns based on manifest")

# --- 2.2 Apply activity-level filters ------------------------------------
before = len(df)
df = df.dropna(subset=["distance_km", "pace_min_per_km"])
df = df[df["distance_km"] >= 0.5]
df = df[(df["pace_min_per_km"] >= 2) & (df["pace_min_per_km"] <= 15)]
df = df[~((df["cadence"] == 0) & (df["speed"] == 0))]
print(f"Removed {before - len(df):,} noisy rows")

# --- 2.3 Aggregate per-run summary ---------------------------------------
summary = (
    df.groupby("run_id")
      .agg(
          records=("timestamp", "count"),
          start_time=("timestamp", "min"),
          end_time=("timestamp", "max"),
          total_distance_km=("distance_km", "max"),
          avg_pace=("pace_min_per_km", "mean"),
          avg_speed=("speed", "mean"),
          avg_cadence=("cadence", "mean"),
          avg_hr=("heart_rate", "mean"),
          elevation_gain=("altitude", lambda s: s.max() - s.min()),
      )
      .reset_index()
)

# --- 2.4 Add date context -------------------------------------------------
summary["date"] = pd.to_datetime(summary["start_time"]).dt.date
summary["weekday"] = pd.to_datetime(summary["start_time"]).dt.day_name()
summary["month"] = pd.to_datetime(summary["start_time"]).dt.to_period("M")

# --- 2.5 Run-level quality metrics ---------------------------------------
summary["missing_pct"] = 100 * (df.isna().sum().sum() / (len(df) * len(df.columns)))

summary["duration_min"] = (
    (pd.to_datetime(summary["end_time"]) - pd.to_datetime(summary["start_time"]))
    .dt.total_seconds() / 60
)

# --- 2.6 Save results -----------------------------------------------------
out_path = Path("../data/strava/processed/run_summary_cleaned.parquet")
summary.to_parquet(out_path, index=False)
summary.to_csv(out_path.with_suffix(".csv"), index=False)
print(f"✅ Saved {len(summary):,} clean runs → {out_path}")

summary.head()
