In [None]:
# --- Stage 3: Exploratory Run Data Analysis & Quality Visualization ---
#This stage explores the cleaned Strava running dataset produced in Stage 2.
#Inspecting overall structure and descriptive statistics
#Visualizing distributions of distance, pace, and cadence
#Checking residual missingness or anomalies
#Validating per-run summaries before deeper modeling or clustering

In [None]:
# 3.0. Import libaries and load data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Load data
data_path = Path("../data/strava/processed/strava_runs.parquet")
df = pd.read_parquet(data_path)
print(f"✅ Loaded {len(df):,} rows × {len(df.columns)} columns")


In [None]:
# --- 3.1. Basic summary ---
display(df.head())
display(df.info())
display(df.describe(include='all').T)


In [None]:
# --- 3.2. Missingness overview ---
missing = df.isna().mean().sort_values(ascending=False)
display(missing)

plt.figure(figsize=(10,6))
missing.head(20).plot(kind='barh', title='Top 20 Columns by Missing %')
plt.gca().invert_yaxis()
plt.show()

# --- 3.3. Quick visual checks ---
sns.histplot(df["distance_km"], bins=50, kde=True)
plt.title("Distribution of distance_km"); plt.show()

sns.histplot(df["pace_min_per_km"], bins=50, kde=True)
plt.title("Distribution of pace_min_per_km"); plt.show()

In [None]:
# 3.4. Group by run_id to see completeness per run
run_summary = (
    df.groupby("run_id")
      .agg(
          records=('timestamp', 'count'),
          total_distance_km=('distance_km', 'max'),
          avg_pace=('pace_min_per_km', 'mean'),
          cadence_mean=('cadence', 'mean'),
          hr_mean=('heart_rate', 'mean'),
          elevation_max=('altitude', 'max'),
      )
      .reset_index()
)

print(run_summary.describe())
sns.histplot(run_summary["total_distance_km"], bins=60, kde=True)
plt.title("Run distance distribution"); plt.show()

sns.scatterplot(data=run_summary, x="total_distance_km", y="avg_pace")
plt.title("Distance vs average pace"); plt.show()


In [None]:
#3.5. Load existing Data Quality Manifest for reference ---
from pathlib import Path
import pandas as pd

manifest_path = Path("../data/strava/processed/data_quality_manifest.csv")
manifest = pd.read_csv(manifest_path)
print(f"✅ Loaded manifest with {len(manifest)} columns from: {manifest_path}")
display(manifest.head(15))


In [None]:
run_summary.to_csv("../data/strava/processed/run_summary_stage1.csv", index=False)
print("✅ Saved per-run summary for next stages.")


In [None]:
# 3.6. Step 1D – Visual Missingness Heatmap
# ---------------------------------------------------------------------
# Purpose:
# The goal here is to *visually inspect patterns of missing data*.
# While numeric completeness metrics (like the manifest) tell us how
# many values are missing per column, a heatmap helps reveal *where*
# those gaps occur and whether they form patterns — for instance:
#   • Do certain sensors (e.g., heart_rate or altitude) drop out together?
#   • Are there entire runs or time segments with missing data?
#
# We use the `missingno` library because it’s fast and intuitive:
# each column is represented as a vertical stripe of completeness
# across a random sample of rows. White bands = missing data.
#
# Note: We sample 5,000 rows for readability — plotting all ~600k
# rows would be both slow and visually cluttered.
# ---------------------------------------------------------------------

import matplotlib.pyplot as plt

try:
    import missingno as msno
    sample_df = df.sample(5000, random_state=42)
    print(f"📊 Visualizing missingness for a {len(sample_df):,}-row sample...")
    msno.matrix(sample_df)
    plt.show()
except ImportError:
    print("⚠️ The `missingno` package is not installed — skipping missingness visualization.")
