In [None]:
1. Load Exploration Dataset

import pandas as pd
df = pd.read_parquet("data/strava/exploration_stage.parquet")

In [None]:
2. Derived Features
Examples (all safe to compute now):

df["pace_variability"] = df["avg_pace"].rolling(5, min_periods=1).std()
df["fatigue_index"] = (df["avg_pace"] * df["elevation_gain"]) / df["avg_cadence"]
df["elev_ratio"] = df["elevation_gain"] / df["total_distance_km"]
df["weekday"] = pd.to_datetime(df["date"]).dt.day_name()

weekly = df.groupby(df["date"].dt.isocalendar().week).agg(
    weekly_distance=("total_distance_km","sum"),
    avg_pace=("avg_pace","mean"),
    fatigue_index=("fatigue_index","mean")
)


In [None]:
Normalization / Scaling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_cols = ["avg_pace","avg_cadence","elevation_gain","fatigue_index"]
df_scaled = df.copy()
df_scaled[scaled_cols] = scaler.fit_transform(df_scaled[scaled_cols])

In [None]:
4. Correlation & Sanity Plots
Use seaborn or matplotlib:

import seaborn as sns, matplotlib.pyplot as plt
sns.heatmap(df_scaled[scaled_cols].corr(), annot=True)
plt.show()

In [None]:
5. Dimensionality Preview (Optional)

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
proj = pca.fit_transform(df_scaled[scaled_cols])
plt.scatter(proj[:,0], proj[:,1], c=df_scaled["avg_pace"], cmap="viridis")

In [None]:
6. Export

df_scaled.to_parquet("data/strava/runs_features.parquet", index=False)
print("✅ ML-ready feature dataset saved.")