In [None]:
# -----------------------------------------------------------------------------
# 📘 Notebook: 04_feature_engineering_mlprep.ipynb
#
# Purpose:
#   Derive secondary metrics from cleaned run summaries and prepare
#   a machine-learning–ready dataset for clustering and regression.
#
# Input : data/strava/processed/run_summary_cleaned.parquet
# Output: data/strava/processed/runs_features.parquet
# Next  : Stage 5 – Clustering & Explainability
# -----------------------------------------------------------------------------

# --- 4.1. Load cleaned dataset -------------------------------------------------
import os
from pathlib import Path
import pandas as pd
import numpy as np

data_path = Path("../data/strava/processed/run_summary_cleaned.parquet")

df = pd.read_parquet(data_path)
print(f"✅ Loaded {len(df):,} cleaned runs")


In [None]:
# --- 4.2 Derived feature engineering -----------------------------------------
# Rolling variability of pace (how consistent you were across runs)
df["pace_variability"] = df["avg_pace"].rolling(5, min_periods=1).std()

# Fatigue index: higher if you climb more or slow down with elevation
df["fatigue_index"] = (df["avg_pace"] * (df["elevation_gain"] + 1)) / (df["avg_cadence"] + 1)

# Elevation per km: terrain load indicator
df["elev_ratio"] = df["elevation_gain"] / (df["total_distance_km"] + 1e-3)

# Day-of-week categorical
df["weekday"] = pd.to_datetime(df["date"]).dt.day_name()

# Quick preview
display(df.head(5))


In [None]:
# --- 4.3. Normalization / scaling ---------------------------------------------
from sklearn.preprocessing import StandardScaler

scaled_cols = ["avg_pace", "avg_cadence", "elevation_gain", "fatigue_index"]
scaler = StandardScaler()

df_scaled = df.copy()
df_scaled[scaled_cols] = scaler.fit_transform(df_scaled[scaled_cols])

print("✅ Normalized key metrics for modeling")


In [None]:
# --- 4.4. Correlation and sanity plots ----------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6,5))
sns.heatmap(df_scaled[scaled_cols].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlations")
plt.show()


In [None]:
# --- 4.5. Optional dimensionality preview (PCA) -------------------------------
from sklearn.decomposition import PCA

# Drop any rows that have NaN in the scaled columns
#df_pca = df_scaled.dropna(subset=scaled_cols)

#--------

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

# Safety: ensure df_scaled exists. If not, build it from df (so this cell is runnable standalone).
if 'df_scaled' not in globals():
    if 'df' in globals():
        print("df_scaled not defined — creating df_scaled from df (scaling selected cols).")
        scaler = StandardScaler()
        df_scaled = df.copy()
        # Fill NaNs with column mean before scaling
        df_scaled[scaled_cols] = df_scaled[scaled_cols].fillna(df_scaled[scaled_cols].mean())
        df_scaled[scaled_cols] = scaler.fit_transform(df_scaled[scaled_cols])
    else:
        raise NameError("Neither df_scaled nor df are defined. Run the earlier cells to load the dataset.")

# Prepare the matrix for PCA
X = df_scaled[scaled_cols].values
# Impute any remaining missing values (safety)
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

#------------------
pca = PCA(n_components=2)
# proj = pca.fit_transform(df_pca[scaled_cols])

# plt.figure(figsize=(7,5))
# plt.scatter(
#     proj[:, 0],
#     proj[:, 1],
#     c=df_pca["avg_pace"],
#     cmap="viridis",
#     s=40,
#     alpha=0.7
# )
# plt.title("PCA projection (color = avg_pace)")
# plt.xlabel("PC1")
# plt.ylabel("PC2")
# plt.show()

#-----
proj = pca.fit_transform(X_imputed)

# Report explained variance
explained = pca.explained_variance_ratio_
print(f"Explained variance: PC1={explained[0]:.2%}, PC2={explained[1]:.2%}, total={(explained[:2].sum()):.2%}")

# Plot the projection. Color points by avg_pace to indicate a continuous label
plt.figure(figsize=(8,6))
sc = plt.scatter(proj[:,0], proj[:,1], c=df_scaled["avg_pace"], cmap="viridis", s=40, alpha=0.8)
plt.colorbar(sc, label='avg_pace')

# Overlay feature loadings as arrows to show which features push points in each direction
loadings = pca.components_.T
x_load = loadings[:,0]
y_load = loadings[:,1]
feature_names = scaled_cols

# Scale loadings for visibility (arbitrary scaling factor)
scale = np.max(np.abs(proj)) * 0.6 if proj.size else 1.0
for i, (x_l, y_l) in enumerate(zip(x_load, y_load)):
    plt.arrow(0, 0, x_l*scale, y_l*scale, color='red', alpha=0.7, head_width=0.03)
    plt.text(x_l*scale*1.15, y_l*scale*1.15, feature_names[i], color='red', fontsize=9)

plt.title('PCA projection (color = avg_pace)')
plt.xlabel(f"PC1 ({explained[0]*100:.1f}% var)")
plt.ylabel(f"PC2 ({explained[1]*100:.1f}% var)")
plt.axhline(0, color='grey', lw=0.5, linestyle='--')
plt.axvline(0, color='grey', lw=0.5, linestyle='--')
plt.grid(alpha=0.2)
plt.show()

# Interpretation tips:
# - Points close together in this plot have similar values across the input features.
# - The color indicates avg_pace; clustering by color shows pace-driven structure.
# - Feature loadings (red arrows) indicate the direction each standardized feature
#   increases in the PCA space. For example, an arrow pointing to the right means
#   that higher values of that feature push samples toward larger PC1 values.
# - Check the explained variance above to judge whether 2 components are sufficient.


In [None]:
import pandas as pd
loadings = pca.components_.T  # shape: (n_features, n_components)
pc1_loadings = pd.Series(loadings[:,0], index=scaled_cols).abs().sort_values(ascending=False)
pc2_loadings = pd.Series(loadings[:,1], index=scaled_cols).abs().sort_values(ascending=False)
print('Top contributors to PC1:\n', pc1_loadings)
print('Top contributors to PC2:\n', pc2_loadings)

In [None]:
# --- 4.6. Export ML-ready features --------------------------------------------
out_path = Path("data/strava/processed/runs_features.parquet")
df_scaled.to_parquet(out_path, index=False)
print(f"✅ ML-ready feature dataset saved → {out_path.resolve()}")

In [None]:
# --- 4.7.. Inspect columns before feature engineering -------------------------
df.columns.tolist()



In [None]:
#4.8. Derived Features Examples (all safe to compute now):

df["pace_variability"] = df["avg_pace"].rolling(5, min_periods=1).std()
df["fatigue_index"] = (df["avg_pace"] * df["elevation_gain"]) / df["avg_cadence"]
df["elev_ratio"] = df["elevation_gain"] / df["total_distance_km"]
df["weekday"] = pd.to_datetime(df["date"]).dt.day_name()

weekly = df.groupby(df["date"].dt.isocalendar().week).agg(
    weekly_distance=("total_distance_km","sum"),
    avg_pace=("avg_pace","mean"),
    fatigue_index=("fatigue_index","mean")
)


In [None]:
4.9. #Normalization / Scaling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_cols = ["avg_pace","avg_cadence","elevation_gain","fatigue_index"]
df_scaled = df.copy()
df_scaled[scaled_cols] = scaler.fit_transform(df_scaled[scaled_cols])

In [None]:
#5.0 Correlation & Sanity Plots
#Use seaborn or matplotlib:

import seaborn as sns, matplotlib.pyplot as plt
sns.heatmap(df_scaled[scaled_cols].corr(), annot=True)
plt.show()

In [None]:
# --- 5. Optional dimensionality preview (PCA) -------------------------------
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# Handle missing values safely
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(df_scaled[scaled_cols])

pca = PCA(n_components=2)
proj = pca.fit_transform(X_imputed)

plt.figure(figsize=(7, 5))
plt.scatter(
    proj[:, 0],
    proj[:, 1],
    c=df_scaled["avg_pace"],
    cmap="viridis",
    s=40,
    alpha=0.7
)
plt.title("PCA projection (color = avg_pace)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()


In [None]:
# --- 6. Export ML-ready feature dataset --------------------------------------
from pathlib import Path
import os

# Detect the correct project root (so you don’t get a nested /notebooks/data/ folder)
cwd = Path(os.getcwd())
project_root = cwd.parents[0] if cwd.name == "notebooks" else cwd

# Define canonical processed output path
processed_path = project_root / "data" / "strava" / "processed" / "run_summary_features.parquet"

# Ensure directory exists
processed_path.parent.mkdir(parents=True, exist_ok=True)

# Save the scaled + engineered dataset
df_scaled.to_parquet(processed_path, index=False)

print(f"✅ ML-ready feature dataset saved → {processed_path.resolve()}")
print(f"Columns exported: {len(df_scaled.columns)} → {df_scaled.columns.tolist()}")
print(f"Rows exported: {len(df_scaled):,}")

