<a href="https://colab.research.google.com/github/cs671/workout-progression-predictor/blob/main/ResistanceTraining_Progression_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
"""
Workout Progression Predictor - Model Training Pipeline
Author: Daniel Romeo
Date: 02/08/2025

Training a Random Forest model to predict my next workout weights based on
my Strong app data. Hopefully this works better than just guessing!
"""

import pandas as pd
import numpy as np
import logging
from pathlib import Path
from typing import Tuple, Dict, Any
import sys

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import joblib
import json
import warnings

# Set up logging - want to see what's happening
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger(__name__)

# These sklearn warnings are annoying
warnings.filterwarnings('ignore', message='X does not have valid feature names')
warnings.filterwarnings('ignore', category=RuntimeWarning)


class WorkoutDataProcessor:
    """
    Handles loading and cleaning my Strong app export data.
    The CSV export format is pretty messy so need to clean it up.
    """

    def __init__(self):
        # Strong app uses these column names
        self.column_mapping = {
            'Exercise Name': 'exercise',
            'Date': 'date',
            'Weight': 'weight_kg',
            'Reps': 'reps',
            'RPE': 'rpe',
            'Workout Name': 'workout_name',
            'Duration': 'duration'
        }

    def load_and_clean_data(self, filepath: str) -> pd.DataFrame:
        """Load the raw CSV and clean it up"""
        logger.info(f"Loading workout data from {filepath}")

        try:
            df = pd.read_csv(filepath)
        except FileNotFoundError:
            logger.error(f"Can't find file at {filepath}")
            sys.exit(1)

        logger.info(f"Loaded {len(df):,} raw workout records")

        # Rename columns to something more pythonic
        df = df.rename(columns=self.column_mapping)

        # Fix data types - dates are usually strings in the export
        df['date'] = pd.to_datetime(df['date'], errors='coerce')

        # Convert weight/reps to numeric, some might be strings with units
        numeric_cols = ['weight_kg', 'reps', 'rpe']
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')

        # Drop obviously bad data
        initial_count = len(df)
        df = df.dropna(subset=['date', 'exercise', 'weight_kg', 'reps'])
        df = df[(df['weight_kg'] > 0) & (df['reps'] > 0)].copy()

        # Calculate total volume (weight x reps) - key metric for progression
        df['volume'] = df['weight_kg'] * df['reps']

        removed_count = initial_count - len(df)
        logger.info(f"Cleaned data: {len(df):,} records ({removed_count} bad records removed)")

        return df

    def create_workout_sessions(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Aggregate individual sets into workout sessions.
        For modeling, I care about the max weight per exercise per day.
        """
        df['session_date'] = df['date'].dt.date

        # Group by date and exercise, take max weight (heaviest set)
        sessions = df.groupby(['session_date', 'exercise']).agg({
            'weight_kg': 'max',        # heaviest set that day
            'volume': 'sum',           # total volume
            'reps': 'mean',            # average reps
            'rpe': 'mean'              # average RPE if available
        }).reset_index()

        sessions['date'] = pd.to_datetime(sessions['session_date'])
        sessions = sessions.sort_values(['exercise', 'date'])

        logger.info(f"Created {len(sessions):,} exercise sessions from individual sets")
        return sessions

    def filter_exercises_for_modeling(self, sessions: pd.DataFrame,
                                    min_sessions: int = 10, max_exercises: int = 8) -> pd.DataFrame:
        """
        Only keep exercises with enough data points.
        Need at least 10 sessions to build a decent model.
        """
        exercise_counts = sessions['exercise'].value_counts()

        # Only exercises with enough sessions
        good_exercises = exercise_counts[exercise_counts >= min_sessions]

        # Take top N exercises by frequency
        top_exercises = good_exercises.head(max_exercises)

        filtered_data = sessions[sessions['exercise'].isin(top_exercises.index)]

        logger.info(f"Selected {len(top_exercises)} exercises with sufficient data:")
        for exercise, count in top_exercises.items():
            logger.info(f"  {exercise}: {count} sessions")

        return filtered_data


class ProgressionFeatureBuilder:
    """
    Creates features for predicting progression.
    The key insight: what I lifted last time + recovery time + how hard it felt
    should predict what I can lift next time.
    """

    def build_progression_sequences(self, sessions: pd.DataFrame) -> pd.DataFrame:
        """
        For each exercise, create sequences with lag features.
        Each row becomes: what happened last time -> what happened this time
        """
        all_progression_data = []

        for exercise in sessions['exercise'].unique():
            exercise_data = sessions[sessions['exercise'] == exercise].copy()
            exercise_data = exercise_data.sort_values('date').reset_index(drop=True)

            if len(exercise_data) < 3:  # Need at least 3 sessions to make sequences
                continue

            # Create lag features - what happened in previous session
            exercise_data['prev_weight'] = exercise_data['weight_kg'].shift(1)
            exercise_data['prev_rpe'] = exercise_data['rpe'].shift(1)
            exercise_data['prev_volume'] = exercise_data['volume'].shift(1)

            # Days between sessions - recovery time matters a lot
            exercise_data['days_rest'] = exercise_data['date'].diff().dt.days

            # Target: weight in next session
            exercise_data['next_weight'] = exercise_data['weight_kg'].shift(-1)

            # Session count - experience with the exercise
            exercise_data['session_number'] = range(1, len(exercise_data) + 1)

            all_progression_data.append(exercise_data)

            # Log progression for this exercise
            start_weight = exercise_data['weight_kg'].iloc[0]
            end_weight = exercise_data['weight_kg'].iloc[-1]
            total_progress = end_weight - start_weight
            logger.info(f"  {exercise}: {start_weight:.1f}kg → {end_weight:.1f}kg ({total_progress:+.1f}kg total)")

        if not all_progression_data:
            raise ValueError("No exercises have enough data for modeling!")

        # Combine all exercises
        combined_data = pd.concat(all_progression_data, ignore_index=True)

        # Only keep rows where we have both previous session data and next session target
        modeling_data = combined_data[
            combined_data['prev_weight'].notna() &
            combined_data['next_weight'].notna()
        ].copy()

        logger.info(f"Built {len(modeling_data)} training examples from progression sequences")
        return modeling_data

    def prepare_model_inputs(self, data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series, LabelEncoder]:
        """Prepare final features and target for model training"""

        # Encode exercise names as numbers
        exercise_encoder = LabelEncoder()
        data['exercise_encoded'] = exercise_encoder.fit_transform(data['exercise'])

        # Handle missing RPE values - fill with exercise-specific median
        # (I don't always track RPE consistently)
        data['prev_rpe_clean'] = data.groupby('exercise')['prev_rpe'].transform(
            lambda x: x.fillna(x.median())
        )

        # Fill any remaining missing values with overall median
        overall_rpe = data['rpe'].median()
        data['prev_rpe_clean'] = data['prev_rpe_clean'].fillna(overall_rpe)

        # Rest days - assume 7 days if missing (weekly schedule)
        data['days_rest'] = data['days_rest'].fillna(7.0)

        # Volume - fill with median
        data['prev_volume'] = data['prev_volume'].fillna(data['prev_volume'].median())

        # These are my key features for prediction
        feature_cols = [
            'prev_weight',        # Most important - what did I lift last time?
            'days_rest',          # Recovery time
            'prev_rpe_clean',     # How hard was last session?
            'prev_volume',        # Total work done last time
            'session_number',     # Experience level with this exercise
            'exercise_encoded'    # Different exercises behave differently
        ]

        X = data[feature_cols].copy()
        y = data['next_weight'].copy()

        logger.info(f"Final feature matrix: {X.shape}")
        logger.info(f"Target range: {y.min():.1f} - {y.max():.1f} kg")

        return X, y, exercise_encoder


class WorkoutProgressionModel:
    """Random Forest model for predicting workout progression"""

    def __init__(self, **params):
        # These hyperparameters worked well in my experiments
        default_config = {
            'n_estimators': 200,      # More trees = better but slower
            'max_depth': 15,          # Prevent overfitting
            'min_samples_split': 5,   # Don't split on tiny groups
            'min_samples_leaf': 2,    # Each leaf needs at least 2 samples
            'random_state': 42,       # For reproducibility
            'n_jobs': -1              # Use all CPU cores
        }
        default_config.update(params)

        self.model = RandomForestRegressor(**default_config)
        self.is_trained = False

    def train_and_evaluate(self, X: pd.DataFrame, y: pd.Series, test_size: float = 0.2):
        """Train the model and return performance metrics"""

        # Split data - stratify by exercise to ensure each exercise is in both train/test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42,
            stratify=X['exercise_encoded']
        )

        logger.info(f"Training on {len(X_train)} examples")
        logger.info(f"Testing on {len(X_test)} examples")

        # Train the model
        self.model.fit(X_train, y_train)
        self.is_trained = True

        # Evaluate on test set
        predictions = self.model.predict(X_test)
        test_mae = mean_absolute_error(y_test, predictions)

        logger.info(f"Model trained! Test MAE: {test_mae:.2f} kg")

        return {
            'model': self.model,
            'test_mae': test_mae,
            'X_test': X_test,
            'y_test': y_test,
            'y_pred': predictions
        }


def analyze_exercise_performance(results: Dict, encoder: LabelEncoder) -> pd.DataFrame:
    """Break down model performance by individual exercise"""

    test_results = results['X_test'].copy()
    test_results['actual_weight'] = results['y_test'].values
    test_results['predicted_weight'] = results['y_pred']

    # Convert exercise codes back to names
    test_results['exercise'] = encoder.inverse_transform(results['X_test']['exercise_encoded'])

    # Calculate MAE for each exercise
    exercise_stats = test_results.groupby('exercise').apply(
        lambda group: pd.Series({
            'test_samples': len(group),
            'mae_kg': mean_absolute_error(group['actual_weight'], group['predicted_weight']),
            'avg_actual': group['actual_weight'].mean(),
            'avg_predicted': group['predicted_weight'].mean()
        }), include_groups=False
    ).round(2)

    return exercise_stats


def save_trained_model(model_obj: RandomForestRegressor, encoder: LabelEncoder,
                      test_mae: float, feature_names: list, exercise_stats: pd.DataFrame,
                      save_dir: str = '.'):
    """Save the trained model and all metadata for deployment"""

    save_path = Path(save_dir)
    save_path.mkdir(exist_ok=True)

    # Save the actual model files
    joblib.dump(model_obj, save_path / 'best_progression_model.pkl')
    joblib.dump(encoder, save_path / 'exercise_label_encoder.pkl')

    # Calculate feature importance
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': model_obj.feature_importances_
    }).sort_values('importance', ascending=False)

    # Create metadata file with all the info Streamlit app needs
    model_metadata = {
        'model_type': 'Random Forest Regressor',
        'mae': float(test_mae),
        'features': feature_names,
        'feature_importance': importance_df.set_index('feature')['importance'].to_dict(),
        'exercises': list(encoder.classes_),
        'exercise_performance': exercise_stats.to_dict('index'),
        'package_versions': {
            'scikit_learn': __import__('sklearn').__version__,
            'numpy': np.__version__,
            'pandas': pd.__version__,
        },
        'training_date': pd.Timestamp.now().strftime('%Y-%m-%d'),
        'notes': 'Trained on personal Strong app data'
    }

    with open(save_path / 'model_info.json', 'w') as f:
        json.dump(model_metadata, f, indent=2)

    logger.info(f"Model files saved to {save_path}")
    return importance_df


def main():
    """Run the full training pipeline"""

    # Configuration - update this path to your Strong app export
    DATA_FILE = "/content/drive/MyDrive/ML Datasets/strong.csv"
    OUTPUT_DIR = "."

    logger.info("Starting workout progression model training")

    try:
        # Step 1: Load and clean the raw data
        processor = WorkoutDataProcessor()
        clean_data = processor.load_and_clean_data(DATA_FILE)

        # Step 2: Create workout sessions
        sessions = processor.create_workout_sessions(clean_data)

        # Step 3: Filter to exercises with enough data
        modeling_sessions = processor.filter_exercises_for_modeling(sessions)

        # Step 4: Build progression features
        feature_builder = ProgressionFeatureBuilder()
        progression_data = feature_builder.build_progression_sequences(modeling_sessions)

        # Step 5: Prepare final inputs
        X, y, exercise_encoder = feature_builder.prepare_model_inputs(progression_data)

        # Step 6: Train the model
        model = WorkoutProgressionModel()
        training_results = model.train_and_evaluate(X, y)

        # Step 7: Analyze performance by exercise
        exercise_performance = analyze_exercise_performance(training_results, exercise_encoder)

        logger.info("Performance breakdown by exercise:")
        for exercise in exercise_performance.index:
            stats = exercise_performance.loc[exercise]
            logger.info(f"  {exercise}: {stats['mae_kg']:.1f}kg MAE ({int(stats['test_samples'])} test samples)")

        # Step 8: Save everything for deployment
        feature_importance = save_trained_model(
            training_results['model'],
            exercise_encoder,
            training_results['test_mae'],
            X.columns.tolist(),
            exercise_performance,
            OUTPUT_DIR
        )

        # Final summary
        logger.info("=" * 50)
        logger.info("Training completed successfully!")
        logger.info(f"Overall model accuracy: ±{training_results['test_mae']:.2f} kg")
        logger.info(f"Most important feature: {feature_importance.iloc[0]['feature']}")
        logger.info(f"Trained on {len(X)} workout progression examples")
        logger.info("=" * 50)

        # Quick test to make sure model works
        test_input = pd.DataFrame({
            'prev_weight': [70.0],
            'days_rest': [3.0],
            'prev_rpe_clean': [7.5],
            'prev_volume': [2000.0],
            'session_number': [25],
            'exercise_encoded': [0]
        })

        test_pred = training_results['model'].predict(test_input)[0]
        logger.info(f"Test prediction: {test_pred:.1f}kg (looks reasonable!)")

    except Exception as e:
        logger.error(f"Training failed: {e}")
        raise


if __name__ == "__main__":
    main()