<a href="https://colab.research.google.com/github/cs671/workout-progression-predictor/blob/main/ResistanceTraining_Progression_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
"""
Workout Progression Predictor - Model Training Pipeline
Author: Daniel Romeo
Date: 02/08/2025

This script trains a Random Forest model to predict optimal workout progressions
based on personal training data from the Strong fitness app.
"""

import pandas as pd
import numpy as np
import logging
from pathlib import Path
from typing import Tuple, Dict, Any

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import joblib
import json
import warnings

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Suppress sklearn warnings
warnings.filterwarnings('ignore', message='X does not have valid feature names')
warnings.filterwarnings('ignore', category=RuntimeWarning, module='pandas')


class WorkoutDataProcessor:
    """Handles data loading, cleaning, and feature engineering for workout data."""

    def __init__(self):
        self.column_mapping = {
            'Exercise Name': 'exercise',
            'Date': 'date',
            'Weight': 'weight_kg',
            'Reps': 'reps',
            'RPE': 'rpe',
            'Workout Name': 'workout_name',
            'Duration': 'duration'
        }

    def load_and_clean_data(self, filepath: str) -> pd.DataFrame:
        """Load and clean raw workout data."""
        logger.info(f"Loading data from {filepath}")

        df = pd.read_csv(filepath)
        logger.info(f"Loaded {len(df)} raw records")

        # Standardize columns
        df = df.rename(columns=self.column_mapping)

        # Clean data types
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        numeric_cols = ['weight_kg', 'reps', 'rpe']
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')

        # Remove invalid records
        initial_rows = len(df)
        df = df.dropna(subset=['date', 'exercise', 'weight_kg', 'reps'])
        df = df[(df['weight_kg'] > 0) & (df['reps'] > 0)].copy()
        df['volume'] = df['weight_kg'] * df['reps']

        logger.info(f"Cleaned data: {len(df)} records ({initial_rows - len(df)} removed)")
        return df

    def create_sessions(self, df: pd.DataFrame) -> pd.DataFrame:
        """Aggregate individual sets into workout sessions."""
        df['session_date'] = df['date'].dt.date

        sessions = df.groupby(['session_date', 'exercise']).agg({
            'weight_kg': 'max',
            'volume': 'sum',
            'reps': 'mean',
            'rpe': 'mean'
        }).reset_index()

        sessions['date'] = pd.to_datetime(sessions['session_date'])
        sessions = sessions.sort_values(['exercise', 'date'])

        logger.info(f"Created {len(sessions)} exercise sessions")
        return sessions

    def filter_exercises(self, sessions: pd.DataFrame, min_sessions: int = 10, max_exercises: int = 8) -> pd.DataFrame:
        """Filter to exercises with sufficient data for modeling."""
        exercise_counts = sessions['exercise'].value_counts()
        top_exercises = exercise_counts[exercise_counts >= min_sessions].head(max_exercises)

        filtered_sessions = sessions[sessions['exercise'].isin(top_exercises.index)]

        logger.info(f"Selected {len(top_exercises)} exercises for modeling:")
        for exercise, count in top_exercises.items():
            logger.info(f"  {exercise}: {count} sessions")

        return filtered_sessions


class ProgressionFeatureEngineer:
    """Creates features for progression modeling."""

    def create_progression_features(self, sessions: pd.DataFrame) -> pd.DataFrame:
        """Create lag features and progression sequences."""
        progression_data = []

        for exercise in sessions['exercise'].unique():
            ex_data = sessions[sessions['exercise'] == exercise].copy()
            ex_data = ex_data.sort_values('date').reset_index(drop=True)

            if len(ex_data) < 3:
                continue

            # Create lag features
            ex_data['prev_weight'] = ex_data['weight_kg'].shift(1)
            ex_data['prev_rpe'] = ex_data['rpe'].shift(1)
            ex_data['prev_volume'] = ex_data['volume'].shift(1)
            ex_data['days_rest'] = ex_data['date'].diff().dt.days
            ex_data['next_weight'] = ex_data['weight_kg'].shift(-1)
            ex_data['session_number'] = range(1, len(ex_data) + 1)

            progression_data.append(ex_data)

            # Log progression summary
            start_weight = ex_data['weight_kg'].iloc[0]
            end_weight = ex_data['weight_kg'].iloc[-1]
            total_change = end_weight - start_weight
            logger.info(f"  {exercise}: {start_weight:.1f}kg → {end_weight:.1f}kg ({total_change:+.1f}kg)")

        if not progression_data:
            raise ValueError("No exercises found with sufficient data")

        ml_data = pd.concat(progression_data, ignore_index=True)

        # Remove sessions without complete features/targets
        valid_data = ml_data[
            ml_data['prev_weight'].notna() &
            ml_data['next_weight'].notna()
        ].copy()

        logger.info(f"Created {len(valid_data)} training examples")
        return valid_data

    def prepare_features(self, data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series, LabelEncoder]:
        """Prepare final feature matrix and target vector."""
        # Encode exercises
        label_encoder = LabelEncoder()
        data['exercise_encoded'] = label_encoder.fit_transform(data['exercise'])

        # Handle missing RPE values
        data['prev_rpe_filled'] = data.groupby('exercise')['prev_rpe'].transform(
            lambda x: x.fillna(x.median())
        )

        # Fill remaining missing values
        overall_rpe_median = data['rpe'].median()
        data['prev_rpe_filled'] = data['prev_rpe_filled'].fillna(overall_rpe_median)
        data['days_rest'] = data['days_rest'].fillna(7.0)
        data['prev_volume'] = data['prev_volume'].fillna(data['prev_volume'].median())

        # Create feature matrix
        feature_columns = [
            'prev_weight',
            'days_rest',
            'prev_rpe_filled',
            'prev_volume',
            'session_number',
            'exercise_encoded'
        ]

        X = data[feature_columns].copy()
        y = data['next_weight'].copy()

        logger.info(f"Feature matrix: {X.shape}, Target range: {y.min():.1f} - {y.max():.1f} kg")
        return X, y, label_encoder


class WorkoutProgressionModel:
    """Random Forest model for workout progression prediction."""

    def __init__(self, **model_params):
        default_params = {
            'n_estimators': 200,
            'max_depth': 15,
            'min_samples_split': 5,
            'min_samples_leaf': 2,
            'random_state': 42,
            'n_jobs': -1
        }
        default_params.update(model_params)
        self.model = RandomForestRegressor(**default_params)

    def train(self, X: pd.DataFrame, y: pd.Series, test_size: float = 0.2) -> Dict[str, Any]:
        """Train model and return performance metrics."""
        # Stratified split by exercise
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42,
            stratify=X['exercise_encoded']
        )

        logger.info(f"Training on {len(X_train)} examples, testing on {len(X_test)}")

        # Train model
        self.model.fit(X_train, y_train)

        # Evaluate
        y_pred = self.model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)

        logger.info(f"Model trained. Test MAE: {mae:.2f} kg")

        return {
            'model': self.model,
            'mae': mae,
            'X_test': X_test,
            'y_test': y_test,
            'y_pred': y_pred
        }


def evaluate_exercise_performance(results: Dict, label_encoder: LabelEncoder) -> pd.DataFrame:
    """Calculate exercise-specific performance metrics."""
    test_data = results['X_test'].copy()
    test_data['actual'] = results['y_test'].values
    test_data['predicted'] = results['y_pred']
    test_data['exercise'] = label_encoder.inverse_transform(results['X_test']['exercise_encoded'])

    exercise_performance = test_data.groupby('exercise').apply(
        lambda x: pd.Series({
            'samples': len(x),
            'mae': mean_absolute_error(x['actual'], x['predicted']),
            'mean_actual': x['actual'].mean(),
            'mean_predicted': x['predicted'].mean()
        }), include_groups=False
    ).round(2)

    return exercise_performance


def save_model_artifacts(model: RandomForestRegressor, label_encoder: LabelEncoder,
                        mae: float, feature_columns: list, exercise_performance: pd.DataFrame,
                        output_dir: str = '.'):
    """Save trained model and metadata."""
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)

    # Save model and encoder
    joblib.dump(model, output_path / 'best_progression_model.pkl')
    joblib.dump(label_encoder, output_path / 'exercise_label_encoder.pkl')

    # Create feature importance analysis
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

    # Save comprehensive metadata
    model_info = {
        'model_type': 'Random Forest Regressor',
        'mae': float(mae),
        'features': feature_columns,
        'feature_importance': feature_importance.set_index('feature')['importance'].to_dict(),
        'exercises': list(label_encoder.classes_),
        'exercise_performance': exercise_performance.to_dict('index'),
        'package_versions': {
            'scikit_learn': __import__('sklearn').__version__,
            'numpy': np.__version__,
            'pandas': pd.__version__,
        }
    }

    with open(output_path / 'model_info.json', 'w') as f:
        json.dump(model_info, f, indent=2)

    logger.info(f"Model artifacts saved to {output_path}")
    return feature_importance


def main():
    """Main training pipeline."""
    # Configuration
    DATA_PATH = "/content/drive/MyDrive/ML Datasets/strong.csv"  # Update path as needed
    OUTPUT_DIR = "."

    try:
        # Initialize processors
        processor = WorkoutDataProcessor()
        feature_engineer = ProgressionFeatureEngineer()

        # Load and process data
        raw_data = processor.load_and_clean_data(DATA_PATH)
        sessions = processor.create_sessions(raw_data)
        filtered_sessions = processor.filter_exercises(sessions)

        # Engineer features
        progression_data = feature_engineer.create_progression_features(filtered_sessions)
        X, y, label_encoder = feature_engineer.prepare_features(progression_data)

        # Train model
        model_trainer = WorkoutProgressionModel()
        results = model_trainer.train(X, y)

        # Evaluate performance
        exercise_performance = evaluate_exercise_performance(results, label_encoder)
        logger.info("Exercise-specific performance:")
        for exercise in exercise_performance.index:
            perf = exercise_performance.loc[exercise]
            logger.info(f"  {exercise}: {perf['mae']:.1f}kg MAE ({int(perf['samples'])} samples)")

        # Save artifacts
        feature_importance = save_model_artifacts(
            results['model'], label_encoder, results['mae'],
            X.columns.tolist(), exercise_performance, OUTPUT_DIR
        )

        logger.info("Training pipeline completed successfully!")
        logger.info(f"Final model MAE: {results['mae']:.2f} kg")

    except Exception as e:
        logger.error(f"Training pipeline failed: {e}")
        raise


if __name__ == "__main__":
    main()

🏋️ GOOGLE COLAB WORKOUT MODEL TRAINER 🏋️
📦 STEP 1: Installing required packages...
Found existing installation: scikit-learn 1.7.1
Uninstalling scikit-learn-1.7.1:
  Successfully uninstalled scikit-learn-1.7.1
Found existing installation: numpy 2.3.2
Uninstalling numpy-2.3.2:
  Successfully uninstalled numpy-2.3.2
Found existing installation: pandas 2.3.1
Uninstalling pandas-2.3.1:
  Successfully uninstalled pandas-2.3.1
Found existing installation: xgboost 3.0.3
Uninstalling xgboost-3.0.3:
  Successfully uninstalled xgboost-3.0.3
Found existing installation: plotly 6.2.0
Uninstalling plotly-6.2.0:
  Successfully uninstalled plotly-6.2.0
Collecting scikit-learn
  Using cached scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting pandas
  Using cached pandas-2.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting xgboost
  Using cached xgboost-3.0.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 k

✅ Packages installed successfully!
Package versions:
  scikit-learn: 1.7.1
  numpy: 2.3.2
  pandas: 2.3.1

📁 STEP 1.5: Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive mounted successfully!

📁 STEP 2: Loading data from Google Drive...
✅ Loaded: /content/drive/MyDrive/ML Datasets/strong.csv
Raw data: 4086 records

🧹 STEP 3: Cleaning workout data...
Columns: ['Date', 'Workout Name', 'Duration', 'Exercise Name', 'Set Order', 'Weight', 'Reps', 'Distance', 'Seconds', 'Notes', 'Workout Notes', 'RPE']
Standardized columns: ['date', 'workout_name', 'duration', 'exercise', 'Set Order', 'weight_kg', 'reps', 'Distance', 'Seconds', 'Notes', 'Workout Notes', 'rpe']
Removed 76 invalid records
Clean data: 4010 records

📈 STEP 4: Creating workout sessions...
Created 785 exercise sessions

Top exercises for modeling:
   Lat Pulldown (Cable): 43 sessions
   Seated Row (Cable): 39 sessi

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🎉 SUCCESS! Your workout AI model is ready!

📊 Model Summary:
   Model: Random Forest Regressor
   Accuracy: ±7.5kg average error
   Training samples: 255
   Exercises: 8
   Best feature: prev_weight

🎯 Exercise Coverage:
   Chest Press (Machine): 36 training sessions
   Lat Pulldown (Cable): 41 training sessions
   Leg Extension (Machine): 35 training sessions
   Lying Leg Curl (Machine): 27 training sessions
   Pec Deck (Machine): 22 training sessions
   Preacher Curl (Machine): 33 training sessions
   Seated Leg Curl (Machine): 24 training sessions
   Seated Row (Cable): 37 training sessions

📋 Next Steps for Streamlit Deployment:
1. 📁 Create GitHub repository
2. 📤 Upload the 3 downloaded model files
3. 📝 Add app.py (Streamlit code)
4. 📝 Add requirements.txt:
     streamlit>=1.28.0
     pandas>=2.3.1
     numpy>=2.3.2
     scikit-learn>=1.7.1
     xgboost>=2.0.0
     joblib>=1.3.0
     plotly>=5.15.0
5. 🚀 Deploy on share.streamlit.io

💡 Your AI can now predict optimal weights for:
 