## Imports

In [None]:
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count
import time
from xgboost import XGBRegressor

try:
    from tqdm import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False

## Data Loading Function

In [2]:
def load_single_file(args):
    """
    Load a single CSV file. Designed for multiprocessing.
    
    Parameters:
    -----------
    args : tuple
        (file_path, meet_id, file_type) where file_type is 'entries' or 'meet'
    
    Returns:
    --------
    DataFrame or None
        Loaded dataframe with MeetID column, or None if file doesn't exist or fails
    """
    file_path, meet_id, file_type = args
    try:
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            df['MeetID'] = meet_id
            return df
        return None
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None


def load_federation_data(federation_path, federation_name):
    """
    Load entries.csv and meet.csv files from all subfolders in a federation directory.
    Uses multiprocessing to load files in parallel.
    
    Parameters:
    -----------
    federation_path : str
        Path to the federation folder (e.g., "../opl-data/meet-data/ipf")
    federation_name : str
        Name of the federation (for display purposes)
    
    Returns:
    --------
    entries_list : list
        List of entries dataframes
    meets_list : list
        List of meets dataframes
    """
    # Get all folders in federation directory
    folders = [f for f in os.listdir(federation_path) 
               if os.path.isdir(os.path.join(federation_path, f)) 
               and not f.startswith('.')]
    
    # Prepare list of all files to load
    file_tasks = []
    for folder in folders:
        entries_path = os.path.join(federation_path, folder, "entries.csv")
        meet_path = os.path.join(federation_path, folder, "meet.csv")
        file_tasks.append((entries_path, folder, 'entries'))
        file_tasks.append((meet_path, folder, 'meet'))
    
    # Use threading to load files in parallel
    # ThreadPoolExecutor is ideal for I/O-bound operations like reading CSV files
    # Use 12 workers to match logical processor count (or cpu_count() for portability)
    max_workers = min(12, cpu_count())
    
    start_time = time.time()
    entries_list = []
    meets_list = []
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        if HAS_TQDM:
            futures = {executor.submit(load_single_file, task): task for task in file_tasks}
            # Collect results with progress bar
            for future in tqdm(as_completed(futures), total=len(file_tasks), desc=f"Loading {federation_name}"):
                result = future.result()
                task = futures[future]
                if result is not None:
                    if task[2] == 'entries':
                        entries_list.append(result)
                    else:
                        meets_list.append(result)
        else:
            # Without tqdm, submit all tasks and collect results
            futures = {executor.submit(load_single_file, task): task for task in file_tasks}
            for future in as_completed(futures):
                result = future.result()
                task = futures[future]
                if result is not None:
                    if task[2] == 'entries':
                        entries_list.append(result)
                    else:
                        meets_list.append(result)
    
    elapsed_time = time.time() - start_time
    print(f"{federation_name}: Loaded {len(entries_list)} entries files, {len(meets_list)} meets files in {elapsed_time:.2f} seconds")
    
    return entries_list, meets_list

# Load data from all three federations
ipf_path = "../opl-data/meet-data/ipf"
usapl_path = "../opl-data/meet-data/usapl"
pa_path = "../opl-data/meet-data/pa"

# Load IPF data
ipf_entries, ipf_meets = load_federation_data(ipf_path, "IPF")
print(f"IPF: Loaded {len(ipf_entries)} entries files, {len(ipf_meets)} meets files")

# Load USAPL data
usapl_entries, usapl_meets = load_federation_data(usapl_path, "USAPL")
print(f"USAPL: Loaded {len(usapl_entries)} entries files, {len(usapl_meets)} meets files")

# Load PA data
pa_entries, pa_meets = load_federation_data(pa_path, "PA")
print(f"PA: Loaded {len(pa_entries)} entries files, {len(pa_meets)} meets files")

# Combine all entries from all federations
all_entries_list = ipf_entries + usapl_entries + pa_entries
all_meets_list = ipf_meets + usapl_meets + pa_meets

# Combine all entries
df_entries_all = pd.concat(all_entries_list, ignore_index=True)

# Combine all meets
df_meets_all = pd.concat(all_meets_list, ignore_index=True)

# Display results
print(f"\n{'='*60}")
print("COMBINED RESULTS:")
print(f"{'='*60}")
print(f"Total entries: {len(df_entries_all)}")
print(f"Total meets: {len(df_meets_all)}")
print(f"\nEntries breakdown:")
print(f"  - IPF: {len(pd.concat(ipf_entries, ignore_index=True)) if ipf_entries else 0} entries")
print(f"  - USAPL: {len(pd.concat(usapl_entries, ignore_index=True)) if usapl_entries else 0} entries")
print(f"  - PA: {len(pd.concat(pa_entries, ignore_index=True)) if pa_entries else 0} entries")
print(f"\nMeets breakdown:")
print(f"  - IPF: {len(pd.concat(ipf_meets, ignore_index=True)) if ipf_meets else 0} meets")
print(f"  - USAPL: {len(pd.concat(usapl_meets, ignore_index=True)) if usapl_meets else 0} meets")
print(f"  - PA: {len(pd.concat(pa_meets, ignore_index=True)) if pa_meets else 0} meets")
print(f"\nEntries dataframe shape: {df_entries_all.shape}")
print(f"Meets dataframe shape: {df_meets_all.shape}")

# Merge entries with meet data using MeetID
# This adds Date and other meet information to each entry
df_entries_all = df_entries_all.merge(
    df_meets_all,
    on='MeetID',
    how='left',  # Keep all entries even if meet data is missing
    suffixes=('', '_meet')  # In case of duplicate column names
)

# Convert Date to datetime 
df_entries_all['Date'] = pd.to_datetime(df_entries_all['Date'])

# Convert BirthDate to datetime 
df_entries_all['BirthDate'] = pd.to_datetime(df_entries_all['BirthDate'], errors='coerce')

# Calculate age in years: (Date - BirthDate) / 365.25
# Using 365.25 to account for leap years
df_entries_all['Age'] = (df_entries_all['Date'] - df_entries_all['BirthDate']).dt.days / 365.25

# For entries where BirthDate is missing but BirthYear exists, estimate age
# Use mid-year as approximation: Date.year - BirthYear
missing_age_mask = df_entries_all['Age'].isna() & df_entries_all['BirthYear'].notna()
df_entries_all.loc[missing_age_mask, 'Age'] = (
    df_entries_all.loc[missing_age_mask, 'Date'].dt.year - 
    df_entries_all.loc[missing_age_mask, 'BirthYear']
)


print(f"Age still missing: {df_entries_all['Age'].isna().sum()}")
print(f"\nAge statistics:")
print(df_entries_all['Age'].describe())

print(f"\n{'='*60}")
print("AFTER MERGE:")
print(f"{'='*60}")
print(f"Entries dataframe shape: {df_entries_all.shape}")
print(f"Columns: {list(df_entries_all.columns)}")
print(f"\nDate column info:")
print(f"  - Non-null dates: {df_entries_all['Date'].notna().sum()}")
print(f"  - Date range: {df_entries_all['Date'].min()} to {df_entries_all['Date'].max()}")
print("\nFirst few rows with merged data:")
print(df_entries_all[['Name', 'MeetID', 'Date', 'Division','Sex','WeightClassKg','Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg']].head())

Loading IPF: 100%|██████████| 532/532 [00:00<00:00, 825.09it/s] 


IPF: Loaded 265 entries files, 265 meets files in 0.71 seconds
IPF: Loaded 265 entries files, 265 meets files


Loading USAPL: 100%|██████████| 8420/8420 [00:10<00:00, 791.70it/s]


USAPL: Loaded 4206 entries files, 4206 meets files in 10.71 seconds
USAPL: Loaded 4206 entries files, 4206 meets files


Loading PA: 100%|██████████| 1904/1904 [00:02<00:00, 890.11it/s]


PA: Loaded 951 entries files, 951 meets files in 2.17 seconds
PA: Loaded 951 entries files, 951 meets files

COMBINED RESULTS:
Total entries: 362415
Total meets: 5422

Entries breakdown:
  - IPF: 52706 entries
  - USAPL: 284587 entries
  - PA: 25122 entries

Meets breakdown:
  - IPF: 265 meets
  - USAPL: 4206 meets
  - PA: 951 meets

Entries dataframe shape: (362415, 52)
Meets dataframe shape: (5422, 7)
Age still missing: 24583

Age statistics:
count    381957.000000
mean         29.745970
std          13.333483
min           7.991786
25%          20.000000
50%          25.000000
75%          36.000000
max          94.000000
Name: Age, dtype: float64

AFTER MERGE:
Entries dataframe shape: (406540, 58)
Columns: ['Place', 'WeightClassKg', 'Sex', 'Division', 'Name', 'BirthYear', 'BodyweightKg', 'Country', 'Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg', 'TotalKg', 'Bench4Kg', 'Event', 'Equipment', 'MeetID', 'Bench1Kg', 'Bench2Kg', 'Bench3Kg', 'BirthDate', 'Deadlift4Kg', 'Team', 'Squat4K

## Convert Pounds to Kilograms

In [3]:
# Convert any Lbs columns to Kg and fill missing Kg values
# Conversion factor: 1 lb = 0.453592 kg

# Define the mapping of Lbs columns to their corresponding Kg columns
lbs_to_kg_mapping = {
    'Best3SquatLbs': 'Best3SquatKg',
    'Best3BenchLbs': 'Best3BenchKg',
    'Best3DeadliftLbs': 'Best3DeadliftKg',
    'Squat1Lbs': 'Squat1Kg',
    'Squat2Lbs': 'Squat2Kg',
    'Squat3Lbs': 'Squat3Kg',
    'Bench1Lbs': 'Bench1Kg',
    'Bench2Lbs': 'Bench2Kg',
    'Bench3Lbs': 'Bench3Kg',
    'Deadlift1Lbs': 'Deadlift1Kg',
    'Deadlift2Lbs': 'Deadlift2Kg',
    'Deadlift3Lbs': 'Deadlift3Kg',
    'TotalLbs': 'TotalKg'
}

# Convert and fill missing values
for lbs_col, kg_col in lbs_to_kg_mapping.items():
    if lbs_col in df_entries_all.columns:
        # Convert lbs to kg (multiply by 0.453592)
        converted_values = df_entries_all[lbs_col] * 0.453592
        
        # Fill missing Kg values with converted Lbs values (only where Kg is missing and Lbs exists)
        mask = df_entries_all[kg_col].isna() & df_entries_all[lbs_col].notna()
        df_entries_all.loc[mask, kg_col] = converted_values[mask]
        
        print(f"Filled {mask.sum()} missing values in {kg_col} using {lbs_col}")
# Drop all Lbs columns after conversion
lbs_columns_to_drop = [col for col in df_entries_all.columns if col.endswith('Lbs')]
df_entries_all = df_entries_all.drop(columns=lbs_columns_to_drop)


Filled 251 missing values in Best3SquatKg using Best3SquatLbs
Filled 257 missing values in Best3BenchKg using Best3BenchLbs
Filled 261 missing values in Best3DeadliftKg using Best3DeadliftLbs
Filled 263 missing values in Squat1Kg using Squat1Lbs
Filled 263 missing values in Squat2Kg using Squat2Lbs
Filled 260 missing values in Squat3Kg using Squat3Lbs
Filled 264 missing values in Bench1Kg using Bench1Lbs
Filled 264 missing values in Bench2Kg using Bench2Lbs
Filled 262 missing values in Bench3Kg using Bench3Lbs
Filled 262 missing values in Deadlift1Kg using Deadlift1Lbs
Filled 261 missing values in Deadlift2Kg using Deadlift2Lbs
Filled 261 missing values in Deadlift3Kg using Deadlift3Lbs
Filled 245 missing values in TotalKg using TotalLbs


In [4]:
# Create a surrogate LifterID so we can build per-lifter competition histories
# We use a combination of fairly stable identity columns.

id_cols = [col for col in ['Name', 'Sex', 'BirthYear'] if col in df_entries_all.columns]

if id_cols:
    # Build a string key from available identity columns
    lifter_key = df_entries_all[id_cols].astype(str).agg('|'.join, axis=1)

    # Turn the key into an integer LifterID (stable within this run)
    df_entries_all['LifterID'] = lifter_key.astype('category').cat.codes

    print("Created LifterID using columns:", id_cols)
    print(df_entries_all[['LifterID'] + id_cols].head(10))
else:
    raise ValueError("No suitable columns found to construct LifterID.")

Created LifterID using columns: ['Name', 'Sex', 'BirthYear']
   LifterID               Name Sex  BirthYear
0    133140      Wei-Ling Chen   F     1982.0
1    133140      Wei-Ling Chen   F     1982.0
2     99408  Natalia Krikunova   F     1981.0
3     99408  Natalia Krikunova   F     1981.0
4    104691       Oxana Sirant   F     1979.0
5    104691       Oxana Sirant   F     1979.0
6      9948        Aneta Rutka   F     1982.0
7      9948        Aneta Rutka   F     1982.0
8     21493  Bénédicte LePanse   F     1978.0
9     21493  Bénédicte LePanse   F     1978.0


In [5]:
# 2.2 Group by LifterID and sort competitions chronologically
# This creates a competition history for each lifter.

# Sort by lifter and date so each lifter's rows are in chronological order
df_entries_all = df_entries_all.sort_values(['LifterID', 'Date'])

# Create an explicit competition index per lifter (1 = first recorded meet, 2 = second, ...)
df_entries_all['CompIndex'] = df_entries_all.groupby('LifterID').cumcount() + 1

print("Per-lifter competition history created using 'LifterID' and 'Date'.")
print(df_entries_all[['LifterID', 'Date', 'CompIndex']].head(10))

Per-lifter competition history created using 'LifterID' and 'Date'.
        LifterID       Date  CompIndex
378313         0 2002-01-01          1
378312         0 2002-05-29          2
378351         1 2002-01-01          1
378350         1 2002-05-29          2
378679         2 2002-01-01          1
378678         2 2002-05-29          2
5177           3 2002-02-16          1
5176           3 2002-09-25          2
405711         4 2023-11-13          1
405712         4 2023-12-17          2


In [6]:
# 2.3 Filter valid lifters
# - Keep only lifters with at least 2 competitions
# - Optionally filter by other criteria (e.g., only SBD events, only Raw equipment)

# Count number of competitions per lifter
comp_counts = df_entries_all.groupby('LifterID')['CompIndex'].max().rename('NumComps')

df_entries_all = df_entries_all.merge(comp_counts, on='LifterID', how='left')

# Keep only lifters with at least 2 competitions
min_comps = 2
valid_mask = df_entries_all['NumComps'] >= min_comps

print(f"Total rows before filtering by competition count: {len(df_entries_all)}")
df_entries_all = df_entries_all[valid_mask].copy()
print(f"Total rows after keeping lifters with >= {min_comps} competitions: {len(df_entries_all)}")
print(f"Number of unique lifters remaining: {df_entries_all['LifterID'].nunique()}")

# Optional: filter to SBD + Raw if those columns exist
if 'Event' in df_entries_all.columns:
    before = len(df_entries_all)
    df_entries_all = df_entries_all[df_entries_all['Event'] == 'SBD']
    print(f"After filtering to SBD events: {len(df_entries_all)} rows (dropped {before - len(df_entries_all)})")

if 'Equipment' in df_entries_all.columns:
    before = len(df_entries_all)
    df_entries_all = df_entries_all[df_entries_all['Equipment'] == 'Raw']
    print(f"After filtering to Raw equipment: {len(df_entries_all)} rows (dropped {before - len(df_entries_all)})")



# Filter out rows where TotalKg is null or NaN
before = len(df_entries_all)
df_entries_all = df_entries_all[df_entries_all['TotalKg'].notna()]
print(f"After filtering out rows with missing TotalKg: {len(df_entries_all)} rows (dropped {before - len(df_entries_all)})")

print(f"Number of unique lifters remaining: {df_entries_all['LifterID'].nunique()}")

Total rows before filtering by competition count: 406540
Total rows after keeping lifters with >= 2 competitions: 349887
Number of unique lifters remaining: 80612
After filtering to SBD events: 294766 rows (dropped 55121)
After filtering to Raw equipment: 233466 rows (dropped 61300)
After filtering out rows with missing TotalKg: 225149 rows (dropped 8317)
Number of unique lifters remaining: 62404


In [7]:
# Drop the specified columns
# NOTE: Date is preserved for time-based features in Step 3
columns_to_drop = ['Name', 'Bench1Kg', 'KoreanName', 'ChineseName', 'Bench2Kg', 'Bench3Kg', 'BirthDate', 'Squat1Kg', 'Squat2Kg', 'Squat3Kg', 
                    'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg', 'Country', 'BirthYear', 'BodyweightKg', 'Place', 'Event', 'Equipment',
                    'Deadlift4Kg', 'Bench4Kg', 'Squat4Kg', 'Team', 'MeetID', 'State', 'AgeRange', 'DeadliftEquipment','Tested', 'CyrillicName', 'MeetCountry', 'MeetState', 'MeetTown', 'MeetName', 'Federation']
df_cleaned = df_entries_all.drop(columns=columns_to_drop)

In [8]:
# Display the cleaned and filtered dataframe
print("\n" + "="*60)
print("FINAL CLEANED AND FILTERED DATAFRAME:")
print("="*60)
print("\nCleaned DataFrame Info:")
print(df_cleaned.info())

print("\nFirst few rows of cleaned data:")
print(df_cleaned.head(10))


FINAL CLEANED AND FILTERED DATAFRAME:

Cleaned DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 225149 entries, 20 to 406526
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   WeightClassKg    221382 non-null  object        
 1   Sex              225149 non-null  object        
 2   Division         225149 non-null  object        
 3   Best3SquatKg     225138 non-null  float64       
 4   Best3BenchKg     225131 non-null  float64       
 5   Best3DeadliftKg  225141 non-null  float64       
 6   TotalKg          225149 non-null  float64       
 7   Age              221083 non-null  float64       
 8   Date             225149 non-null  datetime64[ns]
 9   LifterID         225149 non-null  int32         
 10  CompIndex        225149 non-null  int64         
 11  NumComps         225149 non-null  int64         
dtypes: datetime64[ns](1), float64(5), int32(1), int64(2), object(3)
memo

In [9]:
# Step 3: Create Training Examples (Sliding Window Approach)
# For each lifter with N competitions, create N-1 training examples
# Each example uses previous competitions to predict the next competition's lifts

import numpy as np

def extract_historical_features(previous_comps):
    """
    Extract historical performance features from previous competitions.
    
    Parameters:
    -----------
    previous_comps : DataFrame
        DataFrame containing all previous competitions for a lifter
    
    Returns:
    --------
    dict : Dictionary of historical features
    """
    features = {}
    
    # Previous lifts (last competition)
    if len(previous_comps) > 0:
        last_comp = previous_comps.iloc[-1]
        features['PrevBest3SquatKg'] = last_comp.get('Best3SquatKg', np.nan)
        features['PrevBest3BenchKg'] = last_comp.get('Best3BenchKg', np.nan)
        features['PrevBest3DeadliftKg'] = last_comp.get('Best3DeadliftKg', np.nan)
        features['PrevTotalKg'] = last_comp.get('TotalKg', np.nan)
    else:
        features['PrevBest3SquatKg'] = np.nan
        features['PrevBest3BenchKg'] = np.nan
        features['PrevBest3DeadliftKg'] = np.nan
        features['PrevTotalKg'] = np.nan
    
    # All-time PRs from previous competitions
    valid_squat = previous_comps['Best3SquatKg'].dropna()
    valid_bench = previous_comps['Best3BenchKg'].dropna()
    valid_deadlift = previous_comps['Best3DeadliftKg'].dropna()
    valid_total = previous_comps['TotalKg'].dropna()
    
    features['PRBest3SquatKg'] = valid_squat.max() if len(valid_squat) > 0 else np.nan
    features['PRBest3BenchKg'] = valid_bench.max() if len(valid_bench) > 0 else np.nan
    features['PRBest3DeadliftKg'] = valid_deadlift.max() if len(valid_deadlift) > 0 else np.nan
    features['PRTotalKg'] = valid_total.max() if len(valid_total) > 0 else np.nan
    
    # Averages of last 3 competitions (or all if fewer than 3)
    n_recent = min(3, len(previous_comps))
    if n_recent > 0:
        recent_comps = previous_comps.tail(n_recent)
        features['AvgBest3SquatKg_Last3'] = recent_comps['Best3SquatKg'].mean()
        features['AvgBest3BenchKg_Last3'] = recent_comps['Best3BenchKg'].mean()
        features['AvgBest3DeadliftKg_Last3'] = recent_comps['Best3DeadliftKg'].mean()
        features['AvgTotalKg_Last3'] = recent_comps['TotalKg'].mean()
    else:
        features['AvgBest3SquatKg_Last3'] = np.nan
        features['AvgBest3BenchKg_Last3'] = np.nan
        features['AvgBest3DeadliftKg_Last3'] = np.nan
        features['AvgTotalKg_Last3'] = np.nan
    
    # Consistency (standard deviation) across all previous competitions
    features['StdBest3SquatKg'] = valid_squat.std() if len(valid_squat) > 1 else np.nan
    features['StdBest3BenchKg'] = valid_bench.std() if len(valid_bench) > 1 else np.nan
    features['StdBest3DeadliftKg'] = valid_deadlift.std() if len(valid_deadlift) > 1 else np.nan
    features['StdTotalKg'] = valid_total.std() if len(valid_total) > 1 else np.nan
    
    return features

def extract_time_features(current_comp, previous_comps, dates):
    """
    Extract time-based features.
    
    Parameters:
    -----------
    current_comp : Series
        Current competition row
    previous_comps : DataFrame
        Previous competitions
    dates : Series
        Dates for all competitions (current + previous)
    
    Returns:
    --------
    dict : Dictionary of time-based features
    """
    features = {}
    
    # Competition count (which competition number this is)
    features['CompIndex'] = current_comp.get('CompIndex', np.nan)
    
    # Age at current competition
    features['Age'] = current_comp.get('Age', np.nan)
    
    # Days since last competition
    if len(previous_comps) > 0 and 'Date' in current_comp and pd.notna(current_comp['Date']):
        last_date = previous_comps.iloc[-1].get('Date')
        if pd.notna(last_date):
            days_diff = (current_comp['Date'] - last_date).days
            features['DaysSinceLastComp'] = days_diff
        else:
            features['DaysSinceLastComp'] = np.nan
    else:
        features['DaysSinceLastComp'] = np.nan
    
    # Average days between competitions (for this lifter)
    if len(previous_comps) > 0 and 'Date' in previous_comps.columns:
        valid_dates = previous_comps['Date'].dropna()
        if len(valid_dates) > 1:
            date_diffs = valid_dates.diff().dropna()
            features['AvgDaysBetweenComps'] = date_diffs.dt.days.mean() if len(date_diffs) > 0 else np.nan
        else:
            features['AvgDaysBetweenComps'] = np.nan
    else:
        features['AvgDaysBetweenComps'] = np.nan
    
    return features

def extract_context_features(current_comp):
    """
    Extract context features from current competition.
    
    Parameters:
    -----------
    current_comp : Series
        Current competition row
    
    Returns:
    --------
    dict : Dictionary of context features
    """
    features = {}
    
    # Weight class (keep as string for now, can encode later)
    features['WeightClassKg'] = current_comp.get('WeightClassKg', np.nan)
    
    # Sex (M/F)
    features['Sex'] = current_comp.get('Sex', np.nan)
    
    # Division (Open, Junior, etc.)
    features['Division'] = current_comp.get('Division', np.nan)
    
    return features

def extract_trend_features(all_previous_comps):
    """
    Extract trend/improvement features from all previous competitions.
    
    Parameters:
    -----------
    all_previous_comps : DataFrame
        All previous competitions sorted by CompIndex
    
    Returns:
    --------
    dict : Dictionary of trend features
    """
    features = {}
    
    if len(all_previous_comps) == 0:
        # No previous competitions
        features['SquatImprovementRate'] = np.nan
        features['BenchImprovementRate'] = np.nan
        features['DeadliftImprovementRate'] = np.nan
        features['SquatImprovementDirection'] = 0
        features['BenchImprovementDirection'] = 0
        features['DeadliftImprovementDirection'] = 0
        features['CompsSincePRSquat'] = np.nan
        features['CompsSincePRBench'] = np.nan
        features['CompsSincePRDeadlift'] = np.nan
        return features
    
    # Improvement rate (slope) - change per competition
    valid_comps = all_previous_comps.dropna(subset=['Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg'], how='all')
    
    if len(valid_comps) >= 2:
        # Calculate slope using linear regression (simple: (last - first) / (n_comps - 1))
        comp_indices = valid_comps['CompIndex'].values
        squat_values = valid_comps['Best3SquatKg'].values
        bench_values = valid_comps['Best3BenchKg'].values
        deadlift_values = valid_comps['Best3DeadliftKg'].values
        
        # Squat improvement rate
        if len(squat_values[~np.isnan(squat_values)]) >= 2:
            valid_squat = ~np.isnan(squat_values)
            if valid_squat.sum() >= 2:
                valid_indices = comp_indices[valid_squat]
                valid_squat_vals = squat_values[valid_squat]
                if len(valid_indices) > 1:
                    features['SquatImprovementRate'] = (valid_squat_vals[-1] - valid_squat_vals[0]) / (valid_indices[-1] - valid_indices[0])
                else:
                    features['SquatImprovementRate'] = np.nan
            else:
                features['SquatImprovementRate'] = np.nan
        else:
            features['SquatImprovementRate'] = np.nan
        
        # Bench improvement rate
        if len(bench_values[~np.isnan(bench_values)]) >= 2:
            valid_bench = ~np.isnan(bench_values)
            if valid_bench.sum() >= 2:
                valid_indices = comp_indices[valid_bench]
                valid_bench_vals = bench_values[valid_bench]
                if len(valid_indices) > 1:
                    features['BenchImprovementRate'] = (valid_bench_vals[-1] - valid_bench_vals[0]) / (valid_indices[-1] - valid_indices[0])
                else:
                    features['BenchImprovementRate'] = np.nan
            else:
                features['BenchImprovementRate'] = np.nan
        else:
            features['BenchImprovementRate'] = np.nan
        
        # Deadlift improvement rate
        if len(deadlift_values[~np.isnan(deadlift_values)]) >= 2:
            valid_deadlift = ~np.isnan(deadlift_values)
            if valid_deadlift.sum() >= 2:
                valid_indices = comp_indices[valid_deadlift]
                valid_deadlift_vals = deadlift_values[valid_deadlift]
                if len(valid_indices) > 1:
                    features['DeadliftImprovementRate'] = (valid_deadlift_vals[-1] - valid_deadlift_vals[0]) / (valid_indices[-1] - valid_indices[0])
                else:
                    features['DeadliftImprovementRate'] = np.nan
            else:
                features['DeadliftImprovementRate'] = np.nan
        else:
            features['DeadliftImprovementRate'] = np.nan
        
        # Improvement direction (comparing last 2 competitions)
        if len(valid_comps) >= 2:
            last_two = valid_comps.tail(2)
            last_squat = last_two.iloc[-1]['Best3SquatKg']
            prev_squat = last_two.iloc[-2]['Best3SquatKg']
            if pd.notna(last_squat) and pd.notna(prev_squat):
                features['SquatImprovementDirection'] = 1 if last_squat > prev_squat else (-1 if last_squat < prev_squat else 0)
            else:
                features['SquatImprovementDirection'] = 0
            
            last_bench = last_two.iloc[-1]['Best3BenchKg']
            prev_bench = last_two.iloc[-2]['Best3BenchKg']
            if pd.notna(last_bench) and pd.notna(prev_bench):
                features['BenchImprovementDirection'] = 1 if last_bench > prev_bench else (-1 if last_bench < prev_bench else 0)
            else:
                features['BenchImprovementDirection'] = 0
            
            last_deadlift = last_two.iloc[-1]['Best3DeadliftKg']
            prev_deadlift = last_two.iloc[-2]['Best3DeadliftKg']
            if pd.notna(last_deadlift) and pd.notna(prev_deadlift):
                features['DeadliftImprovementDirection'] = 1 if last_deadlift > prev_deadlift else (-1 if last_deadlift < prev_deadlift else 0)
            else:
                features['DeadliftImprovementDirection'] = 0
        else:
            features['SquatImprovementDirection'] = 0
            features['BenchImprovementDirection'] = 0
            features['DeadliftImprovementDirection'] = 0
        
        # Comps since PR (number of competitions since achieving personal record)
        pr_squat = all_previous_comps['Best3SquatKg'].max()
        pr_bench = all_previous_comps['Best3BenchKg'].max()
        pr_deadlift = all_previous_comps['Best3DeadliftKg'].max()
        
        if pd.notna(pr_squat):
            pr_squat_comps = all_previous_comps[all_previous_comps['Best3SquatKg'] == pr_squat]
            if len(pr_squat_comps) > 0:
                last_pr_comp = pr_squat_comps['CompIndex'].max()
                last_comp = all_previous_comps['CompIndex'].max()
                features['CompsSincePRSquat'] = last_comp - last_pr_comp
            else:
                features['CompsSincePRSquat'] = np.nan
        else:
            features['CompsSincePRSquat'] = np.nan
        
        if pd.notna(pr_bench):
            pr_bench_comps = all_previous_comps[all_previous_comps['Best3BenchKg'] == pr_bench]
            if len(pr_bench_comps) > 0:
                last_pr_comp = pr_bench_comps['CompIndex'].max()
                last_comp = all_previous_comps['CompIndex'].max()
                features['CompsSincePRBench'] = last_comp - last_pr_comp
            else:
                features['CompsSincePRBench'] = np.nan
        else:
            features['CompsSincePRBench'] = np.nan
        
        if pd.notna(pr_deadlift):
            pr_deadlift_comps = all_previous_comps[all_previous_comps['Best3DeadliftKg'] == pr_deadlift]
            if len(pr_deadlift_comps) > 0:
                last_pr_comp = pr_deadlift_comps['CompIndex'].max()
                last_comp = all_previous_comps['CompIndex'].max()
                features['CompsSincePRDeadlift'] = last_comp - last_pr_comp
            else:
                features['CompsSincePRDeadlift'] = np.nan
        else:
            features['CompsSincePRDeadlift'] = np.nan
    else:
        # Not enough competitions for trend analysis
        features['SquatImprovementRate'] = np.nan
        features['BenchImprovementRate'] = np.nan
        features['DeadliftImprovementRate'] = np.nan
        features['SquatImprovementDirection'] = 0
        features['BenchImprovementDirection'] = 0
        features['DeadliftImprovementDirection'] = 0
        features['CompsSincePRSquat'] = np.nan
        features['CompsSincePRBench'] = np.nan
        features['CompsSincePRDeadlift'] = np.nan
    
    return features

def create_training_examples(df):
    """
    Create training examples using sliding window approach.
    
    For each lifter with N competitions, creates N-1 training examples:
    - Example 1: Use comp 1 → predict comp 2
    - Example 2: Use comps 1-2 → predict comp 3
    - Example 3: Use comps 1-3 → predict comp 4
    - etc.
    
    Parameters:
    -----------
    df : DataFrame
        DataFrame with per-lifter competition histories (sorted by LifterID and Date)
        Must have columns: LifterID, CompIndex, Date, and lift columns
    
    Returns:
    --------
    df_training : DataFrame
        DataFrame with features (X) and targets (y) for training
    """
    training_examples = []
    
    # Group by LifterID
    for lifter_id, lifter_group in df.groupby('LifterID'):
        # Sort by CompIndex to ensure chronological order
        lifter_group = lifter_group.sort_values('CompIndex').reset_index(drop=True)
        
        # For each competition from 2nd onwards (we need at least 1 previous comp)
        for idx in range(1, len(lifter_group)):
            current_comp = lifter_group.iloc[idx]
            previous_comps = lifter_group.iloc[:idx]  # All competitions before current
            
            # Extract all features
            example = {}
            
            # Metadata
            example['LifterID'] = lifter_id
            example['CompIndex'] = current_comp['CompIndex']
            
            # Historical features
            hist_features = extract_historical_features(previous_comps)
            example.update(hist_features)
            
            # Time-based features
            time_features = extract_time_features(current_comp, previous_comps, lifter_group['Date'])
            example.update(time_features)
            
            # Context features
            context_features = extract_context_features(current_comp)
            example.update(context_features)
            
            # Trend features
            trend_features = extract_trend_features(previous_comps)
            example.update(trend_features)
            
            # Target variables (what we're trying to predict)
            example['NextBest3SquatKg'] = current_comp.get('Best3SquatKg', np.nan)
            example['NextBest3BenchKg'] = current_comp.get('Best3BenchKg', np.nan)
            example['NextBest3DeadliftKg'] = current_comp.get('Best3DeadliftKg', np.nan)
            
            training_examples.append(example)
    
    # Convert to DataFrame
    df_training = pd.DataFrame(training_examples)
    
    return df_training

print("Feature extraction functions defined.")
print("Ready to create training examples...")

Feature extraction functions defined.
Ready to create training examples...


In [10]:
# Step 3 (FAST + FULL HISTORY REGRESSION)
import numpy as np

# Sort once
df = df_cleaned.sort_values(['LifterID', 'CompIndex']).copy()
g = df.groupby('LifterID', sort=False)

# ---- Previous competition features
df['PrevBest3SquatKg'] = g['Best3SquatKg'].shift(1)
df['PrevBest3BenchKg'] = g['Best3BenchKg'].shift(1)
df['PrevBest3DeadliftKg'] = g['Best3DeadliftKg'].shift(1)
df['PrevTotalKg'] = g['TotalKg'].shift(1)

# ---- PRs (best up to previous comp)
df['PRBest3SquatKg'] = g['Best3SquatKg'].cummax().shift(1)
df['PRBest3BenchKg'] = g['Best3BenchKg'].cummax().shift(1)
df['PRBest3DeadliftKg'] = g['Best3DeadliftKg'].cummax().shift(1)
df['PRTotalKg'] = g['TotalKg'].cummax().shift(1)

# ---- Rolling averages (last 3 previous comps)
for col in ['Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg', 'TotalKg']:
    df[f'Avg{col}_Last3'] = (
        g[col].rolling(3, min_periods=1).mean().shift(1)
        .reset_index(level=0, drop=True)
    )

# ---- Std across all previous comps
for col in ['Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg', 'TotalKg']:
    df[f'Std{col}'] = (
        g[col].expanding().std().shift(1)
        .reset_index(level=0, drop=True)
    )

# ---- Time-based features
df['DaysSinceLastComp'] = g['Date'].diff().dt.days
df['AvgDaysBetweenComps'] = (
    g['Date'].diff().dt.days
    .groupby(df['LifterID']).expanding().mean().shift(1)
    .reset_index(level=0, drop=True)
)

# ---- FULL HISTORY REGRESSION SLOPE (vectorized)
def full_history_slope(df, ycol):
    x = df['CompIndex'].astype(float)
    y = df[ycol].astype(float)
    mask = y.notna()

    # cumulative sums over valid points only
    x_valid = x.where(mask)
    y_valid = y.where(mask)

    n = mask.groupby(df['LifterID']).cumsum().shift(1)
    sum_x = x_valid.groupby(df['LifterID']).cumsum().shift(1)
    sum_y = y_valid.groupby(df['LifterID']).cumsum().shift(1)
    sum_x2 = (x_valid * x_valid).groupby(df['LifterID']).cumsum().shift(1)
    sum_xy = (x_valid * y_valid).groupby(df['LifterID']).cumsum().shift(1)

    denom = (n * sum_x2) - (sum_x * sum_x)
    slope = (n * sum_xy - sum_x * sum_y) / denom

    # need at least 2 valid points and nonzero denom
    slope = slope.where((n >= 2) & (denom != 0))
    return slope

df['SquatImprovementRate'] = full_history_slope(df, 'Best3SquatKg')
df['BenchImprovementRate'] = full_history_slope(df, 'Best3BenchKg')
df['DeadliftImprovementRate'] = full_history_slope(df, 'Best3DeadliftKg')

# ---- Improvement direction (same logic: compare last two comps)
prev_squat_1 = g['Best3SquatKg'].shift(1)
prev_squat_2 = g['Best3SquatKg'].shift(2)
prev_bench_1 = g['Best3BenchKg'].shift(1)
prev_bench_2 = g['Best3BenchKg'].shift(2)
prev_dead_1 = g['Best3DeadliftKg'].shift(1)
prev_dead_2 = g['Best3DeadliftKg'].shift(2)

df['SquatImprovementDirection'] = np.sign(prev_squat_1 - prev_squat_2).fillna(0).astype(int)
df['BenchImprovementDirection'] = np.sign(prev_bench_1 - prev_bench_2).fillna(0).astype(int)
df['DeadliftImprovementDirection'] = np.sign(prev_dead_1 - prev_dead_2).fillna(0).astype(int)

# ---- Comps since PR (exact)
def comps_since_pr(series):
    pr = series.groupby(df['LifterID']).cummax()
    is_pr = series == pr
    last_pr_idx = df['CompIndex'].where(is_pr).groupby(df['LifterID']).ffill()
    last_pr_idx_prev = last_pr_idx.groupby(df['LifterID']).shift(1)
    return (df['CompIndex'] - 1) - last_pr_idx_prev

df['CompsSincePRSquat'] = comps_since_pr(df['Best3SquatKg'])
df['CompsSincePRBench'] = comps_since_pr(df['Best3BenchKg'])
df['CompsSincePRDeadlift'] = comps_since_pr(df['Best3DeadliftKg'])

# ---- Targets (current comp)
df['NextBest3SquatKg'] = df['Best3SquatKg']
df['NextBest3BenchKg'] = df['Best3BenchKg']
df['NextBest3DeadliftKg'] = df['Best3DeadliftKg']

# ---- Build training set (drop first comp per lifter)
feature_cols = [
    'LifterID', 'CompIndex',
    'PrevBest3SquatKg', 'PrevBest3BenchKg', 'PrevBest3DeadliftKg', 'PrevTotalKg',
    'PRBest3SquatKg', 'PRBest3BenchKg', 'PRBest3DeadliftKg', 'PRTotalKg',
    'AvgBest3SquatKg_Last3', 'AvgBest3BenchKg_Last3', 'AvgBest3DeadliftKg_Last3', 'AvgTotalKg_Last3',
    'StdBest3SquatKg', 'StdBest3BenchKg', 'StdBest3DeadliftKg', 'StdTotalKg',
    'Age', 'DaysSinceLastComp', 'AvgDaysBetweenComps',
    'WeightClassKg', 'Sex', 'Division',
    'SquatImprovementRate', 'BenchImprovementRate', 'DeadliftImprovementRate',
    'SquatImprovementDirection', 'BenchImprovementDirection', 'DeadliftImprovementDirection',
    'CompsSincePRSquat', 'CompsSincePRBench', 'CompsSincePRDeadlift',
    'NextBest3SquatKg', 'NextBest3BenchKg', 'NextBest3DeadliftKg'
]

df_training = df.loc[df['CompIndex'] >= 2, feature_cols].copy()

print(f"Training examples created: {len(df_training)}")
print(f"Training data shape: {df_training.shape}")
print(df_training.head())

Training examples created: 169681
Training data shape: (169681, 36)
    LifterID  CompIndex  PrevBest3SquatKg  PrevBest3BenchKg  \
21         9          2             182.5             105.0   
22         9          3             190.0             112.5   
32        13          2             245.0             165.0   
34        13          4             272.5             155.0   
36        14          2               NaN               NaN   

    PrevBest3DeadliftKg  PrevTotalKg  PRBest3SquatKg  PRBest3BenchKg  \
21                185.0        472.5           182.5           105.0   
22                197.5        500.0           190.0           112.5   
32                257.5        667.5           245.0           165.0   
34                280.0        707.5           272.5           165.0   
36                  NaN          NaN           287.5           195.0   

    PRBest3DeadliftKg  PRTotalKg  ...  DeadliftImprovementRate  \
21              185.0      472.5  ...                 

## Create Training Examples

In [11]:
# Step 4: Feature Engineering - Encode Categorical Features
# Convert categorical features (WeightClassKg, Sex, Division) to numeric

try:
    # Check if df_training exists
    if 'df_training' not in locals() and 'df_training' not in globals():
        raise NameError("df_training not found. Please run Step 3 first.")
    
    if df_training.empty:
        raise ValueError("df_training is empty. Cannot encode features.")
    
    print("="*60)
    print("STEP 4: FEATURE ENGINEERING - CATEGORICAL ENCODING")
    print("="*60)
    
    # Create a copy to avoid modifying the original
    df_training_encoded = df_training.copy()
    
    # 4.1 Encode WeightClassKg - Extract numeric value, handle "+" suffix
    if 'WeightClassKg' in df_training_encoded.columns:
        def parse_weight_class(wc):
            """Parse weight class string to numeric value."""
            if pd.isna(wc):
                return np.nan
            try:
                # Handle "+" suffix (e.g., "120+" -> 120)
                wc_str = str(wc).strip()
                if wc_str.endswith('+'):
                    return float(wc_str[:-1])
                # Try to convert to float
                return float(wc_str)
            except (ValueError, TypeError):
                # If parsing fails, return NaN
                return np.nan
        
        df_training_encoded['WeightClassKg_Numeric'] = df_training_encoded['WeightClassKg'].apply(parse_weight_class)
        parsed_count = df_training_encoded['WeightClassKg_Numeric'].notna().sum()
        print(f"\nWeightClassKg encoded: {parsed_count}/{len(df_training_encoded)} successfully parsed")
        
        # Drop original WeightClassKg (keep numeric version)
        df_training_encoded = df_training_encoded.drop(columns=['WeightClassKg'])
        df_training_encoded = df_training_encoded.rename(columns={'WeightClassKg_Numeric': 'WeightClassKg'})
    
    # 4.2 Encode Sex - Binary encoding (M=1, F=0)
    if 'Sex' in df_training_encoded.columns:
        sex_mapping = {'M': 1, 'F': 0, 'm': 1, 'f': 0}
        df_training_encoded['Sex'] = df_training_encoded['Sex'].map(sex_mapping)
        encoded_count = df_training_encoded['Sex'].notna().sum()
        print(f"Sex encoded (binary): {encoded_count}/{len(df_training_encoded)} successfully encoded")
    
    # 4.3 Encode Division - Label encoding (since it may have many categories)
    if 'Division' in df_training_encoded.columns:
        # Get unique divisions
        unique_divisions = df_training_encoded['Division'].dropna().unique()
        division_mapping = {div: idx for idx, div in enumerate(unique_divisions)}
        
        # Apply label encoding
        df_training_encoded['Division'] = df_training_encoded['Division'].map(division_mapping)
        encoded_count = df_training_encoded['Division'].notna().sum()
        print(f"Division encoded (label): {encoded_count}/{len(df_training_encoded)} successfully encoded")
        print(f"  Unique divisions: {len(unique_divisions)}")
    
    # 4.4 Validate all features are numeric (except metadata and targets)
    metadata_cols = ['LifterID', 'CompIndex']
    target_cols = ['NextBest3SquatKg', 'NextBest3BenchKg', 'NextBest3DeadliftKg']
    feature_cols = [col for col in df_training_encoded.columns 
                    if col not in metadata_cols + target_cols]
    
    # Check for string values in feature columns
    string_features = []
    for col in feature_cols:
        if df_training_encoded[col].dtype == 'object':
            string_features.append(col)
    
    if string_features:
        print(f"\nWarning: String features found: {string_features}")
        print("These features may need additional encoding.")
    else:
        print(f"\nAll feature columns are numeric: {len(feature_cols)} features")
    
    # Update df_training
    df_training = df_training_encoded.copy()
    
    print(f"\nStep 4 Complete: Feature encoding done")
    print(f"Final feature count: {len(feature_cols)}")
    print(f"Dataset shape: {df_training.shape}")

except NameError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"Error in feature encoding: {e}")
    raise

STEP 4: FEATURE ENGINEERING - CATEGORICAL ENCODING

WeightClassKg encoded: 166586/169681 successfully parsed
Sex encoded (binary): 169651/169681 successfully encoded
Division encoded (label): 169681/169681 successfully encoded
  Unique divisions: 176

All feature columns are numeric: 31 features

Step 4 Complete: Feature encoding done
Final feature count: 31
Dataset shape: (169681, 36)


## Feature Engineering - Categorical Encoding

In [12]:
# Step 5: Handle Missing Data and Edge Cases

try:
    if 'df_training' not in locals() and 'df_training' not in globals():
        raise NameError("df_training not found. Please run Step 4 first.")
    
    if df_training.empty:
        raise ValueError("df_training is empty. Cannot process missing data.")
    
    print("="*60)
    print("STEP 5: HANDLE MISSING DATA AND EDGE CASES")
    print("="*60)
    
    df_training_cleaned = df_training.copy()
    initial_count = len(df_training_cleaned)
    
    # 5.1 Handle Missing Target Values
    target_cols = ['NextBest3SquatKg', 'NextBest3BenchKg', 'NextBest3DeadliftKg']
    
    # Count missing targets
    missing_targets = df_training_cleaned[target_cols].isna().sum()
    print(f"\nMissing target values:")
    for col in target_cols:
        print(f"  {col}: {missing_targets[col]} ({missing_targets[col]/initial_count*100:.2f}%)")
    
    # Drop rows where ALL targets are missing (cannot train)
    all_targets_missing = df_training_cleaned[target_cols].isna().all(axis=1)
    rows_dropped_targets = all_targets_missing.sum()
    df_training_cleaned = df_training_cleaned[~all_targets_missing].copy()
    print(f"\nDropped {rows_dropped_targets} rows where all targets are missing")
    
    # 5.2 Data Quality Checks
    print(f"\nData Quality Checks:")
    
    # Check for negative lift values (targets)
    negative_targets = 0
    for col in target_cols:
        negative_mask = df_training_cleaned[col] < 0
        negative_count = negative_mask.sum()
        if negative_count > 0:
            negative_targets += negative_count
            print(f"  Warning: {negative_count} negative values in {col} - setting to NaN")
            df_training_cleaned.loc[negative_mask, col] = np.nan
    
    if negative_targets == 0:
        print(f"  ✓ No negative target values found")
    
    # Check for unrealistic lift values (flag outliers)
    # Use world records as reference (rough estimates: Squat ~500kg, Bench ~400kg, Deadlift ~500kg)
    world_records = {'NextBest3SquatKg': 500, 'NextBest3BenchKg': 400, 'NextBest3DeadliftKg': 500}
    outlier_count = 0
    
    for col, max_reasonable in world_records.items():
        if col in df_training_cleaned.columns:
            # Flag values > 3x world record as outliers
            outlier_threshold = max_reasonable * 3
            outlier_mask = df_training_cleaned[col] > outlier_threshold
            outlier_num = outlier_mask.sum()
            if outlier_num > 0:
                outlier_count += outlier_num
                print(f"  Warning: {outlier_num} outliers in {col} (> {outlier_threshold:.0f}kg) - setting to NaN")
                df_training_cleaned.loc[outlier_mask, col] = np.nan
    
    if outlier_count == 0:
        print(f"  ✓ No extreme outliers found")
    
    # Age sanity checks (10-100 years)
    if 'Age' in df_training_cleaned.columns:
        age_mask = (df_training_cleaned['Age'] < 10) | (df_training_cleaned['Age'] > 100)
        invalid_age_count = age_mask.sum()
        if invalid_age_count > 0:
            print(f"  Warning: {invalid_age_count} invalid ages (not in 10-100 range) - setting to NaN")
            df_training_cleaned.loc[age_mask, 'Age'] = np.nan
        else:
            print(f"  ✓ All ages in reasonable range (10-100 years)")
    
    # Days between competitions sanity check (0-3650 days = 10 years max)
    if 'DaysSinceLastComp' in df_training_cleaned.columns:
        days_mask = (df_training_cleaned['DaysSinceLastComp'] < 0) | (df_training_cleaned['DaysSinceLastComp'] > 3650)
        invalid_days_count = days_mask.sum()
        if invalid_days_count > 0:
            print(f"  Warning: {invalid_days_count} invalid days between competitions (not in 0-3650 range) - setting to NaN")
            df_training_cleaned.loc[days_mask, 'DaysSinceLastComp'] = np.nan
    
    # 5.3 Impute Missing Feature Values
    print(f"\nImputing missing feature values:")
    
    metadata_cols = ['LifterID', 'CompIndex']
    feature_cols = [col for col in df_training_cleaned.columns 
                    if col not in metadata_cols + target_cols]
    
    imputation_counts = {}
    
    for col in feature_cols:
        missing_count = df_training_cleaned[col].isna().sum()
        if missing_count > 0:
            if df_training_cleaned[col].dtype in ['int64', 'float64']:
                # Numeric: use median (more robust than mean)
                imputation_value = df_training_cleaned[col].median()
                df_training_cleaned[col].fillna(imputation_value, inplace=True)
                imputation_counts[col] = missing_count
                print(f"  {col}: {missing_count} missing values imputed with median ({imputation_value:.2f})")
            else:
                # Categorical: use mode
                mode_value = df_training_cleaned[col].mode()
                if len(mode_value) > 0:
                    imputation_value = mode_value[0]
                    df_training_cleaned[col].fillna(imputation_value, inplace=True)
                    imputation_counts[col] = missing_count
                    print(f"  {col}: {missing_count} missing values imputed with mode ({imputation_value})")
    
    if len(imputation_counts) == 0:
        print(f"  ✓ No missing feature values to impute")
    
    # Update df_training
    df_training = df_training_cleaned.copy()
    
    print(f"\nStep 5 Complete:")
    print(f"  Initial examples: {initial_count}")
    print(f"  Final examples: {len(df_training)}")
    print(f"  Rows dropped: {initial_count - len(df_training)}")
    print(f"  Features imputed: {len(imputation_counts)}")

except NameError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"Error in missing data handling: {e}")
    raise

STEP 5: HANDLE MISSING DATA AND EDGE CASES

Missing target values:
  NextBest3SquatKg: 6 (0.00%)
  NextBest3BenchKg: 7 (0.00%)
  NextBest3DeadliftKg: 5 (0.00%)

Dropped 0 rows where all targets are missing

Data Quality Checks:
  ✓ No negative target values found
  ✓ No extreme outliers found

Imputing missing feature values:
  PrevBest3SquatKg: 6942 missing values imputed with median (167.50)
  PrevBest3BenchKg: 6949 missing values imputed with median (105.00)
  PrevBest3DeadliftKg: 6941 missing values imputed with median (195.00)
  PrevTotalKg: 6936 missing values imputed with median (470.00)
  PRBest3SquatKg: 7 missing values imputed with median (167.50)
  PRBest3BenchKg: 13 missing values imputed with median (107.50)
  PRBest3DeadliftKg: 5 missing values imputed with median (197.50)
  AvgBest3SquatKg_Last3: 4 missing values imputed with median (164.17)
  AvgBest3BenchKg_Last3: 13 missing values imputed with median (105.00)
  AvgBest3DeadliftKg_Last3: 3 missing values imputed with m

## Time series split

In [13]:
# Step 6: Split Your Data - Time-Based Split
# Split by date: older competitions → train, middle → validation, newest → test

try:
    if 'df_training' not in locals() and 'df_training' not in globals():
        raise NameError("df_training not found. Please run Step 5 first.")
    
    if df_training.empty:
        raise ValueError("df_training is empty. Cannot split data.")
    
    print("="*60)
    print("STEP 6: TIME-BASED DATA SPLIT")
    print("="*60)
    
    # Add Date column to df_training for splitting
    # We need to merge back with df_cleaned to get dates
    try:
        if 'df_cleaned' in locals() or 'df_cleaned' in globals():
            # Merge Date from df_cleaned using LifterID and CompIndex
            # Create a mapping from (LifterID, CompIndex) to Date
            date_mapping = df_cleaned.set_index(['LifterID', 'CompIndex'])['Date'].to_dict()
            
            # Add Date to df_training
            df_training['Date'] = df_training.apply(
                lambda row: date_mapping.get((row['LifterID'], row['CompIndex']), pd.NaT),
                axis=1
            )
            
            # Check if we have dates
            dates_available = df_training['Date'].notna().sum()
            print(f"\nDates added to training data: {dates_available}/{len(df_training)} examples have dates")
            
            if dates_available == 0:
                raise ValueError("No dates available. Cannot perform time-based split.")
        else:
            raise NameError("df_cleaned not found. Cannot add dates for splitting.")
    except Exception as e:
        print(f"Warning: Could not add Date column: {e}")
        print("Falling back to alternative split method (by index)")
        # Fallback: split by index (not ideal, but works if dates unavailable)
        df_training['Date'] = pd.NaT
    
    # Remove rows with missing dates for time-based split
    df_training_dated = df_training[df_training['Date'].notna()].copy()
    
    if len(df_training_dated) == 0:
        raise ValueError("No examples with dates. Cannot perform time-based split.")
    
    # Sort by date to ensure chronological order
    df_training_dated = df_training_dated.sort_values('Date').reset_index(drop=True)
    
    # Calculate split indices
    total_examples = len(df_training_dated)
    
    if total_examples < 10:
        raise ValueError(f"Dataset too small for train/validation/test split: {total_examples} examples")
    
    # Split percentages: 70% train, 15% validation, 15% test
    train_size = int(total_examples * 0.70)
    val_size = int(total_examples * 0.15)
    
    # Split indices
    train_end = train_size
    val_end = train_end + val_size
    
    # Create splits
    df_train = df_training_dated.iloc[:train_end].copy()
    df_val = df_training_dated.iloc[train_end:val_end].copy()
    df_test = df_training_dated.iloc[val_end:].copy()
    
    # Validate splits
    print(f"\nSplit Statistics:")
    print(f"  Total examples: {total_examples}")
    print(f"  Training set: {len(df_train)} ({len(df_train)/total_examples*100:.1f}%)")
    print(f"  Validation set: {len(df_val)} ({len(df_val)/total_examples*100:.1f}%)")
    print(f"  Test set: {len(df_test)} ({len(df_test)/total_examples*100:.1f}%)")
    
    # Validate date ordering (train < validation < test)
    train_max_date = df_train['Date'].max()
    val_min_date = df_val['Date'].min()
    val_max_date = df_val['Date'].max()
    test_min_date = df_test['Date'].min()
    
    print(f"\nDate Range Validation:")
    print(f"  Training: {df_train['Date'].min()} to {train_max_date}")
    print(f"  Validation: {val_min_date} to {val_max_date}")
    print(f"  Test: {test_min_date} to {df_test['Date'].max()}")
    
    # Check for data leakage
    if train_max_date >= val_min_date:
        print(f"  ⚠ Warning: Potential data leakage - train max date >= val min date")
    else:
        print(f"  ✓ No leakage: train max < val min")
    
    if val_max_date >= test_min_date:
        print(f"  ⚠ Warning: Potential data leakage - val max date >= test min date")
    else:
        print(f"  ✓ No leakage: val max < test min")
    
    # Store splits (drop Date column as it's not needed for training)
    # Actually, keep Date for reference but we can drop it later if needed
    df_train_final = df_train.copy()
    df_val_final = df_val.copy()
    df_test_final = df_test.copy()
    
    print(f"\nStep 6 Complete: Data splits created successfully")
    print(f"  ✓ Training examples: {len(df_train_final)}")
    print(f"  ✓ Validation examples: {len(df_val_final)}")
    print(f"  ✓ Test examples: {len(df_test_final)}")
    
    # Display sample from each split
    print(f"\nSample from training set:")
    print(df_train_final.head(3))

except NameError as e:
    print(f"Error: {e}")
except ValueError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"Error in data splitting: {e}")
    raise

STEP 6: TIME-BASED DATA SPLIT

Dates added to training data: 169681/169681 examples have dates

Split Statistics:
  Total examples: 169681
  Training set: 118776 (70.0%)
  Validation set: 25452 (15.0%)
  Test set: 25453 (15.0%)

Date Range Validation:
  Training: 1974-11-09 00:00:00 to 2023-08-27 00:00:00
  Validation: 2023-08-27 00:00:00 to 2024-09-21 00:00:00
  Test: 2024-09-21 00:00:00 to 2025-12-24 00:00:00

Step 6 Complete: Data splits created successfully
  ✓ Training examples: 118776
  ✓ Validation examples: 25452
  ✓ Test examples: 25453

Sample from training set:
   LifterID  CompIndex  PrevBest3SquatKg  PrevBest3BenchKg  \
0     38181          2            408.23            263.08   
1     11399          2            167.50            105.00   
2    107800          3            188.24            117.93   

   PrevBest3DeadliftKg  PrevTotalKg  PRBest3SquatKg  PRBest3BenchKg  \
0               362.87      1034.18          408.23          263.08   
1               195.00       4

In [14]:
# Step 7: Choose Your Model Architecture
# Three separate models (one for squat, one for bench, one for deadlift)
# Model Type: Random Forest (good balance of performance and interpretability)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

try:
    # Check if splits exist
    if 'df_train_final' not in locals() and 'df_train_final' not in globals():
        raise NameError("df_train_final not found. Please run Step 6 first.")
    
    if 'df_val_final' not in locals() and 'df_val_final' not in globals():
        raise NameError("df_val_final not found. Please run Step 6 first.")
    
    if 'df_test_final' not in locals() and 'df_test_final' not in globals():
        raise NameError("df_test_final not found. Please run Step 6 first.")
    
    print("="*60)
    print("STEP 7: MODEL ARCHITECTURE SELECTION")
    print("="*60)
    
    # Prepare feature and target columns
    metadata_cols = ['LifterID', 'CompIndex']
    target_cols = ['NextBest3SquatKg', 'NextBest3BenchKg', 'NextBest3DeadliftKg']
    
    # Get feature columns (everything except metadata and targets)
    feature_cols = [col for col in df_train_final.columns 
                    if col not in metadata_cols + target_cols]
    
    # Remove Date if present (not a feature)
    if 'Date' in feature_cols:
        feature_cols.remove('Date')
    
    print(f"\nModel Architecture:")
    print(f"  Approach: Three separate Random Forest models")
    print(f"    - Model 1: Predict NextBest3SquatKg")
    print(f"    - Model 2: Predict NextBest3BenchKg")
    print(f"    - Model 3: Predict NextBest3DeadliftKg")
    print(f"\nFeature columns: {len(feature_cols)}")
    print(f"  Sample features: {feature_cols[:5]}...")
    
    # Prepare training data
    X_train = df_train_final[feature_cols].copy()
    y_train_squat = df_train_final['NextBest3SquatKg'].copy()
    y_train_bench = df_train_final['NextBest3BenchKg'].copy()
    y_train_deadlift = df_train_final['NextBest3DeadliftKg'].copy()
    
    # Prepare validation data
    X_val = df_val_final[feature_cols].copy()
    y_val_squat = df_val_final['NextBest3SquatKg'].copy()
    y_val_bench = df_val_final['NextBest3BenchKg'].copy()
    y_val_deadlift = df_val_final['NextBest3DeadliftKg'].copy()
    
    # Prepare test data
    X_test = df_test_final[feature_cols].copy()
    y_test_squat = df_test_final['NextBest3SquatKg'].copy()
    y_test_bench = df_test_final['NextBest3BenchKg'].copy()
    y_test_deadlift = df_test_final['NextBest3DeadliftKg'].copy()
    
    # Handle missing target values - drop rows where target is missing
    # For each model, only use examples where that target exists
    print(f"\nTraining data preparation:")
    print(f"  Total training examples: {len(X_train)}")
    
    train_mask_squat = y_train_squat.notna()
    train_mask_bench = y_train_bench.notna()
    train_mask_deadlift = y_train_deadlift.notna()
    
    print(f"  Squat examples: {train_mask_squat.sum()} (dropped {len(X_train) - train_mask_squat.sum()})")
    print(f"  Bench examples: {train_mask_bench.sum()} (dropped {len(X_train) - train_mask_bench.sum()})")
    print(f"  Deadlift examples: {train_mask_deadlift.sum()} (dropped {len(X_train) - train_mask_deadlift.sum()})")
    
    print(f"\nStep 7 Complete: Model architecture selected and data prepared")

except NameError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"Error in model architecture selection: {e}")
    raise

STEP 7: MODEL ARCHITECTURE SELECTION

Model Architecture:
  Approach: Three separate Random Forest models
    - Model 1: Predict NextBest3SquatKg
    - Model 2: Predict NextBest3BenchKg
    - Model 3: Predict NextBest3DeadliftKg

Feature columns: 31
  Sample features: ['PrevBest3SquatKg', 'PrevBest3BenchKg', 'PrevBest3DeadliftKg', 'PrevTotalKg', 'PRBest3SquatKg']...

Training data preparation:
  Total training examples: 118776
  Squat examples: 118770 (dropped 6)
  Bench examples: 118769 (dropped 7)
  Deadlift examples: 118771 (dropped 5)

Step 7 Complete: Model architecture selected and data prepared


## Model Architecture Selection

In [None]:
# Step 8: Train the Models
# Train three XGBoost models (one for each lift)
# Use default hyperparameters for initial training

try:
    # Check if data is prepared
    if 'X_train' not in locals() and 'X_train' not in globals():
        raise NameError("X_train not found. Please run Step 7 first.")
    
    from xgboost import XGBRegressor
    
    print("="*60)
    print("STEP 8: TRAIN MODELS")
    print("="*60)
    
    # Initialize models with default hyperparameters
    xgb_base_params = {
        'n_estimators': 300,
        'learning_rate': 0.1,
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'device': 'cuda',
        'random_state': 42
    }
    
    print(f"\nXGBoost hyperparameters:")
    for k, v in xgb_base_params.items():
        print(f"  {k}: {v}")
    
    # Train Squat Model
    print(f"\nTraining Squat model...")
    X_train_squat = X_train[train_mask_squat].copy()
    y_train_squat_clean = y_train_squat[train_mask_squat].copy()
    
    model_squat = XGBRegressor(**xgb_base_params)
    model_squat.fit(X_train_squat, y_train_squat_clean, verbose=False)
    print(f"  ✓ Trained on {len(X_train_squat)} examples")
    
    # Train Bench Model
    print(f"\nTraining Bench model...")
    X_train_bench = X_train[train_mask_bench].copy()
    y_train_bench_clean = y_train_bench[train_mask_bench].copy()
    
    model_bench = XGBRegressor(**xgb_base_params)
    model_bench.fit(X_train_bench, y_train_bench_clean, verbose=False)
    print(f"  ✓ Trained on {len(X_train_bench)} examples")
    
    # Train Deadlift Model
    print(f"\nTraining Deadlift model...")
    X_train_deadlift = X_train[train_mask_deadlift].copy()
    y_train_deadlift_clean = y_train_deadlift[train_mask_deadlift].copy()
    
    model_deadlift = XGBRegressor(**xgb_base_params)
    model_deadlift.fit(X_train_deadlift, y_train_deadlift_clean, verbose=False)
    print(f"  ✓ Trained on {len(X_train_deadlift)} examples")
    
    # Make predictions on validation set for evaluation
    print(f"\nMaking predictions on validation set...")
    
    # Filter validation set to only examples where target exists
    val_mask_squat = y_val_squat.notna()
    val_mask_bench = y_val_bench.notna()
    val_mask_deadlift = y_val_deadlift.notna()
    
    y_pred_squat_val = model_squat.predict(X_val[val_mask_squat])
    y_pred_bench_val = model_bench.predict(X_val[val_mask_bench])
    y_pred_deadlift_val = model_deadlift.predict(X_val[val_mask_deadlift])
    
    y_actual_squat_val = y_val_squat[val_mask_squat].values
    y_actual_bench_val = y_val_bench[val_mask_bench].values
    y_actual_deadlift_val = y_val_deadlift[val_mask_deadlift].values
    
    print(f"  ✓ Validation predictions made")
    
    print(f"\nStep 8 Complete: All three models trained successfully")
    print(f"  Models ready for evaluation")

except NameError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"Error in model training: {e}")
    raise

STEP 8: TRAIN MODELS

Random Forest hyperparameters:
  n_estimators: 100
  max_depth: 10
  random_state: 42

Training Squat model...
  ✓ Trained on 118770 examples

Training Bench model...
  ✓ Trained on 118769 examples

Training Deadlift model...
  ✓ Trained on 118771 examples

Making predictions on validation set...
  ✓ Validation predictions made

Step 8 Complete: All three models trained successfully
  Models ready for evaluation


In [None]:
# Hyperparameter Optimization: XGBoost Early Stopping (GPU)
# Optimize each model using early stopping on validation data

import time
from xgboost import XGBRegressor

try:
    # Check if training data exists
    if 'X_train' not in locals() and 'X_train' not in globals():
        raise NameError("X_train not found. Please run Step 7 first.")
    
    if 'X_val' not in locals() and 'X_val' not in globals():
        raise NameError("X_val not found. Please run Step 7 first.")
    
    print("="*60)
    print("HYPERPARAMETER OPTIMIZATION")
    print("="*60)
    
    # XGBoost search settings (early stopping will pick best iteration)
    xgb_params = {
        'n_estimators': 2000,
        'learning_rate': 0.05,
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 1,
        'reg_alpha': 0.0,
        'reg_lambda': 1.0,
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'device': 'cuda',
        'random_state': 42
    }
    early_stopping_rounds = 50
    
    print(f"\nXGBoost settings:")
    for param, value in xgb_params.items():
        print(f"  {param}: {value}")
    print(f"  early_stopping_rounds: {early_stopping_rounds}")
    
    # Store baseline performance for comparison
    print(f"\nRecording baseline performance from Step 8...")
    baseline_metrics = {}
    
    # Get baseline validation predictions if available
    if 'y_pred_squat_val' in locals() or 'y_pred_squat_val' in globals():
        from sklearn.metrics import mean_absolute_error
        baseline_metrics['squat'] = mean_absolute_error(y_actual_squat_val, y_pred_squat_val)
        baseline_metrics['bench'] = mean_absolute_error(y_actual_bench_val, y_pred_bench_val)
        baseline_metrics['deadlift'] = mean_absolute_error(y_actual_deadlift_val, y_pred_deadlift_val)
        print(f"  Baseline MAE - Squat: {baseline_metrics['squat']:.2f} kg")
        print(f"  Baseline MAE - Bench: {baseline_metrics['bench']:.2f} kg")
        print(f"  Baseline MAE - Deadlift: {baseline_metrics['deadlift']:.2f} kg")
    else:
        print(f"  Warning: Baseline metrics not found, will calculate after optimization")
    
    print(f"\nStarting XGBoost optimization with early stopping...")
    print(f"  This should be faster than CV-based search")

except NameError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"Error in hyperparameter optimization setup: {e}")
    raise

HYPERPARAMETER OPTIMIZATION

Parameter search space:
  max_depth: [5, 10, 15, 20, None]
  min_samples_split: [2, 5, 10]
  min_samples_leaf: [1, 2, 4]
  max_features: ['sqrt', 'log2', None]

Search settings:
  search_type: HalvingRandomSearchCV
  min_resources (n_estimators): 50
  max_resources (n_estimators): 150
  factor: 3
  cv: 2 (cross-validation folds)
  scoring: neg_mean_absolute_error
  random_state: 42
  final_n_estimators: 300

Recording baseline performance from Step 8...
  Baseline MAE - Squat: 8.14 kg
  Baseline MAE - Bench: 4.68 kg
  Baseline MAE - Deadlift: 8.46 kg

Starting hyperparameter optimization...
  This should be faster than full RandomizedSearchCV


In [None]:
# Step 2-3: Optimize each model separately using XGBoost early stopping

try:
    # Check if search settings are defined
    if 'xgb_params' not in locals() and 'xgb_params' not in globals():
        raise NameError("xgb_params not found. Please run previous cell first.")
    
    print("\n" + "="*60)
    print("OPTIMIZING MODELS")
    print("="*60)
    
    best_params = {}
    search_results = {}
    search_times = {}
    best_iterations = {}
    
    # Prepare train/validation splits once
    X_squat_train = X_train[train_mask_squat]
    y_squat_train = y_train_squat[train_mask_squat]
    X_squat_val = X_val[val_mask_squat]
    y_squat_val_clean = y_val_squat[val_mask_squat]
    
    X_bench_train = X_train[train_mask_bench]
    y_bench_train = y_train_bench[train_mask_bench]
    X_bench_val = X_val[val_mask_bench]
    y_bench_val_clean = y_val_bench[val_mask_bench]
    
    X_deadlift_train = X_train[train_mask_deadlift]
    y_deadlift_train = y_train_deadlift[train_mask_deadlift]
    X_deadlift_val = X_val[val_mask_deadlift]
    y_deadlift_val_clean = y_val_deadlift[val_mask_deadlift]
    
    def _check_cv_size(name, X_train_part, X_val_part):
        if len(X_train_part) < 100 or len(X_val_part) < 100:
            raise ValueError(
                f"Insufficient data for early stopping ({name}): "
                f"train={len(X_train_part)}, val={len(X_val_part)}"
            )
    
    _check_cv_size("squat", X_squat_train, X_squat_val)
    _check_cv_size("bench", X_bench_train, X_bench_val)
    _check_cv_size("deadlift", X_deadlift_train, X_deadlift_val)
    
    # Optimize Squat Model
    print(f"\n[1/3] Optimizing Squat model...")
    start_time = time.time()
    
    try:
        model_squat_es = XGBRegressor(**xgb_params)
        model_squat_es.fit(
            X_squat_train,
            y_squat_train,
            eval_set=[(X_squat_val, y_squat_val_clean)],
            early_stopping_rounds=early_stopping_rounds,
            verbose=False
        )
        
        best_iter = model_squat_es.best_iteration
        best_iterations['squat'] = best_iter
        best_params['squat'] = dict(xgb_params)
        best_params['squat']['n_estimators'] = best_iter + 1
        search_results['squat'] = model_squat_es
        search_times['squat'] = time.time() - start_time
        
        print(f"  ✓ Completed in {search_times['squat']:.1f} seconds")
        print(f"  Best iteration: {best_iter}")
        print(f"  Best score (val MAE): {model_squat_es.best_score:.2f} kg")
        
    except Exception as e:
        print(f"  ✗ Error optimizing squat model: {e}")
        raise
    
    # Optimize Bench Model
    print(f"\n[2/3] Optimizing Bench model...")
    start_time = time.time()
    
    try:
        model_bench_es = XGBRegressor(**xgb_params)
        model_bench_es.fit(
            X_bench_train,
            y_bench_train,
            eval_set=[(X_bench_val, y_bench_val_clean)],
            early_stopping_rounds=early_stopping_rounds,
            verbose=False
        )
        
        best_iter = model_bench_es.best_iteration
        best_iterations['bench'] = best_iter
        best_params['bench'] = dict(xgb_params)
        best_params['bench']['n_estimators'] = best_iter + 1
        search_results['bench'] = model_bench_es
        search_times['bench'] = time.time() - start_time
        
        print(f"  ✓ Completed in {search_times['bench']:.1f} seconds")
        print(f"  Best iteration: {best_iter}")
        print(f"  Best score (val MAE): {model_bench_es.best_score:.2f} kg")
        
    except Exception as e:
        print(f"  ✗ Error optimizing bench model: {e}")
        raise
    
    # Optimize Deadlift Model
    print(f"\n[3/3] Optimizing Deadlift model...")
    start_time = time.time()
    
    try:
        model_deadlift_es = XGBRegressor(**xgb_params)
        model_deadlift_es.fit(
            X_deadlift_train,
            y_deadlift_train,
            eval_set=[(X_deadlift_val, y_deadlift_val_clean)],
            early_stopping_rounds=early_stopping_rounds,
            verbose=False
        )
        
        best_iter = model_deadlift_es.best_iteration
        best_iterations['deadlift'] = best_iter
        best_params['deadlift'] = dict(xgb_params)
        best_params['deadlift']['n_estimators'] = best_iter + 1
        search_results['deadlift'] = model_deadlift_es
        search_times['deadlift'] = time.time() - start_time
        
        print(f"  ✓ Completed in {search_times['deadlift']:.1f} seconds")
        print(f"  Best iteration: {best_iter}")
        print(f"  Best score (val MAE): {model_deadlift_es.best_score:.2f} kg")
        
    except Exception as e:
        print(f"  ✗ Error optimizing deadlift model: {e}")
        raise
    
    total_time = sum(search_times.values())
    print(f"\n✓ All models optimized in {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
    print(f"\nBest iteration summary:")
    for model_name, best_iter in best_iterations.items():
        print(f"  {model_name.capitalize()}: {best_iter}")

except NameError as e:
    print(f"Error: {e}")
except ValueError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"Error in model optimization: {e}")
    raise


OPTIMIZING MODELS

[1/3] Optimizing Squat model...
n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 50
max_resources_: 150
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 3
n_resources: 50
Fitting 2 folds for each of 3 candidates, totalling 6 fits
----------
iter: 1
n_candidates: 1
n_resources: 150
Fitting 2 folds for each of 1 candidates, totalling 2 fits
  ✓ Completed in 316.1 seconds
  Best score (CV): 8.39 kg MAE
  Best parameters: {'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': None, 'max_depth': 20, 'n_estimators': 150}

[2/3] Optimizing Bench model...
n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 50
max_resources_: 150
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 3
n_resources: 50
Fitting 2 folds for each of 3 candidates, totalling 6 fits
----------
iter: 1
n_candidates: 1
n_resources: 150
Fitting 2 folds for each of 1 candidates, totalling 2

KeyboardInterrupt: 

In [None]:
# Step 4: Retrain models with optimized hyperparameters on full training set

try:
    # Check if best parameters exist
    if 'best_params' not in locals() and 'best_params' not in globals():
        raise NameError("best_params not found. Please run optimization cell first.")
    
    from xgboost import XGBRegressor
    
    print("="*60)
    print("RETRAINING MODELS WITH OPTIMIZED HYPERPARAMETERS")
    print("="*60)
    
    # Retrain Squat Model
    print(f"\nRetraining Squat model with optimized parameters...")
    squat_params = dict(best_params['squat'])
    model_squat_optimized = XGBRegressor(**squat_params)
    model_squat_optimized.fit(X_train[train_mask_squat], y_train_squat[train_mask_squat], verbose=False)
    print(f"  ✓ Trained on {train_mask_squat.sum()} examples")
    
    # Retrain Bench Model
    print(f"\nRetraining Bench model with optimized parameters...")
    bench_params = dict(best_params['bench'])
    model_bench_optimized = XGBRegressor(**bench_params)
    model_bench_optimized.fit(X_train[train_mask_bench], y_train_bench[train_mask_bench], verbose=False)
    print(f"  ✓ Trained on {train_mask_bench.sum()} examples")
    
    # Retrain Deadlift Model
    print(f"\nRetraining Deadlift model with optimized parameters...")
    deadlift_params = dict(best_params['deadlift'])
    model_deadlift_optimized = XGBRegressor(**deadlift_params)
    model_deadlift_optimized.fit(X_train[train_mask_deadlift], y_train_deadlift[train_mask_deadlift], verbose=False)
    print(f"  ✓ Trained on {train_mask_deadlift.sum()} examples")
    
    # Make predictions on validation set
    print(f"\nMaking predictions on validation set...")
    y_pred_squat_val_opt = model_squat_optimized.predict(X_val[val_mask_squat])
    y_pred_bench_val_opt = model_bench_optimized.predict(X_val[val_mask_bench])
    y_pred_deadlift_val_opt = model_deadlift_optimized.predict(X_val[val_mask_deadlift])
    
    print(f"  ✓ Validation predictions made")
    
    # Update model variables to use optimized versions
    model_squat = model_squat_optimized
    model_bench = model_bench_optimized
    model_deadlift = model_deadlift_optimized
    
    print(f"\n✓ All models retrained with optimized hyperparameters")

except NameError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"Error retraining models: {e}")
    raise

In [None]:
# Step 5: Compare optimized vs baseline performance

try:
    # Check if optimized predictions exist
    if 'y_pred_squat_val_opt' not in locals() and 'y_pred_squat_val_opt' not in globals():
        raise NameError("Optimized predictions not found. Please run retraining cell first.")
    
    print("="*60)
    print("PERFORMANCE COMPARISON: BASELINE vs OPTIMIZED")
    print("="*60)
    
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    import numpy as np
    
    # Calculate optimized metrics
    optimized_metrics = {}
    
    optimized_metrics['squat'] = {
        'MAE': mean_absolute_error(y_actual_squat_val, y_pred_squat_val_opt),
        'RMSE': np.sqrt(mean_squared_error(y_actual_squat_val, y_pred_squat_val_opt)),
        'R2': r2_score(y_actual_squat_val, y_pred_squat_val_opt)
    }
    
    optimized_metrics['bench'] = {
        'MAE': mean_absolute_error(y_actual_bench_val, y_pred_bench_val_opt),
        'RMSE': np.sqrt(mean_squared_error(y_actual_bench_val, y_pred_bench_val_opt)),
        'R2': r2_score(y_actual_bench_val, y_pred_bench_val_opt)
    }
    
    optimized_metrics['deadlift'] = {
        'MAE': mean_absolute_error(y_actual_deadlift_val, y_pred_deadlift_val_opt),
        'RMSE': np.sqrt(mean_squared_error(y_actual_deadlift_val, y_pred_deadlift_val_opt)),
        'R2': r2_score(y_actual_deadlift_val, y_pred_deadlift_val_opt)
    }
    
    # Calculate baseline metrics if not already stored
    if 'baseline_metrics' not in locals() or len(baseline_metrics) == 0:
        baseline_metrics = {}
        baseline_metrics['squat'] = mean_absolute_error(y_actual_squat_val, y_pred_squat_val)
        baseline_metrics['bench'] = mean_absolute_error(y_actual_bench_val, y_pred_bench_val)
        baseline_metrics['deadlift'] = mean_absolute_error(y_actual_deadlift_val, y_pred_deadlift_val)
    
    # Compare and calculate improvements
    print(f"\nValidation Set Performance Comparison:")
    print("-" * 60)
    
    improvements = {}
    
    for model_name in ['squat', 'bench', 'deadlift']:
        baseline_mae = baseline_metrics[model_name]
        optimized_mae = optimized_metrics[model_name]['MAE']
        improvement = ((baseline_mae - optimized_mae) / baseline_mae) * 100
        
        improvements[model_name] = improvement
        
        print(f"\n{model_name.capitalize()} Model:")
        print(f"  Baseline MAE:  {baseline_mae:.2f} kg")
        print(f"  Optimized MAE: {optimized_mae:.2f} kg")
        print(f"  Improvement:   {improvement:+.2f}% ({'✓ Better' if improvement > 0 else '✗ Worse'})")
        print(f"  Optimized RMSE: {optimized_metrics[model_name]['RMSE']:.2f} kg")
        print(f"  Optimized R²:   {optimized_metrics[model_name]['R2']:.4f}")
    
    # Overall improvement
    avg_baseline = np.mean(list(baseline_metrics.values()))
    avg_optimized = np.mean([optimized_metrics[m]['MAE'] for m in ['squat', 'bench', 'deadlift']])
    avg_improvement = ((avg_baseline - avg_optimized) / avg_baseline) * 100
    
    print(f"\n" + "-" * 60)
    print(f"Overall Average:")
    print(f"  Baseline MAE:  {avg_baseline:.2f} kg")
    print(f"  Optimized MAE: {avg_optimized:.2f} kg")
    print(f"  Average Improvement: {avg_improvement:+.2f}%")
    
    # Store for documentation
    comparison_results = {
        'baseline_metrics': baseline_metrics,
        'optimized_metrics': optimized_metrics,
        'improvements': improvements,
        'avg_improvement': avg_improvement
    }
    
    print(f"\n✓ Performance comparison complete")

except NameError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"Error in performance comparison: {e}")
    raise

In [None]:
# Final Summary: Hyperparameter Optimization Results

try:
    # Check if all required variables exist
    if 'best_params' not in locals() and 'best_params' not in globals():
        raise NameError("best_params not found. Please run optimization cells first.")
    
    if 'search_times' not in locals() and 'search_times' not in globals():
        raise NameError("search_times not found. Please run optimization cells first.")
    
    if 'comparison_results' not in locals() and 'comparison_results' not in globals():
        raise NameError("comparison_results not found. Please run comparison cell first.")
    
    print("="*60)
    print("HYPERPARAMETER OPTIMIZATION SUMMARY")
    print("="*60)
    
    # Best hyperparameters
    print(f"\nBest Hyperparameters:")
    print("-" * 60)
    for model_name in ['squat', 'bench', 'deadlift']:
        print(f"\n{model_name.capitalize()} Model:")
        for param, value in best_params[model_name].items():
            print(f"  {param}: {value}")
    
    # Search time summary
    print(f"\nSearch Time Summary:")
    print("-" * 60)
    total_time = sum(search_times.values())
    for model_name, time_taken in search_times.items():
        print(f"  {model_name.capitalize()}: {time_taken:.1f} seconds ({time_taken/60:.1f} minutes)")
    print(f"  Total: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
    
    # Early stopping results
    print(f"\nEarly Stopping Results:")
    print("-" * 60)
    if 'search_results' in locals() or 'search_results' in globals():
        for model_name in ['squat', 'bench', 'deadlift']:
            model_obj = search_results[model_name]
            best_iter = model_obj.best_iteration
            best_score = model_obj.best_score
            print(f"  {model_name.capitalize()}: best_iteration={best_iter}, best_val_mae={best_score:.2f} kg")
    
    # Performance summary
    print(f"\nPerformance Summary:")
    print("-" * 60)
    print(f"Average Improvement: {comparison_results['avg_improvement']:+.2f}%")
    print(f"\nPer-Model Improvements:")
    for model_name, improvement in comparison_results['improvements'].items():
        status = "✓ Improved" if improvement > 0 else "✗ Degraded"
        print(f"  {model_name.capitalize()}: {improvement:+.2f}% ({status})")
    
    # Final recommendations
    print(f"\n" + "="*60)
    print("OPTIMIZATION COMPLETE")
    print("="*60)
    
    if comparison_results['avg_improvement'] > 0:
        print(f"✓ Hyperparameter optimization improved model performance")
        print(f"  Average MAE reduced by {abs(comparison_results['avg_improvement']):.2f}%")
    else:
        print(f"⚠ Hyperparameter optimization did not improve performance")
        print(f"  Consider: expanding search space, trying different algorithms, or checking for overfitting")
    
    print(f"\nOptimized models are now ready for final evaluation on test set.")
    print(f"Run Step 9 to evaluate optimized models on test data.")

except NameError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"Error in summary generation: {e}")
    raise