In [33]:
import pandas as pd
import os

In [34]:
def load_federation_data(federation_path, federation_name):
    """
    Load entries.csv and meet.csv files from all subfolders in a federation directory.
    
    Parameters:
    -----------
    federation_path : str
        Path to the federation folder (e.g., "../opl-data/meet-data/ipf")
    federation_name : str
        Name of the federation (for display purposes)
    
    Returns:
    --------
    entries_list : list
        List of entries dataframes
    meets_list : list
        List of meets dataframes
    """
    # Get all folders in federation directory
    folders = [f for f in os.listdir(federation_path) 
               if os.path.isdir(os.path.join(federation_path, f)) 
               and not f.startswith('.')]
    
    entries_list = []
    meets_list = []
    
    for folder in folders:
        entries_path = os.path.join(federation_path, folder, "entries.csv")
        meet_path = os.path.join(federation_path, folder, "meet.csv")
        
        if os.path.exists(entries_path):
            df_entries = pd.read_csv(entries_path)
            df_entries['MeetID'] = folder
            entries_list.append(df_entries)
        
        if os.path.exists(meet_path):
            df_meet = pd.read_csv(meet_path)
            df_meet['MeetID'] = folder
            meets_list.append(df_meet)
    
    return entries_list, meets_list

# Load data from all three federations
ipf_path = "../opl-data/meet-data/ipf"
usapl_path = "../opl-data/meet-data/usapl"
pa_path = "../opl-data/meet-data/pa"

# Load IPF data
ipf_entries, ipf_meets = load_federation_data(ipf_path, "IPF")
print(f"IPF: Loaded {len(ipf_entries)} entries files, {len(ipf_meets)} meets files")

# Load USAPL data
usapl_entries, usapl_meets = load_federation_data(usapl_path, "USAPL")
print(f"USAPL: Loaded {len(usapl_entries)} entries files, {len(usapl_meets)} meets files")

# Load PA data
pa_entries, pa_meets = load_federation_data(pa_path, "PA")
print(f"PA: Loaded {len(pa_entries)} entries files, {len(pa_meets)} meets files")

# Combine all entries from all federations
all_entries_list = ipf_entries + usapl_entries + pa_entries
all_meets_list = ipf_meets + usapl_meets + pa_meets

# Combine all entries
df_entries_all = pd.concat(all_entries_list, ignore_index=True)

# Combine all meets
df_meets_all = pd.concat(all_meets_list, ignore_index=True)

# Display results
print(f"\n{'='*60}")
print("COMBINED RESULTS:")
print(f"{'='*60}")
print(f"Total entries: {len(df_entries_all)}")
print(f"Total meets: {len(df_meets_all)}")
print(f"\nEntries breakdown:")
print(f"  - IPF: {len(pd.concat(ipf_entries, ignore_index=True)) if ipf_entries else 0} entries")
print(f"  - USAPL: {len(pd.concat(usapl_entries, ignore_index=True)) if usapl_entries else 0} entries")
print(f"  - PA: {len(pd.concat(pa_entries, ignore_index=True)) if pa_entries else 0} entries")
print(f"\nMeets breakdown:")
print(f"  - IPF: {len(pd.concat(ipf_meets, ignore_index=True)) if ipf_meets else 0} meets")
print(f"  - USAPL: {len(pd.concat(usapl_meets, ignore_index=True)) if usapl_meets else 0} meets")
print(f"  - PA: {len(pd.concat(pa_meets, ignore_index=True)) if pa_meets else 0} meets")
print(f"\nEntries dataframe shape: {df_entries_all.shape}")
print(f"Meets dataframe shape: {df_meets_all.shape}")

# Merge entries with meet data using MeetID
# This adds Date and other meet information to each entry
df_entries_all = df_entries_all.merge(
    df_meets_all,
    on='MeetID',
    how='left',  # Keep all entries even if meet data is missing
    suffixes=('', '_meet')  # In case of duplicate column names
)

# Convert Date to datetime 
df_entries_all['Date'] = pd.to_datetime(df_entries_all['Date'])

# Convert BirthDate to datetime 
df_entries_all['BirthDate'] = pd.to_datetime(df_entries_all['BirthDate'], errors='coerce')

# Calculate age in years: (Date - BirthDate) / 365.25
# Using 365.25 to account for leap years
df_entries_all['Age'] = (df_entries_all['Date'] - df_entries_all['BirthDate']).dt.days / 365.25

# For entries where BirthDate is missing but BirthYear exists, estimate age
# Use mid-year as approximation: Date.year - BirthYear
missing_age_mask = df_entries_all['Age'].isna() & df_entries_all['BirthYear'].notna()
df_entries_all.loc[missing_age_mask, 'Age'] = (
    df_entries_all.loc[missing_age_mask, 'Date'].dt.year - 
    df_entries_all.loc[missing_age_mask, 'BirthYear']
)


print(f"Age still missing: {df_entries_all['Age'].isna().sum()}")
print(f"\nAge statistics:")
print(df_entries_all['Age'].describe())

print(f"\n{'='*60}")
print("AFTER MERGE:")
print(f"{'='*60}")
print(f"Entries dataframe shape: {df_entries_all.shape}")
print(f"Columns: {list(df_entries_all.columns)}")
print(f"\nDate column info:")
print(f"  - Non-null dates: {df_entries_all['Date'].notna().sum()}")
print(f"  - Date range: {df_entries_all['Date'].min()} to {df_entries_all['Date'].max()}")
print("\nFirst few rows with merged data:")
print(df_entries_all[['Name', 'MeetID', 'Date', 'Division','Sex','WeightClassKg','Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg']].head())

IPF: Loaded 265 entries files, 265 meets files
USAPL: Loaded 4206 entries files, 4206 meets files
PA: Loaded 951 entries files, 951 meets files

COMBINED RESULTS:
Total entries: 362415
Total meets: 5422

Entries breakdown:
  - IPF: 52706 entries
  - USAPL: 284587 entries
  - PA: 25122 entries

Meets breakdown:
  - IPF: 265 meets
  - USAPL: 4206 meets
  - PA: 951 meets

Entries dataframe shape: (362415, 52)
Meets dataframe shape: (5422, 7)
Age still missing: 24583

Age statistics:
count    381957.000000
mean         29.745970
std          13.333483
min           7.991786
25%          20.000000
50%          25.000000
75%          36.000000
max          94.000000
Name: Age, dtype: float64

AFTER MERGE:
Entries dataframe shape: (406540, 58)
Columns: ['Name', 'Division', 'BirthYear', 'BirthDate', 'WeightClassKg', 'BodyweightKg', 'Country', 'Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg', 'TotalKg', 'Event', 'Equipment', 'Sex', 'Place', 'MeetID', 'Deadlift4Kg', 'Bench4Kg', 'Bench1Kg', 'Ben

In [35]:
# Convert any Lbs columns to Kg and fill missing Kg values
# Conversion factor: 1 lb = 0.453592 kg

# Define the mapping of Lbs columns to their corresponding Kg columns
lbs_to_kg_mapping = {
    'Best3SquatLbs': 'Best3SquatKg',
    'Best3BenchLbs': 'Best3BenchKg',
    'Best3DeadliftLbs': 'Best3DeadliftKg',
    'Squat1Lbs': 'Squat1Kg',
    'Squat2Lbs': 'Squat2Kg',
    'Squat3Lbs': 'Squat3Kg',
    'Bench1Lbs': 'Bench1Kg',
    'Bench2Lbs': 'Bench2Kg',
    'Bench3Lbs': 'Bench3Kg',
    'Deadlift1Lbs': 'Deadlift1Kg',
    'Deadlift2Lbs': 'Deadlift2Kg',
    'Deadlift3Lbs': 'Deadlift3Kg',
    'TotalLbs': 'TotalKg'
}

# Convert and fill missing values
for lbs_col, kg_col in lbs_to_kg_mapping.items():
    if lbs_col in df_entries_all.columns:
        # Convert lbs to kg (multiply by 0.453592)
        converted_values = df_entries_all[lbs_col] * 0.453592
        
        # Fill missing Kg values with converted Lbs values (only where Kg is missing and Lbs exists)
        mask = df_entries_all[kg_col].isna() & df_entries_all[lbs_col].notna()
        df_entries_all.loc[mask, kg_col] = converted_values[mask]
        
        print(f"Filled {mask.sum()} missing values in {kg_col} using {lbs_col}")
# Drop all Lbs columns after conversion
lbs_columns_to_drop = [col for col in df_entries_all.columns if col.endswith('Lbs')]
df_entries_all = df_entries_all.drop(columns=lbs_columns_to_drop)


Filled 251 missing values in Best3SquatKg using Best3SquatLbs
Filled 257 missing values in Best3BenchKg using Best3BenchLbs
Filled 261 missing values in Best3DeadliftKg using Best3DeadliftLbs
Filled 263 missing values in Squat1Kg using Squat1Lbs
Filled 263 missing values in Squat2Kg using Squat2Lbs
Filled 260 missing values in Squat3Kg using Squat3Lbs
Filled 264 missing values in Bench1Kg using Bench1Lbs
Filled 264 missing values in Bench2Kg using Bench2Lbs
Filled 262 missing values in Bench3Kg using Bench3Lbs
Filled 262 missing values in Deadlift1Kg using Deadlift1Lbs
Filled 261 missing values in Deadlift2Kg using Deadlift2Lbs
Filled 261 missing values in Deadlift3Kg using Deadlift3Lbs
Filled 245 missing values in TotalKg using TotalLbs


In [41]:
# Create a surrogate LifterID so we can build per-lifter competition histories
# We use a combination of fairly stable identity columns.

id_cols = [col for col in ['Name', 'Sex', 'BirthYear'] if col in df_entries_all.columns]

if id_cols:
    # Build a string key from available identity columns
    lifter_key = df_entries_all[id_cols].astype(str).agg('|'.join, axis=1)

    # Turn the key into an integer LifterID (stable within this run)
    df_entries_all['LifterID'] = lifter_key.astype('category').cat.codes

    print("Created LifterID using columns:", id_cols)
    print(df_entries_all[['LifterID'] + id_cols].head(10))
else:
    raise ValueError("No suitable columns found to construct LifterID.")

Created LifterID using columns: ['Name', 'Sex', 'BirthYear']
   LifterID                 Name Sex  BirthYear
0      9746    Andrzej Stanaszek   M     1971.0
1      9746    Andrzej Stanaszek   M     1971.0
2    118950  Sergey Zhuravlev #1   M     1960.0
3    118950  Sergey Zhuravlev #1   M     1960.0
4     52278        Hideaki Inaba   M     1944.0
5     52278        Hideaki Inaba   M     1944.0
6     43877      Ervin Gainer Sr   M     1966.0
7     43877      Ervin Gainer Sr   M     1966.0
8     25867      Chih-Chiang Hsu   M     1979.0
9     25867      Chih-Chiang Hsu   M     1979.0


In [42]:
# 2.2 Group by LifterID and sort competitions chronologically
# This creates a competition history for each lifter.

# Sort by lifter and date so each lifter's rows are in chronological order
df_entries_all = df_entries_all.sort_values(['LifterID', 'Date'])

# Create an explicit competition index per lifter (1 = first recorded meet, 2 = second, ...)
df_entries_all['CompIndex'] = df_entries_all.groupby('LifterID').cumcount() + 1

print("Per-lifter competition history created using 'LifterID' and 'Date'.")
print(df_entries_all[['LifterID', 'Date', 'CompIndex']].head(10))

Per-lifter competition history created using 'LifterID' and 'Date'.
        LifterID       Date  CompIndex
378269         0 2002-01-01          1
378268         0 2002-05-29          2
378307         1 2002-01-01          1
378306         1 2002-05-29          2
378635         2 2002-01-01          1
378634         2 2002-05-29          2
5593           3 2002-02-16          1
5592           3 2002-09-25          2
405617         4 2023-11-13          1
405618         4 2023-12-17          2


In [45]:
# 2.3 Filter valid lifters
# - Keep only lifters with at least 2 competitions
# - Optionally filter by other criteria (e.g., only SBD events, only Raw equipment)

# Count number of competitions per lifter
comp_counts = df_entries_all.groupby('LifterID')['CompIndex'].max().rename('NumComps')

df_entries_all = df_entries_all.merge(comp_counts, on='LifterID', how='left')

# Keep only lifters with at least 2 competitions
min_comps = 2
valid_mask = df_entries_all['NumComps'] >= min_comps

print(f"Total rows before filtering by competition count: {len(df_entries_all)}")
df_entries_all = df_entries_all[valid_mask].copy()
print(f"Total rows after keeping lifters with >= {min_comps} competitions: {len(df_entries_all)}")
print(f"Number of unique lifters remaining: {df_entries_all['LifterID'].nunique()}")

# Optional: filter to SBD + Raw if those columns exist
if 'Event' in df_entries_all.columns:
    before = len(df_entries_all)
    df_entries_all = df_entries_all[df_entries_all['Event'] == 'SBD']
    print(f"After filtering to SBD events: {len(df_entries_all)} rows (dropped {before - len(df_entries_all)})")

if 'Equipment' in df_entries_all.columns:
    before = len(df_entries_all)
    df_entries_all = df_entries_all[df_entries_all['Equipment'] == 'Raw']
    print(f"After filtering to Raw equipment: {len(df_entries_all)} rows (dropped {before - len(df_entries_all)})")

print(f"Number of unique lifters remaining: {df_entries_all['LifterID'].nunique()}")

Total rows before filtering by competition count: 233466
Total rows after keeping lifters with >= 2 competitions: 232354
Number of unique lifters remaining: 61962
After filtering to SBD events: 232354 rows (dropped 0)
After filtering to Raw equipment: 232354 rows (dropped 0)
Number of unique lifters remaining: 61962


In [53]:
# Drop the specified columns
# NOTE: Date is preserved for time-based features in Step 3
columns_to_drop = ['Name', 'Bench1Kg', 'KoreanName', 'ChineseName', 'Bench2Kg', 'Bench3Kg', 'BirthDate', 'Squat1Kg', 'Squat2Kg', 'Squat3Kg', 
                    'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg', 'Country', 'BirthYear', 'BodyweightKg', 'Place', 'Event', 'Equipment',
                    'Deadlift4Kg', 'Bench4Kg', 'Squat4Kg', 'Team', 'MeetID', 'State', 'AgeRange', 'Tested', 'DeadliftEquipment', 'CyrillicName', 'MeetCountry', 'MeetState', 'MeetTown', 'MeetName', 'Federation']
df_cleaned = df_entries_all.drop(columns=columns_to_drop)

In [54]:
# Display the cleaned and filtered dataframe
print("\n" + "="*60)
print("FINAL CLEANED AND FILTERED DATAFRAME:")
print("="*60)
print("\nCleaned DataFrame Info:")
print(df_cleaned.info())

print("\nFirst few rows of cleaned data:")
print(df_cleaned.head(10))


FINAL CLEANED AND FILTERED DATAFRAME:

Cleaned DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 232354 entries, 0 to 233465
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   Division         232352 non-null  object        
 1   WeightClassKg    228461 non-null  object        
 2   Best3SquatKg     228835 non-null  float64       
 3   Best3BenchKg     228771 non-null  float64       
 4   Best3DeadliftKg  229390 non-null  float64       
 5   TotalKg          224096 non-null  float64       
 6   Sex              232354 non-null  object        
 7   Age              228140 non-null  float64       
 8   Date             232354 non-null  datetime64[ns]
 9   LifterID         232354 non-null  int32         
 10  CompIndex        232354 non-null  int64         
 11  NumComps_x       232354 non-null  int64         
 12  NumComps_y       232354 non-null  int64         
 13  NumComps       

In [59]:
# Step 3: Create Training Examples (Sliding Window Approach)
# For each lifter with N competitions, create N-1 training examples
# Each example uses previous competitions to predict the next competition's lifts

import numpy as np

def extract_historical_features(previous_comps):
    """
    Extract historical performance features from previous competitions.
    
    Parameters:
    -----------
    previous_comps : DataFrame
        DataFrame containing all previous competitions for a lifter
    
    Returns:
    --------
    dict : Dictionary of historical features
    """
    features = {}
    
    # Previous lifts (last competition)
    if len(previous_comps) > 0:
        last_comp = previous_comps.iloc[-1]
        features['PrevBest3SquatKg'] = last_comp.get('Best3SquatKg', np.nan)
        features['PrevBest3BenchKg'] = last_comp.get('Best3BenchKg', np.nan)
        features['PrevBest3DeadliftKg'] = last_comp.get('Best3DeadliftKg', np.nan)
        features['PrevTotalKg'] = last_comp.get('TotalKg', np.nan)
    else:
        features['PrevBest3SquatKg'] = np.nan
        features['PrevBest3BenchKg'] = np.nan
        features['PrevBest3DeadliftKg'] = np.nan
        features['PrevTotalKg'] = np.nan
    
    # All-time PRs from previous competitions
    valid_squat = previous_comps['Best3SquatKg'].dropna()
    valid_bench = previous_comps['Best3BenchKg'].dropna()
    valid_deadlift = previous_comps['Best3DeadliftKg'].dropna()
    valid_total = previous_comps['TotalKg'].dropna()
    
    features['PRBest3SquatKg'] = valid_squat.max() if len(valid_squat) > 0 else np.nan
    features['PRBest3BenchKg'] = valid_bench.max() if len(valid_bench) > 0 else np.nan
    features['PRBest3DeadliftKg'] = valid_deadlift.max() if len(valid_deadlift) > 0 else np.nan
    features['PRTotalKg'] = valid_total.max() if len(valid_total) > 0 else np.nan
    
    # Averages of last 3 competitions (or all if fewer than 3)
    n_recent = min(3, len(previous_comps))
    if n_recent > 0:
        recent_comps = previous_comps.tail(n_recent)
        features['AvgBest3SquatKg_Last3'] = recent_comps['Best3SquatKg'].mean()
        features['AvgBest3BenchKg_Last3'] = recent_comps['Best3BenchKg'].mean()
        features['AvgBest3DeadliftKg_Last3'] = recent_comps['Best3DeadliftKg'].mean()
        features['AvgTotalKg_Last3'] = recent_comps['TotalKg'].mean()
    else:
        features['AvgBest3SquatKg_Last3'] = np.nan
        features['AvgBest3BenchKg_Last3'] = np.nan
        features['AvgBest3DeadliftKg_Last3'] = np.nan
        features['AvgTotalKg_Last3'] = np.nan
    
    # Consistency (standard deviation) across all previous competitions
    features['StdBest3SquatKg'] = valid_squat.std() if len(valid_squat) > 1 else np.nan
    features['StdBest3BenchKg'] = valid_bench.std() if len(valid_bench) > 1 else np.nan
    features['StdBest3DeadliftKg'] = valid_deadlift.std() if len(valid_deadlift) > 1 else np.nan
    features['StdTotalKg'] = valid_total.std() if len(valid_total) > 1 else np.nan
    
    return features

def extract_time_features(current_comp, previous_comps, dates):
    """
    Extract time-based features.
    
    Parameters:
    -----------
    current_comp : Series
        Current competition row
    previous_comps : DataFrame
        Previous competitions
    dates : Series
        Dates for all competitions (current + previous)
    
    Returns:
    --------
    dict : Dictionary of time-based features
    """
    features = {}
    
    # Competition count (which competition number this is)
    features['CompIndex'] = current_comp.get('CompIndex', np.nan)
    
    # Age at current competition
    features['Age'] = current_comp.get('Age', np.nan)
    
    # Days since last competition
    if len(previous_comps) > 0 and 'Date' in current_comp and pd.notna(current_comp['Date']):
        last_date = previous_comps.iloc[-1].get('Date')
        if pd.notna(last_date):
            days_diff = (current_comp['Date'] - last_date).days
            features['DaysSinceLastComp'] = days_diff
        else:
            features['DaysSinceLastComp'] = np.nan
    else:
        features['DaysSinceLastComp'] = np.nan
    
    # Average days between competitions (for this lifter)
    if len(previous_comps) > 0 and 'Date' in previous_comps.columns:
        valid_dates = previous_comps['Date'].dropna()
        if len(valid_dates) > 1:
            date_diffs = valid_dates.diff().dropna()
            features['AvgDaysBetweenComps'] = date_diffs.dt.days.mean() if len(date_diffs) > 0 else np.nan
        else:
            features['AvgDaysBetweenComps'] = np.nan
    else:
        features['AvgDaysBetweenComps'] = np.nan
    
    return features

def extract_context_features(current_comp):
    """
    Extract context features from current competition.
    
    Parameters:
    -----------
    current_comp : Series
        Current competition row
    
    Returns:
    --------
    dict : Dictionary of context features
    """
    features = {}
    
    # Weight class (keep as string for now, can encode later)
    features['WeightClassKg'] = current_comp.get('WeightClassKg', np.nan)
    
    # Sex (M/F)
    features['Sex'] = current_comp.get('Sex', np.nan)
    
    # Division (Open, Junior, etc.)
    features['Division'] = current_comp.get('Division', np.nan)
    
    return features

def extract_trend_features(all_previous_comps):
    """
    Extract trend/improvement features from all previous competitions.
    
    Parameters:
    -----------
    all_previous_comps : DataFrame
        All previous competitions sorted by CompIndex
    
    Returns:
    --------
    dict : Dictionary of trend features
    """
    features = {}
    
    if len(all_previous_comps) == 0:
        # No previous competitions
        features['SquatImprovementRate'] = np.nan
        features['BenchImprovementRate'] = np.nan
        features['DeadliftImprovementRate'] = np.nan
        features['SquatImprovementDirection'] = 0
        features['BenchImprovementDirection'] = 0
        features['DeadliftImprovementDirection'] = 0
        features['CompsSincePRSquat'] = np.nan
        features['CompsSincePRBench'] = np.nan
        features['CompsSincePRDeadlift'] = np.nan
        return features
    
    # Improvement rate (slope) - change per competition
    valid_comps = all_previous_comps.dropna(subset=['Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg'], how='all')
    
    if len(valid_comps) >= 2:
        # Calculate slope using linear regression (simple: (last - first) / (n_comps - 1))
        comp_indices = valid_comps['CompIndex'].values
        squat_values = valid_comps['Best3SquatKg'].values
        bench_values = valid_comps['Best3BenchKg'].values
        deadlift_values = valid_comps['Best3DeadliftKg'].values
        
        # Squat improvement rate
        if len(squat_values[~np.isnan(squat_values)]) >= 2:
            valid_squat = ~np.isnan(squat_values)
            if valid_squat.sum() >= 2:
                valid_indices = comp_indices[valid_squat]
                valid_squat_vals = squat_values[valid_squat]
                if len(valid_indices) > 1:
                    features['SquatImprovementRate'] = (valid_squat_vals[-1] - valid_squat_vals[0]) / (valid_indices[-1] - valid_indices[0])
                else:
                    features['SquatImprovementRate'] = np.nan
            else:
                features['SquatImprovementRate'] = np.nan
        else:
            features['SquatImprovementRate'] = np.nan
        
        # Bench improvement rate
        if len(bench_values[~np.isnan(bench_values)]) >= 2:
            valid_bench = ~np.isnan(bench_values)
            if valid_bench.sum() >= 2:
                valid_indices = comp_indices[valid_bench]
                valid_bench_vals = bench_values[valid_bench]
                if len(valid_indices) > 1:
                    features['BenchImprovementRate'] = (valid_bench_vals[-1] - valid_bench_vals[0]) / (valid_indices[-1] - valid_indices[0])
                else:
                    features['BenchImprovementRate'] = np.nan
            else:
                features['BenchImprovementRate'] = np.nan
        else:
            features['BenchImprovementRate'] = np.nan
        
        # Deadlift improvement rate
        if len(deadlift_values[~np.isnan(deadlift_values)]) >= 2:
            valid_deadlift = ~np.isnan(deadlift_values)
            if valid_deadlift.sum() >= 2:
                valid_indices = comp_indices[valid_deadlift]
                valid_deadlift_vals = deadlift_values[valid_deadlift]
                if len(valid_indices) > 1:
                    features['DeadliftImprovementRate'] = (valid_deadlift_vals[-1] - valid_deadlift_vals[0]) / (valid_indices[-1] - valid_indices[0])
                else:
                    features['DeadliftImprovementRate'] = np.nan
            else:
                features['DeadliftImprovementRate'] = np.nan
        else:
            features['DeadliftImprovementRate'] = np.nan
        
        # Improvement direction (comparing last 2 competitions)
        if len(valid_comps) >= 2:
            last_two = valid_comps.tail(2)
            last_squat = last_two.iloc[-1]['Best3SquatKg']
            prev_squat = last_two.iloc[-2]['Best3SquatKg']
            if pd.notna(last_squat) and pd.notna(prev_squat):
                features['SquatImprovementDirection'] = 1 if last_squat > prev_squat else (-1 if last_squat < prev_squat else 0)
            else:
                features['SquatImprovementDirection'] = 0
            
            last_bench = last_two.iloc[-1]['Best3BenchKg']
            prev_bench = last_two.iloc[-2]['Best3BenchKg']
            if pd.notna(last_bench) and pd.notna(prev_bench):
                features['BenchImprovementDirection'] = 1 if last_bench > prev_bench else (-1 if last_bench < prev_bench else 0)
            else:
                features['BenchImprovementDirection'] = 0
            
            last_deadlift = last_two.iloc[-1]['Best3DeadliftKg']
            prev_deadlift = last_two.iloc[-2]['Best3DeadliftKg']
            if pd.notna(last_deadlift) and pd.notna(prev_deadlift):
                features['DeadliftImprovementDirection'] = 1 if last_deadlift > prev_deadlift else (-1 if last_deadlift < prev_deadlift else 0)
            else:
                features['DeadliftImprovementDirection'] = 0
        else:
            features['SquatImprovementDirection'] = 0
            features['BenchImprovementDirection'] = 0
            features['DeadliftImprovementDirection'] = 0
        
        # Comps since PR (number of competitions since achieving personal record)
        pr_squat = all_previous_comps['Best3SquatKg'].max()
        pr_bench = all_previous_comps['Best3BenchKg'].max()
        pr_deadlift = all_previous_comps['Best3DeadliftKg'].max()
        
        if pd.notna(pr_squat):
            pr_squat_comps = all_previous_comps[all_previous_comps['Best3SquatKg'] == pr_squat]
            if len(pr_squat_comps) > 0:
                last_pr_comp = pr_squat_comps['CompIndex'].max()
                last_comp = all_previous_comps['CompIndex'].max()
                features['CompsSincePRSquat'] = last_comp - last_pr_comp
            else:
                features['CompsSincePRSquat'] = np.nan
        else:
            features['CompsSincePRSquat'] = np.nan
        
        if pd.notna(pr_bench):
            pr_bench_comps = all_previous_comps[all_previous_comps['Best3BenchKg'] == pr_bench]
            if len(pr_bench_comps) > 0:
                last_pr_comp = pr_bench_comps['CompIndex'].max()
                last_comp = all_previous_comps['CompIndex'].max()
                features['CompsSincePRBench'] = last_comp - last_pr_comp
            else:
                features['CompsSincePRBench'] = np.nan
        else:
            features['CompsSincePRBench'] = np.nan
        
        if pd.notna(pr_deadlift):
            pr_deadlift_comps = all_previous_comps[all_previous_comps['Best3DeadliftKg'] == pr_deadlift]
            if len(pr_deadlift_comps) > 0:
                last_pr_comp = pr_deadlift_comps['CompIndex'].max()
                last_comp = all_previous_comps['CompIndex'].max()
                features['CompsSincePRDeadlift'] = last_comp - last_pr_comp
            else:
                features['CompsSincePRDeadlift'] = np.nan
        else:
            features['CompsSincePRDeadlift'] = np.nan
    else:
        # Not enough competitions for trend analysis
        features['SquatImprovementRate'] = np.nan
        features['BenchImprovementRate'] = np.nan
        features['DeadliftImprovementRate'] = np.nan
        features['SquatImprovementDirection'] = 0
        features['BenchImprovementDirection'] = 0
        features['DeadliftImprovementDirection'] = 0
        features['CompsSincePRSquat'] = np.nan
        features['CompsSincePRBench'] = np.nan
        features['CompsSincePRDeadlift'] = np.nan
    
    return features

def create_training_examples(df):
    """
    Create training examples using sliding window approach.
    
    For each lifter with N competitions, creates N-1 training examples:
    - Example 1: Use comp 1 → predict comp 2
    - Example 2: Use comps 1-2 → predict comp 3
    - Example 3: Use comps 1-3 → predict comp 4
    - etc.
    
    Parameters:
    -----------
    df : DataFrame
        DataFrame with per-lifter competition histories (sorted by LifterID and Date)
        Must have columns: LifterID, CompIndex, Date, and lift columns
    
    Returns:
    --------
    df_training : DataFrame
        DataFrame with features (X) and targets (y) for training
    """
    training_examples = []
    
    # Group by LifterID
    for lifter_id, lifter_group in df.groupby('LifterID'):
        # Sort by CompIndex to ensure chronological order
        lifter_group = lifter_group.sort_values('CompIndex').reset_index(drop=True)
        
        # For each competition from 2nd onwards (we need at least 1 previous comp)
        for idx in range(1, len(lifter_group)):
            current_comp = lifter_group.iloc[idx]
            previous_comps = lifter_group.iloc[:idx]  # All competitions before current
            
            # Extract all features
            example = {}
            
            # Metadata
            example['LifterID'] = lifter_id
            example['CompIndex'] = current_comp['CompIndex']
            
            # Historical features
            hist_features = extract_historical_features(previous_comps)
            example.update(hist_features)
            
            # Time-based features
            time_features = extract_time_features(current_comp, previous_comps, lifter_group['Date'])
            example.update(time_features)
            
            # Context features
            context_features = extract_context_features(current_comp)
            example.update(context_features)
            
            # Trend features
            trend_features = extract_trend_features(previous_comps)
            example.update(trend_features)
            
            # Target variables (what we're trying to predict)
            example['NextBest3SquatKg'] = current_comp.get('Best3SquatKg', np.nan)
            example['NextBest3BenchKg'] = current_comp.get('Best3BenchKg', np.nan)
            example['NextBest3DeadliftKg'] = current_comp.get('Best3DeadliftKg', np.nan)
            
            training_examples.append(example)
    
    # Convert to DataFrame
    df_training = pd.DataFrame(training_examples)
    
    return df_training

print("Feature extraction functions defined.")
print("Ready to create training examples...")

Feature extraction functions defined.
Ready to create training examples...
