# Experience Analysis

This notebook analyzes lifter experience metrics (years competing, meet count) and their relationship with performance.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from utils import categorize_ipf_weightclass, map_division_to_age_group, categorize_age_group, create_quality_filter, categorize_lifters, categorize_federation_testing_status

# Load the processed dataset
df = pd.read_parquet("../data/processed/full_dataset.parquet")
print(f"Dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")

# Add IPF Weight Class categorization (needed for quality filter)
if 'BodyweightKg' in df.columns and 'Sex' in df.columns:
    df['IPF_WeightClass'] = df.apply(
        lambda row: categorize_ipf_weightclass(row['BodyweightKg'], row['Sex']), 
        axis=1
    )
# Apply quality filter (excludes outliers and invalid data)
print("\n" + "="*80)
print("APPLYING QUALITY FILTER")
print("="*80)
quality_mask = create_quality_filter(df)
filtered_out = (~quality_mask).sum()
print(f"Quality filter will exclude {filtered_out:,} entries ({filtered_out/len(df)*100:.2f}%)")
print(f"  Remaining entries for analysis: {quality_mask.sum():,} ({quality_mask.sum()/len(df)*100:.2f}%)")
# Create clean dataset (filtered)
df = df[quality_mask].copy()
# Add Age Group categorization from Division (100% coverage)
if 'Division' in df.columns:
    print("\n" + "="*80)
    print("ADDING AGE GROUP CATEGORIZATION")
    print("="*80)
    df['AgeGroup'] = df['Division'].apply(map_division_to_age_group)
    print("Age Group categorization complete (using Division - 100% coverage)")
    if 'AgeGroup' in df.columns:
        print(f"Age Group distribution:\n{df['AgeGroup'].value_counts()}")
else:
    print("Warning: Cannot add Age Group - missing Division column")
    df['AgeGroup'] = None
print(f"\n✓ Using filtered dataset with {len(df):,} entries")
# Categorize lifters (New/Intermediate/Advanced)
print("\n" + "="*80)
print("CATEGORIZING LIFTERS")
print("="*80)
df = categorize_lifters(df)
print("✓ Lifters categorized into New, Intermediate, and Advanced")

# Categorize federation testing status (Drug Tested vs Untested)
print("\n" + "="*80)
print("CATEGORIZING FEDERATION TESTING STATUS")
print("="*80)
df = categorize_federation_testing_status(df)
testing_status_counts = df['FederationTestingStatus'].value_counts()
print("Federation Testing Status distribution:")
print(testing_status_counts)
print(f"✓ Federation testing status categorized")
# Create separate datasets for each category
if 'LifterCategory' in df.columns:
    new_lifters_df = df[df['LifterCategory'] == 'New'].copy()
    intermediate_lifters_df = df[df['LifterCategory'] == 'Intermediate'].copy()
    advanced_lifters_df = df[df['LifterCategory'] == 'Advanced'].copy()
    
    print(f"\nCategory breakdown:")
    print(f"  New lifters: {len(new_lifters_df):,} entries")
    print(f"  Intermediate lifters: {len(intermediate_lifters_df):,} entries")
    print(f"  Advanced lifters: {len(advanced_lifters_df):,} entries")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Load the processed dataset
df = pd.read_parquet("../data/processed/full_dataset.parquet")
print(f"Dataset loaded: {df.shape[0]:,} rows")


## 1. Calculate Experience Metrics


In [None]:
# Calculate per-lifter experience metrics
# First meet date, years competing, total meets

if 'Name' in df.columns and 'MeetDate' in df.columns:
    # Get first meet date for each lifter
    lifter_first_meet = df.groupby('Name')['MeetDate'].min().reset_index()
    lifter_first_meet.columns = ['Name', 'FirstMeetDate']
    
    # Merge back to main dataframe
    df = df.merge(lifter_first_meet, on='Name', how='left')
    
    # Calculate years competing (at time of each meet)
    df['YearsCompeting'] = (df['MeetDate'] - df['FirstMeetDate']).dt.days / 365.25
    
    # Calculate total meets per lifter (up to current meet)
    lifter_meet_counts = df.groupby(['Name', 'MeetDate']).size().reset_index(name='MeetNumber')
    lifter_meet_counts = lifter_meet_counts.sort_values(['Name', 'MeetDate'])
    lifter_meet_counts['TotalMeets'] = lifter_meet_counts.groupby('Name').cumcount() + 1
    
    # Merge back
    df = df.merge(lifter_meet_counts[['Name', 'MeetDate', 'TotalMeets']], 
                  on=['Name', 'MeetDate'], how='left')
    
    # Calculate meet frequency (meets per year)
    df['MeetFrequency'] = df['TotalMeets'] / (df['YearsCompeting'] + 0.1)  # Add small value to avoid division by zero
    
    print("Experience metrics calculated!")
    print(f"\nYears Competing stats:")
    print(df['YearsCompeting'].describe())
    print(f"\nTotal Meets stats:")
    print(df['TotalMeets'].describe())
else:
    print("Required columns (Name, MeetDate) not found")


## 2. Experience vs Performance Analysis


In [None]:
# Filter to valid data (including testing status)
analysis_df = df[
    (df['YearsCompeting'] >= 0) & (df['YearsCompeting'] <= 50) & 
    (df['TotalKg'].notna()) & (df['TotalKg'] > 0) &
    (df['FederationTestingStatus'].isin(['Drug Tested', 'Untested']))
].copy()

print(f"Analysis dataset: {len(analysis_df):,} rows")
print(f"\nSplit by Federation Testing Status:")
print(analysis_df['FederationTestingStatus'].value_counts())

# Use Dots or Wilks as base score if available
if 'Dots' in analysis_df.columns:
    base_score_col = 'Dots'
elif 'Wilks' in analysis_df.columns:
    base_score_col = 'Wilks'
else:
    base_score_col = 'TotalKg'

print(f"Using {base_score_col} as performance metric")


In [None]:
# Plot: Years competing vs Performance (Split by Testing Status)
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

for idx, testing_status in enumerate(['Drug Tested', 'Untested']):
    status_df = analysis_df[analysis_df['FederationTestingStatus'] == testing_status]
    
    if len(status_df) > 0:
        # Scatter plot with trend line
        ax1 = axes[idx, 0]
        sample_df = status_df.sample(min(10000, len(status_df)))  # Sample for visualization
        ax1.scatter(sample_df['YearsCompeting'], sample_df[base_score_col], 
                   alpha=0.1, s=1)
        # Add trend line
        if len(sample_df) > 10:
            z = np.polyfit(sample_df['YearsCompeting'].dropna(), 
                           sample_df[base_score_col].dropna(), 1)
            p = np.poly1d(z)
            x_trend = np.linspace(sample_df['YearsCompeting'].min(), 
                                 sample_df['YearsCompeting'].max(), 100)
            ax1.plot(x_trend, p(x_trend), "r--", linewidth=2, label='Trend line')
        ax1.set_xlabel('Years Competing')
        ax1.set_ylabel(base_score_col)
        ax1.set_title(f'Years Competing vs Performance - {testing_status}')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # Box plot by experience bins
        ax2 = axes[idx, 1]
        status_df['ExperienceBin'] = pd.cut(status_df['YearsCompeting'], 
                                           bins=[0, 1, 3, 5, 10, 50],
                                           labels=['<1yr', '1-3yr', '3-5yr', '5-10yr', '10+yr'])
        box_data = status_df[status_df['ExperienceBin'].notna()]
        if len(box_data) > 0:
            sns.boxplot(data=box_data, x='ExperienceBin', y=base_score_col, ax=ax2)
        ax2.set_title(f'Performance by Experience Level - {testing_status}')
        ax2.set_xlabel('Years Competing')
        ax2.tick_params(axis='x', rotation=45)
        ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../data/processed/experience_vs_performance.png', dpi=150, bbox_inches='tight')
plt.show()


In [None]:
# Plot: Meet count vs Performance
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Scatter plot
ax1 = axes[0]
sample_df = analysis_df[analysis_df['TotalMeets'] <= 50].sample(min(10000, len(analysis_df)))
ax1.scatter(sample_df['TotalMeets'], sample_df[base_score_col], 
           alpha=0.1, s=1)
# Add trend line
z = np.polyfit(sample_df['TotalMeets'].dropna(), 
               sample_df[base_score_col].dropna(), 1)
p = np.poly1d(z)
x_trend = np.linspace(sample_df['TotalMeets'].min(), 
                     sample_df['TotalMeets'].max(), 100)
ax1.plot(x_trend, p(x_trend), "r--", linewidth=2, label='Trend line')
ax1.set_xlabel('Total Meets')
ax1.set_ylabel(base_score_col)
ax1.set_title('Total Meets vs Performance')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Average performance by meet number
ax2 = axes[1]
meet_perf = analysis_df.groupby('TotalMeets')[base_score_col].agg(['mean', 'median', 'count']).reset_index()
meet_perf = meet_perf[meet_perf['count'] >= 10]  # Only show bins with at least 10 observations
ax2.plot(meet_perf['TotalMeets'], meet_perf['mean'], label='Mean', linewidth=2)
ax2.plot(meet_perf['TotalMeets'], meet_perf['median'], label='Median', linewidth=2)
ax2.set_xlabel('Total Meets')
ax2.set_ylabel(f'Average {base_score_col}')
ax2.set_title('Average Performance by Meet Number')
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.set_xlim(0, 30)  # Focus on first 30 meets

plt.tight_layout()
plt.savefig('../data/processed/meet_count_vs_performance.png', dpi=150, bbox_inches='tight')
plt.show()


## 3. Improvement Curve Analysis


In [None]:
# Identify typical improvement patterns
# Group by experience level and calculate average performance

experience_summary = analysis_df.groupby('ExperienceBin').agg({
    base_score_col: ['mean', 'median', 'std', 'count'],
    'YearsCompeting': 'mean'
}).round(2)

print("=== Performance by Experience Level ===")
print(experience_summary)

# Calculate improvement rate (change per year)
# For lifters with multiple meets, calculate average improvement
lifter_improvements = []
for name in analysis_df['Name'].unique()[:1000]:  # Sample for speed
    lifter_data = analysis_df[analysis_df['Name'] == name].sort_values('MeetDate')
    if len(lifter_data) >= 2:
        first_score = lifter_data.iloc[0][base_score_col]
        last_score = lifter_data.iloc[-1][base_score_col]
        years = lifter_data.iloc[-1]['YearsCompeting'] - lifter_data.iloc[0]['YearsCompeting']
        if years > 0 and pd.notna(first_score) and pd.notna(last_score):
            improvement_rate = (last_score - first_score) / years
            lifter_improvements.append({
                'Name': name,
                'ImprovementRate': improvement_rate,
                'YearsCompeting': lifter_data.iloc[-1]['YearsCompeting']
            })

if lifter_improvements:
    improvement_df = pd.DataFrame(lifter_improvements)
    print(f"\n=== Improvement Rate Analysis (sample of {len(improvement_df)} lifters) ===")
    print(improvement_df['ImprovementRate'].describe())
    
    # Plot improvement rate by experience
    plt.figure(figsize=(10, 6))
    improvement_df['ExpBin'] = pd.cut(improvement_df['YearsCompeting'], 
                                     bins=[0, 1, 3, 5, 10, 50],
                                     labels=['<1yr', '1-3yr', '3-5yr', '5-10yr', '10+yr'])
    sns.boxplot(data=improvement_df[improvement_df['ExpBin'].notna()], 
               x='ExpBin', y='ImprovementRate')
    plt.title('Improvement Rate by Experience Level')
    plt.xlabel('Years Competing')
    plt.ylabel(f'Improvement Rate ({base_score_col}/year)')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('../data/processed/improvement_rate.png', dpi=150, bbox_inches='tight')
    plt.show()


In [None]:
# Improvement Analysis by Gender × Weight Class × Age Group × Testing Status
if 'IPF_WeightClass' in analysis_df.columns and 'Sex' in analysis_df.columns and 'AgeGroup' in analysis_df.columns:
    for testing_status in ['Drug Tested', 'Untested']:
        status_df = analysis_df[analysis_df['FederationTestingStatus'] == testing_status]
        
        if len(status_df) > 0:
            print(f"\n{'='*80}")
            print(f"IMPROVEMENT RATE ANALYSIS - {testing_status.upper()}")
            print(f"{'='*80}")
            
            # Calculate improvement rates by group
            lifter_improvements_by_group = []
            sample_size = 5000
            unique_names = status_df['Name'].unique()
            
            for name in unique_names[:sample_size]:
                lifter_data = status_df[status_df['Name'] == name].sort_values('MeetDate')
                if len(lifter_data) >= 2:
                    first_score = lifter_data.iloc[0][base_score_col]
                    last_score = lifter_data.iloc[-1][base_score_col]
                    years = lifter_data.iloc[-1]['YearsCompeting'] - lifter_data.iloc[0]['YearsCompeting']
                    if years > 0 and pd.notna(first_score) and pd.notna(last_score):
                        improvement_rate = (last_score - first_score) / years
                        lifter_improvements_by_group.append({
                            'Name': name,
                            'ImprovementRate': improvement_rate,
                            'YearsCompeting': lifter_data.iloc[-1]['YearsCompeting'],
                            'Sex': lifter_data.iloc[0].get('Sex'),
                            'IPF_WeightClass': lifter_data.iloc[0].get('IPF_WeightClass'),
                            'AgeGroup': lifter_data.iloc[0].get('AgeGroup')
                        })
            
            if lifter_improvements_by_group:
                improvement_by_group_df = pd.DataFrame(lifter_improvements_by_group)
                improvement_by_group_df = improvement_by_group_df[
                    (improvement_by_group_df['Sex'].isin(['M', 'F'])) & 
                    (improvement_by_group_df['IPF_WeightClass'].notna()) &
                    (improvement_by_group_df['AgeGroup'].notna())
                ]
                
                print(f"\nImprovement Rate by Gender × Weight Class × Age Group (sample of {len(improvement_by_group_df)} lifters):")
                improvement_stats = improvement_by_group_df.groupby(['Sex', 'IPF_WeightClass', 'AgeGroup'], observed=True)['ImprovementRate'].agg([
                    'count', 'mean', 'median', 'std'
                ]).round(2)
                improvement_stats = improvement_stats[improvement_stats['count'] >= 10]
                print(improvement_stats.head(30))  # Show top 30
                
                # Visualize improvement rates by group
                fig, axes = plt.subplots(2, 1, figsize=(16, 12))
                
                improvement_by_group_df['ExpBin'] = pd.cut(improvement_by_group_df['YearsCompeting'], 
                                                           bins=[0, 1, 3, 5, 10, 50],
                                                           labels=['<1yr', '1-3yr', '3-5yr', '5-10yr', '10+yr'])
                
                for idx, (sex, label) in enumerate([('F', 'FEMALE'), ('M', 'MALE')]):
                    ax = axes[idx]
                    sex_data = improvement_by_group_df[improvement_by_group_df['Sex'] == sex]
                    
                    if len(sex_data) > 0:
                        sns.boxplot(data=sex_data[sex_data['ExpBin'].notna()], 
                                   x='ExpBin', y='ImprovementRate', hue='IPF_WeightClass', ax=ax)
                        ax.set_title(f'Improvement Rate by Experience Level - {label} ({testing_status})', fontsize=14, fontweight='bold')
                        ax.set_xlabel('Years Competing', fontsize=12)
                        ax.set_ylabel(f'Improvement Rate ({base_score_col}/year)', fontsize=12)
                        ax.tick_params(axis='x', rotation=45)
                        ax.legend(title='Weight Class', ncol=4, fontsize=8, bbox_to_anchor=(1.05, 1), loc='upper left')
                        ax.grid(True, alpha=0.3, axis='y')
                
                plt.tight_layout()
                plt.savefig(f'../data/processed/improvement_rate_by_gender_weightclass_{testing_status.replace(" ", "_").lower()}.png', dpi=150, bbox_inches='tight')
                plt.show()


## 4. Key Findings

**Questions to answer:**
- Do newer lifters show different performance patterns?
- What is the typical improvement curve?
- How does experience relate to performance variance?


In [None]:
# Summary statistics
print("=== Experience Analysis Summary ===")
print(f"\n1. Average years competing: {analysis_df['YearsCompeting'].mean():.2f} years")
print(f"2. Average total meets: {analysis_df['TotalMeets'].mean():.2f} meets")
print(f"3. Performance correlation with years competing: {analysis_df['YearsCompeting'].corr(analysis_df[base_score_col]):.3f}")
print(f"4. Performance correlation with meet count: {analysis_df['TotalMeets'].corr(analysis_df[base_score_col]):.3f}")

# Save experience metrics for later use
experience_metrics = analysis_df[['Name', 'MeetDate', 'YearsCompeting', 'TotalMeets', 'MeetFrequency']].copy()
experience_metrics.to_parquet('../data/processed/experience_metrics.parquet', index=False)
print("\nExperience metrics saved to ../data/processed/experience_metrics.parquet")
