# Consistency Analysis

This notebook calculates per-lifter performance variance (CV) and identifies consistency patterns.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from utils import categorize_ipf_weightclass, map_division_to_age_group, categorize_age_group, create_quality_filter, categorize_lifters, categorize_federation_testing_status

# Load the processed dataset
df = pd.read_parquet("../data/processed/full_dataset.parquet")
print(f"Dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")

# Add IPF Weight Class categorization (needed for quality filter)
if 'BodyweightKg' in df.columns and 'Sex' in df.columns:
    df['IPF_WeightClass'] = df.apply(
        lambda row: categorize_ipf_weightclass(row['BodyweightKg'], row['Sex']), 
        axis=1
    )
# Apply quality filter (excludes outliers and invalid data)
print("\n" + "="*80)
print("APPLYING QUALITY FILTER")
print("="*80)
quality_mask = create_quality_filter(df)
filtered_out = (~quality_mask).sum()
print(f"Quality filter will exclude {filtered_out:,} entries ({filtered_out/len(df)*100:.2f}%)")
print(f"  Remaining entries for analysis: {quality_mask.sum():,} ({quality_mask.sum()/len(df)*100:.2f}%)")
# Create clean dataset (filtered)
df = df[quality_mask].copy()
# Add Age Group categorization from Division (100% coverage)
if 'Division' in df.columns:
    print("\n" + "="*80)
    print("ADDING AGE GROUP CATEGORIZATION")
    print("="*80)
    df['AgeGroup'] = df['Division'].apply(map_division_to_age_group)
    print("Age Group categorization complete (using Division - 100% coverage)")
    if 'AgeGroup' in df.columns:
        print(f"Age Group distribution:\n{df['AgeGroup'].value_counts()}")
else:
    print("Warning: Cannot add Age Group - missing Division column")
    df['AgeGroup'] = None
print(f"\n✓ Using filtered dataset with {len(df):,} entries")
# Categorize lifters (New/Intermediate/Advanced)
print("\n" + "="*80)
print("CATEGORIZING LIFTERS")
print("="*80)
df = categorize_lifters(df)
print("✓ Lifters categorized into New, Intermediate, and Advanced")

# Categorize federation testing status (Drug Tested vs Untested)
print("\n" + "="*80)
print("CATEGORIZING FEDERATION TESTING STATUS")
print("="*80)
df = categorize_federation_testing_status(df)
testing_status_counts = df['FederationTestingStatus'].value_counts()
print("Federation Testing Status distribution:")
print(testing_status_counts)
print(f"✓ Federation testing status categorized")
# Create separate datasets for each category
if 'LifterCategory' in df.columns:
    new_lifters_df = df[df['LifterCategory'] == 'New'].copy()
    intermediate_lifters_df = df[df['LifterCategory'] == 'Intermediate'].copy()
    advanced_lifters_df = df[df['LifterCategory'] == 'Advanced'].copy()
    
    print(f"\nCategory breakdown:")
    print(f"  New lifters: {len(new_lifters_df):,} entries")
    print(f"  Intermediate lifters: {len(intermediate_lifters_df):,} entries")
    print(f"  Advanced lifters: {len(advanced_lifters_df):,} entries")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Load the processed dataset
df = pd.read_parquet("../data/processed/full_dataset.parquet")
print(f"Dataset loaded: {df.shape[0]:,} rows")


## 1. Calculate Consistency Metrics


In [None]:
# Use Dots or Wilks as base score if available
if 'Dots' in df.columns:
    base_score_col = 'Dots'
elif 'Wilks' in df.columns:
    base_score_col = 'Wilks'
else:
    base_score_col = 'TotalKg'

# Filter to valid performance data
analysis_df = df[(df[base_score_col].notna()) & (df[base_score_col] > 0)].copy()

print(f"Using {base_score_col} as performance metric")
print(f"Analysis dataset: {len(analysis_df):,} rows")


In [None]:
# Calculate per-lifter consistency metrics
# Only analyze lifters with at least 3 meets
if 'Name' in analysis_df.columns:
    lifter_stats = analysis_df.groupby('Name').agg({
        base_score_col: ['count', 'mean', 'std', 'min', 'max'],
        'MeetDate': ['min', 'max']
    }).reset_index()
    
    # Flatten column names
    lifter_stats.columns = ['Name', 'MeetCount', 'MeanScore', 'StdScore', 'MinScore', 'MaxScore', 
                           'FirstMeet', 'LastMeet']
    
    # Filter to lifters with at least 3 meets
    lifter_stats = lifter_stats[lifter_stats['MeetCount'] >= 3].copy()
    
    # Calculate coefficient of variation (CV) = std/mean
    lifter_stats['CV'] = lifter_stats['StdScore'] / lifter_stats['MeanScore']
    
    # Calculate peak vs average ratio
    lifter_stats['PeakToAvgRatio'] = lifter_stats['MaxScore'] / lifter_stats['MeanScore']
    
    # Calculate improvement trend (simple: last - first)
    lifter_first_last = analysis_df.groupby('Name').apply(
        lambda x: pd.Series({
            'FirstScore': x.sort_values('MeetDate').iloc[0][base_score_col],
            'LastScore': x.sort_values('MeetDate').iloc[-1][base_score_col]
        })
    ).reset_index()
    
    lifter_stats = lifter_stats.merge(lifter_first_last, on='Name', how='left')
    lifter_stats['Improvement'] = lifter_stats['LastScore'] - lifter_stats['FirstScore']
    lifter_stats['ImprovementPct'] = (lifter_stats['Improvement'] / lifter_stats['FirstScore']) * 100
    
    print(f"Analyzed {len(lifter_stats):,} lifters with 3+ meets")
    print(f"\nConsistency statistics:")
    print(lifter_stats[['MeetCount', 'CV', 'PeakToAvgRatio', 'ImprovementPct']].describe())
else:
    print("Name column not found")


## 2. Consistency Distribution Analysis


In [None]:
# Visualize consistency metrics
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# CV distribution
ax1 = axes[0, 0]
lifter_stats['CV'].hist(bins=50, edgecolor='black', alpha=0.7, ax=ax1)
ax1.axvline(lifter_stats['CV'].median(), color='r', linestyle='--', 
           label=f'Median: {lifter_stats["CV"].median():.3f}')
ax1.set_xlabel('Coefficient of Variation (CV)')
ax1.set_ylabel('Number of Lifters')
ax1.set_title('Distribution of Performance Consistency (CV)')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Peak to average ratio
ax2 = axes[0, 1]
lifter_stats['PeakToAvgRatio'].hist(bins=50, edgecolor='black', alpha=0.7, ax=ax2)
ax2.axvline(lifter_stats['PeakToAvgRatio'].median(), color='r', linestyle='--',
           label=f'Median: {lifter_stats["PeakToAvgRatio"].median():.3f}')
ax2.set_xlabel('Peak to Average Ratio')
ax2.set_ylabel('Number of Lifters')
ax2.set_title('Distribution of Peak vs Average Performance')
ax2.legend()
ax2.grid(True, alpha=0.3)

# CV vs Meet Count
ax3 = axes[1, 0]
sample_lifters = lifter_stats.sample(min(5000, len(lifter_stats)))
ax3.scatter(sample_lifters['MeetCount'], sample_lifters['CV'], alpha=0.3, s=10)
ax3.set_xlabel('Number of Meets')
ax3.set_ylabel('Coefficient of Variation (CV)')
ax3.set_title('Consistency vs Experience')
ax3.grid(True, alpha=0.3)

# Improvement percentage distribution
ax4 = axes[1, 1]
lifter_stats['ImprovementPct'].hist(bins=50, edgecolor='black', alpha=0.7, ax=ax4)
ax4.axvline(0, color='r', linestyle='--', label='No change')
ax4.axvline(lifter_stats['ImprovementPct'].median(), color='g', linestyle='--',
           label=f'Median: {lifter_stats["ImprovementPct"].median():.1f}%')
ax4.set_xlabel('Improvement Percentage (%)')
ax4.set_ylabel('Number of Lifters')
ax4.set_title('Distribution of Career Improvement')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../data/processed/consistency_analysis.png', dpi=150, bbox_inches='tight')
plt.show()


## 3. Identify Highly Consistent vs Variable Lifters


In [None]:
# Categorize lifters by consistency
lifter_stats['ConsistencyCategory'] = pd.cut(
    lifter_stats['CV'],
    bins=[0, 0.05, 0.10, 0.20, float('inf')],
    labels=['Very Consistent', 'Consistent', 'Variable', 'Highly Variable']
)

print("=== Consistency Categories ===")
print(lifter_stats['ConsistencyCategory'].value_counts())
print(f"\nCategory statistics:")
# Show consistency by category, split by testing status
for testing_status in ['Drug Tested', 'Untested']:
    status_lifter_stats = lifter_stats[lifter_stats['FederationTestingStatus'] == testing_status]
    if len(status_lifter_stats) > 0:
        print(f"\n{'='*80}")
        print(f"CONSISTENCY BY CATEGORY - {testing_status.upper()}")
        print(f"{'='*80}")
        print(status_lifter_stats.groupby('ConsistencyCategory', observed=True)[['MeanScore', 'MeetCount', 'ImprovementPct']].agg(['mean', 'median']))

# Identify most and least consistent lifters
print(f"\n=== Most Consistent Lifters (Top 10) ===")
most_consistent = lifter_stats.nsmallest(10, 'CV')[['Name', 'MeetCount', 'MeanScore', 'CV', 'PeakToAvgRatio']]
print(most_consistent.to_string(index=False))

print(f"\n=== Least Consistent Lifters (Top 10) ===")
least_consistent = lifter_stats.nlargest(10, 'CV')[['Name', 'MeetCount', 'MeanScore', 'CV', 'PeakToAvgRatio']]
print(least_consistent.to_string(index=False))


## 4. Key Findings Summary


In [None]:
print("=== Consistency Analysis Summary ===")
print(f"\n1. Lifters analyzed (3+ meets): {len(lifter_stats):,}")

# Summary split by testing status
for testing_status in ['Drug Tested', 'Untested']:
    status_lifter_stats = lifter_stats[lifter_stats['FederationTestingStatus'] == testing_status]
    if len(status_lifter_stats) > 0:
        print(f"\n{testing_status.upper()}:")
        print(f"  2. Average CV: {status_lifter_stats['CV'].mean():.3f}")
        print(f"  3. Median CV: {status_lifter_stats['CV'].median():.3f}")
        print(f"  4. Average peak-to-average ratio: {status_lifter_stats['PeakToAvgRatio'].mean():.3f}")
        print(f"  5. Average improvement: {status_lifter_stats['ImprovementPct'].mean():.2f}%")
        print(f"  6. Correlation: CV vs Meet Count: {status_lifter_stats['CV'].corr(status_lifter_stats['MeetCount']):.3f}")

# Save consistency metrics
lifter_stats.to_parquet('../data/processed/consistency_metrics.parquet', index=False)
print("\nConsistency metrics saved to ../data/processed/consistency_metrics.parquet")
