# Age Division Analysis

This notebook analyzes age division (Division column) distributions and division-performance relationships by sex and equipment.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from utils import categorize_ipf_weightclass, map_division_to_age_group, categorize_age_group, create_quality_filter, categorize_lifters, categorize_federation_testing_status

# Load the processed dataset
df = pd.read_parquet("../data/processed/full_dataset.parquet")
print(f"Dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")

# Add IPF Weight Class categorization (needed for quality filter)
if 'BodyweightKg' in df.columns and 'Sex' in df.columns:
    df['IPF_WeightClass'] = df.apply(
        lambda row: categorize_ipf_weightclass(row['BodyweightKg'], row['Sex']), 
        axis=1
    )

# Apply quality filter (excludes outliers and invalid data)
print("\n" + "="*80)
print("APPLYING QUALITY FILTER")
print("="*80)
quality_mask = create_quality_filter(df)
filtered_out = (~quality_mask).sum()
print(f"Quality filter will exclude {filtered_out:,} entries ({filtered_out/len(df)*100:.2f}%)")
print(f"  Remaining entries for analysis: {quality_mask.sum():,} ({quality_mask.sum()/len(df)*100:.2f}%)")

# Create clean dataset (filtered)
df = df[quality_mask].copy()
print(f"\n✓ Using filtered dataset with {len(df):,} entries")

# Categorize lifters (New/Intermediate/Advanced)
print("\n" + "="*80)
print("CATEGORIZING LIFTERS")
print("="*80)
df = categorize_lifters(df)
print("✓ Lifters categorized into New, Intermediate, and Advanced")

# Categorize federation testing status (Drug Tested vs Untested)
print("\n" + "="*80)
print("CATEGORIZING FEDERATION TESTING STATUS")
print("="*80)
df = categorize_federation_testing_status(df)
testing_status_counts = df['FederationTestingStatus'].value_counts()
print("Federation Testing Status distribution:")
print(testing_status_counts)
print(f"✓ Federation testing status categorized")

# Create separate datasets for each category
if 'LifterCategory' in df.columns:
    new_lifters_df = df[df['LifterCategory'] == 'New'].copy()
    intermediate_lifters_df = df[df['LifterCategory'] == 'Intermediate'].copy()
    advanced_lifters_df = df[df['LifterCategory'] == 'Advanced'].copy()
    
    print(f"\nCategory breakdown:")
    print(f"  New lifters: {len(new_lifters_df):,} entries")
    print(f"  Intermediate lifters: {len(intermediate_lifters_df):,} entries")
    print(f"  Advanced lifters: {len(advanced_lifters_df):,} entries")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Load the processed dataset
df = pd.read_parquet("../data/processed/full_dataset.parquet")
print(f"Dataset loaded: {df.shape[0]:,} rows")


## 1. Age Division (Division) Distribution Analysis


In [None]:
# Filter to valid age data (including testing status)
age_group_df = df[
    (df['AgeGroup'].notna()) &
    (df['FederationTestingStatus'].isin(['Drug Tested', 'Untested']))
].copy()

print(f"Valid age data: {len(age_group_df):,} rows")
print(f"\nSplit by Federation Testing Status:")
print(age_group_df['FederationTestingStatus'].value_counts())
print(f"\nAge statistics:")
print(age_group_df['AgeGroup'].describe())

# Age distribution by Sex
if 'Sex' in age_group_df.columns:
    print(f"\nAge distribution by Sex:")
    print(age_group_df.groupby('Sex')['Age Group'].describe())


In [None]:
# Visualize age group distributions
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Overall age group distribution
ax1 = axes[0]
age_group_group_order = ['Youth', 'Teen', 'Sub-Junior', 'Junior', 'Open', 'Masters I', 'Masters II', 'Masters III', 'Masters IV']
age_group_group_counts = age_group_df['AgeGroup'].value_counts().reindex([ag for ag in age_group_group_order if ag in age_group_df['AgeGroup'].values])
age_group_group_counts.plot(kind='bar', ax=ax1, edgecolor='black', alpha=0.7)
ax1.set_xlabel('Age Group')
ax1.set_ylabel('Frequency')
ax1.set_title('Overall Age Group Distribution (from Division)')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(True, alpha=0.3, axis='y')

# Age group distribution by Sex
ax2 = axes[1]
if 'Sex' in age_group_df.columns:
    crosstab = pd.crosstab(age_group_df['Sex'], age_group_df['AgeGroup'])
    crosstab = crosstab.reindex(columns=[ag for ag in age_group_group_order if ag in crosstab.columns], fill_value=0)
    crosstab.plot(kind='bar', ax=ax2, alpha=0.7, edgecolor='black')
    ax2.set_xlabel('Sex')
    ax2.set_ylabel('Frequency')
    ax2.set_title('Age Group Distribution by Sex')
    ax2.legend(title='Age Group', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax2.tick_params(axis='x', rotation=0)
    ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../data/processed/age_group_group_distribution_division.png', dpi=150, bbox_inches='tight')
plt.show()


## 2. Age Group (Division) vs Performance Analysis


In [None]:
# Filter to valid performance data (with testing status)
analysis_df = age_group_df[
    (age_group_df['TotalKg'].notna()) & 
    (age_group_df['TotalKg'] > 0) & 
    (age_group_df['AgeGroup'].notna()) &
    (age_group_df['FederationTestingStatus'].isin(['Drug Tested', 'Untested']))
].copy()

# Use Dots or Wilks as base score if available
if 'Dots' in analysis_df.columns:
    base_score_col = 'Dots'
elif 'Wilks' in analysis_df.columns:
    base_score_col = 'Wilks'
else:
    base_score_col = 'TotalKg'

print(f"Using {base_score_col} as performance metric")
print(f"Analysis dataset: {len(analysis_df):,} rows")


In [None]:
# Plot: Age Group vs Performance by Sex
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Overall age group vs performance
ax1 = axes[0]
sample_df = analysis_df.sample(min(20000, len(analysis_df)))
ax1.scatter(sample_df['AgeGroup'], sample_df[base_score_col], alpha=0.1, s=1)
# Add trend line
z = np.polyfit(sample_df['AgeGroup'].dropna(), sample_df[base_score_col].dropna(), 2)
p = np.poly1d(z)
x_trend = np.linspace(sample_df['AgeGroup'].min(), sample_df['AgeGroup'].max(), 100)
ax1.plot(x_trend, p(x_trend), "r--", linewidth=2, label='Trend line')
ax1.set_xlabel('Age Group')
ax1.set_ylabel(base_score_col)
ax1.set_title('Age Group vs Performance (Overall)')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Age Group vs Performance by Sex
ax2 = axes[1]
if 'Sex' in analysis_df.columns:
    for sex in ['M', 'F']:
        if sex in analysis_df['Sex'].values:
            sex_data = analysis_df[analysis_df['Sex'] == sex].sample(min(5000, len(analysis_df[analysis_df['Sex'] == sex])))
            ax2.scatter(sex_data['Age Group'], sex_data[base_score_col], alpha=0.1, s=1, label=sex)
            # Add trend line for each sex
            z = np.polyfit(sex_data['Age Group'].dropna(), sex_data[base_score_col].dropna(), 2)
            p = np.poly1d(z)
            x_trend = np.linspace(sex_data['Age Group'].min(), sex_data['Age Group'].max(), 100)
            ax2.plot(x_trend, p(x_trend), linewidth=2, label=f'{sex} trend')
    ax2.set_xlabel('Age Group')
    ax2.set_ylabel(base_score_col)
    ax2.set_title('Age Group vs Performance by Sex')
    ax2.legend()
    ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../data/processed/age_group_vs_performance.png', dpi=150, bbox_inches='tight')
plt.show()


In [None]:
# Age Group vs Performance by Equipment
if 'Equipment' in analysis_df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    axes = axes.flatten()
    
    equipment_types = analysis_df['Equipment'].value_counts().head(4).index.tolist()
    
    for idx, eq in enumerate(equipment_types):
        if idx < len(axes):
            ax = axes[idx]
            eq_data = analysis_df[analysis_df['Equipment'] == eq].sample(min(5000, len(analysis_df[analysis_df['Equipment'] == eq])))
            ax.scatter(eq_data['Age Group'], eq_data[base_score_col], alpha=0.1, s=1)
            # Add trend line
            z = np.polyfit(eq_data['Age Group'].dropna(), eq_data[base_score_col].dropna(), 2)
            p = np.poly1d(z)
            x_trend = np.linspace(eq_data['Age Group'].min(), eq_data['Age Group'].max(), 100)
            ax.plot(x_trend, p(x_trend), "r--", linewidth=2)
            ax.set_xlabel('Age Group')
            ax.set_ylabel(base_score_col)
            ax.set_title(f'Age Group vs Performance: {eq}')
            ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('../data/processed/age_group_vs_performance_by_equipment.png', dpi=150, bbox_inches='tight')
    plt.show()


## 3. Peak Performance Age Identification


In [None]:
# Calculate average performance by age group
age_group_performance = analysis_df.groupby('AgeGroup')[base_score_col].agg(['mean', 'median', 'count']).reset_index()
age_group_performance = age_group_performance[age_group_performance['count'] >= 10]  # Only ages with at least 10 observations

# Find peak age
peak_age_group_mean = age_group_performance.loc[age_group_performance['mean'].idxmax(), 'Age Group']
peak_age_group_median = age_group_performance.loc[age_group_performance['median'].idxmax(), 'Age Group']

print(f"=== Peak Performance Age ===")
print(f"Peak age (mean): {peak_age_group_mean:.0f} years")
print(f"Peak age (median): {peak_age_group_median:.0f} years")

# Plot average performance by age group
plt.figure(figsize=(12, 6))
plt.plot(age_group_performance['Age Group'], age_group_performance['mean'], label='Mean', linewidth=2)
plt.plot(age_group_performance['Age Group'], age_group_performance['median'], label='Median', linewidth=2)
plt.axvline(peak_age_group_mean, color='r', linestyle='--', label=f'Peak (mean): {peak_age_group_mean:.0f}')
plt.axvline(peak_age_group_median, color='g', linestyle='--', label=f'Peak (median): {peak_age_group_median:.0f}')
plt.xlabel('Age Group')
plt.ylabel(f'Average {base_score_col}')
plt.title('Average Performance by Age Group')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../data/processed/peak_performance_age.png', dpi=150, bbox_inches='tight')
plt.show()


In [None]:
# Peak age by Sex and Equipment
if 'Sex' in analysis_df.columns:
    print("\n=== Peak Age by Sex ===")
    for sex in ['M', 'F']:
        if sex in analysis_df['Sex'].values:
            sex_data = analysis_df[analysis_df['Sex'] == sex]
            sex_age_group_perf = sex_data.groupby('AgeGroup')[base_score_col].mean().reset_index()
            peak_age_group_sex = sex_age_group_perf.loc[sex_age_group_perf[base_score_col].idxmax(), 'Age Group']
            print(f"{sex}: Peak age = {peak_age_group_sex:.0f} years")

if 'Equipment' in analysis_df.columns:
    print("\n=== Peak Age by Equipment ===")
    for eq in analysis_df['Equipment'].value_counts().head(3).index:
        eq_data = analysis_df[analysis_df['Equipment'] == eq]
        eq_age_group_perf = eq_data.groupby('AgeGroup')[base_score_col].mean().reset_index()
        if len(eq_age_group_perf) > 0:
            peak_age_group_eq = eq_age_group_perf.loc[eq_age_group_perf[base_score_col].idxmax(), 'Age Group']
            print(f"{eq}: Peak age = {peak_age_group_eq:.0f} years")


## 4. Extreme Age Analysis


In [None]:
# Flag extremes
extreme_young = analysis_df[analysis_df['AgeGroup'] < 18]
extreme_old = analysis_df[analysis_df['AgeGroup'] > 60]

print("=== Extreme Age Analysis ===")
print(f"Lifters < 18 years: {len(extreme_young):,} ({len(extreme_young)/len(analysis_df)*100:.1f}%)")
print(f"Lifters > 60 years: {len(extreme_old):,} ({len(extreme_old)/len(analysis_df)*100:.1f}%)")

if len(extreme_young) > 0:
    print(f"\nYoung lifters performance stats:")
    print(extreme_young[base_score_col].describe())

if len(extreme_old) > 0:
    print(f"\nOlder lifters performance stats:")
    print(extreme_old[base_score_col].describe())


## 4.5. Age Analysis by Lifter Category (New/Intermediate/Advanced)


In [None]:
# Analyze age patterns by lifter category
if 'LifterCategory' in df.columns and 'Age Group' in df.columns:
    print("=== Age Analysis by Lifter Category ===")
    
    # Filter to valid age data
    valid_age_group_df = df[df['AgeGroup'].notna() ].copy()
    
    categories = ['New', 'Intermediate', 'Advanced']
    category_dfs = {}
    
    for cat in categories:
        cat_df = valid_age_group_df[valid_age_group_df['LifterCategory'] == cat].copy()
        category_dfs[cat] = cat_df
        print(f"\n{cat} Lifters: {len(cat_df):,} entries")
        if len(cat_df) > 0:
            print(f"  Average age: {cat_df['AgeGroup'].mean():.2f} years")
            print(f"  Age range: {cat_df['AgeGroup'].min():.0f} - {cat_df['AgeGroup'].max():.0f} years")
            print(f"  Average TotalKg: {cat_df['TotalKg'].mean():.1f} kg")
    
    # Compare age-performance relationship by category, gender × weight class × testing status
    print("\n" + "="*80)
    print("AGE-PERFORMANCE BY CATEGORY (Gender × Weight Class × Testing Status)")
    print("="*80)
    
    for testing_status in ['Drug Tested', 'Untested']:
        print(f"\n{'='*80}")
        print(f"{testing_status.upper()} FEDERATIONS")
        print(f"{'='*80}")
        
        for cat in categories:
            cat_df = category_dfs[cat]
            cat_df = cat_df[cat_df['FederationTestingStatus'] == testing_status]
            
            if len(cat_df) > 0 and 'IPF_WeightClass' in cat_df.columns and 'Sex' in cat_df.columns:
                print(f"\n{cat} Lifters - Average Age Group and TotalKg by (Gender × Weight Class):")
                age_group_perf_by_group = cat_df.groupby(['Sex', 'IPF_WeightClass', 'AgeGroup'], observed=True).agg({
                    'TotalKg': ['count', 'mean']
                }).round(1)
                age_group_perf_by_group = age_group_perf_by_group[age_group_perf_by_group[('TotalKg', 'count')] >= 10]
                print(age_group_perf_by_group.head(20))  # Show top 20
    
    # Visualization: Age Group vs Performance by Category
    fig, axes = plt.subplots(1, 3, figsize=(20, 6))
    
    for idx, cat in enumerate(categories):
        ax = axes[idx]
        cat_df = category_dfs[cat]
        
        if len(cat_df) > 0:
            # Sample for visualization if too large
            if len(cat_df) > 10000:
                cat_df_sample = cat_df.sample(n=10000, random_state=42)
            else:
                cat_df_sample = cat_df
            
            ax.scatter(cat_df_sample['Age Group'], cat_df_sample['TotalKg'], 
                      alpha=0.3, s=10)
            ax.set_title(f'{cat} Lifters\n(n={len(cat_df):,})', fontsize=12, fontweight='bold')
            ax.set_xlabel('Age (years)', fontsize=10)
            ax.set_ylabel('TotalKg', fontsize=10)
            ax.grid(True, alpha=0.3)
            
            # Add trend line
            if len(cat_df_sample) > 10:
                z = np.polyfit(cat_df_sample['Age Group'].dropna(), 
                              cat_df_sample['TotalKg'].dropna(), 1)
                p = np.poly1d(z)
                x_trend = np.linspace(cat_df_sample['Age Group'].min(), 
                                     cat_df_sample['Age Group'].max(), 100)
                ax.plot(x_trend, p(x_trend), "r--", alpha=0.8, linewidth=2, label='Trend')
                ax.legend()
    
    plt.tight_layout()
    plt.savefig('../data/processed/age_group_by_category.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print("⚠ LifterCategory or Age column not found. Run categorization cell first.")


In [None]:
# Analyze age patterns by lifter category
if 'LifterCategory' in df.columns and 'Age Group' in df.columns:
    print("=== Age Analysis by Lifter Category ===")
    
    # Filter to valid age data
    valid_age_group_df = df[df['AgeGroup'].notna() ].copy()
    
    categories = ['New', 'Intermediate', 'Advanced']
    category_dfs = {}
    
    for cat in categories:
        cat_df = valid_age_group_df[valid_age_group_df['LifterCategory'] == cat].copy()
        category_dfs[cat] = cat_df
        print(f"\n{cat} Lifters: {len(cat_df):,} entries")
        if len(cat_df) > 0:
            print(f"  Average age: {cat_df['AgeGroup'].mean():.2f} years")
            print(f"  Age range: {cat_df['AgeGroup'].min():.0f} - {cat_df['AgeGroup'].max():.0f} years")
            print(f"  Average TotalKg: {cat_df['TotalKg'].mean():.1f} kg")
    
    # Compare age-performance relationship by category and gender × weight class
    print("\n" + "="*80)
    print("AGE-PERFORMANCE BY CATEGORY (Gender × Weight Class)")
    print("="*80)
    
    for cat in categories:
        cat_df = category_dfs[cat]
        if len(cat_df) > 0 and 'IPF_WeightClass' in cat_df.columns and 'Sex' in cat_df.columns:
            print(f"\n{cat} Lifters - Average Age and TotalKg by (Gender × Weight Class):")
            age_group_perf_by_group = cat_df.groupby(['Sex', 'IPF_WeightClass']).agg({
                'Age Group': ['count', 'mean'],
                'TotalKg': 'mean'
            }).round(1)
            age_group_perf_by_group = age_group_perf_by_group[age_group_perf_by_group[('Age Group', 'count')] >= 10]
            print(age_group_perf_by_group)
    
    # Visualization: Age Group vs Performance by Category
    fig, axes = plt.subplots(1, 3, figsize=(20, 6))
    
    for idx, cat in enumerate(categories):
        ax = axes[idx]
        cat_df = category_dfs[cat]
        
        if len(cat_df) > 0:
            # Sample for visualization if too large
            if len(cat_df) > 10000:
                cat_df_sample = cat_df.sample(n=10000, random_state=42)
            else:
                cat_df_sample = cat_df
            
            ax.scatter(cat_df_sample['Age Group'], cat_df_sample['TotalKg'], 
                      alpha=0.3, s=10)
            ax.set_title(f'{cat} Lifters\n(n={len(cat_df):,})', fontsize=12, fontweight='bold')
            ax.set_xlabel('Age (years)', fontsize=10)
            ax.set_ylabel('TotalKg', fontsize=10)
            ax.grid(True, alpha=0.3)
            
            # Add trend line
            if len(cat_df_sample) > 10:
                z = np.polyfit(cat_df_sample['Age Group'].dropna(), 
                              cat_df_sample['TotalKg'].dropna(), 1)
                p = np.poly1d(z)
                x_trend = np.linspace(cat_df_sample['Age Group'].min(), 
                                     cat_df_sample['Age Group'].max(), 100)
                ax.plot(x_trend, p(x_trend), "r--", alpha=0.8, linewidth=2, label='Trend')
                ax.legend()
    
    plt.tight_layout()
    plt.savefig('../data/processed/age_group_by_category.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print("⚠ LifterCategory or Age column not found. Run categorization cell first.")


## 5. Key Findings Summary


In [None]:
print("=== Age Analysis Summary ===")
print(f"\n1. Average age: {analysis_df['AgeGroup'].mean():.2f} years")
print(f"2. Age range: {analysis_df['AgeGroup'].min():.0f} - {analysis_df['AgeGroup'].max():.0f} years")
print(f"3. Performance correlation with age: {analysis_df['AgeGroup'].corr(analysis_df[base_score_col]):.3f}")
print(f"4. Peak performance age (mean): {peak_age_group_mean:.0f} years")
print(f"5. Peak performance age (median): {peak_age_group_median:.0f} years")

# Save age metrics
age_group_metrics = analysis_df[['Name', 'MeetDate', 'Age Group', base_score_col]].copy()
age_group_metrics.to_parquet('../data/processed/age_group_metrics.parquet', index=False)
print("\nAge metrics saved to ../data/processed/age_group_metrics.parquet")
