# Meet Quality Analysis

This notebook calculates meet competitiveness metrics (competitor count, average strength) and identifies elite meets.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from utils import categorize_ipf_weightclass, map_division_to_age_group, categorize_age_group, create_quality_filter, categorize_lifters, categorize_federation_testing_status

# Load the processed dataset
df = pd.read_parquet("../data/processed/full_dataset.parquet")
print(f"Dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")

# Add IPF Weight Class categorization (needed for quality filter)
if 'BodyweightKg' in df.columns and 'Sex' in df.columns:
    df['IPF_WeightClass'] = df.apply(
        lambda row: categorize_ipf_weightclass(row['BodyweightKg'], row['Sex']), 
        axis=1
    )
# Apply quality filter (excludes outliers and invalid data)
print("\n" + "="*80)
print("APPLYING QUALITY FILTER")
print("="*80)
quality_mask = create_quality_filter(df)
filtered_out = (~quality_mask).sum()
print(f"Quality filter will exclude {filtered_out:,} entries ({filtered_out/len(df)*100:.2f}%)")
print(f"  Remaining entries for analysis: {quality_mask.sum():,} ({quality_mask.sum()/len(df)*100:.2f}%)")
# Create clean dataset (filtered)
df = df[quality_mask].copy()
# Add Age Group categorization from Division (100% coverage)
if 'Division' in df.columns:
    print("\n" + "="*80)
    print("ADDING AGE GROUP CATEGORIZATION")
    print("="*80)
    df['AgeGroup'] = df['Division'].apply(map_division_to_age_group)
    print("Age Group categorization complete (using Division - 100% coverage)")
    if 'AgeGroup' in df.columns:
        print(f"Age Group distribution:\n{df['AgeGroup'].value_counts()}")
else:
    print("Warning: Cannot add Age Group - missing Division column")
    df['AgeGroup'] = None
print(f"\n✓ Using filtered dataset with {len(df):,} entries")
# Categorize lifters (New/Intermediate/Advanced)
print("\n" + "="*80)
print("CATEGORIZING LIFTERS")
print("="*80)
df = categorize_lifters(df)
print("✓ Lifters categorized into New, Intermediate, and Advanced")

# Categorize federation testing status (Drug Tested vs Untested)
print("\n" + "="*80)
print("CATEGORIZING FEDERATION TESTING STATUS")
print("="*80)
df = categorize_federation_testing_status(df)
testing_status_counts = df['FederationTestingStatus'].value_counts()
print("Federation Testing Status distribution:")
print(testing_status_counts)
print(f"✓ Federation testing status categorized")
# Create separate datasets for each category
if 'LifterCategory' in df.columns:
    new_lifters_df = df[df['LifterCategory'] == 'New'].copy()
    intermediate_lifters_df = df[df['LifterCategory'] == 'Intermediate'].copy()
    advanced_lifters_df = df[df['LifterCategory'] == 'Advanced'].copy()
    
    print(f"\nCategory breakdown:")
    print(f"  New lifters: {len(new_lifters_df):,} entries")
    print(f"  Intermediate lifters: {len(intermediate_lifters_df):,} entries")
    print(f"  Advanced lifters: {len(advanced_lifters_df):,} entries")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Load the processed dataset
df = pd.read_parquet("../data/processed/full_dataset.parquet")
print(f"Dataset loaded: {df.shape[0]:,} rows")


## 1. Calculate Meet-Level Metrics


In [None]:
# Use Dots or Wilks as base score if available
if 'Dots' in df.columns:
    base_score_col = 'Dots'
elif 'Wilks' in df.columns:
    base_score_col = 'Wilks'
else:
    base_score_col = 'TotalKg'

# Filter to valid performance data
analysis_df = df[(df[base_score_col].notna()) & (df[base_score_col] > 0)].copy()

print(f"Using {base_score_col} as performance metric")
print(f"Analysis dataset: {len(analysis_df):,} rows")


In [None]:
# Calculate per-meet statistics
if 'MeetPath' in analysis_df.columns:
    meet_stats = analysis_df.groupby('MeetPath').agg({
        base_score_col: ['count', 'mean', 'median', 'std', 'min', 'max'],
        'Name': 'nunique',  # Number of unique competitors
        'Federation': 'first',
        'MeetDate': 'first'
    }).reset_index()
    
    # Flatten column names
    meet_stats.columns = ['MeetPath', 'CompetitorCount', 'AvgScore', 'MedianScore', 
                         'StdScore', 'MinScore', 'MaxScore', 'UniqueLifters', 
                         'Federation', 'MeetDate']
    
    # Calculate top competitor strength (top 10% average)
    top_10_pct_scores = []
    for meet_path in meet_stats['MeetPath']:
        meet_data = analysis_df[analysis_df['MeetPath'] == meet_path][base_score_col]
        if len(meet_data) > 0:
            top_10 = meet_data.nlargest(max(1, int(len(meet_data) * 0.1)))
            top_10_pct_scores.append(top_10.mean())
        else:
            top_10_pct_scores.append(np.nan)
    
    meet_stats['Top10PctAvg'] = top_10_pct_scores
    
    print(f"Calculated statistics for {len(meet_stats):,} meets")
    print(f"\nMeet statistics summary:")
    print(meet_stats[['CompetitorCount', 'AvgScore', 'MedianScore', 'Top10PctAvg']].describe())
else:
    print("MeetPath column not found")


## 2. Identify Elite Meets


In [None]:
# Identify elite meets (top 10% by average competitor strength)
elite_threshold = meet_stats['AvgScore'].quantile(0.90)
elite_meets = meet_stats[meet_stats['AvgScore'] >= elite_threshold].copy()

print(f"=== Elite Meets Analysis ===")
print(f"Elite threshold (90th percentile): {elite_threshold:.2f}")
print(f"Number of elite meets: {len(elite_meets):,} ({len(elite_meets)/len(meet_stats)*100:.1f}%)")
print(f"\nElite meets statistics:")
print(elite_meets[['CompetitorCount', 'AvgScore', 'MedianScore', 'Top10PctAvg']].describe())

# Top 20 elite meets
print(f"\n=== Top 20 Elite Meets ===")
top_elite = elite_meets.nlargest(20, 'AvgScore')[['MeetPath', 'Federation', 'MeetDate', 
                                                   'CompetitorCount', 'AvgScore', 'Top10PctAvg']]
print(top_elite.to_string(index=False))


In [None]:
# Visualize meet quality distribution (split by testing status)
fig, axes = plt.subplots(4, 2, figsize=(16, 20))

# Split visualizations by testing status
for row_idx, testing_status in enumerate(['Drug Tested', 'Untested']):
    status_meet_stats = meet_stats[meet_stats['FederationTestingStatus'] == testing_status]
    
    if len(status_meet_stats) > 0:
        elite_threshold_status = status_meet_stats['AvgScore'].quantile(0.90)
        elite_meets_status = status_meet_stats[status_meet_stats['AvgScore'] >= elite_threshold_status]
        
        # Distribution of average scores
        ax1 = axes[row_idx*2, 0]
        status_meet_stats['AvgScore'].hist(bins=50, edgecolor='black', alpha=0.7, ax=ax1)
        ax1.axvline(elite_threshold_status, color='r', linestyle='--', label=f'Elite threshold: {elite_threshold_status:.1f}')
        ax1.set_xlabel(f'Average {base_score_col}')
        ax1.set_ylabel('Number of Meets')
        ax1.set_title(f'Distribution of Average Competitor Strength - {testing_status}')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # Competitor count vs average score
        ax2 = axes[row_idx*2, 1]
        sample_meets = status_meet_stats.sample(min(5000, len(status_meet_stats)))
        ax2.scatter(sample_meets['CompetitorCount'], sample_meets['AvgScore'], alpha=0.3, s=10)
        ax2.set_xlabel('Number of Competitors')
        ax2.set_ylabel(f'Average {base_score_col}')
        ax2.set_title(f'Meet Size vs Average Strength - {testing_status}')
        ax2.grid(True, alpha=0.3)
        
        # Top 10% average vs overall average
        ax3 = axes[row_idx*2+1, 0]
        ax3.scatter(status_meet_stats['AvgScore'], status_meet_stats['Top10PctAvg'], alpha=0.3, s=10)
        ax3.plot([status_meet_stats['AvgScore'].min(), status_meet_stats['AvgScore'].max()], 
                [status_meet_stats['AvgScore'].min(), status_meet_stats['AvgScore'].max()], 
                'r--', label='y=x')
        ax3.set_xlabel(f'Average {base_score_col}')
        ax3.set_ylabel(f'Top 10% Average {base_score_col}')
        ax3.set_title(f'Top Competitors vs Overall Average - {testing_status}')
        ax3.legend()
        ax3.grid(True, alpha=0.3)
        
        # Elite meets over time
        ax4 = axes[row_idx*2+1, 1]
        if 'MeetDate' in status_meet_stats.columns:
            status_meet_stats['Year'] = pd.to_datetime(status_meet_stats['MeetDate']).dt.year
            elite_meets_status['Year'] = pd.to_datetime(elite_meets_status['MeetDate']).dt.year
            yearly_elite = elite_meets_status.groupby('Year').size()
            yearly_total = status_meet_stats.groupby('Year').size()
            ax4.plot(yearly_elite.index, yearly_elite.values, label='Elite meets', linewidth=2)
            ax4.plot(yearly_total.index, yearly_total.values, label='Total meets', linewidth=2, alpha=0.5)
            ax4.set_xlabel('Year')
            ax4.set_ylabel('Number of Meets')
            ax4.set_title(f'Elite Meets Over Time - {testing_status}')
            ax4.legend()
            ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../data/processed/meet_quality_analysis.png', dpi=150, bbox_inches='tight')
plt.show()


## 3. Federation Prestige Analysis


In [None]:
# Calculate federation-level statistics (split by testing status)
if 'Federation' in meet_stats.columns:
    for testing_status in ['Drug Tested', 'Untested']:
        status_meet_stats = meet_stats[meet_stats['FederationTestingStatus'] == testing_status]
        
        if len(status_meet_stats) > 0:
            fed_stats = status_meet_stats.groupby('Federation').agg({
                'AvgScore': ['mean', 'median', 'count'],
                'CompetitorCount': 'mean',
                'Top10PctAvg': 'mean'
            }).reset_index()
            
            fed_stats.columns = ['Federation', 'MeanAvgScore', 'MedianAvgScore', 'MeetCount', 
                                'AvgCompetitorCount', 'AvgTop10Pct']
            
            # Filter to federations with at least 10 meets
            fed_stats = fed_stats[fed_stats['MeetCount'] >= 10].sort_values('MeanAvgScore', ascending=False)
            
            print(f"\n{'='*80}")
            print(f"FEDERATION PRESTIGE RANKINGS (TOP 20) - {testing_status.upper()}")
            print(f"{'='*80}")
            print(fed_stats.head(20).to_string(index=False))
            
            # Visualize top federations
            plt.figure(figsize=(12, 8))
            top_feds = fed_stats.head(20)
            plt.barh(range(len(top_feds)), top_feds['MeanAvgScore'])
            plt.yticks(range(len(top_feds)), top_feds['Federation'])
            plt.xlabel(f'Average {base_score_col}')
            plt.title(f'Top 20 Federations by Average Competitor Strength - {testing_status}')
            plt.gca().invert_yaxis()
            plt.tight_layout()
            plt.savefig(f'../data/processed/federation_prestige_{testing_status.replace(" ", "_").lower()}.png', dpi=150, bbox_inches='tight')
            plt.show()


## 4. Key Findings Summary


In [None]:
print("=== Meet Quality Analysis Summary ===")
print(f"\n1. Total meets analyzed: {len(meet_stats):,}")

# Summary split by testing status
for testing_status in ['Drug Tested', 'Untested']:
    status_meet_stats = meet_stats[meet_stats['FederationTestingStatus'] == testing_status]
    if len(status_meet_stats) > 0:
        elite_threshold_status = status_meet_stats['AvgScore'].quantile(0.90)
        elite_meets_status = status_meet_stats[status_meet_stats['AvgScore'] >= elite_threshold_status]
        
        print(f"\n{testing_status.upper()}:")
        print(f"  2. Average competitors per meet: {status_meet_stats['CompetitorCount'].mean():.1f}")
        print(f"  3. Average competitor strength: {status_meet_stats['AvgScore'].mean():.2f}")
        print(f"  4. Elite meets threshold: {elite_threshold_status:.2f}")
        print(f"  5. Number of elite meets: {len(elite_meets_status):,}")

# Save meet quality metrics
meet_stats.to_parquet('../data/processed/meet_quality_metrics.parquet', index=False)
print("\nMeet quality metrics saved to ../data/processed/meet_quality_metrics.parquet")
