# Data Loading and Aggregation

This notebook loads and aggregates all meet data from the OpenPowerlifting dataset.


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

# Import utils directly (since we're in the eda directory)
from utils import load_all_meets, merge_entries_meets, clean_data

print("Starting data loading...")

Starting data loading...


In [2]:
# Load entries and meets from last 10 years (2015-2025)
# Filtering happens during loading to avoid loading all 3.5M entries into memory
# Use max_federations parameter to limit for testing (e.g., max_federations=50)
# Set to None to load all federations (may take 10-20 minutes and use significant memory)

print("Loading data from 2015-2025 (this may take several minutes)...")
print("Tip: If this crashes, restart and set max_federations=50 in the function call below")

# Initialize variables
entries_df = None
meets_df = None

try:
    # Load data - filtering to 2015-2025 happens during load
    entries_df, meets_df = load_all_meets(
        base_path="../opl-data/meet-data",
        max_federations=None,  # Set to 50 for testing, None for all federations
        chunk_size=100,  # Process in chunks to save memory
        date_range=('2015-01-01', '2025-12-31')  # Only load meets from 2015-2025
    )
    
    # Verify dataframes are defined
    if entries_df is not None and len(entries_df) > 0:
        print(f"\n✓ Successfully loaded {len(entries_df):,} entries")
        print(f"  Memory: {entries_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    else:
        print("⚠ Warning: entries_df is empty or None")
        entries_df = pd.DataFrame()  # Ensure it's defined
        
    if meets_df is not None and len(meets_df) > 0:
        print(f"✓ Successfully loaded {len(meets_df):,} meets")
        print(f"  Memory: {meets_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    else:
        print("⚠ Warning: meets_df is empty or None")
        meets_df = pd.DataFrame()  # Ensure it's defined
        
except MemoryError:
    print("❌ Out of memory! Try loading fewer federations:")
    print("   Change max_federations=None to max_federations=50")
    entries_df = pd.DataFrame()
    meets_df = pd.DataFrame()
except Exception as e:
    print(f"❌ Error loading data: {e}")
    print("   Dataframes initialized as empty - check error above")
    entries_df = pd.DataFrame()
    meets_df = pd.DataFrame()

Loading data from 2015-2025 (this may take several minutes)...
Tip: If this crashes, restart and set max_federations=50 in the function call below
Date filter enabled: 2015-01-01 to 2025-12-31
Found 426 federation directories

PASS 1: Loading meets and filtering by date range...


Loading meets (pass 1): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 426/426 [01:18<00:00,  5.41it/s]



✓ Pass 1 complete:
  Total meets found: 53,985
  Meets in date range (2015-01-01 to 2025-12-31): 38,880
  Meets filtered out: 15,105

PASS 2: Loading entries for filtered meets...


Loading entries (pass 2): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 426/426 [03:06<00:00,  2.28it/s]



Combining 72 entry chunks...
✓ Loaded 2,545,338 entries from 2,545,338 total rows
  (Only entries for meets in date range 2015-01-01 to 2025-12-31)

Combining 38880 meet records...
✓ Loaded 38,880 meets
  (Only meets in date range 2015-01-01 to 2025-12-31)

✓ Successfully loaded 2,545,338 entries
  Memory: 3358.1 MB
✓ Successfully loaded 38,880 meets
  Memory: 17.3 MB


In [3]:
# Merge entries with meet data
merged_df = merge_entries_meets(entries_df, meets_df)
print(f"\nMerged dataset shape: {merged_df.shape}")


✓ Merged dataset: 2,545,338 rows

Merged dataset shape: (2545338, 71)


In [4]:
# Date filtering is now done during loading (see Cell 2)
# No separate filtering step needed - data is already filtered to 2015-2025
print(f"\n✓ Data already filtered to 2015-2025 during loading")
print(f"Merged dataset shape: {merged_df.shape}")



✓ Data already filtered to 2015-2025 during loading
Merged dataset shape: (2545338, 71)


In [5]:
# Clean and prepare data
df = clean_data(merged_df)
print(f"Cleaned dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")


Cleaned dataset shape: (2545338, 73)

Columns: ['Name', 'WeightClassKg', 'Sex', 'Age', 'Division', 'Event', 'BodyweightKg', 'Equipment', 'Squat1Kg', 'Squat2Kg', 'Squat3Kg', 'Best3SquatKg', 'Squat4Kg', 'Bench1Kg', 'Bench2Kg', 'Bench3Kg', 'Best3BenchKg', 'Bench4Kg', 'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg', 'Best3DeadliftKg', 'Deadlift4Kg', 'TotalKg', 'Place', 'Federation', 'MeetPath', 'Team', 'Tested', 'Country', 'BirthDate', 'State', 'EntryDate', 'BodyweightLbs', 'WeightClassLbs', 'Deadlift1Lbs', 'Deadlift2Lbs', 'Deadlift3Lbs', 'Deadlift4Lbs', 'Best3DeadliftLbs', 'TotalLbs', 'BirthYear', 'Squat1Lbs', 'Squat2Lbs', 'Squat3Lbs', 'Best3SquatLbs', 'Bench1Lbs', 'Bench2Lbs', 'Bench3Lbs', 'Best3BenchLbs', 'Bench4Lbs', 'Squat4Lbs', 'CyrillicName', 'AgeRange', 'ChineseName', 'College/University', 'GreekName', 'SquatEquipment', 'DeadliftEquipment', 'BenchEquipment', 'JapaneseName', 'KoreanName', 'School', 'Federation_meet', 'Date', 'MeetCountry', 'MeetState', 'MeetTown', 'MeetName', 'RuleSet',

In [6]:
# Save to parquet for faster loading later
if df is not None and len(df) > 0:
    try:
        output_path = Path("../data/processed/full_dataset.parquet")
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Fix object columns that cause ArrowTypeError
        # Convert all object columns to string for parquet compatibility
        print("\nPreparing data for parquet save...")
        df_to_save = df.copy()
        
        # Convert all object columns to string
        object_cols = df_to_save.select_dtypes(include=['object']).columns
        print(f"  Converting {len(object_cols)} object columns to string...")
        for col in object_cols:
            # Convert to string, handling NaN values
            df_to_save[col] = df_to_save[col].astype(str)
            # Replace 'nan' strings with empty string for cleaner data
            df_to_save[col] = df_to_save[col].replace('nan', '')
        
        print(f"\nSaving to {output_path}...")
        print(f"  This may take a few minutes for large datasets...")
        df_to_save.to_parquet(output_path, compression='snappy', index=False, engine='pyarrow')
        print(f"✓ Saved dataset successfully!")
        print(f"  Size: {df.shape[0]:,} rows, {df.shape[1]} columns")
        print(f"  File size: {output_path.stat().st_size / 1024**2:.1f} MB")
    except Exception as e:
        print(f"❌ Error saving data: {e}")
        print(f"  Error type: {type(e).__name__}")
        import traceback
        print(f"  Details: {traceback.format_exc()}")
else:
    print("⚠ Cannot save - df is empty. Check previous cells.")



Preparing data for parquet save...
  Converting 34 object columns to string...

Saving to ..\data\processed\full_dataset.parquet...
  This may take a few minutes for large datasets...
✓ Saved dataset successfully!
  Size: 2,545,338 rows, 73 columns
  File size: 66.3 MB


In [7]:
# Quick preview
print("\nFirst few rows:")
df.head()



First few rows:


Unnamed: 0,Name,WeightClassKg,Sex,Age,Division,Event,BodyweightKg,Equipment,Squat1Kg,Squat2Kg,...,Federation_meet,Date,MeetCountry,MeetState,MeetTown,MeetName,RuleSet,Sanctioned,MeetDate,WeightClassKg_numeric
0,Angie Belk Terry,60.0,F,47.0,M2,SBD,59.6,Wraps,38.56,47.63,...,365Strong,2016-10-29,USA,NC,Charlotte,Junior & Senior National Powerlifting Champion...,,,2016-10-29,60.0
1,Dawn Bogart,60.0,F,42.0,M1,SBD,58.51,Single-ply,120.2,136.08,...,365Strong,2016-10-29,USA,NC,Charlotte,Junior & Senior National Powerlifting Champion...,,,2016-10-29,60.0
2,Dawn Bogart,60.0,F,42.0,Open,SBD,58.51,Single-ply,120.2,136.08,...,365Strong,2016-10-29,USA,NC,Charlotte,Junior & Senior National Powerlifting Champion...,,,2016-10-29,60.0
3,Dawn Bogart,60.0,F,42.0,Open,B,58.51,Raw,,,...,365Strong,2016-10-29,USA,NC,Charlotte,Junior & Senior National Powerlifting Champion...,,,2016-10-29,60.0
4,Destiny Dula,67.5,F,18.0,T3,BD,63.68,Raw,,,...,365Strong,2016-10-29,USA,NC,Charlotte,Junior & Senior National Powerlifting Champion...,,,2016-10-29,67.5


In [None]:
# Basic info
print("\nData types:")
print(df.dtypes)
print("\nMemory usage:")
print(f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")



Data types:
Name                             object
WeightClassKg                    object
Sex                              object
Age                             float64
Division                         object
                              ...      
MeetName                         object
RuleSet                          object
Sanctioned                       object
MeetDate                 datetime64[ns]
WeightClassKg_numeric           float64
Length: 73, dtype: object

Memory usage:
4283.48 MB
