In [1]:
import pandas as pd
import os
import glob
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

def get_team_acronym_from_id(team_id):
    """Convert team ID to team acronym using the provided dictionary"""
    team_dict = {
        '1610612760': 'OKC', '1610612749': 'MIL', '1610612758': 'SAC', '1610612747': 'LAL',
        '1610612738': 'BOS', '1610612743': 'DEN', '1610612750': 'MIN', '1610612752': 'NYK',
        '1610612756': 'PHX', '1610612753': 'ORL', '1610612766': 'CHA', '1610612739': 'CLE',
        '1610612746': 'LAC', '1610612737': 'ATL', '1610612748': 'MIA', '1610612742': 'DAL',
        '1610612765': 'DET', '1610612763': 'MEM', '1610612761': 'TOR', '1610612741': 'CHI',
        '1610612754': 'IND', '1610612759': 'SAS', '1610612745': 'HOU', '1610612751': 'BKN',
        '1610612764': 'WAS', '1610612744': 'GSW', '1610612755': 'PHI', '1610612762': 'UTA',
        '1610612757': 'POR', '1610612740': 'NOP'
    }
    
    team_id_str = str(int(team_id)) if pd.notna(team_id) else None
    return team_dict.get(team_id_str, f'UNKNOWN_{team_id_str}')

def determine_game_type(game_id):
    """Determine if game is regular season (rs) or playoffs (ps) based on game ID"""
    game_id_str = str(game_id)
    if game_id_str.startswith('4'):
        return 'ps'  # Playoffs
    elif game_id_str.startswith('2'):
        return 'rs'  # Regular season
    else:
        return 'unknown'

def collect_and_organize_data():
    """Main function to collect and organize scraped PBP data"""
    
    # Create output directories
    os.makedirs('organized_data', exist_ok=True)
    os.makedirs('organized_data/regular_season', exist_ok=True)
    os.makedirs('organized_data/playoffs', exist_ok=True)
    
    # Get all CSV files from pbp_data directory
    pbp_files = glob.glob('pbp_data/*.csv')
    
    if not pbp_files:
        print("No PBP data files found in pbp_data directory!")
        return
    
    print(f"Found {len(pbp_files)} PBP data files")
    
    # Dictionary to store data by team, year, and game type
    team_data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    
    # Process each file
    processed_files = 0
    skipped_files = 0
    
    for file_path in pbp_files:
        try:
            # Extract year from filename (format: YYYY_GAMEID.csv)
            filename = os.path.basename(file_path)
            if '_' not in filename:
                print(f"Skipping file with unexpected format: {filename}")
                skipped_files += 1
                continue
            
            year = filename.split('_')[0]
            
            # Read the CSV file
            df = pd.read_csv(file_path)
            
            if df.empty:
                print(f"Skipping empty file: {filename}")
                skipped_files += 1
                continue
            
            # Check if required columns exist
            required_columns = ['teamId', 'game_id']
            missing_columns = [col for col in required_columns if col not in df.columns]
            if missing_columns:
                print(f"Skipping file {filename} - missing columns: {missing_columns}")
                skipped_files += 1
                continue
            
            # Get unique teams and game type for this file
            unique_teams = df['teamId'].dropna().unique()
            
            if len(unique_teams) == 0:
                print(f"No valid team IDs found in {filename}")
                skipped_files += 1
                continue
            
            # Determine game type from game_id
            game_ids = df['game_id'].dropna().unique()
            if len(game_ids) == 0:
                print(f"No valid game IDs found in {filename}")
                skipped_files += 1
                continue
            
            # Use first game_id to determine game type (should be consistent within file)
            game_type = determine_game_type(game_ids[0])
            
            if game_type == 'unknown':
                print(f"Unknown game type for {filename} (game_id: {game_ids[0]})")
                skipped_files += 1
                continue
            
            # Group data by team
            for team_id in unique_teams:
                team_acronym = get_team_acronym_from_id(team_id)
                team_df = df[df['teamId'] == team_id].copy()
                
                if not team_df.empty:
                    team_data[team_acronym][year][game_type].append(team_df)
            
            processed_files += 1
            
            if processed_files % 100 == 0:
                print(f"Processed {processed_files} files...")
                
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
            skipped_files += 1
            continue
    
    print(f"\nProcessed {processed_files} files, skipped {skipped_files} files")
    
    # Combine and save data for each team/year/game_type combination
    total_files_created = 0
    
    for team_acronym in team_data:
        for year in team_data[team_acronym]:
            for game_type in team_data[team_acronym][year]:
                # Combine all DataFrames for this team/year/game_type
                combined_df = pd.concat(team_data[team_acronym][year][game_type], ignore_index=True)
                
                # Sort by game_id and actionNumber for consistent ordering
                combined_df = combined_df.sort_values(['game_id', 'actionNumber'])
                
                # Create filename
                game_type_folder = 'regular_season' if game_type == 'rs' else 'playoffs'
                filename = f"organized_data/{game_type_folder}/{team_acronym}_{year}_{game_type}.csv"
                
                # Save to CSV
                combined_df.to_csv(filename, index=False)
                
                total_files_created += 1
                print(f"Created: {filename} ({len(combined_df)} rows)")
    
    print(f"\nData organization complete!")
    print(f"Total files created: {total_files_created}")
    
    # Print summary statistics
    print("\nSummary by team:")
    for team_acronym in sorted(team_data.keys()):
        years = sorted(team_data[team_acronym].keys())
        rs_years = []
        ps_years = []
        
        for year in years:
            if 'rs' in team_data[team_acronym][year]:
                rs_years.append(year)
            if 'ps' in team_data[team_acronym][year]:
                ps_years.append(year)
        
        print(f"  {team_acronym}: RS years: {rs_years}, PS years: {ps_years}")

def verify_organized_data():
    """Verify the organized data files"""
    print("\nVerifying organized data files...")
    
    rs_files = glob.glob('organized_data/regular_season/*.csv')
    ps_files = glob.glob('organized_data/playoffs/*.csv')
    
    print(f"Regular season files: {len(rs_files)}")
    print(f"Playoff files: {len(ps_files)}")
    
    # Sample a few files to verify structure
    sample_files = (rs_files + ps_files)[:5]
    
    for file_path in sample_files:
        try:
            df = pd.read_csv(file_path)
            filename = os.path.basename(file_path)
            print(f"  {filename}: {len(df)} rows, {len(df.columns)} columns")
            
            # Check game type consistency
            game_ids = df['game_id'].unique()
            game_types = [determine_game_type(gid) for gid in game_ids]
            unique_game_types = set(game_types)
            
            if len(unique_game_types) > 1:
                print(f"    WARNING: Mixed game types found: {unique_game_types}")
            
        except Exception as e:
            print(f"    ERROR reading {file_path}: {e}")

if __name__ == "__main__":
    print("Starting NBA PBP data collection and organization...")
    collect_and_organize_data()
    verify_organized_data()
    print("\nProcess complete!")

Starting NBA PBP data collection and organization...
Found 6422 PBP data files
Processed 100 files...
Processed 200 files...
Processed 300 files...
Processed 400 files...
Processed 500 files...
Processed 600 files...
Processed 700 files...
Processed 800 files...
Processed 900 files...
Processed 1000 files...
Processed 1100 files...
Processed 1200 files...
Processed 1300 files...
Processed 1400 files...
Processed 1500 files...
Processed 1600 files...
Processed 1700 files...
Processed 1800 files...
Processed 1900 files...
Processed 2000 files...
Processed 2100 files...
Processed 2200 files...
Processed 2300 files...
Processed 2400 files...
Processed 2500 files...
Processed 2600 files...
Processed 2700 files...
Processed 2800 files...
Processed 2900 files...
Processed 3000 files...
Processed 3100 files...
Processed 3200 files...
Processed 3300 files...
Processed 3400 files...
Processed 3500 files...
Processed 3600 files...
Processed 3700 files...
Processed 3800 files...
Processed 3900 fil