In [None]:
import requests
import re
import pandas as pd
import numpy as np
import json
import os
from itertools import chain
from collections import defaultdict
import time
import glob

# Load master record
record = pd.read_csv('master_record.csv')
record=record[['GAME_ID','year']]
record.drop_duplicates(inplace=True)

def get_existing_games():
    """Get list of already scraped games from existing CSV files"""
    existing_games = set()
    
    # Check pbp_data directory for existing files
    if os.path.exists('pbp_data'):
        pbp_files = glob.glob('pbp_data/*.csv')
        for file_path in pbp_files:
            filename = os.path.basename(file_path)
            # Extract game_id from filename format: year_gameid.csv
            if '_' in filename:
                parts = filename.replace('.csv', '').split('_')
                if len(parts) >= 2:
                    game_id = '_'.join(parts[1:])  # Handle game IDs that might contain underscores
                    existing_games.add(game_id)
    
    print(f"Found {len(existing_games)} already scraped games")
    return existing_games

def get_player_name(play):
    """Extract player name from play data"""
    if 'playerName' in play and play['playerName']:
        return play['playerName']
    elif 'playerNameI' in play and play['playerNameI']:
        return play['playerNameI']
    return ''

def is_field_goal(play):
    """Check if play is a field goal attempt"""
    return play.get('actionType') in ['2pt', '3pt']

def get_play_by_play_data(game_id):
    """Fetch play-by-play data from NBA API"""
    pbp_url = f"https://cdn.nba.com/static/json/liveData/playbyplay/playbyplay_00{game_id}.json"
    
    try:
        response = requests.get(pbp_url)
        response.raise_for_status()
        game_data = response.json()
        actions = game_data['game']['actions']
        return pd.DataFrame(actions)
    except Exception as e:
        print(f"Error fetching PBP data for game {game_id}: {e}")
        return pd.DataFrame()

def parse_iso_clock(clock_str):
    """Convert ISO 8601 duration format (PTXMYS) to minutes and seconds"""
    if not clock_str or pd.isna(clock_str):
        return 0, 0
    match = re.match(r'PT(?:(\d+)M)?(?:(\d+(?:\.\d+)?)S)?', str(clock_str))
    if match:
        minutes = int(match.group(1)) if match.group(1) else 0
        seconds = float(match.group(2)) if match.group(2) else 0.0
        return minutes, int(seconds)
    return 0, 0

def get_starters_data(game_id):
    """Get starting lineups for the game"""
    try:
        boxscore_url = f"https://cdn.nba.com/static/json/liveData/boxscore/boxscore_00{game_id}.json"
        response = requests.get(boxscore_url)
        response.raise_for_status()
        boxscore_data = response.json()
        
        home_starters = []
        away_starters = []
        
        # Extract starters from boxscore
        home_players = boxscore_data['game']['homeTeam']['players']
        away_players = boxscore_data['game']['awayTeam']['players']
        
        for player in home_players:
            if player.get('starter') == '1':
                home_starters.append(str(player['personId']))
        
        for player in away_players:
            if player.get('starter') == '1':
                away_starters.append(str(player['personId']))
        
        # Combine all starters
        all_starters = home_starters + away_starters
        return '|'.join(all_starters)
    
    except Exception as e:
        print(f"Error fetching starters for game {game_id}: {e}")
        return ""

def process_pbp_data(pbp_df, starters_on, game_id):
    """Process play-by-play data"""
    if pbp_df.empty:
        return [], {}
    
    events = ['2pt', 'rebound', '3pt', 'turnover', 'steal', 'foul', 'freethrow', 'timeout', 'substitution', 'block']
    pbp_df = pbp_df[pbp_df.actionType.isin(events)]
    pbp_df = pbp_df.replace({np.nan: None})

    if pbp_df.empty:
        return [], {}

    # Parse clock and compute clock display
    clock_data = pbp_df['clock'].apply(lambda x: pd.Series(parse_iso_clock(x)))
    pbp_df[['clock_minutes', 'clock_seconds']] = clock_data
    pbp_df['clock_display'] = pbp_df.apply(lambda row: f"{int(row['clock_minutes']):02}:{int(row['clock_seconds']):02}", axis=1)
    pbp_df['game_clock'] = pbp_df.apply(lambda row: f"Q{row['period']} {row['clock_display']}", axis=1)
    pbp_df['minutes_left_in_game'] = (4 - (pbp_df['period'] - 1)) * 12 - (12 - pbp_df['clock_minutes']) - (pbp_df['clock_seconds'] / 60)

    # Compute next action fields
    pbp_df['next_actionType'] = pbp_df['actionType'].shift(-1)
    pbp_df['next_shotResult'] = pbp_df['shotResult'].shift(-1)

    processed_data = []
    prev_action_type = None
    prev_shot_result = None
    team_lineups = defaultdict(set)  # Track unique lineups per team

    players_on_list = starters_on.split('|') if starters_on else []

    for idx, play in pbp_df.iterrows():
        # Previous action
        previous_action = None
        if prev_action_type:
            if prev_action_type in ['2pt', '3pt']:
                if prev_shot_result == 'Made':
                    previous_action = f"{prev_action_type} made"
                elif prev_shot_result == 'Missed':
                    previous_action = f"{prev_action_type} missed"
                else:
                    previous_action = prev_action_type
            else:
                previous_action = prev_action_type

        # Next action
        next_action = None
        next_action_type = play.get('next_actionType')
        next_shot_result = play.get('next_shotResult')
        if next_action_type:
            if next_action_type in ['2pt', '3pt']:
                if next_shot_result == 'Made':
                    next_action = f"{next_action_type} made"
                elif next_shot_result == 'Missed':
                    next_action = f"{next_action_type} missed"
                else:
                    next_action = next_action_type
            else:
                next_action = next_action_type

        # Handle substitutions
        if play.get('actionType') == 'substitution':
            person_id = str(play.get('personId'))
            sub_type = play.get('subType')
            if sub_type == 'out' and person_id in players_on_list:
                players_on_list.remove(person_id)
            elif sub_type == 'in' and person_id not in players_on_list:
                players_on_list.append(person_id)

        person_id = play.get('personId', None)
        assist_id = play.get('assistPersonId', None)
        formatted_players_on = '|'.join(sorted(players_on_list))
        
        # Track team lineups (assuming we can determine team from teamId)
        team_id = play.get('teamId')
        if team_id and formatted_players_on:
            team_lineups[team_id].add(formatted_players_on)

        play_dict = {
            'period': play.get('period', 0),
            'clock': play.get('clock', ''),
            'clock_display': play.get('clock_display', ''),
            'game_clock': play.get('game_clock', ''),
            'minutes_left_in_game': play.get('minutes_left_in_game', 0),
            'actionNumber': play.get('actionNumber', ''),
            'actionType': play.get('actionType', ''),
            'description': play.get('description', ''),
            'qualifier': play.get('qualifiers', []),
            'playerName': get_player_name(play),
            'scoreHome': play.get('scoreHome', 0),
            'scoreAway': play.get('scoreAway', 0),
            'shotResult': play.get('shotResult', ''),
            'isFieldGoal': is_field_goal(play),
            'assisted': assist_id is not None,
            'person_id': person_id,
            'assister_id': assist_id,
            'previous_action': previous_action,
            'next_action': next_action,
            'foulDrawnPersonId': play.get('foulDrawnPersonId', ''),
            'stealPersonId': play.get('stealPersonId', ''),
            'blockPersonId': play.get('blockPersonId', ''),
            'players_on': formatted_players_on,
            'teamId': team_id,
            'game_id': game_id
        }

        processed_data.append(play_dict)
        
        prev_action_type = play.get('actionType', '')
        prev_shot_result = play.get('shotResult', '')

    return processed_data, team_lineups

def verify_existing_file(game_id, year):
    """Verify that an existing file contains valid data"""
    filename = f"pbp_data/{year}_{game_id}.csv"
    
    try:
        if not os.path.exists(filename):
            return False
            
        # Check if file is readable and has data
        df = pd.read_csv(filename)
        
        # Basic validation checks
        if df.empty:
            print(f"Warning: Empty file found for game {game_id}, will re-scrape")
            return False
            
        # Check for required columns
        required_columns = ['actionType', 'game_id', 'period']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"Warning: Missing columns {missing_columns} in file for game {game_id}, will re-scrape")
            return False
            
        # Check if game_id matches
        if 'game_id' in df.columns and not str(df['game_id'].iloc[0]) == str(game_id):
            print(f"Warning: Game ID mismatch in file for game {game_id}, will re-scrape")
            return False
            
        return True
        
    except Exception as e:
        print(f"Warning: Error reading existing file for game {game_id}: {e}, will re-scrape")
        return False

def main():
    # Filter games since 2020-21 season (assuming year column represents end year)
    games_to_scrape = record[record['year'] >= 2021].copy()
    print(f"Total games in master record: {len(games_to_scrape)}")
    
    # Create directories for output
    os.makedirs('pbp_data', exist_ok=True)
    os.makedirs('team_lineups', exist_ok=True)
    
    # Get existing games
    existing_games = get_existing_games()
    
    # Filter out games that already exist and are valid
    games_to_process = []
    skipped_games = 0
    
    for idx, game_row in games_to_scrape.iterrows():
        game_id = str(game_row['GAME_ID'])
        year = game_row['year']
        
        if game_id in existing_games:
            # Verify the existing file is valid
            if verify_existing_file(game_id, year):
                skipped_games += 1
                continue
            else:
                # File exists but is invalid, add to processing list
                games_to_process.append((game_id, year))
        else:
            games_to_process.append((game_id, year))
    
    print(f"Skipping {skipped_games} already processed games")
    print(f"Games to process: {len(games_to_process)}")
    
    if len(games_to_process) == 0:
        print("No new games to process!")
        return
    
    # Track team lineups across all games
    all_team_lineups = defaultdict(lambda: defaultdict(set))  # team_id -> year -> set of lineups -> games
    
    failed_games = []
    successful_games = 0
    
    for idx, (game_id, year) in enumerate(games_to_process):
        print(f"Processing game {idx + 1}/{len(games_to_process)}: {game_id} (Year: {year})")
        
        try:
            # Get starters
            starters = get_starters_data(game_id)
            
            # Get play-by-play data
            pbp_df = get_play_by_play_data(game_id)
            
            if pbp_df.empty:
                print(f"No PBP data found for game {game_id}")
                failed_games.append(game_id)
                continue
            
            # Process the data
            processed_data, team_lineups = process_pbp_data(pbp_df, starters, game_id)
            
            if not processed_data:
                print(f"No processed data for game {game_id}")
                failed_games.append(game_id)
                continue
            
            # Save to CSV by year and game_id
            output_df = pd.DataFrame(processed_data)
            filename = f"pbp_data/{year}_{game_id}.csv"
            output_df.to_csv(filename, index=False)
            
            # Track team lineups
            for team_id, lineups in team_lineups.items():
                for lineup in lineups:
                    all_team_lineups[team_id][year].add((lineup, game_id))
            
            successful_games += 1
            print(f"Successfully processed game {game_id}")
            
            # Rate limiting
            time.sleep(0.5)
            
        except Exception as e:
            print(f"Error processing game {game_id}: {e}")
            failed_games.append(game_id)
            continue
    
    # Create team lineup index files
    print("\nCreating team lineup indexes...")
    for team_id in all_team_lineups:
        for year in all_team_lineups[team_id]:
            lineup_data = []
            lineup_games = defaultdict(list)
            
            # Group by lineup
            for lineup, game_id in all_team_lineups[team_id][year]:
                lineup_games[lineup].append(game_id)
            
            # Create records
            for lineup, games in lineup_games.items():
                lineup_data.append({
                    'team_id': team_id,
                    'year': year,
                    'players_on': lineup,
                    'games': '|'.join(games),
                    'game_count': len(games)
                })
            
            if lineup_data:
                lineup_df = pd.DataFrame(lineup_data)
                lineup_filename = f"team_lineups/team_{team_id}_year_{year}_lineups.csv"
                lineup_df.to_csv(lineup_filename, index=False)
    
    print(f"\nScraping complete!")
    print(f"Successfully processed: {successful_games} new games")
    print(f"Skipped existing games: {skipped_games}")
    print(f"Failed games: {len(failed_games)}")
    
    if failed_games:
        print("Failed game IDs:")
        for game_id in failed_games[:10]:  # Show first 10 failed games
            print(f"  {game_id}")
        
        # Save failed games list
        failed_df = pd.DataFrame({'failed_game_ids': failed_games})
        failed_df.to_csv('failed_games.csv', index=False)

if __name__ == "__main__":
    main()