In [2]:
import requests
import pandas as pd
import time
import os
from datetime import datetime
import sys
def fetch_details(game_id, player_id, team_id, context_measure="DEF_FGA"):
    """
    Fetch video details for a specific player in a specific game
    """
    base_url = "https://stats.nba.com/stats/videodetailsasset"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Referer": "https://www.nba.com",
        "Accept": "application/json",
        "Accept-Language": "en-US,en;q=0.9",
        "Origin": "https://www.nba.com"
    }
    
    params = {
        "GameID": '00'+str(game_id),
        "GameEventID": "",
        "PlayerID": str(player_id),
        "TeamID": str(team_id),
        "Season": "",
        "SeasonType": "",
        "AheadBehind": "",
        "CFID": "",
        "CFPARAMS": "",
        "ClutchTime": "",
        "Conference": "",
        "ContextFilter": "",
        "ContextMeasure": context_measure,
        "DateFrom": "",
        "DateTo": "",
        "Division": "",
        "EndPeriod": 0,
        "EndRange": 40800,
        "GROUP_ID": "",
        "GameSegment": "",
        "GroupID": "",
        "GroupMode": "",
        "GroupQuantity": 5,
        "LastNGames": 0,
        "Location": "",
        "Month": 0,
        "OnOff": "",
        "OppPlayerID": "",
        "OpponentTeamID": 0,
        "Outcome": "",
        "PORound": 0,
        "Period": 0,
        "PlayerID1": "",
        "PlayerID2": "",
        "PlayerID3": "",
        "PlayerID4": "",
        "PlayerID5": "",
        "PlayerPosition": "",
        "PointDiff": "",
        "Position": "",
        "RangeType": 0,
        "RookieYear": "",
        "SeasonSegment": "",
        "ShotClockRange": "",
        "StartPeriod": 0,
        "StartRange": 0,
        "StarterBench": "",
        "VsConference": "",
        "VsDivision": "",
        "VsPlayerID1": "",
        "VsPlayerID2": "",
        "VsPlayerID3": "",
        "VsPlayerID4": "",
        "VsPlayerID5": "",
        "VsTeamID": ""
    }
    
    try:
        response = requests.get(base_url, headers=headers, params=params, timeout=30)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Request failed with status code {response.status_code} for Player {player_id}, Game {game_id}")
            return None
    except requests.RequestException as e:
        print(f"Request error for Player {player_id}, Game {game_id}: {e}")
        return None

def process_video_data(video_json, player_id, team_id, player_name, year):
    """
    Process the video JSON response into a DataFrame
    """
    try:
        if video_json and 'resultSets' in video_json and 'playlist' in video_json['resultSets']:
            playlist = video_json['resultSets']['playlist']
            if playlist:  # Check if playlist is not empty
                df = pd.DataFrame(playlist)
                if not df.empty and all(col in df.columns for col in ['gi', 'ei', 'dsc']):
                    df = df[['gi', 'ei', 'dsc']]
                    df['def_id'] = player_id
                    df['team_id'] = team_id
                    df['player_name'] = player_name
                    df['year'] = year
                    return df
        return None
    except Exception as e:
        print(f"Error processing data for Player {player_id}: {e}")
        return None

def save_batch_data_by_year(year_data_dict, batch_num, output_dir="scraped_data"):
    """
    Save batch data organized by year
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    total_records_saved = 0
    
    for year, data_list in year_data_dict.items():
        if data_list:
            # Create year-specific directory
            year_dir = os.path.join(output_dir, f"year_{year}")
            if not os.path.exists(year_dir):
                os.makedirs(year_dir)
            
            # Combine all data for this year
            combined_df = pd.concat(data_list, ignore_index=True)
            
            # Save with timestamp and batch number
            filename = f"{year_dir}/batch_{batch_num}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
            combined_df.to_csv(filename, index=False)
            
            records_count = len(combined_df)
            total_records_saved += records_count
            print(f"Saved {year} batch {batch_num} with {records_count} records to {filename}")
    
    return total_records_saved

def scrape_nba_video_details(master_record_path, context_measure="DEF_FGA", 
                           delay_between_requests=2, batch_size=50):
    """
    Main scraper function that processes all unique player_id and game_id combinations,
    organizing data by year
    """
    # Load master record
    try:
        master_record = pd.read_csv(master_record_path)
        master_record['TEAM_ID']=master_record['TEAM_ID'].astype(int)
        master_record['PLAYER_ID']=master_record['PLAYER_ID'].astype(int)

        master_record=master_record[master_record.year>2024]
        print(f"Loaded master record with {len(master_record)} rows")
        print(f"Columns: {list(master_record.columns)}")
    except Exception as e:
        print(f"Error loading master record: {e}")
        return
    
    # Get unique combinations of player_id and game_id
    unique_combinations = master_record[['PLAYER_ID', 'GAME_ID', 'TEAM_ID', 'PLAYER_NAME', 'year']].drop_duplicates()
    total_combinations = len(unique_combinations)
    print(f"Found {total_combinations} unique player-game combinations to process")
    
    # Show year distribution
    year_counts = unique_combinations['year'].value_counts().sort_index()
    print(f"Data distribution by year:")
    for year, count in year_counts.items():
        print(f"  {year}: {count} combinations")
    
    # Initialize tracking variables
    year_data = {}  # Dictionary to hold data by year
    successful_requests = 0
    failed_requests = 0
    batch_num = 1
    total_records_saved = 0
    
    # Process each combination
    for idx, row in unique_combinations.iterrows():
        player_id = str(row['PLAYER_ID'])
        game_id = str(row['GAME_ID'])
        team_id = str(row['TEAM_ID'])
        player_name = row['PLAYER_NAME']
        year = row['year']
        print(player_id)
        print(game_id)
        print(team_id)
        sys.exit()
        
        print(f"Processing {idx + 1}/{total_combinations}: Player {player_name} ({player_id}) in Game {game_id} - {year}")
        
        # Fetch video details
        video_json = fetch_details(game_id, player_id, team_id, context_measure)
        
        if video_json:
            # Process the data
            processed_df = process_video_data(video_json, player_id, team_id, player_name, year)
            
            if processed_df is not None and not processed_df.empty:
                # Initialize year key if it doesn't exist
                if year not in year_data:
                    year_data[year] = []
                
                # Add data to the appropriate year
                year_data[year].append(processed_df)
                successful_requests += 1
                print(f"  ✓ Found {len(processed_df)} video records for {year}")
            else:
                print(f"  - No video data available for {year}")
        else:
            failed_requests += 1
        
        # Check if we should save a batch (based on total successful requests)
        if successful_requests > 0 and successful_requests % batch_size == 0:
            records_saved = save_batch_data_by_year(year_data, batch_num)
            total_records_saved += records_saved
            year_data = {}  # Clear all year data after saving
            batch_num += 1
        
        # Add delay between requests to be respectful to the API
        if idx < len(unique_combinations) - 1:  # Don't delay after the last request
            time.sleep(delay_between_requests)
    
    # Save any remaining data
    if any(year_data.values()):  # Check if there's any data left to save
        records_saved = save_batch_data_by_year(year_data, batch_num)
        total_records_saved += records_saved
    
    # Print summary
    print(f"\n=== SCRAPING COMPLETE ===")
    print(f"Total combinations processed: {total_combinations}")
    print(f"Successful requests: {successful_requests}")
    print(f"Failed requests: {failed_requests}")
    print(f"Total video records saved: {total_records_saved}")
    print(f"Success rate: {(successful_requests/total_combinations)*100:.1f}%")

def combine_batches_by_year(output_dir="scraped_data"):
    """
    Combine all batch files within each year into a single CSV per year
    """
    if not os.path.exists(output_dir):
        print(f"Output directory {output_dir} does not exist")
        return
    
    # Find all year directories
    year_dirs = [d for d in os.listdir(output_dir) if d.startswith('year_') and os.path.isdir(os.path.join(output_dir, d))]
    
    if not year_dirs:
        print("No year directories found")
        return
    
    print(f"Found {len(year_dirs)} year directories: {sorted(year_dirs)}")
    
    for year_dir in sorted(year_dirs):
        year_path = os.path.join(output_dir, year_dir)
        year = year_dir.replace('year_', '')
        
        # Find all batch files for this year
        batch_files = [f for f in os.listdir(year_path) if f.startswith('batch_') and f.endswith('.csv')]
        
        if not batch_files:
            print(f"No batch files found for {year}")
            continue
        
        print(f"\nProcessing {year}: Found {len(batch_files)} batch files")
        
        # Load and combine all batches for this year
        year_batches = []
        for file in batch_files:
            file_path = os.path.join(year_path, file)
            df = pd.read_csv(file_path)
            year_batches.append(df)
            print(f"  Loaded {file}: {len(df)} records")
        
        # Combine all batches for this year
        if year_batches:
            final_df = pd.concat(year_batches, ignore_index=True)
            
            # Remove duplicates if any
            initial_count = len(final_df)
            final_df = final_df.drop_duplicates()
            final_count = len(final_df)
            
            if initial_count != final_count:
                print(f"  Removed {initial_count - final_count} duplicate records for {year}")
            
            # Save final combined file for this year
            final_filename = f"combined_video_details_{year}.csv"
            final_path = os.path.join(year_path, final_filename)
            final_df.to_csv(final_path, index=False)
            print(f"  ✓ Combined file saved: {final_path} with {final_count} total records")

def get_year_summary(output_dir="scraped_data"):
    """
    Print a summary of data collected by year
    """
    if not os.path.exists(output_dir):
        print(f"Output directory {output_dir} does not exist")
        return
    
    year_dirs = [d for d in os.listdir(output_dir) if d.startswith('year_') and os.path.isdir(os.path.join(output_dir, d))]
    
    if not year_dirs:
        print("No year directories found")
        return
    
    print(f"\n=== DATA SUMMARY BY YEAR ===")
    total_records = 0
    
    for year_dir in sorted(year_dirs):
        year = year_dir.replace('year_', '')
        year_path = os.path.join(output_dir, year_dir)
        
        # Look for combined file first, otherwise count batch files
        combined_file = os.path.join(year_path, f"combined_video_details_{year}.csv")
        
        if os.path.exists(combined_file):
            df = pd.read_csv(combined_file)
            record_count = len(df)
            print(f"{year}: {record_count:,} records (combined)")
        else:
            # Count records in batch files
            batch_files = [f for f in os.listdir(year_path) if f.startswith('batch_') and f.endswith('.csv')]
            record_count = 0
            for file in batch_files:
                file_path = os.path.join(year_path, file)
                df = pd.read_csv(file_path)
                record_count += len(df)
            print(f"{year}: {record_count:,} records ({len(batch_files)} batch files)")
        
        total_records += record_count
    
    print(f"\nTotal records across all years: {total_records:,}")

# Example usage
if __name__ == "__main__":
    # Run the scraper
    scrape_nba_video_details(
        master_record_path='master_record.csv',
        context_measure="DEF_FGA",
        delay_between_requests=2,  # 2 seconds between requests
        batch_size=50  # Save every 50 successful requests
    )
    
    # Combine all batch files by year
    combine_batches_by_year()
    
    # Print summary
    get_year_summary()

Loaded master record with 28019 rows
Columns: ['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ABBREVIATION', 'TEAM_ID', 'GAME_ID', 'year']
Found 28019 unique player-game combinations to process
Data distribution by year:
  2025: 28019 combinations
1631260
22400066
1610612749.0


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
