v4 works so v5 is a cleaned up version


To get around the timeout errors we faced in prior scraping attempts, this approach will continually write results to a pkl file as the scraping progresses. This way, any error will not invalidate the progress made + allows for a starting point for the reattempt 

# Scraping games to a pkl

## Extracting unique game_ids 

from the shots dataset

In [None]:
import pandas as pd
df = pd.read_pickle('all-shots.pkl') 
df = df.to_pandas(use_pyarrow_extension_array=True)
print(df['GAME_ID'].nunique())
game_ids = df['GAME_ID'].unique()

## Continual scraping 
featuring
- pkl cache
- tqdm + logging
- exponential retries
- manual timeout waits
- ability to start where last attempt stopped (hasnt crashed so this is untested)


Note that the 'cache' is basically a dictionary with game_ids as keys and endpoint response jsons as values  

In [None]:
import requests
import pickle
import time
from pathlib import Path
from typing import Dict, Optional, List
import logging
from tqdm import tqdm
from requests.exceptions import Timeout, RequestException
import random

### helper methods

#### load/save the pkl cache

In [None]:
def load_or_create_cache(cache_path: str) -> Dict[str, dict]:
    """Load existing cache or create new one if it doesn't exist."""
    if Path(cache_path).exists():
        with open(cache_path, 'rb') as f:
            return pickle.load(f)
    return {}

In [None]:
def save_cache(cache: Dict[str, dict], cache_path: str) -> None:
    """Save the cache to disk."""
    with open(cache_path, 'wb') as f:
        pickle.dump(cache, f)

#### scrape for a single game row

In [None]:
def create_game_level_row(game_id: str, timeout: int = 30, max_retries: int = 3) -> Optional[dict]:
    """
    Fetch game stats from NBA API with timeout handling and automatic retries.
    
    Args:
        game_id: NBA game identifier
        timeout: Request timeout in seconds
        max_retries: Maximum number of retry attempts for timeout errors
    """
    headers = {
        "Host": "stats.nba.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0",
        "Accept": "application/json, text/plain, */*",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "x-nba-stats-origin": "stats",
        "x-nba-stats-token": "true",
        "Connection": "keep-alive",
        "Referer": "https://stats.nba.com/",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache",
    }
    
    url = f'https://stats.nba.com/stats/boxscoretraditionalv3?EndPeriod=0&EndRange=0&GameID={'00'+str(game_id)}&RangeType=0&StartPeriod=0&StartRange=0'
    
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=timeout)
            response.raise_for_status()
            return response.json()
            
        except Timeout:
            wait_time = (attempt + 1) * 5 + random.uniform(1, 3)  # Exponential backoff with jitter
            logging.warning(f"Timeout for game {game_id} (attempt {attempt + 1}/{max_retries}). "
                          f"Waiting {wait_time:.1f} seconds before retry...")
            time.sleep(wait_time)
            
        except RequestException as e:
            logging.error(f"Error fetching game {game_id}: {str(e)}")
            return None
            
    logging.error(f"Max retries ({max_retries}) reached for game {game_id}")
    return None

#### scrape for all games

In [None]:
def scrape_game_stats(game_ids: List[str], cache_path: str = 'nba_games_cache.pkl', 
                     delay: float = 1.0, save_frequency: int = 10,
                     timeout: int = 30, max_retries: int = 3) -> Dict[str, dict]:
    """
    Scrape game stats with progress bar, timeout handling, and automatic retries.
    
    Args:
        game_ids: List of NBA game IDs to scrape
        cache_path: Path to save/load the pickle cache file
        delay: Time to wait between requests in seconds
        save_frequency: How often to save the cache (every N successful requests)
        timeout: Request timeout in seconds
        max_retries: Maximum number of retry attempts for timeout errors
    
    Returns:
        Dictionary mapping game IDs to their stats data
    """
    # Setup logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    
    # Load existing cache
    cache = load_or_create_cache(cache_path)
    logging.info(f"Loaded cache with {len(cache)} existing games")
    
    # Filter out already cached games
    games_to_scrape = [gid for gid in game_ids if gid not in cache]
    logging.info(f"Found {len(games_to_scrape)} new games to scrape")
    
    successful_requests = 0
    
    # Create progress bar
    pbar = tqdm(games_to_scrape, desc="Scraping games", unit="game")
    
    for game_id in pbar:
        try:
            # Update progress bar description with current game
            pbar.set_description(f"Scraping game {game_id}")
            
            # Fetch game data with timeout handling and retries
            game_data = create_game_level_row(game_id, timeout=timeout, max_retries=max_retries)
            
            if game_data is not None:
                cache[game_id] = game_data
                successful_requests += 1
                
                # Update progress bar postfix with success count
                pbar.set_postfix(
                    successful=successful_requests,
                    cached_total=len(cache)
                )
                
                # Save periodically
                if successful_requests % save_frequency == 0:
                    save_cache(cache, cache_path)
                    logging.info(f"Saved cache with {len(cache)} games")
            
            # Wait between requests
            time.sleep(delay)
            
        except Exception as e:
            logging.error(f"Unexpected error processing game {game_id}: {str(e)}")
            # Save cache on error to preserve progress
            save_cache(cache, cache_path)
            logging.info("Saved cache due to error")
            raise
    
    # Close progress bar
    pbar.close()
    
    # Final save
    save_cache(cache, cache_path)
    logging.info(f"Scraping completed. Final cache contains {len(cache)} games")
    
    return cache

### main()

Note: 1s delay may be overkill, but since my attempts did not crash I did not attempt to modify it

In [None]:
try:
    game_stats_2024 = scrape_game_stats(
        game_ids=game_ids,
        cache_path='nba_games_cache.pkl',
        delay=1.0,  # 1 second delay between requests
        save_frequency=10,  # Save every 10 successful requests
        timeout=30,  # 30 second timeout
        max_retries=3  # Retry up to 3 times on timeout
    )
except Exception as e:
    logging.error(f"Scraping stopped due to error: {str(e)}")

# Processing pkl cache into dataset df

In [None]:
import pandas as pd
import pickle

load pkl into cache dictionary

In [None]:
with open('nba_games_cache.pkl', 'rb') as f:
    cache = pickle.load(f)
#print(f'number of game ids: {len(game_ids)}\nnumber of scraped games: {len(cache)}')

iterate through cache using `pd.json_normalize()` to create a row for each game id's value

Note: I had a more complicated and verbose version of this wrapped in a function, but while it worked for 2024 sample cache, it didnt for the entire game cache 

In [None]:
all_games = []
num_skipped = 0
for game_id, game_data in cache.items():
    try:
        assert game_data is not None
        df = pd.json_normalize(game_data)
        all_games.append(df)
    except:
        num_skipped = num_skipped + 1
        continue
print(f'Num skipped: {num_skipped}')
game_level_dataset = pd.concat(all_games, ignore_index=True)
print(f'shape of dataset: {game_level_dataset.shape}')
game_level_dataset.head()

export out to csv

In [None]:
game_level_dataset.to_csv('game-level-dataset.csv', index=False)