In [None]:
import requests
import pandas as pd
import numpy as np
import json
import time
import datetime
import os
import concurrent.futures
from datetime import timedelta

def get_team_dict():
    """Returns a dictionary mapping team abbreviations to team IDs."""
    return {
        'ATL': '1610612737', 'BKN': '1610612751', 'BOS': '1610612738', 'CHA': '1610612766',
        'CHI': '1610612741', 'CLE': '1610612739', 'DAL': '1610612742', 'DEN': '1610612743',
        'DET': '1610612765', 'GSW': '1610612744', 'HOU': '1610612745', 'IND': '1610612754',
        'LAC': '1610612746', 'LAL': '1610612747', 'MEM': '1610612763', 'MIA': '1610612748',
        'MIL': '1610612749', 'MIN': '1610612750', 'NOP': '1610612740', 'NYK': '1610612752',
        'OKC': '1610612760', 'ORL': '1610612753', 'PHI': '1610612755', 'PHX': '1610612756',
        'POR': '1610612757', 'SAC': '1610612758', 'SAS': '1610612759', 'TOR': '1610612761',
        'UTA': '1610612762', 'WAS': '1610612764'
    }

def get_season_dates():
    """Returns a dictionary of season start and end dates from 2015-16 to 2024-25."""
    return {
        '2015-16': ('2015-10-27', '2016-04-13'),
        '2016-17': ('2016-10-25', '2017-04-12'),
        '2017-18': ('2017-10-17', '2018-04-11'),
        '2018-19': ('2018-10-16', '2019-04-10'),
        '2019-20': ('2019-10-22', '2020-08-14'),  # Bubble season
        '2020-21': ('2020-12-22', '2021-05-16'),  # Shortened season
        '2021-22': ('2021-10-19', '2022-04-10'),
        '2022-23': ('2022-10-18', '2023-04-09'),
        '2023-24': ('2023-10-24', '2024-04-14'),
        '2024-25': ('2024-10-22', '2025-04-13'),  # Projected end date
    }

def determine_season(date_str):
    """Determine the season based on a date string."""
    year = int(date_str[:4])
    month = int(date_str[5:7])
    
    if month >= 9:  # New season starts around October
        return f"{year}-{str(year+1)[-2:]}"
    else:
        return f"{year-1}-{str(year)[-2:]}"

def get_date_ranges(start_date, end_date, chunks=7):
    """Generate date ranges between start and end dates with specified chunk size."""
    start = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.datetime.strptime(end_date, '%Y-%m-%d')
    
    date_ranges = []
    current = start
    
    while current < end:
        range_end = min(current + timedelta(days=chunks-1), end)
        date_ranges.append((
            current.strftime('%Y-%m-%d'),
            range_end.strftime('%Y-%m-%d')
        ))
        current = range_end + timedelta(days=1)
    
    return date_ranges

def fetch_possessions(team, start_date, end_date, season_str=None):
    """Fetch both offensive and defensive possessions for a team in the given date range."""
    team_dict = get_team_dict()
    if season_str is None:
        season_str = determine_season(start_date)
    url = "https://api.pbpstats.com/get-possessions/nba"
    
    all_possessions = []
    
    # Fetch offensive possessions
    params = {
        "league": 'nba',
        "TeamId": team_dict[team],
        "Season": season_str,
        "SeasonType": "All",
        "OffDef": "Offense",
        "StartType": "All",
        "FromDate": start_date,
        "ToDate": end_date,
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise exception for HTTP errors
        response_json = response.json()
        offensive_possessions = response_json.get("possessions", [])
        
        # Add team info to each possession
        for possession in offensive_possessions:
            possession['Team'] = team
            possession['IsOffense'] = True
            possession['Season'] = season_str
        
        all_possessions.extend(offensive_possessions)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching offensive possessions for {team} ({season_str}): {e}")
    
    # Fetch defensive possessions
    params['OffDef'] = "Defense"
    time.sleep(1)  # Reduced delay
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        response_json = response.json()
        defensive_possessions = response_json.get("possessions", [])
        
        # Add team info to each possession
        for possession in defensive_possessions:
            possession['Team'] = team
            possession['IsOffense'] = False
            possession['Season'] = season_str
        
        all_possessions.extend(defensive_possessions)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching defensive possessions for {team} ({season_str}): {e}")
    
    print(f"Fetched {len(all_possessions)} possessions for {team} from {start_date} to {end_date} ({season_str})")
    return all_possessions

def process_possessions(possessions):
    """Process and normalize possession data for CSV export."""
    processed_data = []
    
    for possession in possessions:
        # Extract common fields
        row = {
            'Team': possession.get('Team', ''),
            'Season': possession.get('Season', ''),
            'IsOffense': possession.get('IsOffense', True),
            'GameId': possession.get('GameId', ''),
            'GameDate': possession.get('GameDate', ''),
            'Opponent': possession.get('Opponent', ''),
            'Period': possession.get('Period', ''),
            'StartTime': possession.get('StartTime', ''),
            'EndTime': possession.get('EndTime', ''),
            'StartType': possession.get('StartType', ''),
            'StartScoreDifferential': possession.get('StartScoreDifferential', 0),
            'FG2M': possession.get('FG2M', 0),
            'FG2A': possession.get('FG2A', 0),
            'FG3M': possession.get('FG3M', 0),
            'FG3A': possession.get('FG3A', 0),
            'OffensiveRebounds': possession.get('OffensiveRebounds', 0),
            'Turnovers': possession.get('Turnovers', 0),
            'ShootingFoulsDrawn': possession.get('ShootingFoulsDrawn', 0),
            'NonShootingFoulsThatResultedInFts': possession.get('NonShootingFoulsThatResultedInFts', 0),
        }
        
        # Process VideoUrls array
        if 'VideoUrls' in possession and possession['VideoUrls']:
            row['VideoUrlsCount'] = len(possession['VideoUrls'])
            row['VideoUrls'] = '; '.join([v.get('url', '') for v in possession['VideoUrls']])
            row['VideoDescriptions'] = '; '.join([v.get('description', '') for v in possession['VideoUrls']])
        else:
            row['VideoUrlsCount'] = 0
            row['VideoUrls'] = ''
            row['VideoDescriptions'] = ''
        
        # Process Events array
        if 'Events' in possession and possession['Events']:
            row['EventsDescription'] = possession['Events']
            row['EventsCount'] = possession['Events'].count('\n')
        else:
            row['EventsDescription'] = ''
            row['EventsCount'] = 0
            
        processed_data.append(row)
        
    return processed_data

def process_team_for_season(team, season_str, start_date, end_date, output_dir):
    """Process all data for a single team and season."""
    date_ranges = get_date_ranges(start_date, end_date, chunks=7)  # Increased chunk size
    team_data = []
    
    for start, end in date_ranges:
        print(f"  Fetching {team} - {season_str} - {start} to {end}...")
        possessions = fetch_possessions(team, start, end, season_str)
        if possessions:
            processed = process_possessions(possessions)
            team_data.extend(processed)
        
        # Reduced delay
        time.sleep(0.5)
    
    if team_data:
        # Save team-specific data
        team_df = pd.DataFrame(team_data)
        season_year = season_str.replace('-', '')
        team_file = os.path.join(output_dir, f"{season_year}_{team}_possessions.csv")
        team_df.to_csv(team_file, index=False)
        print(f"Saved {len(team_data)} possessions for {team} ({season_str}) to {team_file}")
    
    return team_data

def scrape_nba_possessions(seasons=None, output_dir='nba_possessions_data', max_workers=3):
    """
    Main function to scrape NBA possession data for multiple seasons.
    
    Args:
        seasons: List of seasons to scrape (e.g., ['2015-16', '2016-17'])
                If None, scrapes all seasons from 2015-16 to 2024-25
        output_dir: Directory to save the data
        max_workers: Maximum number of concurrent workers for parallel processing
    """
    if seasons is None:
        seasons = list(get_season_dates().keys())
    
    teams = list(get_team_dict().keys())
    season_dates = get_season_dates()
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Track all data for final combination
    all_data = []
    
    # Process seasons one at a time to avoid overwhelming the API
    for season_str in seasons:
        print(f"\nProcessing {season_str} season...")
        start_date, end_date = season_dates[season_str]
        season_data = []
        season_year = season_str.replace('-', '')
        
        # Use ThreadPoolExecutor to parallelize team processing
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit tasks
            future_to_team = {
                executor.submit(
                    process_team_for_season, team, season_str, start_date, end_date, output_dir
                ): team for team in teams
            }
            
            # Process completed tasks
            for future in concurrent.futures.as_completed(future_to_team):
                team = future_to_team[future]
                try:
                    team_data = future.result()
                    if team_data:
                        season_data.extend(team_data)
                except Exception as e:
                    print(f"Error processing {team} for {season_str}: {e}")
        
        # Save season data
        if season_data:
            season_df = pd.DataFrame(season_data)
            season_file = os.path.join(output_dir, f"{season_year}_all_teams_possessions.csv")
            season_df.to_csv(season_file, index=False)
            print(f"Saved {len(season_data)} total possessions for {season_str} to {season_file}")
            
            # Add to all data
            all_data.extend(season_data)
    
    # Save all data to a single file (can be very large)
    if all_data and len(seasons) > 1:
        all_df = pd.DataFrame(all_data)
        all_file = os.path.join(output_dir, f"all_seasons_possessions.csv")
        all_df.to_csv(all_file, index=False)
        print(f"\nSaved {len(all_data)} total possessions across all seasons to {all_file}")
    
    return all_data

if __name__ == "__main__":
    # Define seasons to scrape (comment out ones you don't want)
    seasons_to_scrape = [
        '2015-16',
        '2016-17',
        '2017-18',
        '2018-19',
        '2019-20',
        '2020-21',
        '2021-22',
        '2022-23',
        '2023-24',
    ]
    
    # You can select specific seasons by uncommenting the next line and specifying seasons
    # seasons_to_scrape = ['2020-21', '2021-22', '2022-23']
    
    print(f"Starting NBA possession data scraper for {len(seasons_to_scrape)} seasons...")
    scrape_nba_possessions(seasons=seasons_to_scrape, max_workers=3)

Starting NBA possession data scraper for 10 seasons...

Processing 2015-16 season...
  Fetching ATL - 2015-16 - 2015-10-27 to 2015-11-02...
  Fetching BKN - 2015-16 - 2015-10-27 to 2015-11-02...
  Fetching BOS - 2015-16 - 2015-10-27 to 2015-11-02...
Error fetching offensive possessions for BOS (2015-16): 403 Client Error: Forbidden for url: https://api.pbpstats.com/get-possessions/nba?league=nba&TeamId=1610612738&Season=2015-16&SeasonType=All&OffDef=Offense&StartType=All&FromDate=2015-10-27&ToDate=2015-11-02
Error fetching offensive possessions for ATL (2015-16): 403 Client Error: Forbidden for url: https://api.pbpstats.com/get-possessions/nba?league=nba&TeamId=1610612737&Season=2015-16&SeasonType=All&OffDef=Offense&StartType=All&FromDate=2015-10-27&ToDate=2015-11-02
Error fetching offensive possessions for BKN (2015-16): 403 Client Error: Forbidden for url: https://api.pbpstats.com/get-possessions/nba?league=nba&TeamId=1610612751&Season=2015-16&SeasonType=All&OffDef=Offense&StartType=