In [None]:
import requests
import pandas as pd
import numpy as np
import json
import time
import datetime
import os
import concurrent.futures
from datetime import timedelta
import glob
import sys
def get_team_dict():
    """Returns a dictionary mapping team abbreviations to team IDs."""
    return {
        'ATL': '1610612737', 'BKN': '1610612751', 'BOS': '1610612738', 'CHA': '1610612766',
        'CHI': '1610612741', 'CLE': '1610612739', 'DAL': '1610612742', 'DEN': '1610612743',
        'DET': '1610612765', 'GSW': '1610612744', 'HOU': '1610612745', 'IND': '1610612754',
        'LAC': '1610612746', 'LAL': '1610612747', 'MEM': '1610612763', 'MIA': '1610612748',
        'MIL': '1610612749', 'MIN': '1610612750', 'NOP': '1610612740', 'NYK': '1610612752',
        'OKC': '1610612760', 'ORL': '1610612753', 'PHI': '1610612755', 'PHX': '1610612756',
        'POR': '1610612757', 'SAC': '1610612758', 'SAS': '1610612759', 'TOR': '1610612761',
        'UTA': '1610612762', 'WAS': '1610612764'
    }

def get_season_dates():
    """Returns a dictionary of season start and end dates from 2015-16 to 2024-25."""
    return {
        '2015-16': ('2015-10-27', '2016-04-13'),
        '2016-17': ('2016-10-25', '2017-04-12'),
        '2017-18': ('2017-10-17', '2018-04-11'),
        '2018-19': ('2018-10-16', '2019-04-10'),
        '2019-20': ('2019-10-22', '2020-08-14'),  # Bubble season
        '2020-21': ('2020-12-22', '2021-05-16'),  # Shortened season
        '2021-22': ('2021-10-19', '2022-04-10'),
        '2022-23': ('2022-10-18', '2023-04-09'),
        '2023-24': ('2023-10-24', '2024-04-14'),
        '2024-25': ('2024-10-22', '2025-04-13'),  # Projected end date
    }

def determine_season(date_str):
    """Determine the season based on a date string."""
    year = int(date_str[:4])
    month = int(date_str[5:7])
    
    if month >= 9:  # New season starts around October
        return f"{year}-{str(year+1)[-2:]}"
    else:
        return f"{year-1}-{str(year)[-2:]}"

def get_date_ranges(start_date, end_date, chunks=7):
    """Generate date ranges between start and end dates with specified chunk size."""
    start = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.datetime.strptime(end_date, '%Y-%m-%d')
    
    date_ranges = []
    current = start
    
    while current < end:
        range_end = min(current + timedelta(days=chunks-1), end)
        date_ranges.append((
            current.strftime('%Y-%m-%d'),
            range_end.strftime('%Y-%m-%d')
        ))
        current = range_end + timedelta(days=1)
    
    return date_ranges

def get_checkpoint_path(output_dir, season_str, team, start_date, end_date):
    """Generate a checkpoint filename for a specific team, date range."""
    season_year = season_str.replace('-', '')
    date_range_str = f"{start_date.replace('-', '')}_{end_date.replace('-', '')}"
    return os.path.join(output_dir, "checkpoints", f"{season_year}_{team}_{date_range_str}.json")

def save_checkpoint(possessions, output_dir, season_str, team, start_date, end_date):
    """Save checkpoint data for a specific scraping segment."""
    checkpoint_dir = os.path.join(output_dir, "checkpoints")
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    checkpoint_path = get_checkpoint_path(output_dir, season_str, team, start_date, end_date)
    
    # Save minimal data to confirm completion
    checkpoint_data = {
        "season": season_str,
        "team": team,
        "start_date": start_date,
        "end_date": end_date,
        "count": len(possessions),
        "timestamp": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        "completed": True
    }
    
    with open(checkpoint_path, 'w') as f:
        json.dump(checkpoint_data, f)
    
    return checkpoint_path

def check_checkpoint(output_dir, season_str, team, start_date, end_date):
    """Check if a checkpoint exists for this specific data segment."""
    checkpoint_path = get_checkpoint_path(output_dir, season_str, team, start_date, end_date)
    if os.path.exists(checkpoint_path):
        try:
            with open(checkpoint_path, 'r') as f:
                checkpoint_data = json.load(f)
            if checkpoint_data.get("completed", False):
                return True, checkpoint_data.get("count", 0)
        except (json.JSONDecodeError, IOError):
            # Invalid checkpoint file
            pass
    return False, 0

def check_team_season_complete(output_dir, season_str, team):
    """Check if a full team season file exists and has data."""
    season_year = season_str.replace('-', '')
    team_file = os.path.join(output_dir, f"{season_year}_{team}_possessions.csv")
    
    if os.path.exists(team_file):
        try:
            # Check if file has content beyond header
            df = pd.read_csv(team_file, nrows=5)
            if len(df) > 0:
                return True
        except:
            pass
    return False

def fetch_possessions(team, start_date, end_date, season_str=None):
    """Fetch both offensive and defensive possessions for a team in the given date range."""
    team_dict = get_team_dict()
    if season_str is None:
        season_str = determine_season(start_date)
    url = "https://api.pbpstats.com/get-possessions/nba"
    
    all_possessions = []
    
    # Fetch offensive possessions
    params = {
        "league": 'nba',
        "TeamId": team_dict[team],
        "Season": season_str,
        "SeasonType": "All",
        "OffDef": "Offense",
        "StartType": "All",
        "FromDate": start_date,
        "ToDate": end_date,
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise exception for HTTP errors
        response_json = response.json()
        offensive_possessions = response_json.get("possessions", [])
        
        # Add team info to each possession
        for possession in offensive_possessions:
            possession['Team'] = team
            possession['IsOffense'] = True
            possession['Season'] = season_str
        
        all_possessions.extend(offensive_possessions)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching offensive possessions for {team} ({season_str}): {e}")
    
    # Fetch defensive possessions
    params['OffDef'] = "Defense"
    time.sleep(1)  # Reduced delay
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        response_json = response.json()
        defensive_possessions = response_json.get("possessions", [])
        
        # Add team info to each possession
        for possession in defensive_possessions:
            possession['Team'] = team
            possession['IsOffense'] = False
            possession['Season'] = season_str
        
        all_possessions.extend(defensive_possessions)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching defensive possessions for {team} ({season_str}): {e}")
    
    print(f"Fetched {len(all_possessions)} possessions for {team} from {start_date} to {end_date} ({season_str})")
    return all_possessions

def process_possessions(possessions):
    """Process and normalize possession data for CSV export."""
    processed_data = []
    
    for possession in possessions:
        # Extract common fields
        print(possession)
     
        row = {
            'Team': possession.get('Team', ''),
            'Season': possession.get('Season', ''),
            'IsOffense': possession.get('IsOffense', True),
            'GameId': possession.get('GameId', ''),
            'GameDate': possession.get('GameDate', ''),
            'Opponent': possession.get('Opponent', ''),
            'Period': possession.get('Period', ''),
            'StartTime': possession.get('StartTime', ''),
            'EndTime': possession.get('EndTime', ''),
            'StartType': possession.get('StartType', ''),
            'StartScoreDifferential': possession.get('StartScoreDifferential', 0),
            'FG2M': possession.get('FG2M', 0),
            'FG2A': possession.get('FG2A', 0),
            'FG3M': possession.get('FG3M', 0),
            'FG3A': possession.get('FG3A', 0),
            'OffensiveRebounds': possession.get('OffensiveRebounds', 0),
            'Turnovers': possession.get('Turnovers', 0),
            'ShootingFoulsDrawn': possession.get('ShootingFoulsDrawn', 0),
            'NonShootingFoulsThatResultedInFts': possession.get('NonShootingFoulsThatResultedInFts', 0),
        }
        
        # Process VideoUrls array
        if 'VideoUrls' in possession and possession['VideoUrls']:
            row['VideoUrlsCount'] = len(possession['VideoUrls'])
            row['VideoUrls'] = '; '.join([v.get('url', '') for v in possession['VideoUrls']])
            row['VideoDescriptions'] = '; '.join([v.get('description', '') for v in possession['VideoUrls']])
        else:
            row['VideoUrlsCount'] = 0
            row['VideoUrls'] = ''
            row['VideoDescriptions'] = ''
        
        # Process Events array
        if 'Events' in possession and possession['Events']:
            row['EventsDescription'] = possession['Events']
            row['EventsCount'] = possession['Events'].count('\n')
        else:
            row['EventsDescription'] = ''
            row['EventsCount'] = 0
            
        processed_data.append(row)
        
    return processed_data

def load_existing_team_data(output_dir, season_str, team):
    """Load existing team data if available."""
    season_year = season_str.replace('-', '')
    team_file = os.path.join(output_dir, f"{season_year}_{team}_possessions.csv")
    
    if os.path.exists(team_file):
        try:
            return pd.read_csv(team_file).to_dict('records')
        except Exception as e:
            print(f"Warning: Could not load existing team data: {e}")
    
    return []

def process_team_for_season(team, season_str, start_date, end_date, output_dir):
    """Process all data for a single team and season with checkpoint support."""
    # First check if complete team season file exists
    if check_team_season_complete(output_dir, season_str, team):
        print(f"  Team {team} - {season_str} already has complete data, skipping...")
        return load_existing_team_data(output_dir, season_str, team)
    
    date_ranges = get_date_ranges(start_date, end_date, chunks=7)
    team_data = []
    
    for start, end in date_ranges:
        # Check if this date range was already processed
        checkpoint_exists, count = check_checkpoint(output_dir, season_str, team, start, end)
        if checkpoint_exists:
            print(f"  Skipping {team} - {season_str} - {start} to {end} (already processed {count} possessions)")
            continue
            
        print(f"  Fetching {team} - {season_str} - {start} to {end}...")
        try:
            possessions = fetch_possessions(team, start, end, season_str)
            if possessions:
                processed = process_possessions(possessions)
                team_data.extend(processed)
                # Save checkpoint after successful processing
                save_checkpoint(possessions, output_dir, season_str, team, start, end)
            else:
                # Save empty checkpoint to avoid retrying
                save_checkpoint([], output_dir, season_str, team, start, end)
        except Exception as e:
            print(f"  Error processing {team} - {season_str} - {start} to {end}: {e}")
            # Don't save checkpoint for errors - will retry next time
            
        # Reduced delay
        time.sleep(2)
    
    # Merge with any existing data
    existing_data = load_existing_team_data(output_dir, season_str, team)
    if existing_data:
        # Combine and remove duplicates based on GameId, Period, StartTime
        combined_data = existing_data + team_data
        if combined_data:
            df = pd.DataFrame(combined_data)
            df = df.drop_duplicates(subset=['GameId', 'Period', 'StartTime', 'IsOffense'])
            team_data = df.to_dict('records')
    
    if team_data:
        # Save team-specific data
        team_df = pd.DataFrame(team_data)
        season_year = season_str.replace('-', '')
        team_file = os.path.join(output_dir, f"{season_year}_{team}_possessions.csv")
        team_df.to_csv(team_file, index=False)
        print(f"Saved {len(team_data)} possessions for {team} ({season_str}) to {team_file}")
    
    return team_data

def check_season_complete(output_dir, season_str):
    """Check if a season file exists and appears complete."""
    season_year = season_str.replace('-', '')
    season_file = os.path.join(output_dir, f"{season_year}_all_teams_possessions.csv")
    
    if os.path.exists(season_file):
        # Load first few rows to check schema
        try:
            df = pd.read_csv(season_file, nrows=5)
            if len(df) > 0 and 'Team' in df.columns and 'Season' in df.columns:
                # Count unique teams to check completeness
                full_df = pd.read_csv(season_file)
                unique_teams = full_df['Team'].nunique()
                if unique_teams >= 28:  # Most seasons have 30 teams
                    print(f"Season {season_str} appears complete with {unique_teams} teams")
                    return True
        except Exception as e:
            print(f"Error checking season completeness: {e}")
    
    return False

def combine_team_files_for_season(output_dir, season_str):
    """Combine all team files for a season into one season file."""
    season_year = season_str.replace('-', '')
    pattern = os.path.join(output_dir, f"{season_year}_*_possessions.csv")
    team_files = glob.glob(pattern)
    
    # Filter out the combined file if it exists
    team_files = [f for f in team_files if not f.endswith(f"{season_year}_all_teams_possessions.csv")]
    
    if not team_files:
        print(f"No team files found for season {season_str}")
        return []
    
    all_data = []
    for file in team_files:
        try:
            df = pd.read_csv(file)
            team_name = os.path.basename(file).split('_')[1]
            print(f"  Adding {len(df)} rows from {team_name}")
            all_data.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")
    
    if not all_data:
        return []
    
    # Combine all dataframes
    combined_df = pd.concat(all_data, ignore_index=True)
    # Remove duplicates
    combined_df = combined_df.drop_duplicates(subset=['GameId', 'Period', 'StartTime', 'IsOffense', 'Team'])
    
    return combined_df.to_dict('records')

def scrape_nba_possessions(seasons=None, output_dir='nba_possessions_data', max_workers=3):
    """
    Main function to scrape NBA possession data for multiple seasons with checkpoint support.
    
    Args:
        seasons: List of seasons to scrape (e.g., ['2015-16', '2016-17'])
                If None, scrapes all seasons from 2015-16 to 2024-25
        output_dir: Directory to save the data
        max_workers: Maximum number of concurrent workers for parallel processing
    """
    if seasons is None:
        seasons = list(get_season_dates().keys())
    
    teams = list(get_team_dict().keys())
    season_dates = get_season_dates()
    
    # Create output directory and checkpoints directory
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(os.path.join(output_dir, "checkpoints"), exist_ok=True)
    
    # Process seasons one at a time
    for season_str in seasons:
        print(f"\nChecking status of {season_str} season...")
        
        # Check if season is already complete
        if check_season_complete(output_dir, season_str):
            print(f"Season {season_str} is already complete, skipping...")
            continue
            
        print(f"Processing {season_str} season...")
        start_date, end_date = season_dates[season_str]
        
        # Create a progress tracking file 
        progress_file = os.path.join(output_dir, "checkpoints", f"{season_str.replace('-', '')}_progress.json")
        progress_data = {
            "season": season_str,
            "start_time": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            "total_teams": len(teams),
            "completed_teams": 0,
            "in_progress": True
        }
        with open(progress_file, 'w') as f:
            json.dump(progress_data, f)
        
        # Use ThreadPoolExecutor to parallelize team processing
        completed_teams = 0
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit tasks
            future_to_team = {
                executor.submit(
                    process_team_for_season, team, season_str, start_date, end_date, output_dir
                ): team for team in teams
            }
            
            # Process completed tasks
            for future in concurrent.futures.as_completed(future_to_team):
                team = future_to_team[future]
                try:
                    future.result()
                    completed_teams += 1
                    # Update progress file
                    progress_data["completed_teams"] = completed_teams
                    with open(progress_file, 'w') as f:
                        json.dump(progress_data, f)
                except Exception as e:
                    print(f"Error processing {team} for {season_str}: {e}")
        
        # After all teams are processed, combine into season file
        print(f"Combining team files for {season_str}...")
        season_data = combine_team_files_for_season(output_dir, season_str)
        
        if season_data:
            season_year = season_str.replace('-', '')
            season_file = os.path.join(output_dir, f"{season_year}_all_teams_possessions.csv")
            season_df = pd.DataFrame(season_data)
            season_df.to_csv(season_file, index=False)
            print(f"Saved {len(season_data)} total possessions for {season_str} to {season_file}")
            
            # Update progress file as complete
            progress_data["in_progress"] = False
            progress_data["end_time"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            progress_data["total_possessions"] = len(season_data)
            with open(progress_file, 'w') as f:
                json.dump(progress_data, f)
    
    # After all seasons are complete, create master file if needed
    if len(seasons) > 1:
        print("\nCreating master file with all seasons...")
        all_seasons_pattern = os.path.join(output_dir, "*_all_teams_possessions.csv")
        season_files = glob.glob(all_seasons_pattern)
        
        if season_files:
            all_data = []
            for file in season_files:
                season_id = os.path.basename(file).split('_')[0]
                print(f"  Reading {season_id} season data...")
                try:
                    df = pd.read_csv(file)
                    all_data.append(df)
                except Exception as e:
                    print(f"  Error reading {file}: {e}")
            
            if all_data:
                master_df = pd.concat(all_data, ignore_index=True)
                master_file = os.path.join(output_dir, "all_seasons_possessions.csv")
                print(f"Saving {len(master_df)} rows to master file...")
                master_df.to_csv(master_file, index=False)
                print(f"Master file created at {master_file}")
    
    print("\nScraping process complete!")
    return True

def resume_scraping():
    """Function to check scraping progress and resume from where it left off."""
    output_dir = 'nba_possessions_data'
    checkpoint_dir = os.path.join(output_dir, "checkpoints")
    
    if not os.path.exists(checkpoint_dir):
        print("No previous scraping detected. Starting fresh...")
        return scrape_nba_possessions()
    
    # Find progress files
    progress_files = glob.glob(os.path.join(checkpoint_dir, "*_progress.json"))
    
    if not progress_files:
        print("No progress tracking files found. Starting fresh...")
        return scrape_nba_possessions()
    
    # Check which seasons are in progress or not started
    seasons_to_scrape = []
    all_seasons = list(get_season_dates().keys())
    
    # Check each season's progressl_possesi
    for season_str in all_seasons:
        season_year = season_str.replace('-', '')
        progress_file = os.path.join(checkpoint_dir, f"{season_year}_progress.json")
        
        if os.path.exists(progress_file):
            try:
                with open(progress_file, 'r') as f:
                    progress = json.load(f)
                if progress.get("in_progress", False):
                    print(f"Season {season_str} was in progress. Will resume.")
                    seasons_to_scrape.append(season_str)
                else:
                    print(f"Season {season_str} was already completed.")
            except:
                print(f"Invalid progress file for {season_str}. Will include in scraping.")
                seasons_to_scrape.append(season_str)
        else:
            # Check if season file exists anyway
            if not check_season_complete(output_dir, season_str):
                print(f"No progress found for {season_str}. Will include in scraping.")
                seasons_to_scrape.append(season_str)
    
    if not seasons_to_scrape:
        print("All seasons appear to be complete! Creating master file if needed...")
        # Just run with all seasons - it will skip completed ones
        return scrape_nba_possessions()
    else:
        print(f"Resuming scraping for {len(seasons_to_scrape)} seasons: {', '.join(seasons_to_scrape)}")
        return scrape_nba_possessions(seasons=seasons_to_scrape)

if __name__ == "__main__":
    # Resume from previous scraping state
    print("Starting NBA possession data scraper with checkpoint support...")
    resume_scraping()

Starting NBA possession data scraper with checkpoint support...
Season 2015-16 was already completed.
Season 2016-17 was in progress. Will resume.
No progress found for 2017-18. Will include in scraping.
No progress found for 2018-19. Will include in scraping.
No progress found for 2019-20. Will include in scraping.
No progress found for 2020-21. Will include in scraping.
No progress found for 2021-22. Will include in scraping.
No progress found for 2022-23. Will include in scraping.
No progress found for 2023-24. Will include in scraping.
No progress found for 2024-25. Will include in scraping.
Resuming scraping for 9 seasons: 2016-17, 2017-18, 2018-19, 2019-20, 2020-21, 2021-22, 2022-23, 2023-24, 2024-25

Checking status of 2016-17 season...
Processing 2016-17 season...
  Skipping ATL - 2016-17 - 2016-10-25 to 2016-10-31 (already processed 588 possessions)
  Skipping ATL - 2016-17 - 2016-11-01 to 2016-11-07 (already processed 616 possessions)
  Skipping ATL - 2016-17 - 2016-11-08 to 

In [9]:
def convert_time_to_seconds(period, time_str):
    """Convert period and MM:SS format to total game seconds"""
    minutes, seconds = map(int, time_str.split(':'))
    
    # Calculate total seconds for all previous periods
    if period <= 4:
        period_seconds = (period - 1) * 720  # 12-minute periods
    else:
        period_seconds = 4 * 720 + (period - 5) * 300  # 5-minute OT periods
    
    # Calculate time passed in current period (counting down)
    print(period_seconds)
    current_period_length = 720 if period <= 4 else 300
    time_passed = current_period_length - (minutes * 60 + seconds)
    
    return period_seconds + time_passed
convert_time_to_seconds(5,'2:40')

2880


3020