In [6]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import requests
import os
import time
from datetime import datetime
import glob

def format_date_to_url(date):
    # Convert date from YYYYMMDD to datetime object
    date_obj = datetime.strptime(str(date), '%Y%m%d')
    
    # Format the date as MM%2FDD%2FYYYY
    formatted_date = date_obj.strftime('%m%%2F%d%%2F%Y')
    
    return formatted_date

def pull_data(url):
    headers = {
        "Host": "stats.nba.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
        "Accept": "application/json, text/plain, */*",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Referer": "https://stats.nba.com/",
        "Origin": "https://stats.nba.com",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
    }

    json = requests.get(url, headers=headers).json()

    if len(json["resultSets"]) == 1:
        data = json["resultSets"][0]["rowSet"]
        columns = json["resultSets"][0]["headers"]
        df = pd.DataFrame.from_records(data, columns=columns)
    else:
        data = json["resultSets"]["rowSet"]
        columns = json["resultSets"]["headers"][1]['columnNames']
        df = pd.DataFrame.from_records(data, columns=columns)

    time.sleep(.1)  # Respect rate limits
    return df

def pull_additional_data(date_num, season, stype='Regular%20Season'):
    """
    Pull additional data from new endpoints for a specific date
    """
    date = format_date_to_url(date_num)
    
    additional_data_frames = []
    year=int(season.split('-')[0])+1
    
    # Get data from your additional links
    # Link 1: Overall defense stats
    ps=False
    if stype=='Playoffs':
        ps=True
    passed=True
    url1 = f'https://stats.nba.com/stats/leaguedashptdefend?College=&Conference=&Country=&DateFrom={date}&DateTo={date}&DefenseCategory=Overall&Division=&DraftPick=&DraftYear=&GameSegment=&Height=&ISTRound=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&Season={season}&SeasonSegment=&SeasonType={stype}&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight='
    try:
        df1 = pull_data(url1)
        df1.rename(columns={'CLOSE_DEF_PERSON_ID': 'PLAYER_ID'}, inplace=True)
        df1.rename(columns={col: f'overall_def_{col}' for col in df1.columns if col != 'PLAYER_ID'}, inplace=True)
        additional_data_frames.append(df1)
        print(f"Successfully pulled Overall defense data for {date_num}")
    except Exception as e:
        passed=False
        print(f"Error pulling Overall defense data for {date_num}: {str(e)}")
    
    # Link 2: 3-pointers defense stats
    url2 = f'https://stats.nba.com/stats/leaguedashptdefend?College=&Conference=&Country=&DateFrom={date}&DateTo={date}&DefenseCategory=3%20Pointers&Division=&DraftPick=&DraftYear=&GameSegment=&Height=&ISTRound=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&Season={season}&SeasonSegment=&SeasonType={stype}&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight='
    try:
        df2 = pull_data(url2)
        df2.rename(columns={'CLOSE_DEF_PERSON_ID': 'PLAYER_ID'}, inplace=True)
        df2.rename(columns={col: f'three_pt_def_{col}' for col in df2.columns if col != 'PLAYER_ID'}, inplace=True)
        additional_data_frames.append(df2)
        print(f"Successfully pulled 3PT defense data for {date_num}")
    except Exception as e:
        passed=False
        print(f"Error pulling 3PT defense data for {date_num}: {str(e)}")
    
    # Link 3: 2-pointers defense stats
    url3 = f'https://stats.nba.com/stats/leaguedashptdefend?College=&Conference=&Country=&DateFrom={date}&DateTo={date}&DefenseCategory=2%20Pointers&Division=&DraftPick=&DraftYear=&GameSegment=&Height=&ISTRound=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&Season={season}&SeasonSegment=&SeasonType={stype}&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight='
    try:
        df3 = pull_data(url3)
        df3.rename(columns={'CLOSE_DEF_PERSON_ID': 'PLAYER_ID'}, inplace=True)
        df3.rename(columns={col: f'two_pt_def_{col}' for col in df3.columns if col != 'PLAYER_ID'}, inplace=True)
        additional_data_frames.append(df3)
        print(f"Successfully pulled 2PT defense data for {date_num}")
    except Exception as e:
        passed=False
        print(f"Error pulling 2PT defense data for {date_num}: {str(e)}")
    
    # Link 4: Less than 6ft defense stats
    url4 = f'https://stats.nba.com/stats/leaguedashptdefend?College=&Conference=&Country=&DateFrom={date}&DateTo={date}&DefenseCategory=Less%20Than%206Ft&Division=&DraftPick=&DraftYear=&GameSegment=&Height=&ISTRound=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&Season={season}&SeasonSegment=&SeasonType={stype}&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight='
    try:
        df4 = pull_data(url4)
        df4.rename(columns={'CLOSE_DEF_PERSON_ID': 'PLAYER_ID'}, inplace=True)
        df4.rename(columns={col: f'less_6ft_def_{col}' for col in df4.columns if col != 'PLAYER_ID'}, inplace=True)
        additional_data_frames.append(df4)
        print(f"Successfully pulled <6ft defense data for {date_num}")
    except Exception as e:
        passed=False
        print(f"Error pulling <6ft defense data for {date_num}: {str(e)}")
    
    # Link 5: Less than 10ft defense stats
    url5 = f'https://stats.nba.com/stats/leaguedashptdefend?College=&Conference=&Country=&DateFrom={date}&DateTo={date}&DefenseCategory=Less%20Than%2010Ft&Division=&DraftPick=&DraftYear=&GameSegment=&Height=&ISTRound=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&Season={season}&SeasonSegment=&SeasonType={stype}&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight='
    try:
        df5 = pull_data(url5)
        df5.rename(columns={'CLOSE_DEF_PERSON_ID': 'PLAYER_ID'}, inplace=True)
        df5.rename(columns={col: f'less_10ft_def_{col}' for col in df5.columns if col != 'PLAYER_ID'}, inplace=True)
        additional_data_frames.append(df5)
        print(f"Successfully pulled <10ft defense data for {date_num}")
    except Exception as e:
        passed=False
        print(f"Error pulling <10ft defense data for {date_num}: {str(e)}")
    
    # Link 6: Less than 15ft defense stats
    url6 = f'https://stats.nba.com/stats/leaguedashptdefend?College=&Conference=&Country=&DateFrom={date}&DateTo={date}&DefenseCategory=Greater%20Than%2015Ft&Division=&DraftPick=&DraftYear=&GameSegment=&Height=&ISTRound=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&Season={season}&SeasonSegment=&SeasonType={stype}&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight='
    try:
        df6 = pull_data(url6)
        df6.rename(columns={'CLOSE_DEF_PERSON_ID': 'PLAYER_ID'}, inplace=True)
        df6.rename(columns={col: f'more_15ft_def_{col}' for col in df6.columns if col != 'PLAYER_ID'}, inplace=True)
        additional_data_frames.append(df6)
        print(f"Successfully pulled >15ft defense data for {date_num}")
    except Exception as e:
        passed=False
        print(f"Error pulling <15ft defense data for {date_num}: {str(e)}")
    
    # Link 7: Hustle stats
    if year>=2016:
        url7 = f'https://stats.nba.com/stats/leaguehustlestatsplayer?College=&Conference=&Country=&DateFrom={date}&DateTo={date}&Division=&DraftPick=&DraftYear=&GameScope=&Height=&ISTRound=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season={season}&SeasonSegment=&SeasonType={stype}&TeamID=0&VsConference=&VsDivision=&Weight='
        try:
            df7 = pull_data(url7)
            df7.rename(columns={col: f'hustle_{col}' for col in df7.columns if col != 'PLAYER_ID'}, inplace=True)
            additional_data_frames.append(df7)
            print(f"Successfully pulled hustle stats for {date_num}")
        except Exception as e:
            passed=False
            print(f"Error pulling hustle stats for {date_num}: {str(e)}")
    
    # Link 8: Post touch stats
    url8 = f'https://stats.nba.com/stats/leaguedashptstats?College=&Conference=&Country=&DateFrom={date}&DateTo={date}&Division=&DraftPick=&DraftYear=&GameScope=&Height=&ISTRound=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&PlayerExperience=&PlayerOrTeam=Player&PlayerPosition=&PtMeasureType=PostTouch&Season={season}&SeasonSegment=&SeasonType={stype}&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight='
    try:
        df8 = pull_data(url8)
        df8.rename(columns={col: f'post_touch_{col}' for col in df8.columns if col != 'PLAYER_ID'}, inplace=True)
        additional_data_frames.append(df8)
        print(f"Successfully pulled post touch stats for {date_num}")
    except Exception as e:
        passed=False
        print(f"Error pulling post touch stats for {date_num}: {str(e)}")
    
    # Link 9: Speed distance stats
    url9 = f'https://stats.nba.com/stats/leaguedashptstats?College=&Conference=&Country=&DateFrom={date}&DateTo={date}&Division=&DraftPick=&DraftYear=&GameScope=&Height=&ISTRound=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&PlayerExperience=&PlayerOrTeam=Player&PlayerPosition=&PtMeasureType=SpeedDistance&Season={season}&SeasonSegment=&SeasonType={stype}&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight='
    try:
        df9 = pull_data(url9)
        df9.rename(columns={col: f'speed_distance_{col}' for col in df9.columns if col != 'PLAYER_ID'}, inplace=True)
        additional_data_frames.append(df9)
        print(f"Successfully pulled speed distance stats for {date_num}")
    except Exception as e:
        passed=False
        print(f"Error pulling speed distance stats for {date_num}: {str(e)}")
    
    # Merge all the additional data frames on PLAYER_ID
    result_df = additional_data_frames[0]
    for df in additional_data_frames[1:]:
        result_df = result_df.merge(df, on='PLAYER_ID', how='outer')
    
    # Add date and season information
    result_df['date'] = date_num
    result_df['season'] = season.replace('-', '_')
    
    return result_df,passed

def update_game_files(start_year, end_year, stype='Regular%20Season'):
    """
    Update existing game files with additional data for a range of seasons
    """
    # Get all dates for which we have existing data
    for year in range(start_year, end_year):
        season = f"{year-1}-{str(year)[-2:]}"
        
        # Create a directory for additional data if it doesn't exist
        additional_data_dir = f'additional_data/{year}'
        if not os.path.exists(additional_data_dir):
            os.makedirs(additional_data_dir, exist_ok=True)
        
        # Get list of game files we already have
        game_files = glob.glob(f'{year}/*.csv')
        
        # Extract unique dates from existing files
        existing_dates = set()
        date_to_games = {}
        
        # Read the date information from existing files
        for game_file in game_files:
            try:
                game_id = os.path.basename(game_file).replace('.csv', '')
                game_df = pd.read_csv(game_file)
                
                if 'date' in game_df.columns:
                    date = game_df['date'].iloc[0]
                    existing_dates.add(date)
                    
                    if date not in date_to_games:
                        date_to_games[date] = []
                    date_to_games[date].append(game_id)
            except Exception as e:
                print(f"Error reading file {game_file}: {str(e)}")
        
        # Process each date to get additional data
        date_dict={
                2014: 20140419,
                2015: 20150418,
                2016: 20160416,
                2017: 20170415,
                2018: 20180414,
                2019: 20190413,
                2020: 20200817,  # Delayed due to COVID-19
                2021: 20210522,
                2022: 20220416,
                2023: 20230415,
                2024: 20240420,
                2025: 20250420
            }
        year=int(season.split('-')[0])
        year+=1
        cutoff= date_dict[year]
  
        for date in sorted(existing_dates):
            additional_data_file = f'{additional_data_dir}/{date}_additional.csv'
            additional_df = None
            
            # Check if additional data file already exists
            if os.path.exists(additional_data_file):
                print(f"Additional data file already exists for date {date}. Loading existing file.")
                try:
                    additional_df = pd.read_csv(additional_data_file)
                    print(f"Loaded existing additional data for date {date}")
                except Exception as e:
                    print(f"Error loading existing additional data for {date}: {str(e)}")
                    continue
            else:
                print(f"Processing date: {date} - pulling new additional data")
                
                try:
                    # Pull additional data
                    if date>=cutoff:
                        season_type='Playoffs'
                    else:
                        season_type='Regular%20Season'
                    additional_df, passed = pull_additional_data(date, season, season_type)
                    
                    # Save the additional data for this date
                    if passed:
                        additional_df.to_csv(additional_data_file, index=False)
                        print(f"Saved additional data for date {date}")
                    else:
                        print(f"Failed to pull additional data for date {date}")
                        continue
                        
                except Exception as e:
                    print(f"Error processing additional data for date {date}: {str(e)}")
                    continue
            
            # Now update each game file for this date with the additional data
            # (This happens whether we loaded existing data or pulled new data)
            if additional_df is not None:
                game_ids = date_to_games.get(date, [])
                
                for game_id in game_ids:
                    game_file = f'{year}/{game_id}.csv'
                    updated_game_file = f'{year}/{game_id}_updated.csv'
                    
                    # Skip if updated file already exists
                    if os.path.exists(updated_game_file):
                        print(f"Updated game file already exists for {game_id}. Skipping.")
                        continue
                    
                    try:
                        # Load the game file data
                        game_df = pd.read_csv(game_file)
                        
                        # Identify columns to keep (merge keys) and remove overlapping columns
                        cols_to_keep = ['PLAYER_ID', 'date']
                        cols_to_remove = [col for col in game_df.columns if col in additional_df.columns and col not in cols_to_keep]

                        # Drop the overlapping columns
                        if cols_to_remove:
                            game_df = game_df.drop(columns=cols_to_remove)

                        # Merge with additional data
                        updated_df = game_df.merge(additional_df, on=cols_to_keep, how='left')

                        # Save the updated game file
                        updated_df.to_csv(updated_game_file, index=False)
                        print(f"Updated game file: {game_id}")
                    except Exception as e:
                        print(f"Error updating game file {game_id}: {str(e)}")
            else:
                print(f"No additional data available for date {date}")
 

def update_all_games_file(year):
    """
    Update the consolidated 'all_games' file with additional data
    """
    # Get all updated game files
    updated_game_files = glob.glob(f'{year}/*_updated.csv')
    
    if not updated_game_files:
        print(f"No updated game files found for year {year}")
        return
    
    
    # Combine all updated game files
    all_updated_games = []
    
    for game_file in updated_game_files:
        try:
            game_df = pd.read_csv(game_file)
            all_updated_games.append(game_df)
        except Exception as e:
            print(f"Error reading updated game file {game_file}: {str(e)}")
    
    if all_updated_games:
        # Concatenate all game data
        all_games_df = pd.concat(all_updated_games, ignore_index=True)
        
        # Save as CSV and parquet
        all_games_df.to_csv(f'all_games/all_{year}_updated.csv', index=False)
        all_games_df.to_parquet(f'all_games/all_{year}_updated.parquet', index=False)
        print(f"Updated consolidated files for year {year}")
    else:
        print(f"No valid updated game files to concatenate for year {year}")

def cleanup_and_rename(year):
    """
    After successful update, rename updated files to replace originals
    """
    # Rename individual game files
    updated_game_files = glob.glob(f'{year}/*_updated.csv')
    
    for game_file in updated_game_files:
        original_file = game_file.replace('_updated.csv', '.csv')
        os.rename(game_file, original_file)
        print(f"Replaced {original_file} with updated version")
    
    # Rename consolidated files
    if os.path.exists(f'all_games/all_{year}_updated.csv'):
        os.rename(f'all_games/all_{year}_updated.csv', f'all_games/all_{year}.csv')
        print(f"Replaced all_games/all_{year}.csv with updated version")
    
    if os.path.exists(f'all_games/all_{year}_updated.parquet'):
        os.rename(f'all_games/all_{year}_updated.parquet', f'all_games/all_{year}.parquet')
        print(f"Replaced all_games/all_{year}.parquet with updated version")

def main():
    # Define year range
    start_year = 2025
    end_year = 2026
    
    # Update game files with additional data
    update_game_files(start_year, end_year)
    
    # Update consolidated files
    for year in range(start_year, end_year):
        update_all_games_file(year)
    
    # Clean up and rename files
    for year in range(start_year, end_year):
        cleanup_and_rename(year)

if __name__ == "__main__":
    main()

Additional data file already exists for date 20241022. Loading existing file.
Loaded existing additional data for date 20241022
Updated game file: 22400062
Updated game file: 22400061
Additional data file already exists for date 20241023. Loading existing file.
Loaded existing additional data for date 20241023
Updated game file: 22400070
Updated game file: 22400063
Updated game file: 22400065
Updated game file: 22400072
Updated game file: 22400071
Updated game file: 22400069
Updated game file: 22400064
Updated game file: 22400068
Updated game file: 22400066
Updated game file: 22400067
Additional data file already exists for date 20241024. Loading existing file.
Loaded existing additional data for date 20241024
Updated game file: 22400075
Updated game file: 22400073
Updated game file: 22400074
Updated game file: 22400076
Additional data file already exists for date 20241025. Loading existing file.
Loaded existing additional data for date 20241025
Updated game file: 22400085
Updated game