In [14]:
import pandas as pd
import requests
import os
import time
from datetime import datetime

def format_date_to_url(date):
    # Convert date from YYYYMMDD to datetime object
    date_obj = datetime.strptime(str(date), '%Y%m%d')
    
    # Format the date as MM%2FDD%2FYYYY
    formatted_date = date_obj.strftime('%m%%2F%d%%2F%Y')
    
    return formatted_date

def pull_data(url):
    headers = {
        "Host": "stats.nba.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
        "Accept": "application/json, text/plain, */*",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Referer": "https://stats.nba.com/",
        "Origin": "https://stats.nba.com",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
    }

    json = requests.get(url, headers=headers).json()

    if len(json["resultSets"]) == 1:
        data = json["resultSets"][0]["rowSet"]
        columns = json["resultSets"][0]["headers"]
        df = pd.DataFrame.from_records(data, columns=columns)
    else:
        data = json["resultSets"]["rowSet"]
        columns = json["resultSets"]["headers"][1]['columnNames']
        df = pd.DataFrame.from_records(data, columns=columns)

    time.sleep(0.1)
    return df

def update_team_stats(start_year, end_year, ps=False):
    """
    Updates existing team stats files with additional metrics
    """
    trail = 'ps' if ps else ''
    stype = 'Playoffs' if ps else 'Regular%20Season'
    
    for year in range(start_year, end_year):
        # Check if the file exists
        file_path = f'year_files/{year}{trail}_teamgames.csv'
        if not os.path.exists(file_path):
            print(f"File {file_path} does not exist. Skipping...")
            continue
        
        # Load existing data
        existing_df = pd.read_csv(file_path)
        print(f"Loaded file for year {year}: {len(existing_df)} records")
        
        # Get unique dates from the existing data
        unique_dates = existing_df['date'].unique().tolist()
        unique_dates.sort()
        
        # Create a dictionary to store dataframes for each date
        date_dfs = {}
        
        season = str(year - 1) + '-' + str(year)[-2:]
        
        # Process each date
        for date_num in unique_dates:
            try:
                date = format_date_to_url(date_num)
                
                # Misc stats URL
                url_misc = f'https://stats.nba.com/stats/leaguedashteamstats?College=&Conference=&Country=&DateFrom={date}&DateTo={date}&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&ISTRound=&LastNGames=0&LeagueID=00&Location=&MeasureType=Misc&Month=0&OpponentTeamID=0&Outcome=&PORound=&PaceAdjust=N&PerMode=Totals&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season={season}&SeasonSegment=&SeasonType={stype}&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight='
                
                misc_df = pull_data(url_misc)
                
                # Keep only the TEAM_ID and new columns to be added
                existing_columns = existing_df.columns.tolist()
                misc_columns = [col for col in misc_df.columns if col not in existing_columns or col == 'TEAM_ID']
                misc_df = misc_df[misc_columns]
                
                # Store in our dictionary
                date_dfs[date_num] = misc_df
                print(f"Processed date: {date_num}")
                
            except Exception as e:
                print(f"Error processing date {date_num}: {str(e)}")
                time.sleep(1)
        
        # Now update the existing dataframe with the new data
        updated_rows = []
        
        for i, row in existing_df.iterrows():
            date_num = row['date']
            team_id = row['TEAM_ID']
            
            if date_num in date_dfs:
                # Find the matching row in the new data
                match = date_dfs[date_num][date_dfs[date_num]['TEAM_ID'] == team_id]
                
                if not match.empty:
                    # Update row with new columns
                    for col in match.columns:
                        if col != 'TEAM_ID':
                            row[col] = match.iloc[0][col]
            
            updated_rows.append(row)
        
        # Convert back to dataframe
        updated_df = pd.DataFrame(updated_rows)
        
        # Save to a new file to be safe
        updated_file = f'year_files/{year}{trail}_teamgames_updated.csv'
        updated_df.to_csv(updated_file, index=False)
        print(f"Saved updated data to {updated_file}")
        
        # Optional: replace original file after verification
        # import shutil
        # shutil.move(updated_file, file_path)
        # print(f"Replaced original file with updated data")

def main():
    start_year = 2014
    end_year = 2015
    
    # Update regular season data
    #update_team_stats(start_year, end_year, ps=False)
   
    update_team_stats(start_year, end_year, ps=True)
    
    # Uncomment to update playoff data as well
    # update_team_stats(start_year, end_year, ps=True)

if __name__ == "__main__":
    main()

Loaded file for year 2014: 178 records
Processed date: 20140419
Processed date: 20140420
Processed date: 20140421
Processed date: 20140422
Processed date: 20140423
Processed date: 20140424
Processed date: 20140425
Processed date: 20140426
Processed date: 20140427
Processed date: 20140428
Processed date: 20140429
Processed date: 20140430
Processed date: 20140501
Processed date: 20140502
Processed date: 20140503
Processed date: 20140504
Processed date: 20140505
Processed date: 20140506
Processed date: 20140507
Processed date: 20140508
Processed date: 20140509
Processed date: 20140510
Processed date: 20140511
Processed date: 20140512
Processed date: 20140513
Processed date: 20140514
Processed date: 20140515
Processed date: 20140518
Processed date: 20140519
Processed date: 20140520
Processed date: 20140521
Processed date: 20140524
Processed date: 20140525
Processed date: 20140526
Processed date: 20140527
Processed date: 20140528
Processed date: 20140529
Processed date: 20140530
Processed d

In [16]:
import pandas as pd
trail = 'ps'
# Define year and file paths
for year in range(2014,2025):


    file = f'year_files/{year}{trail}_teamgames.csv'
    file2 = f'year_files/{year}{trail}_teamgames_updated.csv'

    # Load the data
    df = pd.read_csv(file)
    df2 = pd.read_csv(file2)
    update_copy=df2.copy()
    print(len(df.columns))
    print(len(df2.columns))
    # Drop non-shared columns from df2
    non_shared_columns = list(set(df2.columns) - set(df.columns))
    print(non_shared_columns)
    newdf = df2.drop(columns=non_shared_columns)

    # Ensure column order matches df
    newdf = newdf[df.columns]

    # Print basic information
    print(f"Original df columns: {len(df.columns)}")
    print(f"Updated df2 columns: {len(df2.columns)}")
    print(f"Non-shared columns dropped: {non_shared_columns}")
    print(f"Number of rows in df: {len(df)}")
    print(f"Number of rows in newdf: {len(newdf)}")

    # Check if the DataFrames are equal
    if df.equals(newdf):
        print(len(update_copy.columns))
        update_copy.to_csv(file,index=False)
        print("The DataFrames are equal.")
        
    else:
        print("The DataFrames are NOT equal.")



305
305
[]
Original df columns: 305
Updated df2 columns: 305
Non-shared columns dropped: []
Number of rows in df: 178
Number of rows in newdf: 178
305
The DataFrames are equal.
305
305
[]
Original df columns: 305
Updated df2 columns: 305
Non-shared columns dropped: []
Number of rows in df: 162
Number of rows in newdf: 162
305
The DataFrames are equal.
305
305
[]
Original df columns: 305
Updated df2 columns: 305
Non-shared columns dropped: []
Number of rows in df: 172
Number of rows in newdf: 172
305
The DataFrames are equal.
305
305
[]
Original df columns: 305
Updated df2 columns: 305
Non-shared columns dropped: []
Number of rows in df: 158
Number of rows in newdf: 158
305
The DataFrames are equal.
305
305
[]
Original df columns: 305
Updated df2 columns: 305
Non-shared columns dropped: []
Number of rows in df: 164
Number of rows in newdf: 164
305
The DataFrames are equal.
305
305
[]
Original df columns: 305
Updated df2 columns: 305
Non-shared columns dropped: []
Number of rows in df: 1