In [1]:
# %%write_file ../src/fetch_utils.py
import time
from nba_api.stats.endpoints import commonallplayers, commonplayerinfo, playercareerstats, leaguestandings
from requests.exceptions import RequestException
from json.decoder import JSONDecodeError

MAX_REQUESTS_PER_MINUTE = 30
DELAY_BETWEEN_REQUESTS = 2

def fetch_with_retry(endpoint, max_retries=10, initial_delay=5, max_delay=60, timeout=60, debug=False, **kwargs):
    for attempt in range(max_retries):
        try:
            if debug:
                print(f"Fetching data using {endpoint.__name__} (Attempt {attempt + 1}) with parameters: {kwargs}")
            data = endpoint(**kwargs, timeout=timeout).get_data_frames()
            return data[0] if isinstance(data, list) else data
        except (RequestException, JSONDecodeError, KeyError) as e:
            if debug:
                print(f"Error occurred during fetching {endpoint.__name__}: {e}")
            if attempt == max_retries - 1:
                if debug:
                    print(f"Failed to fetch data from {endpoint.__name__} after {max_retries} attempts")
                return None
            delay = min(initial_delay * (2 ** attempt), max_delay)
            if debug:
                print(f"Retrying in {delay} seconds...")
            time.sleep(delay)

def fetch_all_players(season, debug=False):
    all_players_data = fetch_with_retry(commonallplayers.CommonAllPlayers, season=season, debug=debug)
    all_players = {}
    if all_players_data is not None:
        for _, row in all_players_data.iterrows():
            player_name = row['DISPLAY_FIRST_LAST'].strip().lower()
            player_id = row['PERSON_ID']
            team_id = row['TEAM_ID']
            all_players[player_name] = {
                'player_id': player_id,
                'team_id': team_id
            }
    if debug:
        print(f"Fetched {len(all_players)} players for season {season}")
    return all_players

def fetch_player_info(player_id, debug=False):
    return fetch_with_retry(commonplayerinfo.CommonPlayerInfo, player_id=player_id, debug=debug)

def fetch_career_stats(player_id, debug=False):
    return fetch_with_retry(playercareerstats.PlayerCareerStats, player_id=player_id, debug=debug)

def fetch_league_standings(season, debug=False):
    return fetch_with_retry(leaguestandings.LeagueStandings, season=season, debug=debug)

if __name__ == "__main__":
    # Example usage
    debug = True
    season = "2022-23"
    
    # Fetch all players
    all_players = fetch_all_players(season, debug=debug)
    print(f"Total players fetched: {len(all_players)}")
    
    # Fetch player info for a sample player
    sample_player_id = list(all_players.values())[0]['player_id']
    player_info = fetch_player_info(sample_player_id, debug=debug)
    print("Sample player info:")
    print(player_info)
    
    # Fetch career stats for the sample player
    career_stats = fetch_career_stats(sample_player_id, debug=debug)
    print("Sample player career stats:")
    print(career_stats)
    
    # Fetch league standings
    standings = fetch_league_standings(season, debug=debug)
    print("League standings:")
    print(standings)

Fetching data using CommonAllPlayers (Attempt 1) with parameters: {'season': '2022-23'}
Fetched 4971 players for season 2022-23
Total players fetched: 4971
Fetching data using CommonPlayerInfo (Attempt 1) with parameters: {'player_id': 76001}
Sample player info:
   PERSON_ID FIRST_NAME  LAST_NAME DISPLAY_FIRST_LAST  \
0      76001       Alaa  Abdelnaby     Alaa Abdelnaby   

  DISPLAY_LAST_COMMA_FIRST DISPLAY_FI_LAST     PLAYER_SLUG  \
0          Abdelnaby, Alaa    A. Abdelnaby  alaa-abdelnaby   

             BIRTHDATE SCHOOL COUNTRY  ...              PLAYERCODE FROM_YEAR  \
0  1968-06-24T00:00:00   Duke     USA  ...  HISTADD_alaa_abdelnaby      1990   

  TO_YEAR  DLEAGUE_FLAG NBA_FLAG GAMES_PLAYED_FLAG DRAFT_YEAR DRAFT_ROUND  \
0    1994             N        Y                 Y       1990           1   

   DRAFT_NUMBER GREATEST_75_FLAG  
0            25                N  

[1 rows x 33 columns]
Fetching data using PlayerCareerStats (Attempt 1) with parameters: {'player_id': 76001}


In [2]:
# scrape_utils.py
import pandas as pd
import requests
from bs4 import BeautifulSoup

def scrape_salary_cap_history():
    url = "https://basketball.realgm.com/nba/info/salary_cap"
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', class_='basketball compact')
        
        if not table:
            print("Could not find the salary cap table on the page.")
            return None

        data = []
        headers = [th.text.strip() for th in table.find('thead').find_all('th')]
        for row in table.find('tbody').find_all('tr'):
            cols = row.find_all('td')
            if cols:
                row_data = [col.text.strip() for col in cols]
                data.append(row_data)

        df = pd.DataFrame(data, columns=headers)
        
        # Clean up the data
        df['Season'] = df['Season'].str.extract(r'(\d{4}-\d{4})')
        df['Salary Cap'] = df['Salary Cap'].str.replace('$', '').str.replace(',', '').astype(float)
        
        # Convert other columns to float, handling non-numeric values
        for col in df.columns:
            if col not in ['Season', 'Salary Cap']:
                df[col] = pd.to_numeric(df[col].str.replace('$', '').str.replace(',', ''), errors='coerce')
        
        return df
    except Exception as e:
        print(f"Error scraping salary cap history: {str(e)}")
        return None

def scrape_player_salary_data(start_season, end_season, player_filter=None):
    all_data = []
    
    for season in range(start_season, end_season + 1):
        season_str = f"{season}-{str(season+1)[-2:]}"
        url = f"https://hoopshype.com/salaries/players/{season}-{season+1}/"
        print(f"Scraping data for {season_str} from URL: {url}")
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', class_='hh-salaries-ranking-table')
        
        if table:
            rows = table.find_all('tr')[1:]
            for row in rows:
                cols = row.find_all('td')
                if len(cols) >= 3:
                    player = cols[1].get_text(strip=True)
                    if player_filter is None or player_filter.lower() == 'all' or player.lower() == player_filter.lower():
                        salary_text = cols[2].get_text(strip=True)
                        salary = int(salary_text.replace('$', '').replace(',', ''))
                        all_data.append({'Player': player, 'Salary': salary, 'Season': season_str})
        else:
            print(f"No salary data found for season {season_str}")
        
        time.sleep(2)  # Delay between requests to avoid hitting rate limits
    
    df = pd.DataFrame(all_data)
    print(f"Scraped salary data for {'all players' if player_filter is None or player_filter.lower() == 'all' else player_filter} from seasons {start_season}-{end_season}:")
    print(df.head())
    return df


def scrape_team_salary_data(season):
    url = f"https://hoopshype.com/salaries/{season}/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', class_='hh-salaries-ranking-table')
    rows = table.find_all('tr')[1:]
    data = []
    for row in rows:
        cols = row.find_all('td')
        team = cols[1].get_text(strip=True)
        salary = int(cols[2].get_text(strip=True).replace('$', '').replace(',', ''))
        data.append({'Team': team, 'Team_Salary': salary, 'Season': season})
    return pd.DataFrame(data)

In [3]:
# player_utils.py
import pandas as pd
from io import StringIO
import requests
from bs4 import BeautifulSoup
# from fetch_utils import fetch_player_info, fetch_career_stats, fetch_league_standings

# Function to get the player URL from the search results
def get_player_url(player_name):
    search_url = f"https://www.basketball-reference.com/search/search.fcgi?search={player_name.replace(' ', '+')}"
    response = requests.get(search_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    search_results = soup.find('div', {'class': 'search-results'})
    if search_results:
        for item in search_results.find_all('div', {'class': 'search-item'}):
            link = item.find('a')
            if link and 'players' in link['href']:
                return f"https://www.basketball-reference.com{link['href']}"
    
    raise ValueError(f"Player URL not found for {player_name}")

# General function to scrape data from a specific section of the player's page
def scrape_player_data(player_name, section_id, season_end_year):
    player_url = get_player_url(player_name)
    response = requests.get(player_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    data = {}
    table = soup.find('table', {'id': section_id})
    if table:
        df = pd.read_html(StringIO(str(table)))[0]
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = df.columns.droplevel()  # Drop the multi-index level
        df = df[df['Season'].str.contains(season_end_year)]
        
        if not df.empty:
            row = df.iloc[0]
            for col in df.columns:
                if col not in ['Season', 'Age', 'Tm', 'Lg', 'Pos', 'G', 'GS']:
                    try:
                        data[col] = float(row[col])
                    except ValueError:
                        data[col] = row[col]
        else:
            raise ValueError(f"No data found for season {season_end_year}")
    else:
        raise ValueError(f"Section {section_id} not found for {player_name}")
    
    return data

def process_player_data(player, season, all_players):
    if player.lower() not in all_players:
        print(f"No player ID found for {player}")
        return None

    player_id = all_players[player.lower()]['player_id']
    team_id = all_players[player.lower()]['team_id']
    
    player_info = fetch_player_info(player_id)
    career_stats = fetch_career_stats(player_id)
    league_standings = fetch_league_standings(season)
    
    # Scrape advanced metrics from Basketball Reference
    advanced_metrics = scrape_advanced_metrics(player, season)

    if player_info is None or career_stats is None or career_stats.empty:
        print(f"Unable to fetch complete data for {player}")
        return None

    season_stats = career_stats[career_stats['SEASON_ID'].str.contains(season.split('-')[0], na=False)]
    if season_stats.empty:
        print(f"No stats found for {player} in season {season}")
        return None

    latest_season_stats = season_stats.iloc[0]
    
    try:
        draft_year = int(player_info['DRAFT_YEAR'].iloc[0])
    except ValueError:
        draft_year = int(player_info['FROM_YEAR'].iloc[0])

    current_season_year = int(season.split('-')[0])
    years_of_service = max(0, current_season_year - draft_year)

    # Handle missing league standings gracefully
    if league_standings is not None and not league_standings.empty:
        player_stats = calculate_player_stats(latest_season_stats, player_info, years_of_service, team_id, league_standings, advanced_metrics)
    else:
        player_stats = calculate_player_stats(latest_season_stats, player_info, years_of_service, team_id, pd.DataFrame(), advanced_metrics)

    player_stats.update({'Player': player, 'Season': season})

    print(f"Processed data for {player} in season {season}: {player_stats}")
    return player_stats


def calculate_player_stats(stats, player_info, years_of_service, team_id, league_standings, advanced_metrics):
    fg = stats.get('FGM', 0) or 0
    fga = stats.get('FGA', 0) or 0
    fg3 = stats.get('FG3M', 0) or 0
    fg3a = stats.get('FG3A', 0) or 0
    efg = (fg + 0.5 * fg3) / fga if fga != 0 else 0
    fg2 = fg - fg3
    fg2a = fga - fg3a
    fg2_pct = (fg2 / fg2a) if fg2a != 0 else 0

    player_stats = {
        'Position': player_info.iloc[0]['POSITION'],
        'Age': stats.get('PLAYER_AGE', None),
        'Team': stats.get('TEAM_ABBREVIATION', None),
        'TeamID': team_id,
        'Years of Service': years_of_service,
        'GP': stats.get('GP', None),
        'GS': stats.get('GS', None),
        'MP': stats.get('MIN', None),
        'FG': fg,
        'FGA': fga,
        'FG%': stats.get('FG_PCT', None),
        '3P': fg3,
        '3PA': fg3a,
        '3P%': stats.get('FG3_PCT', None),
        '2P': fg2,
        '2PA': fg2a,
        '2P%': fg2_pct,
        'eFG%': efg,
        'FT': stats.get('FTM', None),
        'FTA': stats.get('FTA', None),
        'FT%': stats.get('FT_PCT', None),
        'ORB': stats.get('OREB', None),
        'DRB': stats.get('DREB', None),
        'TRB': stats.get('REB', None),
        'AST': stats.get('AST', None),
        'STL': stats.get('STL', None),
        'BLK': stats.get('BLK', None),
        'TOV': stats.get('TOV', None),
        'PF': stats.get('PF', None),
        'PTS': stats.get('PTS', None),
    }
    
    # Add advanced metrics
    advanced_metrics_to_add = [
        'PER', 'TS%', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%',
        'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'
    ]
    for metric in advanced_metrics_to_add:
        player_stats[metric] = advanced_metrics.get(metric, None)

    if league_standings is not None and team_id is not None:
        team_standings = league_standings[league_standings['TeamID'] == team_id]
        if not team_standings.empty:
            player_stats.update({
                'Wins': team_standings['WINS'].values[0],
                'Losses': team_standings['LOSSES'].values[0]
            })

    return player_stats


def scrape_advanced_metrics(player_name, season):
    try:
        search_url = f"https://www.basketball-reference.com/search/search.fcgi?search={player_name.replace(' ', '+')}"
        response = requests.get(search_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        search_results = soup.find('div', {'class': 'search-results'})
        if search_results:
            for item in search_results.find_all('div', {'class': 'search-item'}):
                link = item.find('a')
                if link and 'players' in link['href']:
                    player_url = f"https://www.basketball-reference.com{link['href']}"
                    break
            else:
                return {}
        else:
            return {}

        response = requests.get(player_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'advanced'})
        if table:
            df = pd.read_html(StringIO(str(table)))[0]
            if isinstance(df.columns, pd.MultiIndex):
                df.columns = df.columns.droplevel()
            df['Season'] = df['Season'].astype(str)
            df = df[df['Season'].str.contains(season.split('-')[0], na=False)]
            if not df.empty:
                row = df.iloc[0]
                metrics = ['PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']
                return {col: row[col] for col in metrics if col in row.index}
    except Exception as e:
        print(f"Error scraping advanced metrics for {player_name}: {e}")
    return {}

In [4]:
import pandas as pd
import numpy as np
import cpi
from datetime import datetime
# from fetch_utils import fetch_all_players
# from scrape_utils import scrape_player_salary_data, scrape_team_salary_data, scrape_salary_cap_history
# from player_utils import process_player_data

def inflate_value(value, year_str):
    try:
        year = int(year_str[:4])
        current_year = datetime.now().year
       
        if year >= current_year:
            return value  # Return the original value for future years
        # Adjust to 2022 dollars to match the original data
        return cpi.inflate(value, year, to=2022)
    except ValueError:
        print(f"Invalid year format: {year_str}")
        return value
    except cpi.errors.CPIObjectDoesNotExist:
        # If data for the specific year is not available, use the earliest available year
        earliest_year = min(cpi.SURVEYS['CPI-U'].indexes['annual'].keys()).year
        return cpi.inflate(value, earliest_year, to=2022)
    except Exception as e:
        print(f"Error inflating value for year {year_str}: {e}")
        return value

def load_existing_data(file_path):
    expected_columns = [
        'Season', 'Player', 'Salary', 'GP', 'PTS', 'TRB', 'AST', 'Injured', 'Injury_Periods',
        'Position', 'Age', 'Team', 'TeamID', 'Years of Service', 'GS', 'MP', 'FG', 'FGA', 'FG%',
        '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'STL',
        'BLK', 'TOV', 'PF', 'Wins', 'Losses', 'Team_Salary',
        'Mid-Level Exception', 'Salary Cap', 'Luxury Tax', '1st Apron', '2nd Apron', 'BAE',
        'Standard /Non-Taxpayer', 'Taxpayer', 'Team Room /Under Cap', 'Salary_Cap_Inflated',
        'PER', 'TS%', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%',
        'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'
    ]

    try:
        data = pd.read_csv(file_path)
        
        # Ensure the DataFrame has the expected columns
        for column in expected_columns:
            if column not in data.columns:
                data[column] = pd.Series(dtype='object')
        
        # Remove any unexpected columns
        data = data[expected_columns]
        
        return data
    except FileNotFoundError:
        print(f"No existing data found at {file_path}. Starting with an empty DataFrame.")
        return pd.DataFrame(columns=expected_columns)


def merge_salary_cap_data(player_data, salary_cap_data):
    player_data['Season_Year'] = player_data['Season'].str[:4].astype(int)
    salary_cap_data['Season_Year'] = salary_cap_data['Season'].str[:4].astype(int)
    
    # Add inflation-adjusted salary cap
    salary_cap_data['Salary_Cap_Inflated'] = salary_cap_data.apply(
        lambda row: inflate_value(row['Salary Cap'], row['Season']),
        axis=1
    )
    
    # Merge salary cap data
    merged_data = pd.merge(player_data, salary_cap_data, on='Season_Year', how='left', suffixes=('', '_cap'))
    
    # Update salary cap columns
    cap_columns = ['Mid-Level Exception', 'Salary Cap', 'Luxury Tax', '1st Apron', '2nd Apron', 'BAE',
                   'Standard /Non-Taxpayer', 'Taxpayer', 'Team Room /Under Cap', 'Salary_Cap_Inflated']
    for col in cap_columns:
        if f'{col}_cap' in merged_data.columns:
            merged_data[col] = merged_data[col].fillna(merged_data[f'{col}_cap'])
            merged_data.drop(columns=[f'{col}_cap'], inplace=True)
    
    # Clean up temporary columns
    merged_data.drop(columns=['Season_Year'], inplace=True)
    
    return merged_data

def load_injury_data(file_path='../data/processed/NBA Player Injury Stats(1951 - 2023).csv'):
    try:
        injury_data = pd.read_csv(file_path)
        injury_data['Date'] = pd.to_datetime(injury_data['Date'])
        injury_data['Season'] = injury_data['Date'].apply(lambda x: f"{x.year}-{str(x.year+1)[-2:]}" if x.month >= 10 else f"{x.year-1}-{str(x.year)[-2:]}")
        print("Injury data loaded successfully")
        return injury_data
    except FileNotFoundError:
        print("Injury data file not found. Proceeding without injury data.")
        return None

def merge_injury_data(merged_data, injury_data, season):
    if injury_data is None:
        return merged_data
    
    season_injury_data = injury_data[injury_data['Season'] == season]
    all_players_df = pd.DataFrame({'Player': merged_data['Player'].unique()})
    all_players_df['Season'] = season
    all_players_df['Injured'] = False
    all_players_df['Injury_Periods'] = ''

    for player in all_players_df['Player']:
        player_injuries = season_injury_data[season_injury_data['Relinquished'].str.contains(player, case=False, na=False)]
        if not player_injuries.empty:
            periods = []
            for i in range(0, len(player_injuries), 2):
                try:
                    start_date = player_injuries.iloc[i]['Date'].strftime('%Y-%m-%d')
                    end_date = player_injuries.iloc[i+1]['Date'].strftime('%Y-%m-%d')
                    periods.append(f"{start_date} - {end_date}")
                except IndexError:
                    periods.append(f"{start_date} - ongoing")
            all_players_df.loc[all_players_df['Player'] == player, 'Injured'] = True
            all_players_df.loc[all_players_df['Player'] == player, 'Injury_Periods'] = '; '.join(periods)

    merged_data = pd.merge(merged_data, all_players_df, on=['Player', 'Season'], how='left')
    return merged_data

def calculate_percentages(df):
    df['FG%'] = df['FG'] / df['FGA'].replace(0, np.nan)
    df['3P%'] = df['3P'] / df['3PA'].replace(0, np.nan)
    df['2P%'] = df['2P'] / df['2PA'].replace(0, np.nan)
    df['FT%'] = df['FT'] / df['FTA'].replace(0, np.nan)
    df['eFG%'] = (df['FG'] + 0.5 * df['3P']) / df['FGA'].replace(0, np.nan)
    return df


def update_data(existing_data, start_year, end_year, player_filter=None, min_avg_minutes=None):
    all_data = existing_data.copy()
    injury_data = load_injury_data()

    salary_data = scrape_player_salary_data(start_year, end_year)
    print(f"Fetched salary data with shape: {salary_data.shape}")

    for year in range(start_year, end_year + 1):
        season = f"{year}-{str(year+1)[-2:]}"
        
        # Remove existing data for the specified season and player filter
        if player_filter and player_filter.lower() != 'all':
            all_data = all_data[~((all_data['Season'] == season) & (all_data['Player'].str.lower() == player_filter.lower()))]
        else:
            all_data = all_data[all_data['Season'] != season]
        
        print(f"Processing data for season {season}")
        
        team_salary_data = scrape_team_salary_data(season)
        all_players = fetch_all_players(season=season)
        
        season_salary_data = salary_data[salary_data['Season'] == season]
        
        if player_filter and player_filter.lower() != 'all':
            season_salary_data = season_salary_data[season_salary_data['Player'].str.lower() == player_filter.lower()]

        additional_stats = []

        for _, salary_row in season_salary_data.iterrows():
            player_name = salary_row['Player']
            player_name_lower = player_name.lower()
            
            if player_name_lower in all_players:
                player_stats = process_player_data(player_name, season, all_players)
                if player_stats:
                    player_stats['Salary'] = salary_row['Salary']
                    additional_stats.append(player_stats)

        additional_stats_df = pd.DataFrame(additional_stats)

        if additional_stats_df.empty or 'Team' not in additional_stats_df.columns:
            print(f"Warning: No valid player stats data for season {season}")
            continue

        # Merge team salary data
        merged_data = pd.merge(additional_stats_df, team_salary_data, on=['Team', 'Season'], how='left')

        if injury_data is not None:
            merged_data = merge_injury_data(merged_data, injury_data, season)

        # Apply minimum average minutes filter if specified
        if min_avg_minutes is not None:
            merged_data = merged_data[merged_data['MP'] >= min_avg_minutes]

        # Calculate percentages
        merged_data = calculate_percentages(merged_data)

        all_data = pd.concat([all_data, merged_data], ignore_index=True)

    # Sort the final data by season and player
    all_data.sort_values(by=['Season', 'Player'], inplace=True)

    # Ensure only expected columns are present and remove any empty columns
    expected_columns = load_existing_data(file_path='').columns.tolist()
    all_data = all_data[expected_columns]
    all_data = all_data.dropna(axis=1, how='all')

    print(f"Updated data with shape: {all_data.shape}")
    return all_data

def main():
    start_year = 2022
    end_year = 2023
    processed_file_path = '../data/processed/nba_player_data_final_inflated.csv'
    salary_cap_file_path = '../data/processed/salary_cap_history_inflated.csv'

    player_filter = input("Enter player name or 'all' for all players: ").strip()
    min_avg_minutes = None
    if player_filter.lower() == 'all':
        min_avg_minutes = float(input("Enter the minimum average minutes per game (default 25 mins): ") or 25)

    existing_data = load_existing_data(processed_file_path)
    print(f"Loaded existing data with shape: {existing_data.shape}")

    try:
        print(f"Updating data for years {start_year} to {end_year}")
        updated_data = update_data(existing_data, start_year, end_year, player_filter, min_avg_minutes)

        if not updated_data.empty:
            print("New data retrieved. Processing and saving...")

            print("Fetching salary cap data...")
            salary_cap_data = scrape_salary_cap_history()

            if salary_cap_data is not None:
                print("Salary cap data successfully retrieved.")
                
                # Add inflation-adjusted salary cap to salary cap data
                salary_cap_data['Salary_Cap_Inflated'] = salary_cap_data.apply(
                    lambda row: inflate_value(row['Salary Cap'], row['Season']),
                    axis=1
                )
                
                salary_cap_data.to_csv(salary_cap_file_path, index=False)
                print(f"Salary cap data saved to {salary_cap_file_path}")

                print("Merging salary cap data with player data...")
                updated_data = merge_salary_cap_data(updated_data, salary_cap_data)

                print("Final data shape:", updated_data.shape)
                print("Final data columns:", updated_data.columns)
            else:
                print("Warning: Failed to retrieve salary cap data. Skipping merge.")

            # Ensure all expected columns are present before saving
            expected_columns = load_existing_data(processed_file_path).columns.tolist()
            for column in expected_columns:
                if column not in updated_data.columns:
                    updated_data[column] = pd.NA

            # Remove any remaining empty columns
            updated_data = updated_data.dropna(axis=1, how='all')

            # Save the updated data
            updated_data.to_csv(processed_file_path, index=False)
            print(f"Updated and cleaned data saved to {processed_file_path}")

            # Print summary of the data
            summary_columns = ['Season', 'Player', 'Salary', 'GP', 'PTS', 'TRB', 'AST', 'PER', 'WS', 'VORP', 'Injured', 'Injury_Periods', 'FG%', '3P%', 'FT%', 'Team_Salary', 'Salary Cap', 'Salary_Cap_Inflated']
            available_columns = [col for col in summary_columns if col in updated_data.columns]
            print("\nData summary")
            print(updated_data[available_columns].head().to_string(index=False))
        else:
            print("No new data to save. The dataset is empty.")


    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print("Traceback:")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()


Loaded existing data with shape: (480, 66)
Updating data for years 2022 to 2023
Injury data loaded successfully
Scraping data for 2022-23 from URL: https://hoopshype.com/salaries/players/2022-2023/
Scraping data for 2023-24 from URL: https://hoopshype.com/salaries/players/2023-2024/
Scraped salary data for all players from seasons 2022-2023:
              Player    Salary   Season
0      Stephen Curry  48070014  2022-23
1          John Wall  47345760  2022-23
2  Russell Westbrook  47080179  2022-23
3       LeBron James  44474988  2022-23
4       Kevin Durant  44119845  2022-23
Fetched salary data with shape: (1186, 3)
Processing data for season 2022-23
Processed data for Stephen Curry in season 2022-23: {'Position': 'Guard', 'Age': 35.0, 'Team': 'GSW', 'TeamID': 1610612744, 'Years of Service': 13, 'GP': 56, 'GS': 56, 'MP': 1941.0, 'FG': 559, 'FGA': 1133, 'FG%': 0.493, '3P': 273, '3PA': 639, '3P%': 0.427, '2P': 286, '2PA': 494, '2P%': 0.5789473684210527, 'eFG%': 0.6138570167696381, 'FT'