In [71]:
# %%writefile ../src/utils/fetch_utils.py

import time
import pandas as pd
from nba_api.stats.endpoints import commonallplayers, commonplayerinfo, leaguedashplayerstats, leaguestandings, playercareerstats
from requests.exceptions import RequestException
from json.decoder import JSONDecodeError

# Define constants for rate limiting
MAX_REQUESTS_PER_MINUTE = 30
DELAY_BETWEEN_REQUESTS = 2  # in seconds

def fetch_with_retry(endpoint, max_retries=3, delay=5, timeout=60, **kwargs):
    """
    Fetch data from an NBA API endpoint with retries in case of failures.
    """
    for attempt in range(max_retries):
        try:
            print(f"Attempting to fetch data using {endpoint.__name__} (Attempt {attempt + 1}) with parameters {kwargs}")
            data = endpoint(**kwargs, timeout=timeout).get_data_frames()
            return data
        except (RequestException, JSONDecodeError, KeyError) as e:
            print(f"Error occurred: {e}")
            if attempt == max_retries - 1:
                print(f"Failed to fetch data after {max_retries} attempts: {str(e)}")
                return None
            print(f"Attempt {attempt + 1} failed. Retrying in {delay} seconds...")
            time.sleep(delay)

def fetch_all_players(season, player_filter=None):
    """
    Fetch all players' IDs, team IDs, and years of service for a given season.
    If player_filter is provided, fetch data only for that player.
    """
    print(f"Fetching all players for season: {season} with filter: {player_filter}")
    all_players_data = fetch_with_retry(commonallplayers.CommonAllPlayers, season=season, is_only_current_season=1)
    all_players = {}
    if all_players_data:
        print(f"Fetched {len(all_players_data[0])} players")
        for _, row in all_players_data[0].iterrows():
            player_name = row['DISPLAY_FIRST_LAST'].strip().lower()
            if player_filter and player_filter.lower() not in player_name:
                continue
            
            player_id = row['PERSON_ID']
            team_id = row['TEAM_ID']
            print(f"Processing player: {player_name}, ID: {player_id}, Team ID: {team_id}")
            
            # Fetch player info to get years of service
            player_info = fetch_with_retry(commonplayerinfo.CommonPlayerInfo, player_id=player_id)
            years_of_service = 0
            if player_info:
                years_of_service = player_info[0]['SEASON_EXP'].iloc[0]
                print(f"Years of service for player {player_name}: {years_of_service}")
            
            all_players[player_name] = {
                'player_id': player_id,
                'team_id': team_id,
                'years_of_service': years_of_service
            }
    
    return all_players

def fetch_league_standings(season):
    """
    Fetch league standings for a given season.
    """
    standings_data = fetch_with_retry(leaguestandings.LeagueStandings, season=season)
    if standings_data is None or not standings_data:
        return pd.DataFrame()  # Return an empty DataFrame if fetching fails
    return standings_data[0]

def fetch_player_stats(season):
    """
    Fetch player statistics for a given season.
    """
    player_stats = fetch_with_retry(leaguedashplayerstats.LeagueDashPlayerStats, season=season, season_type_all_star='Regular Season', per_mode_detailed='PerGame')
    return player_stats[0] if player_stats else None



In [72]:
# Sample script to fetch and display data from NBA API endpoints

from nba_api.stats.endpoints import commonallplayers, commonplayerinfo, leaguedashplayerstats, leaguestandings, playercareerstats
import pandas as pd

def fetch_with_retry(endpoint, max_retries=3, delay=5, timeout=60, **kwargs):
    """
    Fetch data from an NBA API endpoint with retries in case of failures.
    """
    for attempt in range(max_retries):
        try:
            print(f"Attempting to fetch data using {endpoint.__name__} (Attempt {attempt + 1}) with parameters {kwargs}")
            data = endpoint(**kwargs, timeout=timeout).get_data_frames()
            return data
        except (RequestException, JSONDecodeError, KeyError) as e:
            print(f"Error occurred: {e}")
            if attempt == max_retries - 1:
                print(f"Failed to fetch data after {max_retries} attempts: {str(e)}")
                return None
            print(f"Attempt {attempt + 1} failed. Retrying in {delay} seconds...")
            time.sleep(delay)

# Fetch all players
season = '2022-23'
all_players_data = fetch_with_retry(commonallplayers.CommonAllPlayers, season=season, is_only_current_season=1)
if all_players_data:
    print("Sample data from CommonAllPlayers:")
    print(all_players_data[0].head())

# Fetch player info for a specific player
player_id = 1628977  # Example player ID (Hamidou Diallo)
player_info = fetch_with_retry(commonplayerinfo.CommonPlayerInfo, player_id=player_id)
if player_info:
    print("Sample data from CommonPlayerInfo:")
    print(player_info[0].head())

# Fetch player statistics for the season
player_stats = fetch_with_retry(leaguedashplayerstats.LeagueDashPlayerStats, season=season, season_type_all_star='Regular Season', per_mode_detailed='PerGame')
if player_stats:
    print("Sample data from LeagueDashPlayerStats:")
    print(player_stats[0].head())

# Fetch league standings for the season
league_standings = fetch_with_retry(leaguestandings.LeagueStandings, season=season)
if league_standings:
    print("Sample data from LeagueStandings:")
    print(league_standings[0].head())

# Fetch career stats for a specific player
career_stats = fetch_with_retry(playercareerstats.PlayerCareerStats, player_id=player_id)
if career_stats:
    print("Sample data from PlayerCareerStats:")
    print(career_stats[0].head())


Attempting to fetch data using CommonAllPlayers (Attempt 1) with parameters {'season': '2022-23', 'is_only_current_season': 1}
Sample data from CommonAllPlayers:
   PERSON_ID DISPLAY_LAST_COMMA_FIRST DISPLAY_FIRST_LAST  ROSTERSTATUS  \
0     203500            Adams, Steven       Steven Adams             1   
1     201571           Augustin, D.J.      D.J. Augustin             1   
2     203115             Barton, Will        Will Barton             1   
3     203920              Birch, Khem         Khem Birch             1   
4    1630195         Bolmaro, Leandro    Leandro Bolmaro             0   

  FROM_YEAR TO_YEAR       PLAYERCODE      PLAYER_SLUG     TEAM_ID  \
0      2013    2024     steven_adams     steven_adams  1610612763   
1      2008    2022    d.j._augustin      dj_augustin  1610612745   
2      2012    2022      will_barton      will_barton  1610612761   
3      2017    2022       khem_birch       khem_birch  1610612759   
4      2021    2022  leandro_bolmaro  leandro_bo

In [73]:
# %%writefile ../src/utils/scrape_utils.py

import requests
from bs4 import BeautifulSoup

def filter_players_by_avg_minutes(player_stats, min_avg_minutes):
    """
    Filter players by their average minutes per game.
    """
    return player_stats[player_stats['MIN'] >= min_avg_minutes]

def get_player_url(player_name):
    search_url = f"https://www.basketball-reference.com/search/search.fcgi?search={player_name.replace(' ', '+')}"
    response = requests.get(search_url)
    soup = BeautifulSoup(response.content, 'html.parser')
   
    search_results = soup.find('div', {'class': 'search-results'})
    if search_results:
        for item in search_results.find_all('div', {'class': 'search-item'}):
            link = item.find('a')
            if link and 'players' in link['href']:
                return f"https://www.basketball-reference.com{link['href']}"
   
    raise ValueError(f"Player URL not found for {player_name}")

def scrape_advanced_metrics(player_name, season):
    try:
        player_url = get_player_url(player_name)
        response = requests.get(player_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        data = {}
        table = soup.find('table', {'id': 'advanced'})
        if table:
            df = pd.read_html(StringIO(str(table)))[0]
            if isinstance(df.columns, pd.MultiIndex):
                df.columns = df.columns.droplevel()
            df = df[df['Season'].str.contains(season.split('-')[0])]
           
            if not df.empty:
                row = df.iloc[0]
                important_columns = ['PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']
                for col in important_columns:
                    if col in row.index:
                        try:
                            data[col] = float(row[col])
                        except ValueError:
                            data[col] = row[col]
            else:
                print(f"No advanced metrics found for {player_name} in season {season}")
        else:
            print(f"Advanced metrics table not found for {player_name}")
    except Exception as e:
        print(f"Error scraping advanced metrics for {player_name}: {e}")
        data = {}
    
    return data


In [74]:
# %%writefile ../src/processing/load_data.py


import pandas as pd

def load_salary_data(file_path):
    """
    Load salary data from the pre-existing CSV file and format the season column.
    """
    try:
        salary_data = pd.read_csv(file_path)
        # Convert season format from "2023-2024" to "2023-24"
        salary_data['Season'] = salary_data['Season'].apply(lambda x: f"{str(x)[:4]}-{str(x)[-2:]}")
        print("Loaded salary data:")
        print(salary_data.head())
        return salary_data
    except Exception as e:
        print(f"Error loading salary data: {e}")
        return pd.DataFrame()



In [75]:
# %%writefile ../src/processing/add_nba_stats.py

import time
import pandas as pd
from nba_api.stats.endpoints import commonplayerinfo, playercareerstats, leaguestandings
# from src.utils.fetch_utils import fetch_with_retry, fetch_player_stats, fetch_league_standings
# from src.utils.scrape_utils import filter_players_by_avg_minutes, scrape_advanced_metrics

MAX_REQUESTS_PER_MINUTE = 30
DELAY_BETWEEN_REQUESTS = 2  # in seconds

def add_nba_stats(salary_data, all_players, player_filter=None, min_avg_minutes=None):
    """
    Add NBA statistics to the salary data for the filtered players.
    """
    additional_stats = []
    request_count = 0  # Track number of requests made

    if player_filter == 'all' and min_avg_minutes is not None:
        season = salary_data['Season'].iloc[0]
        player_stats = fetch_player_stats(season)
        if player_stats is not None and not player_stats.empty:
            filtered_players = filter_players_by_avg_minutes(player_stats, min_avg_minutes)
            player_list = filtered_players['PLAYER_NAME'].tolist()
        else:
            player_list = []
    else:
        player_list = [player_filter]

    for player in salary_data['Player']:
        if player_filter != 'all' and player.lower() != player_filter.lower():
            continue

        if player_filter == 'all' and player.lower() not in [p.lower() for p in player_list]:
            continue

        if player.lower() not in all_players:
            print(f"No player ID found for {player}")
            continue

        print(f"Fetching data for {player}...")
        player_id = all_players[player.lower()]['player_id']
        team_id = all_players[player.lower()]['team_id']
        
        # Fetch player info
        player_info = fetch_with_retry(commonplayerinfo.CommonPlayerInfo, player_id=player_id)
        request_count += 1
        time.sleep(DELAY_BETWEEN_REQUESTS)  # Delay between requests
        
        # Check request count and delay if necessary
        if request_count >= MAX_REQUESTS_PER_MINUTE:
            print("Rate limit reached. Waiting for a minute before continuing...")
            time.sleep(60)  # Wait for a minute
            request_count = 0

        # Fetch career stats
        career_stats = fetch_with_retry(playercareerstats.PlayerCareerStats, player_id=player_id)
        request_count += 1
        time.sleep(DELAY_BETWEEN_REQUESTS)  # Delay between requests

        # Check request count and delay if necessary
        if request_count >= MAX_REQUESTS_PER_MINUTE:
            print("Rate limit reached. Waiting for a minute before continuing...")
            time.sleep(60)  # Wait for a minute
            request_count = 0

        season = salary_data.loc[salary_data['Player'] == player, 'Season'].values[0]
        league_standings = fetch_league_standings(season)
        request_count += 1
        time.sleep(DELAY_BETWEEN_REQUESTS)  # Delay between requests
        
        # Check request count and delay if necessary
        if request_count >= MAX_REQUESTS_PER_MINUTE:
            print("Rate limit reached. Waiting for a minute before continuing...")
            time.sleep(60)  # Wait for a minute
            request_count = 0

        # Fetch advanced metrics
        advanced_metrics = scrape_advanced_metrics(player, season)

        if player_info and career_stats and not career_stats[0].empty:
            season_stats = career_stats[0][career_stats[0]['SEASON_ID'].str.contains(season.split('-')[0])]
            if not season_stats.empty:
                latest_season_stats = season_stats.iloc[0]

                # Calculate years of service for the current season
                draft_year = player_info[0]['DRAFT_YEAR'].iloc[0]
                current_season_year = int(season.split('-')[0])
                if draft_year == 'Undrafted':
                    years_of_service = 0  # or you could set this to the number of years since the player entered the league
                else:
                    years_of_service = max(0, current_season_year - int(draft_year))


                current_season_year = int(season.split('-')[0])
                years_of_service = max(0, current_season_year - draft_year)

                fg = latest_season_stats.get('FGM', 0) or 0
                fga = latest_season_stats.get('FGA', 0) or 0
                fg3 = latest_season_stats.get('FG3M', 0) or 0
                fg3a = latest_season_stats.get('FG3A', 0) or 0
                efg = (fg + 0.5 * fg3) / fga if fga != 0 else 0
                fg2 = fg - fg3
                fg2a = fga - fg3a
                fg2_pct = (fg2 / fg2a) if fg2a != 0 else 0

                player_stats = {
                    'Player': player,
                    'Position': player_info[0].iloc[0]['POSITION'],
                    'Age': latest_season_stats.get('PLAYER_AGE', None),
                    'Team': latest_season_stats.get('TEAM_ABBREVIATION', None),
                    'TeamID': team_id,
                    'Years of Service': years_of_service,
                    'GP': latest_season_stats.get('GP', None),
                    'GS': latest_season_stats.get('GS', None),
                    'MP': latest_season_stats.get('MIN', None),
                    'FG': fg,
                    'FGA': fga,
                    'FG%': latest_season_stats.get('FG_PCT', None),
                    '3P': fg3,
                    '3PA': fg3a,
                    '3P%': latest_season_stats.get('FG3_PCT', None),
                    '2P': fg2,
                    '2PA': fg2a,
                    '2P%': fg2_pct,
                    'eFG%': efg,
                    'FT': latest_season_stats.get('FTM', None),
                    'FTA': latest_season_stats.get('FTA', None),
                    'FT%': latest_season_stats.get('FT_PCT', None),
                    'ORB': latest_season_stats.get('OREB', None),
                    'DRB': latest_season_stats.get('DREB', None),
                    'TRB': latest_season_stats.get('REB', None),
                    'AST': latest_season_stats.get('AST', None),
                    'STL': latest_season_stats.get('STL', None),
                    'BLK': latest_season_stats.get('BLK', None),
                    'TOV': latest_season_stats.get('TOV', None),
                    'PF': latest_season_stats.get('PF', None),
                    'PTS': latest_season_stats.get('PTS', None),
                    'Season': season  # Add the season to the player stats
                }

                # Add advanced metrics
                player_stats.update(advanced_metrics)

                if league_standings is not None and team_id is not None:
                    team_standings = league_standings[league_standings['TeamID'] == team_id]
                    if not team_standings.empty:
                        player_stats.update({
                            'Wins': team_standings['WINS'].values[0],
                            'Losses': team_standings['LOSSES'].values[0]
                        })
                    else:
                        print(f"No standings data found for team ID {team_id}")
                else:
                    print(f"No league standings data available for {season} or team ID not found")

                additional_stats.append(player_stats)
            else:
                print(f"No season stats available for {player} in {season}")
        else:
            print(f"No career stats available for {player}")

    additional_stats_df = pd.DataFrame(additional_stats)

    # Debug output to check DataFrames before merging
    print("Salary Data:")
    print(salary_data.head())
    print("Salary Data columns:", salary_data.columns)
    print("Additional Stats Data:")
    print(additional_stats_df.head())
    print("Additional Stats Data columns:", additional_stats_df.columns)

    # Ensure both DataFrames have the 'Player' and 'Season' columns
    if 'Player' in salary_data.columns and 'Season' in salary_data.columns:
        if additional_stats_df.empty:
            print("No additional stats found. Returning salary data as is.")
            return salary_data
        elif 'Player' in additional_stats_df.columns and 'Season' in additional_stats_df.columns:
            merged_data = pd.merge(salary_data, additional_stats_df, on=['Player', 'Season'], how='left')
            print("Merged Data:")
            print(merged_data.head())
            return merged_data
        else:
            print("Missing 'Player' or 'Season' column in additional stats data. Returning salary data as is.")
            return salary_data
    else:
        raise KeyError("'Player' or 'Season' column missing from salary data")


In [76]:
# %%writefile ../src/processing/add_yos.py

import pandas as pd

def add_yos_to_dataset(merged_data_path, salary_limits_path):
    """
    Add Years of Service (YOS) to the merged dataset using salary limits data.
    """
    # Load the datasets
    salary_limits = pd.read_csv(salary_limits_path)
    player_data = pd.read_csv(merged_data_path)

    # Remove ' YOS' from the 'YOS' column to convert it to integer
    salary_limits['YOS'] = salary_limits['YOS'].str.replace(' YOS', '').astype(int)

    # Merge the dataframes based on 'Season' and 'Years of Service'
    merged_data = pd.merge(player_data, salary_limits, left_on=['Season', 'Years of Service'], right_on=['Season', 'YOS'])

    # Drop the now redundant 'YOS' column from merged_data
    merged_data = merged_data.drop(columns=['YOS'])

    # Display the merged dataframe to verify the structure
    print(merged_data.head())
    return merged_data



In [77]:
# %%writefile ../src/processing/add_salary_cap.py

import pandas as pd

def add_salary_cap_to_dataset(player_data_path, salary_cap_path):
    """
    Add salary cap information to the dataset based on the season.
    """
    # Load the datasets
    player_data = pd.read_csv(player_data_path)
    salary_cap = pd.read_csv(salary_cap_path)

    # Merge the dataframes based on 'Season' and 'Year'
    merged_data = pd.merge(player_data, salary_cap, left_on='Season', right_on='Year', how='left')

    # Drop the now redundant 'Year' column from merged_data
    merged_data = merged_data.drop(columns=['Year'])

    # Display the merged dataframe to verify the structure
    print(merged_data.head())
    return merged_data


In [78]:
# %%writefile ../src/main.py

import os
import pandas as pd
# from src.processing.load_data import load_salary_data
# from src.processing.add_nba_stats import add_nba_stats
# from src.processing.add_yos import add_yos_to_dataset
# from src.processing.add_salary_cap import add_salary_cap_to_dataset
# from src.utils.fetch_utils import fetch_all_players

def main():
    """
    Main function to process and save NBA player salary and stats data.
    """
    start_year = 2023
    end_year = 2021
    player_filter = input("Enter player name or 'all' for all players: ").strip().lower()
    min_avg_minutes = None
    if player_filter == 'all':
        min_avg_minutes = float(input("Enter the minimum average minutes per game: "))

    # Load existing salary data
    salary_file_path = '../data/processed/salary_data.csv'
    salary_data = load_salary_data(salary_file_path)
    if salary_data.empty:
        print("Failed to load salary data. Exiting.")
        return

    # Load existing processed data if available
    processed_file_path = '../data/processed/salary_data_with_team_info.csv'
    if os.path.exists(processed_file_path):
        existing_data = pd.read_csv(processed_file_path)
    else:
        existing_data = pd.DataFrame()

    all_data = pd.DataFrame()

    for year in range(start_year, end_year-1, -1):
        season = f"{year}-{str(year+1)[-2:]}"
        print(f"Processing data for {season}...")
        season_salary_data = salary_data[salary_data['Season'] == season]
        
        if player_filter != 'all':
            season_salary_data = season_salary_data[season_salary_data['Player'].str.contains(player_filter, case=False)]
        
        print(f"Salary data for {season}:")
        print(season_salary_data.head())
        
        if season_salary_data.empty:
            print(f"No salary data found for {season}. Skipping this season.")
            continue
        
        # Fetch player data with filter for this specific season
        all_players = fetch_all_players(season=season, player_filter=player_filter)
        
        try:
            season_data = add_nba_stats(season_salary_data, all_players, player_filter, min_avg_minutes)
        except Exception as e:
            print(f"Error processing data for {season}: {e}")
            continue
        
        # Debug output to check data before concatenating
        print(f"Processed data for {season}:")
        print(season_data.head())
        
        # Concatenate all data
        all_data = pd.concat([all_data, season_data], ignore_index=True)
        
    # Filter rows with no data in 'PTS'
    all_data = all_data[all_data['PTS'].notna()]

    # Debug output to check final data
    print("Final processed data:")
    print(all_data.head())

    if not all_data.empty:
        os.makedirs('../data/processed', exist_ok=True)
        all_data.to_csv(processed_file_path, index=False)
        print(f"Data saved to {processed_file_path}")
    else:
        print("No data to save. Please check the input and processing steps.")

    # Add YOS to the final dataset
    final_data_path = '../data/processed/final_salary_data_with_yos.csv'
    salary_limits_path = '../data/raw/nba_salary_limits.csv'
    final_data = add_yos_to_dataset(processed_file_path, salary_limits_path)
    final_data.to_csv(final_data_path, index=False)
    print(f"Final data with YOS saved to {final_data_path}")

    # Add salary cap to the final dataset
    final_data_with_cap_path = '../data/processed/final_salary_data_with_yos_and_cap.csv'
    salary_cap_path = '../data/raw/salary_cap_history.csv'
    final_data_with_cap = add_salary_cap_to_dataset(final_data_path, salary_cap_path)
    final_data_with_cap.to_csv(final_data_with_cap_path, index=False)
    print(f"Final data with salary cap saved to {final_data_with_cap_path}")

if __name__ == "__main__":
    main()


Loaded salary data:
          Player    Salary   Season
0  Stephen Curry  51915615  2023-24
1   Kevin Durant  47649433  2023-24
2   Nikola Jokic  47607350  2023-24
3   LeBron James  47607350  2023-24
4    Joel Embiid  47607350  2023-24
Processing data for 2023-24...
Salary data for 2023-24:
          Player    Salary   Season
0  Stephen Curry  51915615  2023-24
1   Kevin Durant  47649433  2023-24
2   Nikola Jokic  47607350  2023-24
3   LeBron James  47607350  2023-24
4    Joel Embiid  47607350  2023-24
Fetching all players for season: 2023-24 with filter: 
Attempting to fetch data using CommonAllPlayers (Attempt 1) with parameters {'season': '2023-24', 'is_only_current_season': 1}
Fetched 572 players
Processing player: precious achiuwa, ID: 1630173, Team ID: 1610612752
Attempting to fetch data using CommonPlayerInfo (Attempt 1) with parameters {'player_id': 1630173}
Years of service for player precious achiuwa: 4
Processing player: bam adebayo, ID: 1628389, Team ID: 1610612748
Attempti

KeyboardInterrupt: 

In [2]:
import os
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
from io import StringIO
from nba_api.stats.endpoints import commonallplayers, commonplayerinfo, playercareerstats, leaguestandings, leaguedashplayerstats
from requests.exceptions import RequestException
from json.decoder import JSONDecodeError

# Define constants for rate limiting
MAX_REQUESTS_PER_MINUTE = 30
DELAY_BETWEEN_REQUESTS = 2  # in seconds

def fetch_with_retry(endpoint, max_retries=3, delay=5, **kwargs):
    """
    Fetch data from an NBA API endpoint with retries in case of failures.
    """
    for attempt in range(max_retries):
        try:
            print(f"Attempting to fetch data using {endpoint.__name__} (Attempt {attempt + 1}) with parameters {kwargs}")
            data = endpoint(**kwargs).get_data_frames()
            return data
        except (RequestException, JSONDecodeError, KeyError) as e:
            print(f"Error occurred: {e}")
            if attempt == max_retries - 1:
                print(f"Failed to fetch data after {max_retries} attempts: {str(e)}")
                return None
            print(f"Attempt {attempt + 1} failed. Retrying in {delay} seconds...")
            time.sleep(delay)

def load_salary_data(file_path):
    """
    Load salary data from the pre-existing CSV file and format the season column.
    """
    try:
        salary_data = pd.read_csv(file_path)
        # Convert season format from "2023-2024" to "2023-24"
        salary_data['Season'] = salary_data['Season'].apply(lambda x: f"{str(x)[:4]}-{str(x)[-2:]}")
        print("Loaded salary data:")
        print(salary_data.head())
        return salary_data
    except Exception as e:
        print(f"Error loading salary data: {e}")
        return pd.DataFrame()

def fetch_all_players(season):
    """
    Fetch all players' IDs and team IDs for a given season.
    """
    all_players_data = fetch_with_retry(commonallplayers.CommonAllPlayers, season=season)
    all_players = {}
    if all_players_data:
        for _, row in all_players_data[0].iterrows():
            player_name = row['DISPLAY_FIRST_LAST'].strip().lower()
            player_id = row['PERSON_ID']
            team_id = row['TEAM_ID']
            all_players[player_name] = {
                'player_id': player_id,
                'team_id': team_id
            }
    return all_players

def fetch_player_info(player_id):
    """
    Fetch basic information about a player given their player ID.
    """
    return fetch_with_retry(commonplayerinfo.CommonPlayerInfo, player_id=player_id)

def fetch_career_stats(player_id):
    """
    Fetch career statistics for a player given their player ID.
    """
    return fetch_with_retry(playercareerstats.PlayerCareerStats, player_id=player_id)

def fetch_league_standings(season):
    """
    Fetch league standings for a given season.
    """
    standings_data = fetch_with_retry(leaguestandings.LeagueStandings, season=season)
    if standings_data is None or not standings_data:
        return pd.DataFrame()  # Return an empty DataFrame if fetching fails
    return standings_data[0]

def fetch_player_stats(season):
    """
    Fetch player statistics for a given season.
    """
    player_stats = fetch_with_retry(leaguedashplayerstats.LeagueDashPlayerStats, season=season, season_type_all_star='Regular Season', per_mode_detailed='PerGame')
    return player_stats[0] if player_stats else None

def filter_players_by_avg_minutes(player_stats, min_avg_minutes):
    """
    Filter players by their average minutes per game.
    """
    return player_stats[player_stats['MIN'] >= min_avg_minutes]

def get_player_url(player_name):
    search_url = f"https://www.basketball-reference.com/search/search.fcgi?search={player_name.replace(' ', '+')}"
    response = requests.get(search_url)
    soup = BeautifulSoup(response.content, 'html.parser')
   
    search_results = soup.find('div', {'class': 'search-results'})
    if search_results:
        for item in search_results.find_all('div', {'class': 'search-item'}):
            link = item.find('a')
            if link and 'players' in link['href']:
                return f"https://www.basketball-reference.com{link['href']}"
   
    raise ValueError(f"Player URL not found for {player_name}")

def scrape_advanced_metrics(player_name, season):
    try:
        player_url = get_player_url(player_name)
        response = requests.get(player_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        data = {}
        table = soup.find('table', {'id': 'advanced'})
        if table:
            df = pd.read_html(StringIO(str(table)))[0]
            if isinstance(df.columns, pd.MultiIndex):
                df.columns = df.columns.droplevel()
            df = df[df['Season'].str.contains(season.split('-')[0], na=False)]
           
            if not df.empty:
                row = df.iloc[0]
                important_columns = ['PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']
                for col in important_columns:
                    if col in row.index:
                        try:
                            data[col] = float(row[col]) if pd.notna(row[col]) else None
                        except ValueError:
                            data[col] = None
            else:
                print(f"No advanced metrics found for {player_name} in season {season}")
        else:
            print(f"Advanced metrics table not found for {player_name}")
    except Exception as e:
        print(f"Error scraping advanced metrics for {player_name}: {e}")
        data = {}
    
    return data


def add_nba_stats(salary_data, all_players):
    """
    Add NBA statistics to the salary data for all players with a salary.
    """
    additional_stats = []
    request_count = 0

    # Create a set of players from the salary dataset for quick lookup
    salary_players = set(salary_data['Player'].str.lower())

    for _, row in salary_data.iterrows():
        player = row['Player']
        season = row['Season']

        if player.lower() not in all_players:
            print(f"No player ID found for {player}")
            continue

        print(f"Fetching data for {player}...")
        player_id = all_players[player.lower()]['player_id']
        team_id = all_players[player.lower()]['team_id']
        
        # Fetch player info
        player_info = fetch_with_retry(commonplayerinfo.CommonPlayerInfo, player_id=player_id)
        request_count += 1
        time.sleep(DELAY_BETWEEN_REQUESTS)
        
        # Rate limiting check
        if request_count >= MAX_REQUESTS_PER_MINUTE:
            print("Rate limit reached. Waiting for a minute before continuing...")
            time.sleep(60)
            request_count = 0

        # Fetch career stats
        career_stats = fetch_with_retry(playercareerstats.PlayerCareerStats, player_id=player_id)
        request_count += 1
        time.sleep(DELAY_BETWEEN_REQUESTS)

        # Rate limiting check
        if request_count >= MAX_REQUESTS_PER_MINUTE:
            print("Rate limit reached. Waiting for a minute before continuing...")
            time.sleep(60)
            request_count = 0

        league_standings = fetch_league_standings(season)
        request_count += 1
        time.sleep(DELAY_BETWEEN_REQUESTS)
        
        # Rate limiting check
        if request_count >= MAX_REQUESTS_PER_MINUTE:
            print("Rate limit reached. Waiting for a minute before continuing...")
            time.sleep(60)
            request_count = 0

        # Fetch advanced metrics
        advanced_metrics = scrape_advanced_metrics(player, season)

        if player_info is not None and career_stats is not None and not career_stats[0].empty:
            season_stats = career_stats[0][career_stats[0]['SEASON_ID'].str.contains(season.split('-')[0])]
            if not season_stats.empty:
                latest_season_stats = season_stats.iloc[0]

                # Calculate years of service for the current season
                draft_year = player_info[0]['DRAFT_YEAR'].iloc[0]
                current_season_year = int(season.split('-')[0])
                if draft_year == 'Undrafted':
                    years_of_service = 0  # or some other default value
                else:
                    years_of_service = max(0, current_season_year - int(draft_year))

                fg = latest_season_stats.get('FGM', 0) or 0
                fga = latest_season_stats.get('FGA', 0) or 0
                fg3 = latest_season_stats.get('FG3M', 0) or 0
                fg3a = latest_season_stats.get('FG3A', 0) or 0
                efg = (fg + 0.5 * fg3) / fga if fga != 0 else 0
                fg2 = fg - fg3
                fg2a = fga - fg3a
                fg2_pct = (fg2 / fg2a) if fg2a != 0 else 0

                player_stats = {
                    'Player': player,
                    'Position': player_info[0].iloc[0]['POSITION'],
                    'Age': latest_season_stats.get('PLAYER_AGE', None),
                    'Team': latest_season_stats.get('TEAM_ABBREVIATION', None),
                    'TeamID': team_id,
                    'Years of Service': years_of_service,
                    'GP': latest_season_stats.get('GP', None),
                    'GS': latest_season_stats.get('GS', None),
                    'MP': latest_season_stats.get('MIN', None),
                    'FG': fg,
                    'FGA': fga,
                    'FG%': latest_season_stats.get('FG_PCT', None),
                    '3P': fg3,
                    '3PA': fg3a,
                    '3P%': latest_season_stats.get('FG3_PCT', None),
                    '2P': fg2,
                    '2PA': fg2a,
                    '2P%': fg2_pct,
                    'eFG%': efg,
                    'FT': latest_season_stats.get('FTM', None),
                    'FTA': latest_season_stats.get('FTA', None),
                    'FT%': latest_season_stats.get('FT_PCT', None),
                    'ORB': latest_season_stats.get('OREB', None),
                    'DRB': latest_season_stats.get('DREB', None),
                    'TRB': latest_season_stats.get('REB', None),
                    'AST': latest_season_stats.get('AST', None),
                    'STL': latest_season_stats.get('STL', None),
                    'BLK': latest_season_stats.get('BLK', None),
                    'TOV': latest_season_stats.get('TOV', None),
                    'PF': latest_season_stats.get('PF', None),
                    'PTS': latest_season_stats.get('PTS', None),
                    'Season': season  # Add the season to the player stats
                }

                # Add advanced metrics
                player_stats.update(advanced_metrics)

                if league_standings is not None and team_id is not None and team_id != 0:
                    team_standings = league_standings[league_standings['TeamID'] == team_id]
                    if not team_standings.empty:
                        player_stats.update({
                            'Wins': team_standings['WINS'].values[0],
                            'Losses': team_standings['LOSSES'].values[0]
                        })
                    else:
                        print(f"No standings data found for team ID {team_id}")
                        player_stats.update({
                            'Wins': None,
                            'Losses': None
                        })
                else:
                    print(f"No league standings data available for {season} or player not on a team")
                    player_stats.update({
                        'Wins': None,
                        'Losses': None
                    })

                additional_stats.append(player_stats)
            else:
                print(f"No season stats available for {player} in {season}")
        else:
            print(f"No career stats available for {player}")

    additional_stats_df = pd.DataFrame(additional_stats)
    print(f"Additional stats DataFrame shape: {additional_stats_df.shape}")
    print(f"Additional stats columns: {additional_stats_df.columns}")
    merged_data = pd.merge(salary_data, additional_stats_df, on=['Player', 'Season'], how='left')
    print(f"Merged data shape: {merged_data.shape}")
    print(f"Merged data columns: {merged_data.columns}")
    return merged_data


def add_yos_to_dataset(merged_data_path, salary_limits_path):
    """
    Add Years of Service (YOS) to the merged dataset using salary limits data.
    """
    # Load the datasets
    salary_limits = pd.read_csv(salary_limits_path)
    player_data = pd.read_csv(merged_data_path)

    # Remove ' YOS' from the 'YOS' column to convert it to integer
    salary_limits['YOS'] = salary_limits['YOS'].str.replace(' YOS', '').astype(int)

    # Merge the dataframes based on 'Season' and 'Years of Service'
    merged_data = pd.merge(player_data, salary_limits, left_on=['Season', 'Years of Service'], right_on=['Season', 'YOS'])

    # Drop the now redundant 'YOS' column from merged_data
    merged_data = merged_data.drop(columns=['YOS'])

    # Display the merged dataframe to verify the structure
    print(merged_data.head())
    return merged_data

def add_salary_cap_to_dataset(player_data_path, salary_cap_path):
    """
    Add salary cap information to the dataset based on the season.
    """
    # Load the datasets
    player_data = pd.read_csv(player_data_path)
    salary_cap = pd.read_csv(salary_cap_path)

    # Merge the dataframes based on 'Season' and 'Year'
    merged_data = pd.merge(player_data, salary_cap, left_on='Season', right_on='Year', how='left')

    # Drop the now redundant 'Year' column from merged_data
    merged_data = merged_data.drop(columns=['Year'])

    # Display the merged dataframe to verify the structure
    print(merged_data.head())
    return merged_data

def main():
    start_year = 2023
    end_year = 2022

    # Load existing salary data
    salary_file_path = '../data/processed/salary_data.csv'
    processed_file_path = '../data/processed/salary_data_with_stats.csv'  # Define this here
    salary_data = load_salary_data(salary_file_path)
    if salary_data.empty:
        print("Failed to load salary data. Exiting.")
        return

    all_data = pd.DataFrame()

    for year in range(start_year, end_year-1, -1):
        season = f"{year}-{str(year+1)[-2:]}"
        print(f"Processing data for {season}...")
        season_salary_data = salary_data[salary_data['Season'] == season]
        
        print(f"Salary data for {season}:")
        print(season_salary_data.head())
        
        if season_salary_data.empty:
            print(f"No salary data found for {season}. Skipping this season.")
            continue
        
        # Fetch player data for this specific season
        all_players = fetch_all_players(season=season)
        
        try:
            season_data = add_nba_stats(season_salary_data, all_players)
        except Exception as e:
            print(f"Error processing data for {season}: {e}")
            continue
        
        # Debug output to check data before concatenating
        print(f"Processed data for {season}:")
        print(season_data.head())
        
        # Concatenate all data
        all_data = pd.concat([all_data, season_data], ignore_index=True)
        
    if 'PTS' in all_data.columns:
        all_data = all_data[all_data['PTS'].notna()]
    else:
        print("Warning: 'PTS' column not found in the data")

    print("Columns in all_data:", all_data.columns)
    print("Sample of all_data:")
    print(all_data.head())

    if not all_data.empty:
        os.makedirs('../data/processed', exist_ok=True)
        all_data.to_csv(processed_file_path, index=False)
        print(f"Data saved to {processed_file_path}")
    else:
        print("No data to save. Please check the input and processing steps.")
        return  # Exit the function if there's no data

    # Check if the file was actually created
    if not os.path.exists(processed_file_path):
        print(f"Error: {processed_file_path} was not created. Exiting.")
        return

    final_data_path = '../data/processed/final_salary_data_with_yos.csv'
    salary_limits_path = '../data/raw/nba_salary_limits.csv'
    final_data = add_yos_to_dataset(processed_file_path, salary_limits_path)
    final_data.to_csv(final_data_path, index=False)
    print(f"Final data with YOS saved to {final_data_path}")

    # Add salary cap to the final dataset
    final_data_with_cap_path = '../data/processed/final_salary_data_with_yos_and_cap.csv'
    salary_cap_path = '../data/raw/salary_cap_history.csv'
    final_data_with_cap = add_salary_cap_to_dataset(final_data_path, salary_cap_path)
    final_data_with_cap.to_csv(final_data_with_cap_path, index=False)
    print(f"Final data with salary cap saved to {final_data_with_cap_path}")

if __name__ == "__main__":
    main()


Loaded salary data:
          Player    Salary   Season
0  Stephen Curry  51915615  2023-24
1   Kevin Durant  47649433  2023-24
2   Nikola Jokic  47607350  2023-24
3   LeBron James  47607350  2023-24
4    Joel Embiid  47607350  2023-24
Processing data for 2023-24...
Salary data for 2023-24:
          Player    Salary   Season
0  Stephen Curry  51915615  2023-24
1   Kevin Durant  47649433  2023-24
2   Nikola Jokic  47607350  2023-24
3   LeBron James  47607350  2023-24
4    Joel Embiid  47607350  2023-24
Attempting to fetch data using CommonAllPlayers (Attempt 1) with parameters {'season': '2023-24'}
Fetching data for Stephen Curry...
Attempting to fetch data using CommonPlayerInfo (Attempt 1) with parameters {'player_id': 201939}
Attempting to fetch data using PlayerCareerStats (Attempt 1) with parameters {'player_id': 201939}
Attempting to fetch data using LeagueStandings (Attempt 1) with parameters {'season': '2023-24'}
Fetching data for Kevin Durant...
Attempting to fetch data using 