In [6]:
# fetch_utils.py
import time
from nba_api.stats.endpoints import commonallplayers, commonplayerinfo, playercareerstats, leaguestandings
from requests.exceptions import RequestException
from json.decoder import JSONDecodeError

MAX_REQUESTS_PER_MINUTE = 30
DELAY_BETWEEN_REQUESTS = 2

def fetch_with_retry(endpoint, max_retries=5, delay=5, timeout=60, **kwargs):
    for attempt in range(max_retries):
        try:
            print(f"Fetching data using {endpoint.__name__} (Attempt {attempt + 1}) with parameters: {kwargs}")
            data = endpoint(**kwargs, timeout=timeout).get_data_frames()
            return data[0] if isinstance(data, list) else data
        except (RequestException, JSONDecodeError, KeyError) as e:
            print(f"Error occurred: {e}")
            if attempt == max_retries - 1:
                print(f"Failed to fetch data after {max_retries} attempts")
                return None
            print(f"Retrying in {delay} seconds...")
            time.sleep(delay)

def fetch_all_players(season):
    all_players_data = fetch_with_retry(commonallplayers.CommonAllPlayers, season=season)
    all_players = {}
    if all_players_data is not None:
        for _, row in all_players_data.iterrows():
            player_name = row['DISPLAY_FIRST_LAST'].strip().lower()
            player_id = row['PERSON_ID']
            team_id = row['TEAM_ID']
            all_players[player_name] = {
                'player_id': player_id,
                'team_id': team_id
            }
    return all_players

def fetch_player_info(player_id):
    return fetch_with_retry(commonplayerinfo.CommonPlayerInfo, player_id=player_id)

def fetch_career_stats(player_id):
    return fetch_with_retry(playercareerstats.PlayerCareerStats, player_id=player_id)

def fetch_league_standings(season):
    return fetch_with_retry(leaguestandings.LeagueStandings, season=season)



In [7]:
# scrape_utils.py
import pandas as pd
import requests
from bs4 import BeautifulSoup

def scrape_salary_cap_history():
    url = "https://basketball.realgm.com/nba/info/salary_cap"
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', class_='basketball compact')
        
        if not table:
            print("Could not find the salary cap table on the page.")
            return None

        data = []
        headers = [th.text.strip() for th in table.find('thead').find_all('th')]
        for row in table.find('tbody').find_all('tr'):
            cols = row.find_all('td')
            if cols:
                row_data = [col.text.strip() for col in cols]
                data.append(row_data)

        df = pd.DataFrame(data, columns=headers)
        
        # Clean up the data
        df['Season'] = df['Season'].str.extract(r'(\d{4}-\d{4})')
        df['Salary Cap'] = df['Salary Cap'].str.replace('$', '').str.replace(',', '').astype(float)
        
        # Convert other columns to float, handling non-numeric values
        for col in df.columns:
            if col not in ['Season', 'Salary Cap']:
                df[col] = pd.to_numeric(df[col].str.replace('$', '').str.replace(',', ''), errors='coerce')
        
        return df
    except Exception as e:
        print(f"Error scraping salary cap history: {str(e)}")
        return None

def scrape_player_salary_data(start_season, end_season, player_name="Stephen Curry"):
    all_data = []
    
    for season in range(start_season, end_season + 1):
        season_str = f"{season}-{str(season+1)[-2:]}"
        url = f"https://hoopshype.com/salaries/players/{season}-{season+1}/"
        print(f"Scraping data for {season_str} from URL: {url}")
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', class_='hh-salaries-ranking-table')
        
        if table:
            rows = table.find_all('tr')[1:]
            for row in rows:
                cols = row.find_all('td')
                if len(cols) >= 3:
                    player = cols[1].get_text(strip=True)
                    if player.lower() == player_name.lower():  # Filter for the specific player
                        salary_text = cols[2].get_text(strip=True)
                        salary = int(salary_text.replace('$', '').replace(',', ''))
                        all_data.append({'Player': player, 'Salary': salary, 'Season': season_str})
                        break  # Break after finding the player in the season
        else:
            print(f"No salary data found for season {season_str}")
        
        time.sleep(2)  # Delay between requests to avoid hitting rate limits
    
    df = pd.DataFrame(all_data)
    print(f"Scraped salary data for {player_name} from seasons {start_season}-{end_season}:")
    print(df)
    return df

def scrape_team_salary_data(season):
    url = f"https://hoopshype.com/salaries/{season}/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', class_='hh-salaries-ranking-table')
    rows = table.find_all('tr')[1:]
    data = []
    for row in rows:
        cols = row.find_all('td')
        team = cols[1].get_text(strip=True)
        salary = int(cols[2].get_text(strip=True).replace('$', '').replace(',', ''))
        data.append({'Team': team, 'Team_Salary': salary, 'Season': season})
    return pd.DataFrame(data)

In [8]:
# player_utils.py
import pandas as pd
from io import StringIO
import requests
from bs4 import BeautifulSoup
# from fetch_utils import fetch_player_info, fetch_career_stats, fetch_league_standings

def process_player_data(player, season, all_players):
    if player.lower() not in all_players:
        print(f"No player ID found for {player}")
        return None

    player_id = all_players[player.lower()]['player_id']
    team_id = all_players[player.lower()]['team_id']
    
    player_info = fetch_player_info(player_id)
    career_stats = fetch_career_stats(player_id)
    league_standings = fetch_league_standings(season)
    advanced_metrics = scrape_advanced_metrics(player, season)

    if player_info is None or career_stats is None or career_stats.empty:
        print(f"Unable to fetch complete data for {player}")
        return None

    season_stats = career_stats[career_stats['SEASON_ID'].str.contains(season.split('-')[0], na=False)]
    if season_stats.empty:
        print(f"No stats found for {player} in season {season}")
        return None

    latest_season_stats = season_stats.iloc[0]
    
    try:
        draft_year = int(player_info['DRAFT_YEAR'].iloc[0])
    except ValueError:
        draft_year = int(player_info['FROM_YEAR'].iloc[0])

    current_season_year = int(season.split('-')[0])
    years_of_service = max(0, current_season_year - draft_year)

    player_stats = calculate_player_stats(latest_season_stats, player_info, years_of_service, team_id, league_standings, advanced_metrics)
    player_stats.update({'Player': player, 'Season': season})

    return player_stats

def calculate_player_stats(stats, player_info, years_of_service, team_id, league_standings, advanced_metrics):
    fg = stats.get('FGM', 0) or 0
    fga = stats.get('FGA', 0) or 0
    fg3 = stats.get('FG3M', 0) or 0
    fg3a = stats.get('FG3A', 0) or 0
    efg = (fg + 0.5 * fg3) / fga if fga != 0 else 0
    fg2 = fg - fg3
    fg2a = fga - fg3a
    fg2_pct = (fg2 / fg2a) if fg2a != 0 else 0

    player_stats = {
        'Position': player_info.iloc[0]['POSITION'],
        'Age': stats.get('PLAYER_AGE', None),
        'Team': stats.get('TEAM_ABBREVIATION', None),
        'TeamID': team_id,
        'Years of Service': years_of_service,
        'GP': stats.get('GP', None),
        'GS': stats.get('GS', None),
        'MP': stats.get('MIN', None),
        'FG': fg,
        'FGA': fga,
        'FG%': stats.get('FG_PCT', None),
        '3P': fg3,
        '3PA': fg3a,
        '3P%': stats.get('FG3_PCT', None),
        '2P': fg2,
        '2PA': fg2a,
        '2P%': fg2_pct,
        'eFG%': efg,
        'FT': stats.get('FTM', None),
        'FTA': stats.get('FTA', None),
        'FT%': stats.get('FT_PCT', None),
        'ORB': stats.get('OREB', None),
        'DRB': stats.get('DREB', None),
        'TRB': stats.get('REB', None),
        'AST': stats.get('AST', None),
        'STL': stats.get('STL', None),
        'BLK': stats.get('BLK', None),
        'TOV': stats.get('TOV', None),
        'PF': stats.get('PF', None),
        'PTS': stats.get('PTS', None),
    }
    player_stats.update(advanced_metrics)

    if league_standings is not None and team_id is not None:
        team_standings = league_standings[league_standings['TeamID'] == team_id]
        if not team_standings.empty:
            player_stats.update({
                'Wins': team_standings['WINS'].values[0],
                'Losses': team_standings['LOSSES'].values[0]
            })

    return player_stats

def scrape_advanced_metrics(player_name, season):
    try:
        search_url = f"https://www.basketball-reference.com/search/search.fcgi?search={player_name.replace(' ', '+')}"
        response = requests.get(search_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        search_results = soup.find('div', {'class': 'search-results'})
        if search_results:
            for item in search_results.find_all('div', {'class': 'search-item'}):
                link = item.find('a')
                if link and 'players' in link['href']:
                    player_url = f"https://www.basketball-reference.com{link['href']}"
                    break
            else:
                return {}
        else:
            return {}

        response = requests.get(player_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'advanced'})
        if table:
            df = pd.read_html(StringIO(str(table)))[0]
            if isinstance(df.columns, pd.MultiIndex):
                df.columns = df.columns.droplevel()
            df['Season'] = df['Season'].astype(str)
            df = df[df['Season'].str.contains(season.split('-')[0], na=False)]
            if not df.empty:
                row = df.iloc[0]
                metrics = ['PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']
                return {col: row[col] for col in metrics if col in row.index}
    except Exception as e:
        print(f"Error scraping advanced metrics for {player_name}: {e}")
    return {}

In [9]:
%%writefile ../src/salary_predict/training.py

import pandas as pd
import numpy as np
from nba_api.stats.endpoints import commonallplayers, commonplayerinfo, playercareerstats, leaguestandings
import time
from requests.exceptions import RequestException
from json.decoder import JSONDecodeError
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer
import joblib
from sklearn.inspection import permutation_importance
from fetch_utils import fetch_with_retry
from scrape_utils import scrape_player_salary_data, scrape_team_salary_data, scrape_salary_cap_history
from player_utils import fetch_all_players, process_player_data

# Import other necessary functions (fetch_with_retry, scrape_functions, etc.)

def load_and_preprocess_data(file_path, use_inflated_cap=True):
    data = pd.read_csv(file_path)
    
    if use_inflated_cap:
        data.drop(columns=['2022 Dollars', 'Salary Cap'], inplace=True)
        salary_cap_column = 'Salary_Cap_Inflated'
    else:
        data.drop(columns=['2022 Dollars', 'Salary_Cap_Inflated'], inplace=True)
        salary_cap_column = 'Salary Cap'

    data['Season'] = data['Season'].str[:4].astype(int)

    numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
    imputer = SimpleImputer(strategy='mean')
    data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

    # Feature engineering
    data['PPG'] = data['PTS'] / data['GP']
    data['APG'] = data['AST'] / data['GP']
    data['RPG'] = data['TRB'] / data['GP']
    data['SPG'] = data['STL'] / data['GP']
    data['BPG'] = data['BLK'] / data['GP']
    data['TOPG'] = data['TOV'] / data['GP']
    data['WinPct'] = data['Wins'] / (data['Wins'] + data['Losses'])
    data['SalaryGrowth'] = data['Salary'].pct_change().fillna(0)
    data['Availability'] = data['GP'] / 82
    data['SalaryPct'] = data['Salary'] / data[salary_cap_column]

    return data, salary_cap_column

def prepare_data_for_training(data, salary_cap_column):
    categorical_cols = ['Player', 'Season', 'Position', 'Team']
    numerical_cols = data.columns.difference(categorical_cols + ['Salary', 'SalaryPct'])

    encoder = OneHotEncoder(drop='first', sparse_output=False)
    encoded_cats = pd.DataFrame(encoder.fit_transform(data[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))

    data = pd.concat([data[numerical_cols], encoded_cats, data[['Player', 'Season', 'Salary', 'SalaryPct']]], axis=1)

    initial_features = ['Age', 'Years of Service', 'GP', 'PPG', 'APG', 'RPG', 'SPG', 'BPG', 'TOPG', 'FG%', '3P%', 'FT%', 'PER', 'WS', 'VORP', 'Availability'] + list(encoded_cats.columns)

    data_subset = data[initial_features + ['SalaryPct', 'Salary']].copy()
    data_cleaned = data_subset.dropna()

    return data_cleaned, initial_features

def train_models(data_cleaned, initial_features, target_variable='SalaryPct', n_features_to_select=10):
    X = data_cleaned[initial_features]
    y = data_cleaned[target_variable]

    rfe = RFE(estimator=RandomForestRegressor(n_estimators=100, random_state=42), n_features_to_select=n_features_to_select)
    rfe = rfe.fit(X, y)
    selected_features = [feature for feature, selected in zip(initial_features, rfe.support_) if selected]

    print("Selected features by RFE:", selected_features)

    X = data_cleaned[selected_features]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    models = {
        'Random_Forest': RandomForestRegressor(random_state=42),
        'Gradient_Boosting': GradientBoostingRegressor(random_state=42),
        'Ridge_Regression': Ridge(),
        'ElasticNet': ElasticNet(max_iter=10000),
        'SVR': SVR(),
        'Decision_Tree': DecisionTreeRegressor(random_state=42)
    }

    param_grids = {
        'Random Forest': {
            'n_estimators': [50, 100, 200],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [8, 10, 12],
            'min_samples_split': [5, 10, 15],
            'min_samples_leaf': [1, 2, 4]
        },
        'Gradient Boosting': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 4, 5],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'subsample': [0.8, 0.9, 1.0]
        },
        'Ridge Regression': {'alpha': [0.1, 1.0, 10.0, 100.0]},
        'ElasticNet': {'alpha': [0.1, 1.0, 10.0], 'l1_ratio': [0.1, 0.5, 0.9]},
        'SVR': {'C': [0.1, 1, 10], 'epsilon': [0.1, 0.2, 0.5]},
        'Decision Tree': {'max_depth': [6, 8, 10], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
    }

    best_models = {}
    for name, model in models.items():
        print(f"Training {name}...")
        grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X_train_scaled, y_train)
        best_models[name] = grid_search.best_estimator_
        
        cv_scores = cross_val_score(best_models[name], X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
        print(f"{name} - Best params: {grid_search.best_params_}")
        print(f"{name} - Cross-validation MSE: {-cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        y_pred = best_models[name].predict(X_test_scaled)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"{name} - Test MSE: {mse:.4f}, R²: {r2:.4f}")
        
        if name in ['Random Forest', 'Gradient Boosting', 'Decision Tree']:
            importances = best_models[name].feature_importances_
            feature_importance = pd.DataFrame({'feature': selected_features, 'importance': importances})
            feature_importance = feature_importance.sort_values('importance', ascending=False)
            print(f"\n{name} - Top 5 important features:")
            print(feature_importance.head())
        else:
            perm_importance = permutation_importance(best_models[name], X_test_scaled, y_test, n_repeats=10, random_state=42)
            feature_importance = pd.DataFrame({'feature': selected_features, 'importance': perm_importance.importances_mean})
            feature_importance = feature_importance.sort_values('importance', ascending=False)
            print(f"\n{name} - Top 5 important features (Permutation Importance):")
            print(feature_importance.head())
        
        model_filename = f"../data/models/{name}_salary_prediction_model_{'inflated' if target_variable == 'SalaryPct' else 'regular'}.joblib"
        joblib.dump(best_models[name], model_filename)
        print(f"{name} model saved to '{model_filename}'")

    return best_models, scaler, selected_features

def main():
    start_year = 2024  # Current year
    end_year = 2022    # Adjust as needed
    processed_file_path = '../data/processed/nba_player_data.csv'
    salary_cap_file_path = '../data/processed/salary_cap_history.csv'

    player_filter = input("Enter player name or 'all' for all players: ").strip().lower()
    min_avg_minutes = None
    if player_filter == 'all':
        min_avg_minutes = float(input("Enter the minimum average minutes per game (default 25 mins): ") or 25)

    existing_data = load_existing_data(processed_file_path)

    try:
        print(f"Updating data for years {start_year} to {end_year}")
        updated_data = update_data(existing_data, start_year, end_year, player_filter, min_avg_minutes)
        
        if not updated_data.equals(existing_data):
            print("New data retrieved. Merging with existing data...")
            
            print("Fetching salary cap data...")
            salary_cap_data = scrape_salary_cap_history()
            
            if salary_cap_data is not None:
                print("Salary cap data successfully retrieved.")
                
                salary_cap_data.to_csv(salary_cap_file_path, index=False)
                print(f"Salary cap data saved to {salary_cap_file_path}")
                
                print("Merging salary cap data with player data...")
                
                salary_cap_columns = [col for col in updated_data.columns if 'Salary Cap' in col]
                if salary_cap_columns:
                    print(f"Removing existing Salary Cap columns: {salary_cap_columns}")
                    updated_data = updated_data.drop(columns=salary_cap_columns)
                    
                updated_data = pd.merge(updated_data, salary_cap_data[['Season', 'Salary Cap']], on='Season', how='left')
                
                print("Final data shape:", updated_data.shape)
                print("Final data columns:", updated_data.columns)
            else:
                print("Warning: Failed to retrieve salary cap data. Skipping merge.")

            updated_data.to_csv(processed_file_path, index=False)
            print(f"Updated data saved to {processed_file_path}")
        else:
            print("No new data to save. The dataset is already up-to-date.")

        # Ask if user wants to train models
        train_models_option = input("Do you want to train the models? (yes/no): ").strip().lower()
        if train_models_option == 'yes':
            use_inflated_cap = input("Use inflated salary cap? (yes/no): ").strip().lower() == 'yes'
            target_variable = input("Choose target variable (SalaryPct/Salary): ").strip()
            
            data, salary_cap_column = load_and_preprocess_data(processed_file_path, use_inflated_cap)
            data_cleaned, initial_features = prepare_data_for_training(data, salary_cap_column)
            best_models, scaler, selected_features = train_models(data_cleaned, initial_features, target_variable)
            
            print("Model training completed.")

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print("Traceback:")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Overwriting ../src/salary_predict/training.py


In [10]:
# main.py
import os
import pandas as pd
import numpy as np
# from fetch_utils import fetch_all_players
# from scrape_utils import scrape_player_salary_data, scrape_team_salary_data, scrape_salary_cap_history
# from player_utils import process_player_data

def load_existing_data(file_path):
    try:
        return pd.read_csv(file_path)
    except FileNotFoundError:
        return pd.DataFrame()

def get_latest_season(data):
    if not data.empty:
        return data['Season'].max()
    return None

def merge_salary_cap_data(player_data, salary_cap_data):
    player_data['Season_Year'] = player_data['Season'].str[:4]
    salary_cap_data['Season_Year'] = salary_cap_data['Season'].str[:4]
    merged_data = pd.merge(player_data, salary_cap_data, on='Season_Year', how='left')
    merged_data.drop(columns=['Season_Year', 'Season_y'], inplace=True)
    merged_data.rename(columns={'Season_x': 'Season'}, inplace=True)
    return merged_data


def update_data(existing_data, start_year, end_year, min_avg_minutes=None):
    all_data = existing_data.copy()
    latest_season = get_latest_season(all_data)

    # Load injury data
    injury_data = pd.read_csv('../data/processed/NBA Player Injury Stats(1951 - 2023).csv')
    injury_data['Date'] = pd.to_datetime(injury_data['Date'])
    injury_data['Season'] = injury_data['Date'].apply(lambda x: f"{x.year}-{str(x.year+1)[-2:]}" if x.month >= 10 else f"{x.year-1}-{str(x.year)[-2:]}")

    # Fetch salary data for all seasons at once
    salary_data = scrape_player_salary_data(start_year, end_year)

    for year in range(start_year, end_year):
        season = f"{year}-{str(year+1)[-2:]}"
        if latest_season and season <= latest_season:
            print(f"Skipping {season} as it's already in the dataset.")
            continue

        print(f"\nProcessing data for {season}...")

        team_salary_data = scrape_team_salary_data(season)

        all_players = fetch_all_players(season=season)
        additional_stats = []

        for player_name, player_info in all_players.items():
            player_stats = process_player_data(player_name.title(), season, all_players)
            if player_stats:
                # Merge salary data
                player_salary_row = salary_data[(salary_data['Season'] == season) & (salary_data['Player'].str.lower() == player_name)]
                if not player_salary_row.empty:
                    player_stats['Salary'] = player_salary_row.iloc[0]['Salary']
                else:
                    print(f"No salary data found for {player_name} in season {season}")
                    player_stats['Salary'] = None

                # Filter by average minutes if required
                if min_avg_minutes is None or player_stats['MP'] / player_stats['GP'] >= min_avg_minutes:
                    additional_stats.append(player_stats)

        additional_stats_df = pd.DataFrame(additional_stats)

        if not additional_stats_df.empty:
            merged_data = additional_stats_df
            merged_data = pd.merge(merged_data, team_salary_data, on=['Team', 'Season'], how='left')

            # Process and merge injury data
            season_injury_data = injury_data[injury_data['Season'] == season]

            # Create a DataFrame with all players and initialize injury columns
            all_players_df = pd.DataFrame({'Player': merged_data['Player'].unique()})
            all_players_df['Season'] = season
            all_players_df['Injured'] = False
            all_players_df['Injury_Periods'] = ''

            # Update injury information for players with injuries
            for player in all_players_df['Player']:
                player_injuries = season_injury_data[season_injury_data['Relinquished'].str.contains(player, case=False, na=False)]
                if not player_injuries.empty:
                    periods = []
                    for i in range(0, len(player_injuries), 2):
                        try:
                            start_date = player_injuries.iloc[i]['Date'].strftime('%Y-%m-%d')
                            end_date = player_injuries.iloc[i+1]['Date'].strftime('%Y-%m-%d')
                            periods.append(f"{start_date} - {end_date}")
                        except IndexError:
                            periods.append(f"{start_date} - ongoing")
                    all_players_df.loc[all_players_df['Player'] == player, 'Injured'] = True
                    all_players_df.loc[all_players_df['Player'] == player, 'Injury_Periods'] = '; '.join(periods)

            # Merge injury data with player stats
            merged_data = pd.merge(merged_data, all_players_df, on=['Player', 'Season'], how='outer')

            # Fill missing values for players without stats
            merged_data = merged_data.fillna({col: 0 for col in merged_data.columns if merged_data[col].dtype in [np.int64, np.float64]})
            merged_data = merged_data.fillna('')

            all_data = pd.concat([all_data, merged_data], ignore_index=True)

    return all_data

def main():
    start_year = 2023  # Adjusted to match the example
    end_year = 2024    # Adjusted to match the example
    processed_file_path = '../data/processed/nba_player_data.csv'
    salary_cap_file_path = '../data/processed/salary_cap_history.csv'

    player_filter = input("Enter player name or 'all' for all players: ").strip().lower()
    min_avg_minutes = None
    if player_filter == 'all':
        min_avg_minutes = float(input("Enter the minimum average minutes per game (default 25 mins): ") or 25)

    # Delete existing processed file if it exists
    if os.path.exists(processed_file_path):
        os.remove(processed_file_path)
        print(f"Deleted existing file: {processed_file_path}")

    existing_data = pd.DataFrame()  # Start with an empty DataFrame

    try:
        print(f"Updating data for years {start_year} to {end_year}")
        updated_data = update_data(existing_data, start_year, end_year, min_avg_minutes)

        if not updated_data.empty:
            print("New data retrieved. Processing and saving...")

            print("Fetching salary cap data...")
            salary_cap_data = scrape_salary_cap_history()

            if salary_cap_data is not None:
                print("Salary cap data successfully retrieved.")

                salary_cap_data.to_csv(salary_cap_file_path, index=False)
                print(f"Salary cap data saved to {salary_cap_file_path}")

                print("Merging salary cap data with player data...")

                updated_data = merge_salary_cap_data(updated_data, salary_cap_data)

                print("Final data shape:", updated_data.shape)
                print("Final data columns:", updated_data.columns)
            else:
                print("Warning: Failed to retrieve salary cap data. Skipping merge.")

            # Save the cleaned data
            updated_data.to_csv(processed_file_path, index=False)
            print(f"Updated and cleaned data saved to {processed_file_path}")

            # Print summary of the data
            print("\nData summary")
            print(updated_data[['Season', 'Player', 'Salary', 'GP', 'PTS', 'TRB', 'AST', 'Injured', 'Injury_Periods']].to_string(index=False))
        else:
            print("No new data to save. The dataset is empty.")

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print("Traceback:")
        import traceback
        traceback.print_exc()
 
if __name__ == "__main__":
    main()

Updating data for years 2023 to 2024
Scraping data for 2023-24 from URL: https://hoopshype.com/salaries/players/2023-2024/
Scraping data for 2024-25 from URL: https://hoopshype.com/salaries/players/2024-2025/
Scraped salary data for Stephen Curry from seasons 2023-2024:
          Player    Salary   Season
0  Stephen Curry  51915615  2023-24

Processing data for 2023-24...
Fetching data using CommonAllPlayers (Attempt 1) with parameters: {'season': '2023-24'}
Fetching data using CommonPlayerInfo (Attempt 1) with parameters: {'player_id': 76001}
Fetching data using PlayerCareerStats (Attempt 1) with parameters: {'player_id': 76001}
Fetching data using LeagueStandings (Attempt 1) with parameters: {'season': '2023-24'}
No stats found for Alaa Abdelnaby in season 2023-24
Fetching data using CommonPlayerInfo (Attempt 1) with parameters: {'player_id': 76002}
Fetching data using PlayerCareerStats (Attempt 1) with parameters: {'player_id': 76002}
Fetching data using LeagueStandings (Attempt 1) 

KeyboardInterrupt: 