In [57]:
# %%write_file ../src/fetch_utils.py
import time
from nba_api.stats.endpoints import commonallplayers, commonplayerinfo, playercareerstats, leaguestandings
from requests.exceptions import RequestException
from json.decoder import JSONDecodeError

# Define the maximum requests allowed per minute and delay between requests
MAX_REQUESTS_PER_MINUTE = 20
DELAY_BETWEEN_REQUESTS = 3  # seconds

def fetch_with_retry(endpoint, max_retries=10, initial_delay=5, max_delay=60, timeout=60, debug=False, **kwargs):
    for attempt in range(max_retries):
        try:
            if debug:
                print(f"Fetching data using {endpoint.__name__} (Attempt {attempt + 1}) with parameters: {kwargs}")
            data = endpoint(**kwargs, timeout=timeout).get_data_frames()
            time.sleep(DELAY_BETWEEN_REQUESTS)  # Add delay between requests
            return data[0] if isinstance(data, list) else data
        except (RequestException, JSONDecodeError, KeyError) as e:
            if debug:
                print(f"Error occurred during fetching {endpoint.__name__}: {e}")
            if attempt == max_retries - 1:
                if debug:
                    print(f"Failed to fetch data from {endpoint.__name__} after {max_retries} attempts")
                return None
            delay = min(initial_delay * (2 ** attempt), max_delay)
            if debug:
                print(f"Retrying in {delay} seconds...")
            time.sleep(delay)

def fetch_all_players(season, debug=False):
    all_players_data = fetch_with_retry(commonallplayers.CommonAllPlayers, season=season, debug=debug)
    all_players = {}
    if all_players_data is not None:
        for _, row in all_players_data.iterrows():
            player_name = row['DISPLAY_FIRST_LAST'].strip().lower()
            player_id = row['PERSON_ID']
            team_id = row['TEAM_ID']
            all_players[player_name] = {
                'player_id': player_id,
                'team_id': team_id
            }
    if debug:
        print(f"Fetched {len(all_players)} players for season {season}")
    return all_players

def fetch_player_info(player_id, debug=False):
    return fetch_with_retry(commonplayerinfo.CommonPlayerInfo, player_id=player_id, debug=debug)

def fetch_career_stats(player_id, debug=False):
    return fetch_with_retry(playercareerstats.PlayerCareerStats, player_id=player_id, debug=debug)

def fetch_league_standings(season, debug=False):
    return fetch_with_retry(leaguestandings.LeagueStandings, season=season, debug=debug)

if __name__ == "__main__":
    # Example usage
    debug = True
    season = "2022-23"
    sample_player_name = "LeBron James"
    
    # Fetch all players
    all_players = fetch_all_players(season, debug=debug)
    print(f"Total players fetched: {len(all_players)}")
    
    # Fetch player info for a sample player
    if sample_player_name.lower() in all_players:
        sample_player_id = all_players[sample_player_name.lower()]['player_id']
        player_info = fetch_player_info(sample_player_id, debug=debug)
        print(f"Sample player info for {sample_player_name}:")
        print(player_info)
        
        # Fetch career stats for the sample player
        career_stats = fetch_career_stats(sample_player_id, debug=debug)
        print(f"Sample player career stats for {sample_player_name}:")
        print(career_stats)
    else:
        print(f"Player {sample_player_name} not found in the {season} season data.")
    
    # Fetch league standings
    standings = fetch_league_standings(season, debug=debug)
    print("League standings:")
    print(standings)

Fetching data using CommonAllPlayers (Attempt 1) with parameters: {'season': '2022-23'}
Fetched 4972 players for season 2022-23
Total players fetched: 4972
Fetching data using CommonPlayerInfo (Attempt 1) with parameters: {'player_id': 2544}
Sample player info for LeBron James:
   PERSON_ID FIRST_NAME LAST_NAME DISPLAY_FIRST_LAST DISPLAY_LAST_COMMA_FIRST  \
0       2544     LeBron     James       LeBron James            James, LeBron   

  DISPLAY_FI_LAST   PLAYER_SLUG            BIRTHDATE  \
0        L. James  lebron-james  1984-12-30T00:00:00   

                         SCHOOL COUNTRY  ...    PLAYERCODE FROM_YEAR TO_YEAR  \
0  St. Vincent-St. Mary HS (OH)     USA  ...  lebron_james      2003    2024   

   DLEAGUE_FLAG NBA_FLAG GAMES_PLAYED_FLAG DRAFT_YEAR DRAFT_ROUND  \
0             N        Y                 Y       2003           1   

   DRAFT_NUMBER GREATEST_75_FLAG  
0             1                Y  

[1 rows x 33 columns]
Fetching data using PlayerCareerStats (Attempt 1) wi

In [58]:
# %%writefile ../src/scrape_utils.py
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import StringIO
import time

def scrape_salary_cap_history(debug=False):
    url = "https://basketball.realgm.com/nba/info/salary_cap"
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', class_='basketball compact')
        
        if not table:
            if debug:
                print("Could not find the salary cap table on the page.")
            return None

        data = []
        headers = [th.text.strip() for th in table.find('thead').find_all('th')]
        for row in table.find('tbody').find_all('tr'):
            cols = row.find_all('td')
            if cols:
                row_data = [col.text.strip() for col in cols]
                data.append(row_data)

        df = pd.DataFrame(data, columns=headers)
        
        # Clean up the data
        df['Season'] = df['Season'].str.extract(r'(\d{4}-\d{4})')
        df['Salary Cap'] = df['Salary Cap'].str.replace('$', '').str.replace(',', '').astype(float)
        
        # Convert other columns to float, handling non-numeric values
        for col in df.columns:
            if col not in ['Season', 'Salary Cap']:
                df[col] = pd.to_numeric(df[col].str.replace('$', '').str.replace(',', ''), errors='coerce')
        
        if debug:
            print("Salary cap data scraped successfully")
            print(df.head())
        return df
    except Exception as e:
        if debug:
            print(f"Error scraping salary cap history: {str(e)}")
        return None

DELAY_BETWEEN_REQUESTS = 3  # seconds

def scrape_player_salary_data(start_season, end_season, player_filter=None, debug=False):
    all_data = []
    
    for season in range(start_season, end_season + 1):
        season_str = f"{season}-{str(season+1)[-2:]}"
        url = f"https://hoopshype.com/salaries/players/{season}-{season+1}/"
        if debug:
            print(f"Scraping data for {season_str} from URL: {url}")
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', class_='hh-salaries-ranking-table')
        
        if table:
            rows = table.find_all('tr')[1:]
            for row in rows:
                cols = row.find_all('td')
                if len(cols) >= 3:
                    player = cols[1].get_text(strip=True)
                    if player_filter is None or player_filter.lower() == 'all' or player.lower() == player_filter.lower():
                        salary_text = cols[2].get_text(strip=True)
                        salary = int(salary_text.replace('$', '').replace(',', ''))
                        all_data.append({'Player': player, 'Salary': salary, 'Season': season_str})
        else:
            if debug:
                print(f"No salary data found for season {season_str}")
        
        time.sleep(DELAY_BETWEEN_REQUESTS)  # Delay between requests to avoid hitting rate limits
    
    df = pd.DataFrame(all_data)
    if debug:
        print(f"Scraped salary data for {'all players' if player_filter is None or player_filter.lower() == 'all' else player_filter} from seasons {start_season}-{end_season}:")
        print(df.head())
    return df
    
    df = pd.DataFrame(all_data)
    if debug:
        print(f"Scraped salary data for {'all players' if player_filter is None or player_filter.lower() == 'all' else player_filter} from seasons {start_season}-{end_season}:")
        print(df.head())
    return df

def scrape_team_salary_data(season, debug=False):
    url = f"https://hoopshype.com/salaries/{season}/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', class_='hh-salaries-ranking-table')
    rows = table.find_all('tr')[1:]
    data = []
    for row in rows:
        cols = row.find_all('td')
        team = cols[1].get_text(strip=True)
        salary = int(cols[2].get_text(strip=True).replace('$', '').replace(',', ''))
        data.append({'Team': team, 'Team_Salary': salary, 'Season': season})
    df = pd.DataFrame(data)
    if debug:
        print(f"Scraped team salary data for season {season}:")
        print(df.head())
    return df

def scrape_advanced_metrics(player_name, season, debug=False, max_retries=3, retry_delay=60):
    def make_request(url):
        response = requests.get(url)
        if response.status_code == 429:
            if debug:
                print(f"Rate limit hit. Waiting for {retry_delay} seconds before retrying.")
            time.sleep(retry_delay)
            return None
        return response

    for attempt in range(max_retries):
        try:
            search_url = f"https://www.basketball-reference.com/search/search.fcgi?search={player_name.replace(' ', '+')}"
            response = make_request(search_url)
            if response is None:
                continue

            soup = BeautifulSoup(response.content, 'html.parser')
            search_results = soup.find('div', {'class': 'search-results'})

            if search_results:
                for item in search_results.find_all('div', {'class': 'search-item'}):
                    link = item.find('a')
                    if link and 'players' in link['href']:
                        player_url = f"https://www.basketball-reference.com{link['href']}"
                        break
                else:
                    if debug:
                        print(f"No player URL found for {player_name}")
                    return {}
            else:
                if debug:
                    print(f"No search results found for {player_name}")
                return {}

            time.sleep(2)  # Wait 2 seconds between requests

            response = make_request(player_url)
            if response is None:
                continue

            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', {'id': 'advanced'})
            if table:
                df = pd.read_html(StringIO(str(table)))[0]
                if isinstance(df.columns, pd.MultiIndex):
                    df.columns = df.columns.droplevel()
                df['Season'] = df['Season'].astype(str)
                df = df[df['Season'].str.contains(season.split('-')[0], na=False)]
                if not df.empty:
                    row = df.iloc[0]
                    metrics = ['PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']
                    result = {col: row[col] for col in metrics if col in row.index}
                    if debug:
                        print(f"Scraped advanced metrics for {player_name} in season {season}: {result}")
                    return result
                else:
                    if debug:
                        print(f"No advanced metrics found for {player_name} in season {season}")
            else:
                if debug:
                    print(f"No advanced stats table found for {player_name}")

        except Exception as e:
            if debug:
                print(f"Error scraping advanced metrics for {player_name}: {e}")
        
        if attempt < max_retries - 1:
            if debug:
                print(f"Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)

    if debug:
        print(f"Failed to scrape advanced metrics for {player_name} after {max_retries} attempts")
    return {}

def load_injury_data(file_path='../data/processed/NBA Player Injury Stats(1951 - 2023).csv'):
    try:
        injury_data = pd.read_csv(file_path)
        injury_data['Date'] = pd.to_datetime(injury_data['Date'])
        injury_data['Season'] = injury_data['Date'].apply(lambda x: f"{x.year}-{str(x.year+1)[-2:]}" if x.month >= 10 else f"{x.year-1}-{str(x.year)[-2:]}")
        print("Injury data loaded successfully")
        return injury_data
    except FileNotFoundError:
        print("Injury data file not found. Proceeding without injury data.")
        return None

def merge_injury_data(player_data, injury_data):
    if injury_data is None:
        return player_data

    all_players_df = player_data.copy()
    all_players_df['Injured'] = False
    all_players_df['Injury_Periods'] = ''
    all_players_df['Total_Days_Injured'] = 0
    all_players_df['Injury_Risk'] = 'Low Risk'

    for index, row in all_players_df.iterrows():
        player_injuries = injury_data[
            (injury_data['Season'] == row['Season']) & 
            (injury_data['Relinquished'].str.contains(row['Player'], case=False, na=False))
        ]
        if not player_injuries.empty:
            periods = []
            total_days = 0
            for _, injury in player_injuries.iterrows():
                start_date = injury['Date']
                acquired_matches = injury_data[
                    (injury_data['Date'] > start_date) & 
                    (injury_data['Acquired'].str.contains(row['Player'], case=False, na=False))
                ]
                if not acquired_matches.empty:
                    end_date = acquired_matches.iloc[0]['Date']
                else:
                    # Assuming injuries last until the end of the season if no acquired date is found
                    end_year = int(row['Season'].split('-')[1])
                    end_date = pd.Timestamp(f"{end_year}-06-30")
                
                period_days = (end_date - start_date).days
                total_days += period_days
                periods.append(f"{start_date.strftime('%Y-%m-%d')} - {end_date.strftime('%Y-%m-%d')}")

            all_players_df.at[index, 'Injured'] = True
            all_players_df.at[index, 'Injury_Periods'] = '; '.join(periods)
            all_players_df.at[index, 'Total_Days_Injured'] = total_days
            
            # Categorize injury risk based on total days
            if total_days < 10:
                risk = 'Low Risk'
            elif 10 <= total_days <= 20:
                risk = 'Moderate Risk'
            else:
                risk = 'High Risk'
            all_players_df.at[index, 'Injury_Risk'] = risk

    return all_players_df

if __name__ == "__main__":
    # Example usage and testing of all functions
    debug = True
    start_season = 2022
    end_season = 2023
    sample_player = "Ja Morant"  # Example player
    
    print("1. Testing scrape_salary_cap_history:")
    salary_cap_history = scrape_salary_cap_history(debug=debug)
    
    print("\n2. Testing scrape_player_salary_data:")
    player_salary_data = scrape_player_salary_data(start_season, end_season, player_filter=sample_player, debug=debug)
    
    print("\n3. Testing scrape_team_salary_data:")
    team_salary_data = scrape_team_salary_data(f"{start_season}-{str(start_season+1)[-2:]}", debug=debug)
    
    print("\n4. Testing scrape_advanced_metrics:")
    advanced_metrics = scrape_advanced_metrics(sample_player, f"{start_season}-{str(start_season+1)[-2:]}", debug=debug)
    print(f"Advanced Metrics for {sample_player}:")
    print(advanced_metrics)
    
    print("\n5. Testing load_injury_data and merge_injury_data:")
    injury_data = load_injury_data()
    if not player_salary_data.empty and injury_data is not None:
        merged_data = merge_injury_data(player_salary_data, injury_data)
        print("Merged data with injury info:")
        columns_to_display = ['Player', 'Season', 'Salary']
        if 'Injured' in merged_data.columns:
            columns_to_display.append('Injured')
        if 'Injury_Periods' in merged_data.columns:
            columns_to_display.append('Injury_Periods')
        if 'Total_Days_Injured' in merged_data.columns:
            columns_to_display.append('Total_Days_Injured')
        if 'Injury_Risk' in merged_data.columns:
            columns_to_display.append('Injury_Risk')
        print(merged_data[columns_to_display].head())

    if not player_salary_data.empty:
        avg_salary = player_salary_data['Salary'].mean()
        print(f"Average salary for {sample_player} from {start_season} to {end_season}: ${avg_salary:,.2f}")
    
    if not team_salary_data.empty:
        highest_team_salary = team_salary_data.loc[team_salary_data['Team_Salary'].idxmax()]
        print(f"Team with highest salary in {start_season}-{end_season}: {highest_team_salary['Team']} (${highest_team_salary['Team_Salary']:,.2f})")
    
    if not injury_data.empty:
        injury_count = injury_data['Relinquished'].str.contains(sample_player, case=False).sum()
        print(f"Number of injuries/illnesses for {sample_player} from {start_season} to {end_season}: {injury_count}")

    print("\nAll tests completed.")


1. Testing scrape_salary_cap_history:
Salary cap data scraped successfully
       Mid-Level Exception     Season   Salary Cap   Luxury Tax    1st Apron  \
0 NaN                  NaN  2034-2035  364650000.0  443050000.0  461955000.0   
1 NaN                  NaN  2033-2034  331500000.0  402773000.0  419960000.0   
2 NaN                  NaN  2032-2033  301364000.0  366157000.0  381781000.0   
3 NaN                  NaN  2031-2032  273967000.0  332870000.0  347074000.0   
4 NaN                  NaN  2030-2031  249061000.0  302609000.0  315522000.0   

     2nd Apron         BAE  Standard /Non-Taxpayer    Taxpayer  \
0  489965000.0  12106000.0              33256000.0  13404000.0   
1  445423000.0  11006000.0              30233000.0  12186000.0   
2  404929000.0  10005000.0              27484000.0  11078000.0   
3  368118000.0   9096000.0              24986000.0  10071000.0   
4  334652000.0   8269000.0              22714000.0   9155000.0   

   Team Room /Under Cap  
0            20705000

In [59]:
# %%write_file ../src/process_utils.py
import pandas as pd
import numpy as np
from datetime import datetime
import cpi
# from fetch_utils import fetch_player_info, fetch_career_stats, fetch_league_standings
# from scrape_utils import scrape_advanced_metrics

def inflate_value(value, year_str, debug=False):
    try:
        year = int(year_str[:4])
        current_year = datetime.now().year
       
        if year >= current_year:
            return value  # Return the original value for future years
        # Adjust to 2022 dollars to match the original data
        inflated_value = cpi.inflate(value, year, to=2022)
        if debug:
            print(f"Inflated value {value} from {year} to {inflated_value} (2022 dollars)")
        return inflated_value
    except ValueError:
        if debug:
            print(f"Invalid year format: {year_str}")
        return value
    except cpi.errors.CPIObjectDoesNotExist:
        # If data for the specific year is not available, use the earliest available year
        earliest_year = min(cpi.SURVEYS['CPI-U'].indexes['annual'].keys()).year
        inflated_value = cpi.inflate(value, earliest_year, to=2022)
        if debug:
            print(f"Used earliest available year {earliest_year} for inflation calculation")
        return inflated_value
    except Exception as e:
        if debug:
            print(f"Error inflating value for year {year_str}: {e}")
        return value

def calculate_percentages(df, debug=False):
    df['FG%'] = df['FG'] / df['FGA'].replace(0, np.nan)
    df['3P%'] = df['3P'] / df['3PA'].replace(0, np.nan)
    df['2P%'] = df['2P'] / df['2PA'].replace(0, np.nan)
    df['FT%'] = df['FT'] / df['FTA'].replace(0, np.nan)
    df['eFG%'] = (df['FG'] + 0.5 * df['3P']) / df['FGA'].replace(0, np.nan)
    if debug:
        print("Calculated percentages:")
        print(df[['FG%', '3P%', '2P%', 'FT%', 'eFG%']].head())
    return df

def process_player_data(player, season, all_players, debug=False):
    if player.lower() not in all_players:
        if debug:
            print(f"No player ID found for {player}")
        return None

    player_id = all_players[player.lower()]['player_id']
    team_id = all_players[player.lower()]['team_id']
    
    player_info = fetch_player_info(player_id, debug=debug)
    career_stats = fetch_career_stats(player_id, debug=debug)
    league_standings = fetch_league_standings(season, debug=debug)
    
    # Scrape advanced metrics from Basketball Reference
    advanced_metrics = scrape_advanced_metrics(player, season, debug=debug)

    if player_info is None or career_stats is None or career_stats.empty:
        if debug:
            print(f"Unable to fetch complete data for {player}")
        return None

    season_stats = career_stats[career_stats['SEASON_ID'].str.contains(season.split('-')[0], na=False)]
    if season_stats.empty:
        if debug:
            print(f"No stats found for {player} in season {season}")
        return None

    latest_season_stats = season_stats.iloc[0]
    
    try:
        draft_year = int(player_info['DRAFT_YEAR'].iloc[0])
    except ValueError:
        draft_year = int(player_info['FROM_YEAR'].iloc[0])

    current_season_year = int(season.split('-')[0])
    years_of_service = max(0, current_season_year - draft_year)

    # Handle missing league standings gracefully
    if league_standings is not None and not league_standings.empty:
        player_stats = calculate_player_stats(latest_season_stats, player_info, years_of_service, team_id, league_standings, advanced_metrics)
    else:
        player_stats = calculate_player_stats(latest_season_stats, player_info, years_of_service, team_id, pd.DataFrame(), advanced_metrics)

    player_stats.update({'Player': player, 'Season': season})

    if debug:
        print(f"Processed data for {player} in season {season}: {player_stats}")
    return player_stats

def calculate_player_stats(stats, player_info, years_of_service, team_id, league_standings, advanced_metrics):
    fg = stats.get('FGM', 0) or 0
    fga = stats.get('FGA', 0) or 0
    fg3 = stats.get('FG3M', 0) or 0
    fg3a = stats.get('FG3A', 0) or 0
    efg = (fg + 0.5 * fg3) / fga if fga != 0 else 0
    fg2 = fg - fg3
    fg2a = fga - fg3a
    fg2_pct = (fg2 / fg2a) if fg2a != 0 else 0

    player_stats = {
        'Position': player_info.iloc[0]['POSITION'],
        'Age': stats.get('PLAYER_AGE', None),
        'Team': stats.get('TEAM_ABBREVIATION', None),
        'TeamID': team_id,
        'Years of Service': years_of_service,
        'GP': stats.get('GP', None),
        'GS': stats.get('GS', None),
        'MP': stats.get('MIN', None),
        'FG': fg,
        'FGA': fga,
        'FG%': stats.get('FG_PCT', None),
        '3P': fg3,
        '3PA': fg3a,
        '3P%': stats.get('FG3_PCT', None),
        '2P': fg2,
        '2PA': fg2a,
        '2P%': fg2_pct,
        'eFG%': efg,
        'FT': stats.get('FTM', None),
        'FTA': stats.get('FTA', None),
        'FT%': stats.get('FT_PCT', None),
        'ORB': stats.get('OREB', None),
        'DRB': stats.get('DREB', None),
        'TRB': stats.get('REB', None),
        'AST': stats.get('AST', None),
        'STL': stats.get('STL', None),
        'BLK': stats.get('BLK', None),
        'TOV': stats.get('TOV', None),
        'PF': stats.get('PF', None),
        'PTS': stats.get('PTS', None),
    }
    
    # Add advanced metrics
    player_stats.update(advanced_metrics)

    if league_standings is not None and not league_standings.empty:
        team_standings = league_standings[league_standings['TeamID'] == team_id]
        if not team_standings.empty:
            player_stats.update({
                'Wins': team_standings['WINS'].values[0],
                'Losses': team_standings['LOSSES'].values[0]
            })

    return player_stats

if __name__ == "__main__":
    # Example usage
    debug = True
    season = "2022-23"
    sample_value = 1000000
    sample_year = "2022"
    sample_player = "LeBron James"
    
    # Test inflate_value
    inflated_value = inflate_value(sample_value, sample_year, debug=debug)
    print(f"Inflated value: {inflated_value}")
    
    # Test calculate_percentages
    sample_df = pd.DataFrame({
        'FG': [100], 'FGA': [200],
        '3P': [50], '3PA': [100],
        '2P': [50], '2PA': [100],
        'FT': [75], 'FTA': [100]
    })
    calculated_df = calculate_percentages(sample_df, debug=debug)
    print("Calculated percentages:")
    print(calculated_df)
    
    # Test process_player_data
    # Note: This requires actual data from fetch_utils and scrape_utils
    # Here's a mock-up of how it would work:
    # from fetch_utils import fetch_all_players
    all_players = fetch_all_players(season, debug=debug)
    if sample_player.lower() in all_players:
        player_data = process_player_data(sample_player, season, all_players, debug=debug)
        print(f"Processed data for {sample_player}:")
        print(player_data)
    else:
        print(f"Player {sample_player} not found in the {season} season data.")

Inflated value 1000000 from 2022 to 1000000 (2022 dollars)
Inflated value: 1000000
Calculated percentages:
   FG%  3P%  2P%   FT%   eFG%
0  0.5  0.5  0.5  0.75  0.625
Calculated percentages:
    FG  FGA  3P  3PA  2P  2PA  FT  FTA  FG%  3P%  2P%   FT%   eFG%
0  100  200  50  100  50  100  75  100  0.5  0.5  0.5  0.75  0.625
Fetching data using CommonAllPlayers (Attempt 1) with parameters: {'season': '2022-23'}
Fetched 4972 players for season 2022-23
Fetching data using CommonPlayerInfo (Attempt 1) with parameters: {'player_id': 2544}
Fetching data using PlayerCareerStats (Attempt 1) with parameters: {'player_id': 2544}
Fetching data using LeagueStandings (Attempt 1) with parameters: {'season': '2022-23'}
Scraped advanced metrics for LeBron James in season 2022-23: {'PER': 23.9, 'TS%': 0.583, '3PAr': 0.309, 'FTr': 0.268, 'ORB%': 3.7, 'DRB%': 20.8, 'TRB%': 12.5, 'AST%': 33.5, 'STL%': 1.2, 'BLK%': 1.4, 'TOV%': 11.6, 'USG%': 33.3, 'OWS': 3.2, 'DWS': 2.4, 'WS': 5.6, 'WS/48': 0.138, 'OBPM': 5

In [60]:
# %%writefile ../src/data_utils.py

import pandas as pd
import numpy as np

def clean_dataframe(df):
    # Remove unnamed columns
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    
    # Remove duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]
    
    # Remove columns with all NaN values
    df = df.dropna(axis=1, how='all')
    
    # Remove rows with all NaN values
    df = df.dropna(axis=0, how='all')
    
    # Ensure only one 'Season' column exists
    season_columns = [col for col in df.columns if 'Season' in col]
    if len(season_columns) > 1:
        df = df.rename(columns={season_columns[0]: 'Season'})
        for col in season_columns[1:]:
            df = df.drop(columns=[col])
    
    # Remove '3PAr' and 'FTr' columns
    columns_to_remove = ['3PAr', 'FTr']
    df = df.drop(columns=columns_to_remove, errors='ignore')
    
    # Round numeric columns to 2 decimal places
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df[numeric_columns] = df[numeric_columns].round(2)
    
    return df

def merge_salary_cap_data(player_data, salary_cap_data):
    player_data['Season_Year'] = player_data['Season'].str[:4].astype(int)
    salary_cap_data['Season_Year'] = salary_cap_data['Season'].str[:4].astype(int)
    
    # Add inflation-adjusted salary cap
    salary_cap_data['Salary_Cap_Inflated'] = salary_cap_data.apply(
        lambda row: inflate_value(row['Salary Cap'], row['Season']),
        axis=1
    )
    
    # Merge salary cap data
    merged_data = pd.merge(player_data, salary_cap_data, on='Season_Year', how='left', suffixes=('', '_cap'))
    
    # Update salary cap columns
    cap_columns = ['Mid-Level Exception', 'Salary Cap', 'Luxury Tax', '1st Apron', '2nd Apron', 'BAE',
                   'Standard /Non-Taxpayer', 'Taxpayer', 'Team Room /Under Cap', 'Salary_Cap_Inflated']
    for col in cap_columns:
        if f'{col}_cap' in merged_data.columns:
            merged_data[col] = merged_data[col].fillna(merged_data[f'{col}_cap'])
            merged_data.drop(columns=[f'{col}_cap'], inplace=True)
    
    # Clean up temporary columns
    merged_data.drop(columns=['Season_Year'], inplace=True)
    
    # Clean the dataframe
    merged_data = clean_dataframe(merged_data)
    
    return merged_data

def validate_data(df):
    # Check for missing values
    missing_values = df.isnull().sum()
    if missing_values.sum() > 0:
        print("Warning: Missing values found in the following columns:")
        print(missing_values[missing_values > 0])
    
    # Check for duplicate rows
    duplicates = df.duplicated()
    if duplicates.sum() > 0:
        print(f"Warning: {duplicates.sum()} duplicate rows found")
    
    # Check data types
    expected_types = {
        'Season': 'object',
        'Player': 'object',
        'Age': 'float64',
        'GP': 'float64',
        'MP': 'float64',
        'Salary': 'float64',
        'Team_Salary': 'float64',
        'Salary Cap': 'float64',
        'Salary_Cap_Inflated': 'float64'
    }
    for col, expected_type in expected_types.items():
        if col in df.columns:
            actual_type = df[col].dtype
            if str(actual_type) != expected_type:
                print(f"Warning: Column '{col}' has type {actual_type}, expected {expected_type}")
    
    # Check for negative values in numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if (df[col] < 0).any():
            print(f"Warning: Negative values found in column '{col}'")
    
    return df

# def merge_injury_data(player_data, injury_data):
#     merged_data = player_data.copy()
#     merged_data['Injured'] = False
#     merged_data['Injury_Periods'] = ''

#     for index, row in merged_data.iterrows():
#         player_injuries = injury_data[(injury_data['Season'] == row['Season']) & 
#                                       (injury_data['Relinquished'].str.contains(row['Player'], case=False, na=False))]
#         if not player_injuries.empty:
#             merged_data.at[index, 'Injured'] = True
#             injury_periods = player_injuries.apply(lambda x: f"{x['Date'].strftime('%Y-%m-%d')} - {x['Notes']}", axis=1).tolist()
#             merged_data.at[index, 'Injury_Periods'] = '; '.join(injury_periods)

#     return merged_data

In [68]:
# %%writefile ../src/main.py
import argparse
import pandas as pd
# from fetch_utils import fetch_all_players
# from process_utils import process_player_data, inflate_value, calculate_percentages
# from scrape_utils import scrape_salary_cap_history, scrape_player_salary_data, scrape_team_salary_data, scrape_injury_data
# from data_utils import load_existing_data, clean_dataframe, merge_salary_cap_data, validate_data, merge_injury_data


def update_data(existing_data, start_year, end_year, player_filter=None, min_avg_minutes=None, debug=False):
    all_data = existing_data.copy() if existing_data is not None else pd.DataFrame()

    # Load injury data
    injury_data = load_injury_data()

    salary_data = scrape_player_salary_data(start_year, end_year, player_filter=player_filter, debug=debug)

    new_data = pd.DataFrame()

    for year in range(start_year, end_year + 1):
        season = f"{year}-{str(year+1)[-2:]}"
        
        if debug:
            print(f"Processing season: {season}")
        
        team_salary_data = scrape_team_salary_data(season, debug=debug)
        all_players = fetch_all_players(season=season, debug=debug)
        
        season_salary_data = salary_data[salary_data['Season'] == season]
        
        if player_filter and player_filter.lower() != 'all':
            season_salary_data = season_salary_data[season_salary_data['Player'].str.lower() == player_filter.lower()]

        additional_stats = []

        for _, salary_row in season_salary_data.iterrows():
            player_name = salary_row['Player']
            player_name_lower = player_name.lower()
            
            if player_name_lower in all_players:
                player_stats = process_player_data(player_name, season, all_players, debug=debug)
                if player_stats:
                    player_stats['Salary'] = salary_row['Salary']
                    additional_stats.append(player_stats)
            elif debug:
                print(f"Player not found in all_players: {player_name}")

        additional_stats_df = pd.DataFrame(additional_stats)

        if additional_stats_df.empty or 'Team' not in additional_stats_df.columns:
            if debug:
                print(f"Warning: No valid player stats data for season {season}")
            continue

        # Merge team salary data
        merged_data = pd.merge(additional_stats_df, team_salary_data, on=['Team', 'Season'], how='left', suffixes=('', '_team'))

        # Apply minimum average minutes filter if specified
        if min_avg_minutes is not None:
            before_filter = len(merged_data)
            merged_data = merged_data[merged_data['MP'] >= min_avg_minutes]
            if debug:
                print(f"Filtered {before_filter - len(merged_data)} players based on minimum average minutes")

        # Merge injury data
        merged_data = merge_injury_data(merged_data, injury_data)

        new_data = pd.concat([new_data, merged_data], ignore_index=True, sort=False)

    # Remove existing data for the players and seasons we just updated
    if not all_data.empty and not new_data.empty:
        all_data = all_data[~((all_data['Season'].isin(new_data['Season'])) & 
                              (all_data['Player'].isin(new_data['Player'])))]

    # Combine existing data with new data
    all_data = pd.concat([all_data, new_data], ignore_index=True, sort=False)

    # Sort the final data by season and player
    all_data.sort_values(by=['Season', 'Player'], inplace=True)

    # Calculate percentages
    all_data = calculate_percentages(all_data)

    # Clean the dataframe
    all_data = clean_dataframe(all_data)

    if debug:
        print(f"Final data shape: {all_data.shape}")
        print(f"Columns: {all_data.columns.tolist()}")

    return all_data

def main(start_year, end_year, player_filter=None, min_avg_minutes=None, debug=False):
    processed_file_path = '../data/processed/nba_player_data_final_inflated.csv'
    salary_cap_file_path = '../data/processed/salary_cap_history_inflated.csv'

    existing_data = load_existing_data(processed_file_path)

    try:
        if debug:
            print(f"Updating data for years {start_year} to {end_year}")
        updated_data = update_data(existing_data, start_year, end_year, player_filter, min_avg_minutes, debug=debug)

        if not updated_data.empty:
            if debug:
                print("New data retrieved. Processing and saving...")

            salary_cap_data = scrape_salary_cap_history(debug=debug)

            if salary_cap_data is not None:
                salary_cap_data.to_csv(salary_cap_file_path, index=False)
                updated_data = merge_salary_cap_data(updated_data, salary_cap_data)

            # Final cleaning of the data
            updated_data = clean_dataframe(updated_data)

            # Save the updated data
            updated_data.to_csv(processed_file_path, index=False, float_format='%.2f')
            if debug:
                print(f"Updated data saved to {processed_file_path}")

            # Print summary of the data
            summary_columns = ['Season', 'Player', 'Salary', 'GP', 'PTS', 'TRB', 'AST', 'PER', 'WS', 'VORP', 'Injured', 'FG%', '3P%', 'FT%', 'Team_Salary', 'Salary Cap', 'Salary_Cap_Inflated']
            available_columns = [col for col in summary_columns if col in updated_data.columns]
            print("\nData summary:")
            print(updated_data[available_columns].head().to_string(index=False))
        else:
            print("No new data to save. The dataset is empty.")

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print("Traceback:")
        import traceback
        traceback.print_exc()

# if __name__ == "__main__":
#     parser = argparse.ArgumentParser(description="Update NBA player data")
#     parser.add_argument("--start_year", type=int, required=True, help="Start year for data update")
#     parser.add_argument("--end_year", type=int, required=True, help="End year for data update")
#     parser.add_argument("--player_filter", type=str, default="all", help="Filter for specific player or 'all'")
#     parser.add_argument("--min_avg_minutes", type=float, default=25, help="Minimum average minutes per game")
#     parser.add_argument("--debug", action="store_true", help="Enable debug mode")

#     args = parser.parse_args()

#     main(args.start_year, args.end_year, args.player_filter, args.min_avg_minutes, args.debug)
    

if __name__ == "__main__":
    start_year = 2019
    end_year = 2023
    player_filter = input("Enter player name or 'all' for all players: ").strip()
    min_avg_minutes = None
    if player_filter.lower() == 'all':
        min_avg_minutes = float(input("Enter the minimum average minutes per game (default 25 mins): ") or 25)

    debug = True  # Set to False to disable debug output

    main(start_year, end_year, player_filter, min_avg_minutes, debug)






    

Loaded existing data with shape: (23, 66)
Updating data for years 2019 to 2023
Injury data loaded successfully
Scraping data for 2019-20 from URL: https://hoopshype.com/salaries/players/2019-2020/
Scraping data for 2020-21 from URL: https://hoopshype.com/salaries/players/2020-2021/
Scraping data for 2021-22 from URL: https://hoopshype.com/salaries/players/2021-2022/
Scraping data for 2022-23 from URL: https://hoopshype.com/salaries/players/2022-2023/
Scraping data for 2023-24 from URL: https://hoopshype.com/salaries/players/2023-2024/
Scraped salary data for Jayson Tatum from seasons 2019-2023:
         Player    Salary   Season
0  Jayson Tatum   7830000  2019-20
1  Jayson Tatum   9897120  2020-21
2  Jayson Tatum  28103500  2021-22
3  Jayson Tatum  30351780  2022-23
4  Jayson Tatum  32600060  2023-24
Processing season: 2019-20
Scraped team salary data for season 2019-20:
        Team  Team_Salary   Season
0    Phoenix    230718931  2019-20
1  Minnesota    209038266  2019-20
2     Bosto

KeyboardInterrupt: 