In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from modular.player_game_logs import load_nba_player_game_logs, prepare_upcoming_games_data
from modular.metrics_functions import prepare_mean_std_data, prepare_league_std_data, prepare_performance_against_all_teams
from modular.betting_functions import calculate_probability, calculate_bet_outcome, generate_betting_options

#------------Loading data with caching---------------
#load if data in winr dataset is not = to yesterday at max
#data = pd.read_csv('data/player_game_logs_winr.csv')
#print(data[['GAME_DATE']].max())
#print(data.columns)
load_nba_player_game_logs(['2023-24'], min_avg_minutes=25.0, save_path='data/player_game_logs_winr.csv')


# Loading data with caching
def load_data():
    data = pd.read_csv('data/player_game_logs_winr.csv')
    data['GAME_DATE'] = pd.to_datetime(data['GAME_DATE'])
    data.sort_values(by='GAME_DATE', inplace=True)
    return data

#pull in upcoming games to concatenate to data and input averages onto it
upcoming_games = prepare_upcoming_games_data('data/23_24_season_games.csv', 'data/player_game_logs_winr.csv', expand_with_players=True)

# Load the existing games data
previous_games = load_data()

# Ensure GAME_DATE is in datetime format for comparison
upcoming_games['GAME_DATE'] = pd.to_datetime(upcoming_games['GAME_DATE'])

#print max date for previous games and min date for upcoming games
#print("max date for previous games =", previous_games['GAME_DATE'].max())
#print("min date for upcoming games =", upcoming_games['GAME_DATE'].min())

# Filter out upcoming games that have dates already in previous games
unique_upcoming_games = upcoming_games[~upcoming_games['GAME_DATE'].isin(previous_games['GAME_DATE'])]

# only include these columns: ['GAME_DATE', 'MATCHUP', 'home_away', 'TEAM_NAME','OPPOSING_TEAM', 'PLAYER_ID', 'PLAYER_NAME']
# print("upcoming games columns =", unique_upcoming_games.columns)
# print("previous games columns =", previous_games.columns)

# Concatenate the unique upcoming games to the previous games dataset
data = pd.concat([previous_games, unique_upcoming_games], ignore_index=True)

# Sort the concatenated data by GAME_DATE to maintain chronological order
data.sort_values(by='GAME_DATE', inplace=True)

# Reset the index of the concatenated DataFrame
data.reset_index(drop=True, inplace=True)
#------------Loading data with caching---------------


# Select a date and player for testing
data['GAME_DATE'] = pd.to_datetime(data['GAME_DATE']).normalize()
selected_date = pd.to_datetime('2024-02-27')
selected_player = 'Luka Doncic'
game_location = 'Home'
game_opposing_team = 'Test Opponent'

# Extract player data for the selected date
current_data = data[data['GAME_DATE'] == selected_date]
player_data = current_data[current_data['PLAYER_NAME'] == selected_player]

# Debugging Data Types
print("DataFrame 'GAME_DATE':", data['GAME_DATE'].head())
print("DataFrame 'GAME_DATE' dtype:", data['GAME_DATE'].dtype)
print("'selected_date':", selected_date)
print("'selected_date' type:", type(selected_date))

# Displaying Data for Selected Player and Date
if not player_data.empty:
    game_location = 'Home' if player_data['home_away'].iloc[0] == 'Home' else 'Away'
    game_opposing_team = player_data['OPPONENT_NAME'].iloc[0]
    print(f"Data for {selected_player} ({game_location} game) against {game_opposing_team} on {selected_date}:")
    print(player_data[['GAME_DATE', 'TEAM_NAME', 'home_away', 'PLAYER_NAME', 'TEAM_WIN_RATE', 'OPPONENT_WIN_RATE', 'PTS', 'FG3M', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'MIN']])
else:
    print(f"No data available for {selected_player} on {selected_date}.")

# Filter data for the selected date to compute stats, league standards, and performance against all teams
current_stats_data = data[data['GAME_DATE'] <= selected_date]
total_games_played = current_stats_data[current_stats_data['PLAYER_NAME'] == selected_player].shape[0]
total_averages_data = prepare_mean_std_data(current_stats_data, total_games_played, game_location)
league_std_data = prepare_league_std_data(current_stats_data, total_games_played, game_location)
performance_against_all_teams_data = prepare_performance_against_all_teams(current_stats_data)

# Display relevant information for verification
print(f"Data for {selected_player} on {selected_date}:")
print(player_data)

# Total games played by the player in the dataset
print(f"Total games played by {selected_player} in the dataset: {total_games_played}")

# Betting analysis
# Assuming betting analysis is only relevant for past games, we filter out future games
bet_player_data = current_stats_data[current_stats_data['PLAYER_NAME'] == selected_player]
probability, _, _, player_std, std_dev_comparison, _, _ = calculate_probability(
    bet_player_data, 'PTS', 20, league_std_data, 10, 0.9, game_opposing_team
)
expected_profit, expected_loss, probability_weighted_to_profit = calculate_bet_outcome(100, 150, probability)

# Display betting analysis results
print(f"Probability of achieving projection: {probability * 100:.2f}%")
print(f"Expected Profit if Win: ${expected_profit:.2f}, Expected Loss if Lose: -${expected_loss:.2f}")

# Generating and displaying betting options
betting_options_df = generate_betting_options(bet_player_data, league_std_data, selected_player, game_opposing_team)
print("Betting Options:")
print(betting_options_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  upcoming_games.sort_values(by='DATE', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  upcoming_games['GAME_DATE'] = upcoming_games['DATE'].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  upcoming_games.rename(columns={'Home_Away': 'home_away'}, inplace=True)
  upcoming_games.at[index, 'OPPOSING_TEAM'] = away_team_full_name


Index(['SEASON_ID', 'Player_ID', 'Game_ID', 'GAME_DATE', 'MATCHUP', 'WL',
       'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'PLUS_MINUS', 'VIDEO_AVAILABLE', 'PLAYER_NAME',
       'TEAM_ABBREVIATION', 'OPPONENT_ABBREVIATION', 'TEAM_NAME',
       'OPPONENT_NAME', 'TEAM_WIN_RATE', 'OPPONENT_WIN_RATE', 'home_away'],
      dtype='object')
Index(['GAME_DATE', 'MATCHUP', 'home_away', 'TEAM_ID', 'TEAM_NAME',
       'OPPOSING_TEAM'],
      dtype='object')


AttributeError: 'Series' object has no attribute 'normalize'

In [3]:
        
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from nba_api.stats.static import teams

def prepare_upcoming_games_data(season_games_csv, player_game_logs_csv, expand_with_players=False):
    # Load season games data
    data = pd.read_csv(season_games_csv)
    
    # Process home and away data
    home_data = data[['DATE', 'Start (ET)', 'Home/Neutral']].copy()
    home_data['Home_Away'] = 'Home'
    home_data['MATCHUP'] = home_data['Home/Neutral'] + ' vs. ' + data['Visitor/Neutral']
    home_data.rename(columns={'Home/Neutral': 'Team'}, inplace=True)
    home_data['WL_encoded'] = np.nan
    
    away_data = data[['DATE', 'Start (ET)', 'Visitor/Neutral']].copy()
    away_data['Home_Away'] = 'Away'
    away_data['MATCHUP'] = away_data['Visitor/Neutral'] + ' @ ' + home_data['Team']  # Adjusted to use '@' for away games
    away_data.rename(columns={'Visitor/Neutral': 'Team'}, inplace=True)
    away_data['WL_encoded'] = np.nan
    
    final_data = pd.concat([home_data, away_data], ignore_index=True)
    final_data.sort_values(by=['DATE', 'Start (ET)', 'Home_Away'], inplace=True)
    final_data.reset_index(drop=True, inplace=True)
    
    # Convert 'DATE' column to datetime format
    final_data['DATE'] = pd.to_datetime(final_data['DATE'], format='%a, %b %d, %Y')
    
    # Get unique team information from the NBA API
    teams_info = teams.get_teams()
    teams_df = pd.DataFrame(teams_info)
    teams_df.rename(columns={'id': 'TEAM_ID', 'full_name': 'TEAM_NAME', 'abbreviation': 'TEAM_ABBREVIATION'}, inplace=True)
    
    # Merge final_data with teams_df to include TEAM_ID and abbreviations
    final_data = pd.merge(final_data, teams_df[['TEAM_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION']], left_on='Team', right_on='TEAM_NAME', how='left')
    
    # Ensure all team names in MATCHUP are abbreviations
    for index, row in final_data.iterrows():
        for _, team_row in teams_df.iterrows():
            final_data.at[index, 'MATCHUP'] = final_data.at[index, 'MATCHUP'].replace(team_row['TEAM_NAME'], team_row['TEAM_ABBREVIATION'])
    
    # Extract and filter for upcoming games
    today = pd.Timestamp.now().floor('D')  # Normalize to avoid time part
    week_out = today + timedelta(days=7)
    upcoming_games = final_data[(final_data['DATE'] >= today) & (final_data['DATE'] <= week_out)]
    upcoming_games.sort_values(by='DATE', inplace=True)
    upcoming_games.reset_index(drop=True, inplace=True)
    
    # Format the 'DATE' column to match the example output's 'GAME_DATE' format
    upcoming_games['GAME_DATE'] = upcoming_games['DATE'].dt.strftime('%Y-%m-%d')
    #print("upcoming_games max =", upcoming_games['GAME_DATE'].max())
    #print("upcoming_games min =", upcoming_games['GAME_DATE'].min())
    
    # Correct the column name for consistency
    upcoming_games.rename(columns={'Home_Away': 'home_away'}, inplace=True)
    
    # Drop unnecessary columns and adjust to match the target dataset structure
    upcoming_games = upcoming_games[['GAME_DATE', 'MATCHUP', 'home_away', 'TEAM_ID', 'TEAM_NAME']]
    
    # Create OPPONENT_NAME column
    upcoming_games['OPPONENT_NAME'] = np.nan  # Placeholder for opposing team names
    
    # Populate TEAM_NAME and OPPONENT_NAME with correct names
    for index, row in upcoming_games.iterrows():
        if row['home_away'] == 'Home':
            # If it's a home game, the home team is TEAM_NAME and the visitor team is OPPONENT_NAME
            home_team_abbr = row['MATCHUP'].split(' vs. ')[0]
            away_team_abbr = row['MATCHUP'].split(' vs. ')[1]
        else:
            # If it's an away game, the visitor team is TEAM_NAME and the home team is OPPONENT_NAME
            away_team_abbr = row['MATCHUP'].split(' @ ')[0]
            home_team_abbr = row['MATCHUP'].split(' @ ')[1]

        home_team_full_name = teams_df[teams_df['TEAM_ABBREVIATION'] == home_team_abbr]['TEAM_NAME'].values[0]
        away_team_full_name = teams_df[teams_df['TEAM_ABBREVIATION'] == away_team_abbr]['TEAM_NAME'].values[0]

        upcoming_games.at[index, 'TEAM_NAME'] = home_team_full_name if row['home_away'] == 'Home' else away_team_full_name
        upcoming_games.at[index, 'OPPONENT_NAME'] = away_team_full_name if row['home_away'] == 'Home' else home_team_full_name

    #print("upcoming_games post team change max =", upcoming_games['GAME_DATE'].max())
    #print("upcoming_games post team change min =", upcoming_games['GAME_DATE'].min())

    # Load player game logs to use for fetching rosters
    player_game_logs = pd.read_csv(player_game_logs_csv)
    #print("player_game_logs post team change max =", player_game_logs['GAME_DATE'].max())
    #print("player_game_logs post team change min =", player_game_logs['GAME_DATE'].min())

    #print("player_game_logs =", player_game_logs.columns)
    #print("final upcoming_games =", upcoming_games.columns)
    
    if expand_with_players:
        expanded_games_with_players = pd.DataFrame()

        # Assuming player_game_logs_csv is correctly loaded into player_game_logs DataFrame
        player_game_logs = pd.read_csv('data\player_game_logs_winr.csv')

        expanded_rows = []

        for _, game in upcoming_games.iterrows():
            team_name = game['TEAM_NAME']
            team_players = player_game_logs[player_game_logs['TEAM_NAME'] == team_name]

            for _, player in team_players.iterrows():
                expanded_row = game.copy().to_dict()
                expanded_row['Player_ID'] = player['Player_ID']
                expanded_row['PLAYER_NAME'] = player['PLAYER_NAME']
                expanded_rows.append(expanded_row)

        expanded_games_with_players = pd.DataFrame(expanded_rows)

        #drop duplicate players and game_dates
        expanded_games_with_players = expanded_games_with_players.drop_duplicates(subset=['GAME_DATE', 'PLAYER_NAME'], keep='first')

        # only include these columns: ['GAME_DATE', 'MATCHUP', 'home_away', 'TEAM_NAME','OPPOSING_TEAM', 'Player_ID', 'PLAYER_NAME']
        expanded_games_with_players = expanded_games_with_players[['GAME_DATE', 'MATCHUP', 'home_away', 'TEAM_NAME', 'OPPONENT_NAME', 'Player_ID', 'PLAYER_NAME']]
        
        # Return the expanded DataFrame
        return expanded_games_with_players


    return upcoming_games

# Example usage with file paths
season_games_csv = 'data/23_24_season_games.csv'
player_game_logs_csv = 'data/player_game_logs_winr.csv'
upcoming_games_df = prepare_upcoming_games_data(season_games_csv, player_game_logs_csv, expand_with_players=True)
#print(upcoming_games_df.head())
print(len(upcoming_games_df))

#filter for kawhi leonard
kawhi_leonard = upcoming_games_df[upcoming_games_df['PLAYER_NAME'] == 'James Harden']
print(kawhi_leonard.head())
print(len(kawhi_leonard))

#print max and min game_date
print(upcoming_games_df['GAME_DATE'].max())
print(upcoming_games_df['GAME_DATE'].min())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  upcoming_games.sort_values(by='DATE', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  upcoming_games['GAME_DATE'] = upcoming_games['DATE'].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  upcoming_games.rename(columns={'Home_Away': 'home_away'}, inplace=True)
  upcoming_games.at[index, 'OPPONENT_NAME'] = away_team_full_name if row['home_away'] == 'Home' else home_team_full_na

971
        GAME_DATE      MATCHUP home_away             TEAM_NAME  \
8551   2024-03-12  LAC vs. MIN      Home  Los Angeles Clippers   
19959  2024-03-14    LAC @ CHI      Away  Los Angeles Clippers   
25383  2024-03-15    LAC @ NOP      Away  Los Angeles Clippers   
39543  2024-03-17  LAC vs. ATL      Home  Los Angeles Clippers   

                OPPONENT_NAME  Player_ID   PLAYER_NAME  
8551   Minnesota Timberwolves     201935  James Harden  
19959           Chicago Bulls     201935  James Harden  
25383    New Orleans Pelicans     201935  James Harden  
39543           Atlanta Hawks     201935  James Harden  
4
2024-03-18
2024-03-11


In [6]:
import pandas as pd
from datetime import datetime, timedelta
from nba_api.stats.endpoints import commonallplayers, playergamelog, leaguedashplayerstats, leaguegamefinder
from nba_api.stats.static import teams
import time
import numpy as np


def get_current_nba_season_year():
    current_date = datetime.now()
    if current_date.month > 9:  # NBA season starts in October
        return str(current_date.year) + "-" + str(current_date.year + 1)[2:]
    else:
        return str(current_date.year - 1) + "-" + str(current_date.year)[2:]

def calculate_cumulative_win_rates(season):
    try:
        # Adjust the season start date based on the typical NBA season start dates
        season_start_date = season.split('-')[0] + "-10-01"  # Assuming October 1st as a generic start date
        all_games = leaguegamefinder.LeagueGameFinder(season_nullable=season).get_data_frames()[0]
        all_games['GAME_DATE'] = pd.to_datetime(all_games['GAME_DATE'])
        all_games = all_games[all_games['GAME_DATE'] > pd.to_datetime(season_start_date)]
        all_games = all_games.sort_values('GAME_DATE')
        all_games['WIN'] = all_games['WL'].apply(lambda x: 1 if x == 'W' else 0)
        all_games['CUMULATIVE_WINS'] = all_games.groupby('TEAM_NAME')['WIN'].cumsum()
        all_games['CUMULATIVE_GAMES'] = all_games.groupby('TEAM_NAME').cumcount() + 1
        all_games['CUMULATIVE_WIN_RATE'] = all_games['CUMULATIVE_WINS'] / all_games['CUMULATIVE_GAMES']
        return all_games
    except Exception as e:
        print(f"Error calculating cumulative win rates: {e}")
        return pd.DataFrame()


def load_nba_player_game_logs(seasons, min_avg_minutes=30.0, save_path='data/player_game_logs.csv'):
    if not isinstance(seasons, list):
        seasons = [seasons]

    new_players_data = pd.DataFrame()

    for season in seasons:
        print(f"Processing season {season}...")
        try:
            all_players = commonallplayers.CommonAllPlayers(is_only_current_season=0).get_data_frames()[0]
            player_stats = leaguedashplayerstats.LeagueDashPlayerStats(season=season).get_data_frames()[0]
        except Exception as e:
            print(f"Error fetching player stats for season {season}: {e}")
            continue

        # Filter for players who meet the minimum average minutes threshold
        player_stats['AVG_MIN'] = player_stats['MIN'] / player_stats['GP']
        eligible_players = player_stats[player_stats['AVG_MIN'] >= min_avg_minutes]
        teams_list = teams.get_teams()
        team_abbrev_to_full_name = {team['abbreviation']: team['full_name'] for team in teams_list}

        all_games = calculate_cumulative_win_rates(season)
        if all_games.empty:
            print("Skipping win rate calculation due to an error.")
            continue

        for index, player in eligible_players.iterrows():
            try:
                player_id = player['PLAYER_ID']
                player_name = player['PLAYER_NAME']
                player_log = playergamelog.PlayerGameLog(player_id=player_id, season=season)
                player_data = player_log.get_data_frames()[0]
                if player_data.empty:
                    continue
                player_data['PLAYER_NAME'] = player_name
                player_data['TEAM_ABBREVIATION'] = player_data['MATCHUP'].str[:3]
                player_data['OPPONENT_ABBREVIATION'] = player_data['MATCHUP'].apply(lambda x: x.split(' ')[2] if 'vs.' in x else x.split(' ')[-1])
                player_data['TEAM_NAME'] = player_data['TEAM_ABBREVIATION'].map(team_abbrev_to_full_name)
                player_data['OPPONENT_NAME'] = player_data['OPPONENT_ABBREVIATION'].map(team_abbrev_to_full_name)
                player_data = player_data[player_data['TEAM_ABBREVIATION'].isin(team_abbrev_to_full_name.keys())]
                new_players_data = pd.concat([new_players_data, player_data], ignore_index=True)
            except Exception as e:
                print(f"Error processing player {player_name}: {e}")
                continue
            time.sleep(0.6)  # To respect rate limits

        # Calculate team and opponent win rates
        new_players_data['GAME_DATE'] = pd.to_datetime(new_players_data['GAME_DATE'])
        new_players_data['TEAM_WIN_RATE'] = new_players_data.apply(lambda row: get_win_rate(row, 'TEAM_NAME', all_games), axis=1)
        new_players_data['OPPONENT_WIN_RATE'] = new_players_data.apply(lambda row: get_win_rate(row, 'OPPONENT_NAME', all_games), axis=1)

        new_players_data['home_away'] = new_players_data['MATCHUP'].str.split(' ').str[1]
        new_players_data['home_away'] = new_players_data['home_away'].apply(lambda x: 'Away' if '@' in x else 'Home')
        new_players_data.reset_index(drop=True)
    if not new_players_data.empty:
        new_players_data.to_csv(save_path, index=False)
        print(f"Player game logs saved to {save_path}")
    else:
        print("No player game logs to save.")



#Example usage
seasons = ['2023-24']  # You can adjust seasons as needed
new_players_data = load_nba_player_game_logs(seasons, min_avg_minutes=30.0, save_path='data/player_game_logs_winr.csv')

# filter for james harden
james_harden = new_players_data[new_players_data['PLAYER_NAME'] == 'James Harden']

#filter for 3-10-2024
james_harden_3_10_2024 = james_harden[james_harden['GAME_DATE'] == '2024-03-10']
print(james_harden_3_10_2024.head())


Processing season 2023-24...


  new_players_data['GAME_DATE'] = pd.to_datetime(new_players_data['GAME_DATE'])


Player game logs saved to data/player_game_logs_winr.csv


TypeError: 'NoneType' object is not subscriptable