<a href="https://colab.research.google.com/github/csmotherman/NBAPlayerStat/blob/main/NBA_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Load in API

In [None]:
!pip install nba_api --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.7/261.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.5/162.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

#Scrape and Clean _Defense vs. Position_ Stats from URL

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the webpage
url = "https://hashtagbasketball.com/nba-defense-vs-position"

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find all tables on the webpage
tables = soup.find_all("table")

# Read the table data into the DataFrame
DefenseVsPosition = pd.read_html(str(tables[2]))[0]

# Remove "Sort: " from column names
DefenseVsPosition.columns = DefenseVsPosition.columns.str.replace('Sort: ', '')

# Columns to convert to float (excluding 'Position' and 'Team')
columns_to_convert = DefenseVsPosition.columns.difference(['Position', 'Team'])

# Loop through specified columns, remove ranks, and convert to floats
for col in columns_to_convert:
    DefenseVsPosition[col] = DefenseVsPosition[col].apply(lambda x: float(x.split()[0]))

# Clean 'Team' column to remove ranks
DefenseVsPosition['Team'] = DefenseVsPosition['Team'].apply(lambda x: x.split()[0])

replacement_map = {'SA': 'SAS', 'PHO': 'PHX', 'NY': 'NYK', 'NO': 'NOP', 'GS': 'GSW'}

# Replace values in 'Team' column using the replacement_map
DefenseVsPosition['Team'].replace(replacement_map, inplace=True)


#Use API to get Team's Opponent Stat Averages

In [None]:
from nba_api.stats.endpoints import leaguedashteamstats

# Fetch team opponent stats
from nba_api.stats.endpoints import leaguedashteamstats

# Fetch per game opponent stats
team_opponent_stats = leaguedashteamstats.LeagueDashTeamStats(per_mode_detailed='PerGame', measure_type_detailed_defense='Opponent',last_n_games=15)

# Convert the obtained data to a pandas DataFrame
team_opponent_stats_df = team_opponent_stats.get_data_frames()[0]


# Display the DataFrame containing team opponent stats
team_opponent_stats_df['TEAM_NAME'].unique()

team_abbreviations = {
    'Atlanta Hawks': 'ATL', 'Boston Celtics': 'BOS', 'Brooklyn Nets': 'BKN', 'Charlotte Hornets': 'CHA',
    'Chicago Bulls': 'CHI', 'Cleveland Cavaliers': 'CLE', 'Dallas Mavericks': 'DAL', 'Denver Nuggets': 'DEN',
    'Detroit Pistons': 'DET', 'Golden State Warriors': 'GSW', 'Houston Rockets': 'HOU', 'Indiana Pacers': 'IND',
    'LA Clippers': 'LAC', 'Los Angeles Lakers': 'LAL', 'Memphis Grizzlies': 'MEM', 'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL', 'Minnesota Timberwolves': 'MIN', 'New Orleans Pelicans': 'NOP',
    'New York Knicks': 'NYK', 'Oklahoma City Thunder': 'OKC', 'Orlando Magic': 'ORL', 'Philadelphia 76ers': 'PHI',
    'Phoenix Suns': 'PHX', 'Portland Trail Blazers': 'POR', 'Sacramento Kings': 'SAC',
    'San Antonio Spurs': 'SAS', 'Toronto Raptors': 'TOR', 'Utah Jazz': 'UTA', 'Washington Wizards': 'WAS'
}

# Replace the team names with abbreviations using the mapping
team_opponent_stats_df['TEAM'] = team_opponent_stats_df['TEAM_NAME'].map(team_abbreviations)

# Your DataFrame containing the columns
columns = [
    'TEAM', 'OPP_FGM', 'OPP_FGA', 'OPP_FG_PCT', 'OPP_FG3M', 'OPP_FG3A', 'OPP_FG3_PCT',
    'OPP_FTM', 'OPP_FTA', 'OPP_FT_PCT', 'OPP_OREB', 'OPP_DREB', 'OPP_REB', 'OPP_AST',
    'OPP_TOV', 'OPP_STL', 'OPP_BLK', 'OPP_BLKA', 'OPP_PF', 'OPP_PFD', 'OPP_PTS'
]

# Sample DataFrame (replace this with your actual DataFrame)
TeamOpponent = team_opponent_stats_df[columns]

# Define the desired column order
desired_order = [
    'TEAM', 'OPP_FGM', 'OPP_FGA', 'OPP_FG_PCT', 'OPP_FG3M', 'OPP_FG3A', 'OPP_FG3_PCT',
    'OPP_FTM', 'OPP_FTA', 'OPP_FT_PCT', 'OPP_OREB', 'OPP_DREB', 'OPP_REB', 'OPP_AST',
    'OPP_TOV', 'OPP_STL', 'OPP_BLK', 'OPP_BLKA', 'OPP_PF', 'OPP_PFD', 'OPP_PTS'
]

# Reorder columns
TeamOpponent = TeamOpponent.reindex(columns=desired_order)

# Display the updated DataFra


#Sort for only recent data (last 15 games)

In [None]:
adv_team_1 = leaguedashteamstats.LeagueDashTeamStats(per_mode_detailed='PerGame', measure_type_detailed_defense='Advanced',last_n_games=15)
adv_team = adv_team_1.get_data_frames()[0]
adv_team.columns

adv_team = adv_team[['TEAM_NAME','OFF_RATING','DEF_RATING','PACE']]
adv_team['TEAM'] = adv_team['TEAM_NAME'].map(team_abbreviations)
adv_team = adv_team.reindex(columns=['TEAM','OFF_RATING','DEF_RATING','PACE'])
adv_team['PT_PWR'] = adv_team['OFF_RATING'] + adv_team['DEF_RATING'] + adv_team['PACE']
pt_pwr = adv_team[['TEAM','PT_PWR']]
pt_pwr = pt_pwr.rename(columns = {'TEAM' : 'OPP'})

#Import player minute projections from external source (CSV)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page to scrape
url = "https://www.sportsline.com/nba/expert-projections/simulation/"

# Get the page content
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table
table = soup.find('table')  # This assumes there's only one table. Adjust selector if needed.

# Extract headers
headers = [th.text.strip() for th in table.find_all('th')]

# Find the index of PLAYER and MIN columns
player_index = headers.index('PLAYER')
min_index = headers.index('MIN')

# Extract rows
rows = table.find_all('tr')[1:]  # Skip header row

# Extract data
data = []
for row in rows:
    cols = row.find_all('td')
    player = cols[player_index].text.strip()
    min_col = cols[min_index].text.strip()
    data.append([player, min_col])

# Create DataFrame
PlayersMinutes = pd.DataFrame(data, columns=['Player', 'MIN'])

# Convert MIN column to float
PlayersMinutes['MIN'] = PlayersMinutes['MIN'].astype(float)

print(PlayersMinutes)


                Player   MIN
0          Luka Doncic  41.0
1         Jayson Tatum  41.0
2         Jaylen Brown  38.0
3         Kyrie Irving  40.0
4         Jrue Holiday  39.0
5        Derrick White  37.0
6           Al Horford  32.0
7        PJ Washington  34.0
8     Dereck Lively II  31.0
9       Daniel Gafford  17.0
10   Derrick Jones Jr.  23.0
11    Payton Pritchard  13.0
12  Kristaps Porzingis  10.0
13        Tim Hardaway  13.0
14          Sam Hauser  15.0
15          Josh Green  16.0
16   Maximilian Kleber  16.0
17      Xavier Tillman  11.0
18         Jaden Hardy   9.0


#Use API to retrieve the game logs of players who are scheduled to play today

In [None]:
import pandas as pd
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.static import players

# List of player names from the CSV file
player_names = PlayersMinutes['Player'].tolist()  # Assuming 'Player' is the column header in the CSV
# Get player IDs for the given player names
nba_players = players.get_players()
player_logs = []  # List to store player game logs

for player_name in player_names:
    player_info = [player for player in nba_players if player['full_name'] == player_name]

    # If player not found, try adding ' Jr.' to the name and then ' II'
    if not player_info:
        player_name_with_jr = f"{player_name} Jr."
        player_info = [player for player in nba_players if player['full_name'] == player_name_with_jr]

        # If still not found, try adding ' II' to the name
        if not player_info:
            player_name_with_ii = f"{player_name} II"
            player_info = [player for player in nba_players if player['full_name'] == player_name_with_ii]
        if not player_info:
            player_name_with_ii = f"{player_name} III"
            player_info = [player for player in nba_players if player['full_name'] == player_name_with_ii]

    if player_info:
        player_id = player_info[0]['id']
        # Get player's game log for a specific season
        season = '2023-24'  # Replace with the season you want to retrieve
        player_log = playergamelog.PlayerGameLog(player_id=player_id, season=season)
        player_data = player_log.get_data_frames()[0]  # Get player's game log DataFrame
        # Add a 'Player' column with the player's name
        player_data['Player'] = player_name

        # Modify 'MATCHUP' column to create 'OPP' column with the last 3 characters
        player_data['OPP'] = player_data['MATCHUP'].str[-3:]

        player_logs.append(player_data)  # Append DataFrame to the list
        print(f"Done for {player_name}")
    else:
        print(f"Player '{player_name}' not found.")

# Concatenate all player game logs into a single DataFrame
master_df = pd.concat(player_logs, ignore_index=True)

master_df.info()


Done for Luka Doncic
Done for Jayson Tatum
Done for Jaylen Brown
Done for Kyrie Irving
Done for Jrue Holiday
Done for Derrick White
Done for Al Horford
Player 'PJ Washington' not found.
Done for Dereck Lively II
Done for Daniel Gafford
Done for Derrick Jones Jr.
Done for Payton Pritchard
Done for Kristaps Porzingis
Done for Tim Hardaway
Done for Sam Hauser
Done for Josh Green
Player 'Maximilian Kleber' not found.
Done for Xavier Tillman
Done for Jaden Hardy
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1086 entries, 0 to 1085
Data columns (total 29 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   SEASON_ID        1086 non-null   object 
 1   Player_ID        1086 non-null   object 
 2   Game_ID          1086 non-null   object 
 3   GAME_DATE        1086 non-null   object 
 4   MATCHUP          1086 non-null   object 
 5   WL               1086 non-null   object 
 6   MIN              1086 non-null   object 
 7   FGM         

### CREATE A COPY

In [None]:
copy_reset = pd.DataFrame(master_df.copy())

In [None]:
master_df = copy_reset.copy()
master_df = pd.DataFrame(master_df)
master_df

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,Player,OPP
0,22023,1629029,22301161,"APR 10, 2024",DAL @ MIA,W,36,9,23,0.391,...,9,0,0,3,4,29,7,1,Luka Doncic,MIA
1,22023,1629029,22301144,"APR 09, 2024",DAL @ CHA,W,35,13,25,0.52,...,10,0,1,4,2,39,22,1,Luka Doncic,CHA
2,22023,1629029,22301131,"APR 07, 2024",DAL vs. HOU,W,45,12,21,0.571,...,12,0,2,7,5,37,20,1,Luka Doncic,HOU
3,22023,1629029,22301124,"APR 04, 2024",DAL vs. ATL,W,37,8,25,0.32,...,8,2,1,5,3,25,21,1,Luka Doncic,ATL
4,22023,1629029,22300589,"APR 02, 2024",DAL @ GSW,L,39,11,22,0.5,...,11,0,0,5,1,30,9,1,Luka Doncic,GSW
5,22023,1629029,22301083,"MAR 31, 2024",DAL @ HOU,W,35,18,30,0.6,...,7,2,0,4,1,47,12,1,Luka Doncic,HOU
6,22023,1629029,22301073,"MAR 29, 2024",DAL @ SAC,W,41,6,14,0.429,...,12,2,0,5,2,26,6,1,Luka Doncic,SAC
7,22023,1629029,22301047,"MAR 26, 2024",DAL @ SAC,W,32,10,18,0.556,...,6,2,0,4,4,28,18,1,Luka Doncic,SAC
8,22023,1629029,22301041,"MAR 25, 2024",DAL @ UTA,W,41,10,24,0.417,...,13,1,1,4,3,29,10,1,Luka Doncic,UTA
9,22023,1629029,22301008,"MAR 21, 2024",DAL vs. UTA,W,35,11,23,0.478,...,8,4,0,4,2,34,-3,1,Luka Doncic,UTA


#Convert numeric columns to numeric values

In [None]:
master_df.dropna(inplace=True)
columns_to_convert = ['PTS', 'REB', 'AST', 'BLK', 'STL', 'TOV', 'PF','MIN']
master_df[columns_to_convert] = master_df[columns_to_convert].apply(pd.to_numeric, errors='coerce')

master_df = master_df[master_df['MIN'] > 5]

In [None]:
selected_columns = ['Player', 'GAME_DATE','OPP', 'MIN', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV']
master_df = master_df[selected_columns]
playerlogs = master_df[master_df['MIN'] >= 15]
playerlogs['Player'].unique()

array(['Luka Doncic', 'Jayson Tatum', 'Jaylen Brown', 'Kyrie Irving',
       'Jrue Holiday', 'Derrick White', 'Al Horford', 'Dereck Lively II',
       'Daniel Gafford', 'Derrick Jones Jr.', 'Payton Pritchard',
       'Kristaps Porzingis', 'Sam Hauser', 'Josh Green', 'Xavier Tillman',
       'Jaden Hardy'], dtype=object)

#Convert stats to a per minute basis

In [None]:
trying = playerlogs.copy()

trying['PTS'] = trying['PTS'] / trying['MIN']
trying['REB'] = trying['REB'] / trying['MIN']
trying['AST'] = trying['AST'] / trying['MIN']

trying.drop(columns=['MIN'], inplace=True)

trying = trying[['Player','GAME_DATE','OPP','PTS','REB','AST']]


playerlogs = trying.copy()
print(playerlogs)

                  Player     GAME_DATE  OPP       PTS       REB       AST
0            Luka Doncic  APR 10, 2024  MIA  0.805556  0.250000  0.250000
1            Luka Doncic  APR 09, 2024  CHA  1.114286  0.342857  0.285714
2            Luka Doncic  APR 07, 2024  HOU  0.822222  0.200000  0.266667
3            Luka Doncic  APR 04, 2024  ATL  0.675676  0.324324  0.216216
4            Luka Doncic  APR 02, 2024  GSW  0.769231  0.307692  0.282051
5            Luka Doncic  MAR 31, 2024  HOU  1.342857  0.342857  0.200000
6            Luka Doncic  MAR 29, 2024  SAC  0.634146  0.219512  0.292683
7            Luka Doncic  MAR 26, 2024  SAC  0.875000  0.343750  0.187500
8            Luka Doncic  MAR 25, 2024  UTA  0.707317  0.292683  0.317073
9            Luka Doncic  MAR 21, 2024  UTA  0.971429  0.257143  0.228571
10           Luka Doncic  MAR 19, 2024  SAS  0.450000  0.250000  0.400000
11           Luka Doncic  MAR 17, 2024  DEN  0.925000  0.250000  0.075000
12           Luka Doncic  MAR 13, 2024

#Get all players last 10 games

In [None]:
import pandas as pd

# Assuming 'GAME_DATE' is in datetime format
playerlogs['GAME_DATE'] = pd.to_datetime(playerlogs['GAME_DATE'])

# Sort the dataframe by player and date in descending order
playerlogs_sorted = playerlogs.sort_values(by=['Player', 'GAME_DATE'], ascending=[True, False])

# Filter to include only the last 10 games for each player
playerlogs_last_10 = playerlogs_sorted.groupby('Player').head(10)

# Calculate differences from season averages
playerlogs_last_10['PTS_diff_season_avg'] = playerlogs_last_10['PTS'] - playerlogs_last_10.groupby('Player')['PTS'].transform('mean')
playerlogs_last_10['REB_diff_season_avg'] = playerlogs_last_10['REB'] - playerlogs_last_10.groupby('Player')['REB'].transform('mean')
playerlogs_last_10['AST_diff_season_avg'] = playerlogs_last_10['AST'] - playerlogs_last_10.groupby('Player')['AST'].transform('mean')

# Identify top and bottom players in each statistic
top_bottom_stats = {}
stats = ['PTS_diff_season_avg', 'REB_diff_season_avg', 'AST_diff_season_avg']
for stat in stats:
    top_bottom_stats[stat] = {}
    top_players = playerlogs_last_10.groupby('Player')[stat].last().nlargest(10)
    bottom_players = playerlogs_last_10.groupby('Player')[stat].last().nsmallest(10)
    top_bottom_stats[stat]['Top'] = top_players
    top_bottom_stats[stat]['Bottom'] = bottom_players

  playerlogs['GAME_DATE'] = pd.to_datetime(playerlogs['GAME_DATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  playerlogs_last_10['PTS_diff_season_avg'] = playerlogs_last_10['PTS'] - playerlogs_last_10.groupby('Player')['PTS'].transform('mean')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  playerlogs_last_10['REB_diff_season_avg'] = playerlogs_last_10['REB'] - playerlogs_last_10.groupby('Player')['REB'].transform('mean')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

#Get player positions from CSV file

In [None]:
# Read the CSV file containing player names
file_path = '/content/drive/MyDrive/PlayerPositions.csv'
player_positions = pd.read_csv(file_path)

merged_df = pd.merge(playerlogs, player_positions, on='Player', how='left')

# Display the updated merged DataFrame
selected_columns = ['Player','Pos','GAME_DATE', 'OPP', 'PTS', 'REB', 'AST']
playerlogs = merged_df[selected_columns]

playerlogs

Unnamed: 0,Player,Pos,GAME_DATE,OPP,PTS,REB,AST
0,Luka Doncic,SF,2024-04-10,MIA,0.805556,0.25,0.25
1,Luka Doncic,SF,2024-04-09,CHA,1.114286,0.342857,0.285714
2,Luka Doncic,SF,2024-04-07,HOU,0.822222,0.2,0.266667
3,Luka Doncic,SF,2024-04-04,ATL,0.675676,0.324324,0.216216
4,Luka Doncic,SF,2024-04-02,GSW,0.769231,0.307692,0.282051
5,Luka Doncic,SF,2024-03-31,HOU,1.342857,0.342857,0.2
6,Luka Doncic,SF,2024-03-29,SAC,0.634146,0.219512,0.292683
7,Luka Doncic,SF,2024-03-26,SAC,0.875,0.34375,0.1875
8,Luka Doncic,SF,2024-03-25,UTA,0.707317,0.292683,0.317073
9,Luka Doncic,SF,2024-03-21,UTA,0.971429,0.257143,0.228571


#Clean up the positions for each player and merge

In [None]:
# Assuming 'playerlogs' is the DataFrame containing player logs

# Map positions accordingly
position_mapping = {'F': 'SF', 'G': 'PG', 'GF': 'SF', 'FC': 'C'}

# Replace positions with the updated ones
playerlogs['Pos'] = playerlogs['Pos'].replace(position_mapping)

# Filter positions to keep only 'PG', 'SG', 'SF', 'PF', 'C'
playerlogs = playerlogs[playerlogs['Pos'].isin(['PG', 'SG', 'SF', 'PF', 'C'])]

# Display unique positions after transformation
playerlogs['Player'].unique()

array(['Luka Doncic', 'Jayson Tatum', 'Jaylen Brown', 'Kyrie Irving',
       'Jrue Holiday', 'Derrick White', 'Al Horford', 'Dereck Lively II',
       'Daniel Gafford', 'Derrick Jones Jr.', 'Payton Pritchard',
       'Kristaps Porzingis', 'Sam Hauser', 'Josh Green', 'Jaden Hardy'],
      dtype=object)

In [None]:
merged_data = playerlogs.merge(DefenseVsPosition, left_on=['OPP', 'Pos'], right_on=['Team', 'Position'], how='left')

In [None]:
playerlogs_columns = {
    'PTS_x': 'PTS',
    'REB_x': 'REB',
    'AST_x': 'AST',
}

# Rename the columns from DefenseVsPosition with 'OPP_' prefix
opp_columns = {
    'PTS_y': 'OPP_PTSvsPos',
    'REB_y': 'OPP_REBvsPos',
    'AST_y': 'OPP_ASTvsPos',
    'STL': 'OPP_STLvsPos',
    'BLK': 'OPP_BLKvsPos',
    'TO': 'OPP_TOVvsPos'
}

# Rename columns in the DataFrame
final_data = merged_data.rename(columns={**playerlogs_columns, **opp_columns})

# Select only the desired columns
selected_columns = ['Player', 'Pos','GAME_DATE', 'OPP','PTS', 'REB', 'AST', 'OPP_PTSvsPos', 'OPP_REBvsPos', 'OPP_ASTvsPos', 'OPP_STLvsPos', 'OPP_BLKvsPos', 'OPP_TOVvsPos']
final_data = final_data[selected_columns]



In [None]:
final_data = pd.merge(final_data, TeamOpponent, left_on='OPP', right_on='TEAM')
final_data

Unnamed: 0,Player,Pos,GAME_DATE,OPP,PTS,REB,AST,OPP_PTSvsPos,OPP_REBvsPos,OPP_ASTvsPos,...,OPP_DREB,OPP_REB,OPP_AST,OPP_TOV,OPP_STL,OPP_BLK,OPP_BLKA,OPP_PF,OPP_PFD,OPP_PTS
0,Luka Doncic,SF,2024-04-10,MIA,0.805556,0.25,0.25,21.7,7.8,4.7,...,33.4,42.0,25.7,13.7,6.2,3.9,3.3,18.1,15.4,103.7
1,Luka Doncic,SF,2024-03-07,MIA,0.875,0.275,0.275,21.7,7.8,4.7,...,33.4,42.0,25.7,13.7,6.2,3.9,3.3,18.1,15.4,103.7
2,Jayson Tatum,SF,2024-02-11,MIA,0.666667,0.25641,0.230769,21.7,7.8,4.7,...,33.4,42.0,25.7,13.7,6.2,3.9,3.3,18.1,15.4,103.7
3,Jayson Tatum,SF,2024-01-25,MIA,0.8125,0.25,0.125,21.7,7.8,4.7,...,33.4,42.0,25.7,13.7,6.2,3.9,3.3,18.1,15.4,103.7
4,Jayson Tatum,SF,2023-10-27,MIA,0.536585,0.195122,0.121951,21.7,7.8,4.7,...,33.4,42.0,25.7,13.7,6.2,3.9,3.3,18.1,15.4,103.7
5,Jaylen Brown,SF,2024-02-11,MIA,0.540541,0.243243,0.054054,21.7,7.8,4.7,...,33.4,42.0,25.7,13.7,6.2,3.9,3.3,18.1,15.4,103.7
6,Jaylen Brown,SF,2024-01-25,MIA,0.666667,0.148148,0.185185,21.7,7.8,4.7,...,33.4,42.0,25.7,13.7,6.2,3.9,3.3,18.1,15.4,103.7
7,Jaylen Brown,SF,2023-10-27,MIA,0.794118,0.176471,0.029412,21.7,7.8,4.7,...,33.4,42.0,25.7,13.7,6.2,3.9,3.3,18.1,15.4,103.7
8,Kyrie Irving,PG,2024-04-10,MIA,0.675676,0.081081,0.108108,22.3,5.8,8.3,...,33.4,42.0,25.7,13.7,6.2,3.9,3.3,18.1,15.4,103.7
9,Kyrie Irving,PG,2024-03-07,MIA,0.638889,0.111111,0.111111,22.3,5.8,8.3,...,33.4,42.0,25.7,13.7,6.2,3.9,3.3,18.1,15.4,103.7


In [None]:
final_data['GAME_DATE'] = pd.to_datetime(final_data['GAME_DATE'], format='%b %d, %Y')


# Sort the dataframe by 'Player' and 'Game' columns
final_data = final_data.sort_values(by=['Player', 'GAME_DATE']).reset_index(drop=True)

# Calculate rolling averages for points, rebounds, and assists for the previous 5 games
final_data['Avg_Points_Prev_5'] = final_data.groupby('Player')['PTS'].rolling(window=5, min_periods=0).mean().reset_index(drop=True)
final_data['Avg_Rebounds_Prev_5'] = final_data.groupby('Player')['REB'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
final_data['Avg_Assists_Prev_5'] = final_data.groupby('Player')['AST'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)



# Shift the rolling averages by one row to represent the previous 5 games excluding the current game
final_data['Avg_Points_Prev_5'] = final_data.groupby('Player')['Avg_Points_Prev_5'].shift(fill_value=0)
final_data['Avg_Rebounds_Prev_5'] = final_data.groupby('Player')['Avg_Rebounds_Prev_5'].shift(fill_value=0)
final_data['Avg_Assists_Prev_5'] = final_data.groupby('Player')['Avg_Assists_Prev_5'].shift(fill_value=0)

copy = final_data.to_dict()
final_data = pd.DataFrame(copy)
final_data.dropna(inplace=True)
# Display the updated DataFrame

final_data.sort_values(by='GAME_DATE',ascending = False)

Unnamed: 0,Player,Pos,GAME_DATE,OPP,PTS,REB,AST,OPP_PTSvsPos,OPP_REBvsPos,OPP_ASTvsPos,...,OPP_TOV,OPP_STL,OPP_BLK,OPP_BLKA,OPP_PF,OPP_PFD,OPP_PTS,Avg_Points_Prev_5,Avg_Rebounds_Prev_5,Avg_Assists_Prev_5
943,Sam Hauser,SF,2024-04-14,WAS,0.457143,0.142857,0.057143,24.1,8.4,5.1,...,12.7,9.0,7.3,4.5,17.8,20.9,119.1,0.390914,0.152448,0.093781
550,Josh Green,SG,2024-04-14,OKC,0.129032,0.096774,0.096774,22.5,6.5,5.5,...,15.9,8.1,4.5,5.8,18.5,17.0,110.8,0.250239,0.122301,0.061954
874,Payton Pritchard,PG,2024-04-14,WAS,0.863636,0.204545,0.272727,25.1,7.0,9.4,...,12.7,9.0,7.3,4.5,17.8,20.9,119.1,0.6835,0.103254,0.232831
352,Jaden Hardy,SG,2024-04-14,OKC,0.310345,0.137931,0.103448,22.5,6.5,5.5,...,15.9,8.1,4.5,5.8,18.5,17.0,110.8,0.578884,0.140335,0.15394
351,Jaden Hardy,SG,2024-04-12,DET,0.862069,0.241379,0.068966,22.8,6.1,5.1,...,13.7,8.6,6.1,3.7,15.3,16.4,114.1,0.633137,0.105392,0.140147
873,Payton Pritchard,PG,2024-04-12,CHA,1.0,0.096774,0.354839,23.4,6.5,9.2,...,13.5,7.3,3.5,4.2,17.0,15.4,116.8,0.562071,0.112471,0.219006
549,Josh Green,SG,2024-04-12,DET,0.173913,0.130435,0.0,22.8,6.1,5.1,...,13.7,8.6,6.1,3.7,15.3,16.4,114.1,0.261611,0.111598,0.061954
942,Sam Hauser,SF,2024-04-12,CHA,0.533333,0.133333,0.066667,23.0,7.2,5.0,...,13.5,7.3,3.5,4.2,17.0,15.4,116.8,0.375156,0.143963,0.080447
872,Payton Pritchard,PG,2024-04-11,NYK,0.761905,0.190476,0.285714,23.2,5.7,8.6,...,13.9,6.3,5.1,3.9,18.1,17.4,109.9,0.45969,0.095804,0.197577
63,Al Horford,C,2024-04-11,NYK,0.375,0.125,0.0625,21.8,13.0,3.4,...,13.9,6.3,5.1,3.9,18.1,17.4,109.9,0.454667,0.229143,0.091905


In [None]:
data = pd.merge(final_data,pt_pwr,on=['OPP'])

In [None]:
data = data.sort_values(by=['Player', 'GAME_DATE']).reset_index(drop=True)
data['Player'].unique()

array(['Al Horford', 'Daniel Gafford', 'Dereck Lively II',
       'Derrick Jones Jr.', 'Derrick White', 'Jaden Hardy',
       'Jaylen Brown', 'Jayson Tatum', 'Josh Green', 'Jrue Holiday',
       'Kristaps Porzingis', 'Kyrie Irving', 'Luka Doncic',
       'Payton Pritchard', 'Sam Hauser'], dtype=object)

In [None]:
data['GAME_DATE'] = pd.to_datetime(data['GAME_DATE'], format='%b %d, %Y')

# Filter rows for dates on or after November 5, 2023
data = data[data['GAME_DATE'] >= '2024-2-1']

#Test models for each player, and choose best. Store in dictionary

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, HuberRegressor, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# Assuming your data is in a DataFrame named 'data'
# data = pd.read_csv('your_data.csv')  # Uncomment and adjust to load your data

# Unique players in the dataset
unique_players = data['Player'].unique()
print(unique_players)

# Dictionary to store the best model for each player's target variable ('PTS', 'REB', 'AST')
player_best_models = {player: {} for player in unique_players}

# Iterate over unique players
for player in unique_players:
    # Filter data for the current player
    player_data = data[data['Player'] == player]
    print(player)
    # Check if the player has at least 24 rows
    if len(player_data) < 24:
        continue

    # Target variables
    target_variables = ['PTS', 'REB', 'AST']

    for target_variable in target_variables:
        # Features: All numeric columns except the target variable and excluded columns
        excluded_columns = ['PTS', 'REB', 'AST']
        features = player_data.select_dtypes(include='number').drop(columns=excluded_columns).columns

        # Conditionally drop columns based on the target variable
        if target_variable == 'PTS':
            features = [feature for feature in features if feature not in ['Avg_Assists_Prev_5', 'Avg_Rebounds_Prev_5']]
        elif target_variable == 'AST':
            features = [feature for feature in features if feature not in ['Avg_Rebounds_Prev_5', 'Avg_Points_Prev_5']]
        elif target_variable == 'REB':
            features = [feature for feature in features if feature not in ['Avg_Assists_Prev_5', 'Avg_Points_Prev_5']]

        # Calculate the range threshold
        average_value = player_data[target_variable].mean()
        relative_threshold_percentage = 10  # Adjust this percentage based on your preference
        range_threshold = average_value * (relative_threshold_percentage / 100)

        # Features and target variable
        X = player_data[features]
        y = player_data[target_variable]

        # Normalize features
        scaler = StandardScaler()
        X_normalized = scaler.fit_transform(X)

        # Split data into train and test sets (80% train, 20% test)
        X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

        # Creating different models
        models = [
            LinearRegression(),
            Lasso(),
            Ridge(),
            DecisionTreeRegressor(),
            RandomForestRegressor(),
            xgb.XGBRegressor(),
            HuberRegressor(max_iter=1000),
            BayesianRidge()
        ]

        # Train and evaluate each model
        best_model = None
        best_accuracy = 0  # Initialize with 0, as higher accuracy is better
        for model in models:
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)

            # Calculate accuracy based on the specified range (-2 to 2)
            correct_predictions = ((predictions >= y_test - range_threshold) & (predictions <= y_test + range_threshold)).sum()
            total_predictions = len(predictions)
            accuracy = correct_predictions / total_predictions

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model

        # Check if the best model is not None before proceeding
        if best_model is not None:
            # Get the top 10 coefficients from the best model
            if hasattr(best_model, 'coef_'):
                coef_abs = abs(best_model.coef_)
            elif hasattr(best_model, 'feature_importances_'):
                coef_abs = abs(best_model.feature_importances_)
            else:
                coef_abs = best_model.feature_importances_

            top_10_indices = coef_abs.argsort()[-15:][::-1]  # Indexes of top 10 coefficients
            top_10_features = [features[i] for i in top_10_indices]

            # Refit the best model with the top 10 features
            X_top_10 = X[top_10_features]
            scaler.fit(X_top_10)  # Refit scaler with the top 10 features
            X_top_10_normalized = scaler.transform(X_top_10)

            best_model.fit(X_top_10_normalized, y)  # Refit the best model with the top 10 features

            # Calculate accuracy for the best model using the top 10 features
            predictions_top_10 = best_model.predict(X_test[:, top_10_indices])
            correct_predictions_top_10 = (
                (predictions_top_10 >= y_test - range_threshold) & (predictions_top_10 <= y_test + range_threshold)).sum()
            total_predictions_top_10 = len(predictions_top_10)
            accuracy_top_10 = correct_predictions_top_10 / total_predictions_top_10

            # Store the best model, top 10 features, accuracy, and range threshold for the player and target variable
            player_best_models[player][target_variable] = {'model': best_model,
                                                            'top_10_features': top_10_features,
                                                            'score': accuracy_top_10,
                                                            'range_threshold': range_threshold}

# Iterate through each player in 'today_data'
points_predictions = {'Player': [], 'Floor': [], 'Points': [], 'Ceiling': [], 'R2_Score': [], 'Accuracy': []}
rebounds_predictions = {'Player': [], 'Floor': [], 'Rebounds': [], 'Ceiling': [], 'R2_Score': [], 'Accuracy': []}
assists_predictions = {'Player': [], 'Floor': [], 'Assists': [], 'Ceiling': [], 'R2_Score': [], 'Accuracy': []}

for player, player_data in today_data.groupby('Player'):
    if player in player_best_models:
        selected_models = player_best_models[player]
        if selected_models:
            for target_variable, model_info in selected_models.items():
                model = model_info['model']
                r_squared = model_info.get('score', 0)

                if model and r_squared > 0.0:
                    top_features = model_info.get('top_10_features', [])
                    X_player_today = player_data[top_features]

                    if not X_player_today.empty:
                        scaler = StandardScaler()
                        X_player_today_normalized = scaler.fit_transform(X_player_today)

                        prediction = model.predict(X_player_today_normalized)
                        range_threshold = model_info.get('range_threshold', 0)
                        floor = round(prediction[0] - range_threshold, 2)
                        projection = round(prediction[0], 2)
                        ceiling = round(prediction[0] + range_threshold, 2)

                        # Append predictions
                        if target_variable == 'PTS':
                            points_predictions['Player'].append(player)
                            points_predictions['R2_Score'].append(r_squared)
                            points_predictions['Floor'].append(float(floor))
                            points_predictions['Points'].append(float(projection))
                            points_predictions['Ceiling'].append(float(ceiling))
                            points_predictions['Accuracy'].append(r_squared)
                        elif target_variable == 'REB':
                            rebounds_predictions['Player'].append(player)
                            rebounds_predictions['R2_Score'].append(r_squared)
                            rebounds_predictions['Floor'].append(float(floor))
                            rebounds_predictions['Rebounds'].append(float(projection))
                            rebounds_predictions['Ceiling'].append(float(ceiling))
                            rebounds_predictions['Accuracy'].append(r_squared)
                        elif target_variable == 'AST':
                            assists_predictions['Player'].append(player)
                            assists_predictions['R2_Score'].append(r_squared)
                            assists_predictions['Floor'].append(float(floor))
                            assists_predictions['Assists'].append(float(projection))
                            assists_predictions['Ceiling'].append(float(ceiling))
                            assists_predictions['Accuracy'].append(r_squared)

# Convert the lists of predictions to DataFrames
df_points_predictions = pd.DataFrame(points_predictions)
df_rebounds_predictions = pd.DataFrame(rebounds_predictions)
df_assists_predictions = pd.DataFrame(assists_predictions)

# Sort DataFrames by R2 Score in descending order
df_points_pred = df_points_predictions.sort_values(by='R2_Score', ascending=False)
df_rebounds_pred = df_rebounds_predictions.sort_values(by='R2_Score', ascending=False)
df_assists_pred = df_assists_predictions.sort_values(by='R2_Score', ascending=False)

# Display the sorted DataFrames
print(df_points_pred)
print(df_rebounds_pred)
print(df_assists_pred)


['Al Horford' 'Daniel Gafford' 'Dereck Lively II' 'Derrick Jones Jr.'
 'Derrick White' 'Jaden Hardy' 'Jaylen Brown' 'Jayson Tatum' 'Josh Green'
 'Jrue Holiday' 'Kristaps Porzingis' 'Kyrie Irving' 'Luka Doncic'
 'Payton Pritchard' 'Sam Hauser']
Al Horford
Daniel Gafford
Dereck Lively II
Derrick Jones Jr.
Derrick White
Jaden Hardy
Jaylen Brown
Jayson Tatum
Josh Green
Jrue Holiday
Kristaps Porzingis
Kyrie Irving
Luka Doncic
Payton Pritchard
Sam Hauser
              Player  Floor  Points  Ceiling  R2_Score  Accuracy
2      Derrick White   0.49    0.54     0.58  1.000000  1.000000
5       Jrue Holiday   0.31    0.34     0.38  0.800000  0.800000
6       Kyrie Irving   0.64    0.71     0.78  0.714286  0.714286
7        Luka Doncic   0.79    0.88     0.96  0.666667  0.666667
3       Jaylen Brown   0.64    0.71     0.78  0.600000  0.600000
4       Jayson Tatum   0.59    0.67     0.74  0.500000  0.500000
0         Al Horford   0.34    0.37     0.41  0.333333  0.333333
1  Derrick Jones Jr.   0.30

In [None]:
t = {k: v for k, v in player_best_models.items() if v}
my_dict = {k: t[k] for k in set(t)}


# Remove duplicates using dictionary comprehension
player_best_models = {k: v for k, v in my_dict.items() if list(my_dict.keys()).count(k) == 1}

# Filter out entries with None values for 'PTS', 'REB', or 'AST'
player_best_models = {
    player: stats
    for player, stats in player_best_models.items()
    if all(stats.get(key) is not None for key in ['PTS', 'REB', 'AST'])
}

In [None]:
import pandas as pd

# Create empty lists to store player scores for each category
pts_scores = []
reb_scores = []
ast_scores = []

# Iterate through each player in the player_best_models dictionary
for player, stats in player_best_models.items():
    if 'PTS' in stats and stats['PTS'] is not None and 'score' in stats['PTS']:
        pts_score = stats['PTS']['score']  # Get the PTS score for each player
        pts_scores.append({'Player': player, 'PTS_Score': pts_score})

    if 'REB' in stats and stats['REB'] is not None and 'score' in stats['REB']:
        reb_score = stats['REB']['score']  # Get the REB score for each player
        reb_scores.append({'Player': player, 'REB_Score': reb_score})

    if 'AST' in stats and stats['AST'] is not None and 'score' in stats['AST']:
        ast_score = stats['AST']['score']  # Get the AST score for each player
        ast_scores.append({'Player': player, 'AST_Score': ast_score})

# Create DataFrames from the player scores for each category
pts_scores_df = pd.DataFrame(pts_scores)
reb_scores_df = pd.DataFrame(reb_scores)
ast_scores_df = pd.DataFrame(ast_scores)

# Find top 5 players based on PTS score
if not pts_scores_df.empty:
    top_5_pts_players = pts_scores_df.nlargest(5, 'PTS_Score')
else:
    print("No available PTS scores for any player.")

# Find top 5 players based on REB score
if not reb_scores_df.empty:
    top_5_reb_players = reb_scores_df.nlargest(5, 'REB_Score')
else:
    print("\nNo available REB scores for any player.")

# Find top 5 players based on AST score
if not ast_scores_df.empty:
    top_5_ast_players = ast_scores_df.nlargest(5, 'AST_Score')
else:
    print("\nNo available AST scores for any player.")


#Import NBA schedule today to get current games from ESPN.com

In [None]:
import re

team_mapping = {
    'Atlanta': 'ATL', 'Boston': 'BOS', 'Brooklyn': 'BKN', 'Charlotte': 'CHA',
    'Chicago': 'CHI', 'Cleveland': 'CLE', 'Dallas': 'DAL', 'Denver': 'DEN',
    'Detroit': 'DET', 'Golden State': 'GSW', 'Houston': 'HOU', 'Indiana': 'IND',
    'LA': 'LAC', 'Los Angeles': 'LAL', 'Memphis': 'MEM', 'Miami': 'MIA',
    'Milwaukee': 'MIL', 'Minnesota': 'MIN', 'New Orleans': 'NOP', 'New York': 'NYK',
    'Oklahoma City': 'OKC', 'Orlando': 'ORL', 'Philadelphia': 'PHI', 'Phoenix': 'PHX',
    'Portland': 'POR', 'Sacramento': 'SAC', 'San Antonio': 'SAS', 'Toronto': 'TOR',
    'Utah': 'UTA', 'Washington': 'WAS'
}

text = """
Dallas
NBA Finals - Game 1, Series starts 6/6
  @

Boston
8:30 PM
ABC
Tickets as low as $711
Line: BOS -6.5
O/U: 216.5
"""

lines = text.split('\n')

# Remove empty lines and header rows
relevant_lines = [line.strip() for line in lines if line.strip() and line.strip() != 'MATCHUP' and line.strip() != 'TIME' and line.strip() != 'TV' and line.strip() != 'TICKETS']

team_abbreviations = [team_mapping[line] for line in relevant_lines if line in team_mapping]
team_opponent_pairs = [(team_abbreviations[i], team_abbreviations[i + 1]) for i in range(0, len(team_abbreviations), 2)]
team_opponent_pairs


[('DAL', 'BOS')]

#Get players team

In [None]:
player_teams = pd.read_csv('/content/drive/MyDrive/PlayerTeam.csv')

predict_data_temp = []
for team, opponent in team_opponent_pairs:
    # Filter players by the team
    team_players = player_teams[player_teams['Team'] == team]['Player']
    # Create rows for each player with their respective opponent
    for player in team_players:
        predict_data_temp.append({'Player': player, 'OPP': opponent})

    # Similarly, now get the players from the opponent team
    opponent_players = player_teams[player_teams['Team'] == opponent]['Player']
    # Create rows for each player with their respective opponent (opponent team)
    for player in opponent_players:
        predict_data_temp.append({'Player': player, 'OPP': team})

# Create DataFrame
predict_data = pd.DataFrame(predict_data_temp)

In [None]:
predict_data = pd.merge(predict_data, player_positions, on='Player', how='left')
predict_data['Pos'] = predict_data['Pos'].replace(position_mapping)

# Filter positions to keep only 'PG', 'SG', 'SF', 'PF', 'C'
predict_data = predict_data[predict_data['Pos'].isin(['PG', 'SG', 'SF', 'PF', 'C'])]

In [None]:

predict_data = pd.merge(predict_data, PlayersMinutes, on='Player', how='left')

In [None]:
temp_df = predict_data.merge(DefenseVsPosition, left_on=['OPP', 'Pos'], right_on=['Team', 'Position'], how='left')

print(temp_df.shape)
opp_columns = {
    'PTS': 'OPP_PTSvsPos',
    'REB': 'OPP_REBvsPos',
    'AST': 'OPP_ASTvsPos',
    'STL': 'OPP_STLvsPos',
    'BLK': 'OPP_BLKvsPos',
    'TO': 'OPP_TOVvsPos'
}
# Rename columns in the DataFrame
today_data = temp_df.rename(columns={**opp_columns})

selected_columns = ['Player', 'Pos', 'OPP','OPP_PTSvsPos', 'OPP_REBvsPos', 'OPP_ASTvsPos', 'OPP_STLvsPos', 'OPP_BLKvsPos', 'OPP_TOVvsPos']
today_data = today_data[selected_columns]
today_data = pd.merge(today_data, TeamOpponent, left_on='OPP', right_on='TEAM')
today_data = pd.merge(today_data,pt_pwr,on=['OPP']).dropna()


(32, 15)


In [None]:
today_data.dropna(inplace=True)

In [None]:
import pandas as pd



# Assuming you have loaded your dataframes 'today_data' and 'PlayerMinutes'

# First, filter 'PlayerMinutes' to get players with MIN greater than 20
players_with_min_gt_20 = PlayersMinutes[PlayersMinutes['MIN'] > 20]['Player'].tolist()

# Now, filter 'today_data' based on the players in 'PlayerMinutes' with MIN > 20
today_data = today_data[today_data['Player'].isin(players_with_min_gt_20)]


today_data['Pos'].unique()


array(['SF', 'PG', 'C'], dtype=object)

#Predict today's players previous 5 games

In [None]:
import pandas as pd

# Assuming 'data' contains the game logs for each player
# Here, 'data' is your existing DataFrame containing game logs

# Sort the data by player and game date
data_sorted = data.sort_values(by=['Player', 'GAME_DATE'], ascending=[True, False])

# Group the sorted data by player and get the last 5 games for each player
last_5_games_per_player = data_sorted.groupby('Player').head(10)

# Select only the numeric columns for calculating the mean
numeric_columns = ['PTS', 'REB', 'AST']

# Calculate average stats per minute for the last 5 games for each player
average_stats_per_player = last_5_games_per_player.groupby('Player')[numeric_columns].mean()

# Rename columns for clarity
average_stats_per_player.columns = ['Avg_Points_Prev_5', 'Avg_Rebounds_Prev_5', 'Avg_Assists_Prev_5']

# Reset index to make 'Player' a regular column
average_stats_per_player.reset_index(inplace=True)

# Print the final DataFrame
print(average_stats_per_player)

average_stats_per_minute = average_stats_per_player.copy()


                Player  Avg_Points_Prev_5  Avg_Rebounds_Prev_5  \
0           Al Horford           0.440394             0.205796   
1       Daniel Gafford           0.428911             0.250812   
2     Dereck Lively II           0.464776             0.308630   
3    Derrick Jones Jr.           0.308162             0.118322   
4        Derrick White           0.423110             0.121330   
5          Jaden Hardy           0.619729             0.129467   
6         Jaylen Brown           0.666657             0.159671   
7         Jayson Tatum           0.700920             0.198065   
8           Josh Green           0.219372             0.138589   
9         Jrue Holiday           0.326574             0.141645   
10  Kristaps Porzingis           0.631403             0.267049   
11        Kyrie Irving           0.721411             0.114806   
12         Luka Doncic           0.871772             0.288082   
13    Payton Pritchard           0.525217             0.119992   
14        

In [None]:
average_stats = average_stats_per_minute.copy()
today_data = pd.merge(today_data,average_stats,on = 'Player')

In [None]:
t = {k: v for k, v in player_best_models.items() if v}
my_dict = {k: t[k] for k in set(t)}


# Remove duplicates using dictionary comprehension
player_best_models = {k: v for k, v in my_dict.items() if list(my_dict.keys()).count(k) == 1}

# Filter out entries with None values for 'PTS', 'REB', or 'AST'
player_best_models = {
    player: stats
    for player, stats in player_best_models.items()
    if all(stats.get(key) is not None for key in ['PTS', 'REB', 'AST'])
}


In [None]:
t = {k: v for k, v in player_best_models.items() if v}
my_dict = {k: t[k] for k in set(t)}


# Remove duplicates using dictionary comprehension
player_best_models = {k: v for k, v in my_dict.items() if list(my_dict.keys()).count(k) == 1}

# Filter out entries with None values for 'PTS', 'REB', or 'AST'
player_best_models = {
    player: stats
    for player, stats in player_best_models.items()
    if all(stats.get(key) is not None for key in ['PTS', 'REB', 'AST'])
}


#Predict today's player props (per minute)

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Assuming your dataframes and models are already defined
# today_data, player_best_models, y_train, y_pred


# Create separate dictionaries for points, rebounds, assists predictions
points_predictions = {'Player': [], 'Floor':[], 'Points':[], 'Ceiling': [], 'R2_Score': [], 'Accuracy': []}
rebounds_predictions = {'Player': [], 'Floor':[], 'Rebounds':[], 'Ceiling': [], 'R2_Score': [], 'Accuracy': []}
assists_predictions = {'Player': [], 'Floor':[], 'Assists':[], 'Ceiling': [], 'R2_Score': [], 'Accuracy': []}

# Example function to get the actual value (you need to implement this based on your data)
def get_actual_value(player, target_variable):
    # This function should return the actual value for the player and target variable
    # For now, it's just a placeholder
    return 0

# Iterate through each player in 'today_data'
for player, player_data in today_data.groupby('Player'):
    if player in player_best_models:
        selected_models = player_best_models[player]
        if selected_models:
            for target_variable, model_info in selected_models.items():
                model = model_info['model']
                r_squared = model_info.get('score', 0)

                if model and r_squared > 0.0:
                    top_features = model_info.get('top_10_features', [])
                    X_player_today = player_data[top_features]

                    if not X_player_today.empty:
                        scaler = StandardScaler()
                        X_player_today_normalized = scaler.fit_transform(X_player_today)

                        prediction = model.predict(X_player_today_normalized)
                        range_threshold = model_info.get('range_threshold', 0)
                        floor = round(prediction[0] - range_threshold, 2)
                        projection = round(prediction[0], 2)
                        ceiling = round(prediction[0] + range_threshold, 2)

                        # Calculate prediction intervals
                        lower_bound = prediction[0] - confidence_interval
                        upper_bound = prediction[0] + confidence_interval

                        # Example: Check if actual value falls within the interval
                        actual_value = get_actual_value(player, target_variable)
                        is_accurate = lower_bound <= actual_value <= upper_bound

                        # Append predictions and accuracy metric
                        if target_variable == 'PTS':
                            points_predictions['Player'].append(player)
                            points_predictions['R2_Score'].append(r_squared)
                            points_predictions['Floor'].append(float(floor))
                            points_predictions['Points'].append(float(projection))
                            points_predictions['Ceiling'].append(float(ceiling))
                            points_predictions['Accuracy'].append(is_accurate)
                        elif target_variable == 'REB':
                            rebounds_predictions['Player'].append(player)
                            rebounds_predictions['R2_Score'].append(r_squared)
                            rebounds_predictions['Floor'].append(float(floor))
                            rebounds_predictions['Rebounds'].append(float(projection))
                            rebounds_predictions['Ceiling'].append(float(ceiling))
                            rebounds_predictions['Accuracy'].append(is_accurate)
                        elif target_variable == 'AST':
                            assists_predictions['Player'].append(player)
                            assists_predictions['R2_Score'].append(r_squared)
                            assists_predictions['Floor'].append(float(floor))
                            assists_predictions['Assists'].append(float(projection))
                            assists_predictions['Ceiling'].append(float(ceiling))
                            assists_predictions['Accuracy'].append(is_accurate)

# Convert the lists of predictions to DataFrames
df_points_predictions = pd.DataFrame(points_predictions)
df_rebounds_predictions = pd.DataFrame(rebounds_predictions)
df_assists_predictions = pd.DataFrame(assists_predictions)

# Sort DataFrames by R2 Score in descending order
df_points_pred = df_points_predictions.sort_values(by='R2_Score', ascending=False)
df_rebounds_pred = df_rebounds_predictions.sort_values(by='R2_Score', ascending=False)
df_assists_pred = df_assists_predictions.sort_values(by='R2_Score', ascending=False)


NameError: name 'confidence_interval' is not defined

In [None]:
import pandas as pd

# Set display options to show all rows
pd.set_option('display.max_rows', None)

# Sort Points Predictions by 'Points' and round the values to 1 decimal place
df_points_pred = df_points_pred.sort_values(by='Points', ascending=False).round(3)

# Sort Rebounds Predictions by 'Rebounds' and round the values to 1 decimal place
df_rebounds_pred = df_rebounds_pred.sort_values(by='Rebounds', ascending=False).round(3)

# Sort Assists Predictions by 'Assists' and round the values to 1 decimal place
df_assists_pred = df_assists_pred.sort_values(by='Assists', ascending=False).round(3)

print("Points Predictions:")
print(df_points_pred)

print("\nRebounds Predictions:")
print(df_rebounds_pred)

print("\nAssists Predictions:")
print(df_assists_pred)


Points Predictions:
              Player  Floor  Points  Ceiling  R2_Score  Accuracy
7        Luka Doncic   0.79    0.88     0.96     0.667     0.667
6       Kyrie Irving   0.64    0.71     0.78     0.714     0.714
3       Jaylen Brown   0.64    0.71     0.78     0.600     0.600
4       Jayson Tatum   0.59    0.67     0.74     0.500     0.500
2      Derrick White   0.49    0.54     0.58     1.000     1.000
0         Al Horford   0.34    0.37     0.41     0.333     0.333
5       Jrue Holiday   0.31    0.34     0.38     0.800     0.800
1  Derrick Jones Jr.   0.30    0.33     0.37     0.333     0.333

Rebounds Predictions:
              Player  Floor  Rebounds  Ceiling  R2_Score  Accuracy
4       Jayson Tatum   0.32      0.34     0.36     1.000     1.000
7        Luka Doncic   0.24      0.27     0.30     0.167     0.167
0         Al Horford   0.20      0.22     0.24     0.333     0.333
3       Jaylen Brown   0.16      0.18     0.20     1.000     1.000
2      Derrick White   0.13      0.15

In [None]:
averages = playerlogs.groupby('Player').mean(numeric_only=True).reset_index()

In [None]:
# Merge averages DataFrame with PlayerMinutes DataFrame
averages_merged = averages.merge(PlayersMinutes, left_on='Player', right_on='Player', how='left')



#Multiply predictions by player's projected minutes to get final projections

In [None]:
# Merge predictions with PlayerMinutes on 'Player'
df_points_predictions_merged = df_points_pred.merge(PlayersMinutes, left_on='Player', right_on='Player', how='left')
df_rebounds_predictions_merged = df_rebounds_pred.merge(PlayersMinutes, left_on='Player', right_on='Player', how='left')
df_assists_predictions_merged = df_assists_pred.merge(PlayersMinutes, left_on='Player', right_on='Player', how='left')

# Calculate predicted statistics by multiplying predictions by minutes
df_points_predictions_merged['PTS_prediction_minutes'] = df_points_predictions_merged['Points'] * df_points_predictions_merged['MIN']
df_rebounds_predictions_merged['REB_prediction_minutes'] = df_rebounds_predictions_merged['Rebounds'] * df_rebounds_predictions_merged['MIN']
df_assists_predictions_merged['AST_prediction_minutes'] = df_assists_predictions_merged['Assists'] * df_assists_predictions_merged['MIN']

# Print the results including R2_Score
print("PTS predictions with R2 score above 0.59:")
print(df_points_predictions_merged[['Player', 'PTS_prediction_minutes', 'R2_Score']].sort_values(by='PTS_prediction_minutes', ascending=False))

print("\nREB predictions with R2 score above 0.59:")
print(df_rebounds_predictions_merged[['Player', 'REB_prediction_minutes', 'R2_Score']].sort_values(by='REB_prediction_minutes', ascending=False))

print("\nAST predictions with R2 score above 0.59:")
print(df_assists_predictions_merged[['Player', 'AST_prediction_minutes', 'R2_Score']].sort_values(by='AST_prediction_minutes', ascending=False))


PTS predictions with R2 score above 0.59:
              Player  PTS_prediction_minutes  R2_Score
0        Luka Doncic                   36.08     0.667
1       Kyrie Irving                   28.40     0.714
3       Jayson Tatum                   27.47     0.500
2       Jaylen Brown                   26.98     0.600
4      Derrick White                   19.98     1.000
6       Jrue Holiday                   13.26     0.800
5         Al Horford                   11.84     0.333
7  Derrick Jones Jr.                    7.59     0.333

REB predictions with R2 score above 0.59:
              Player  REB_prediction_minutes  R2_Score
0       Jayson Tatum                   13.94     1.000
1        Luka Doncic                   11.07     0.167
2         Al Horford                    7.04     0.333
3       Jaylen Brown                    6.84     1.000
5       Kyrie Irving                    5.60     0.286
4      Derrick White                    5.55     0.833
6       Jrue Holiday               