In [24]:
import numpy as np
import pandas as pd
from dill import dump, load
from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import leaguegamelog, leaguedashplayerstats
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer

# Get player gamelogs

Select the seasons to consider for our dataset. To start with, I will just pick the past five seasons.

In [25]:
#to start with, here are the last 5 seasons
season_list = [
    '2022-21',
    '2021-20',
    '2020-19',
    '2019-18',
    '2018-17'
]

Load all of the gamelogs, concatenate over each year, drop a couple of columns, and pickle them

In [26]:
def getplayoffandregular(year):
    #get playoff stats
    df_reg = leaguegamelog.LeagueGameLog(
        season=year,
        season_type_all_star='Regular Season',
        player_or_team_abbreviation='P'
    ).get_data_frames()[0]
    #add a feature indicating that these rows are regular season
    df_reg['PLAYOFFS'] = 0
    
    #get regular season stats
    df_playoffs = leaguegamelog.LeagueGameLog(
        season=year,
        season_type_all_star='Playoffs',
        player_or_team_abbreviation='P'
    ).get_data_frames()[0]
    #add a feature indicating that these rows are playoffs
    df_playoffs['PLAYOFFS'] = 1
    
    #join them together
    df_all = pd.concat([df_reg,df_playoffs])
    return df_all

In [27]:
gamelogs = pd.DataFrame()
for year in season_list:
    df = getplayoffandregular(year)
    
    if gamelogs.empty:
        gamelogs = df
    else:
        gamelogs = pd.concat([gamelogs,df])

gamelogs.sort_values(['PLAYER_NAME','GAME_DATE'], ascending=[True,False], inplace=True)
gamelogs.drop(['FANTASY_PTS','VIDEO_AVAILABLE'],axis=1, inplace=True)

In [28]:
#get opponent name from the matchup string
gamelogs['OPPONENT'] = gamelogs['MATCHUP'].str.split(' vs. | @ ',expand=True)[1]

#get the opponent ID from the opponent abbreviation
def GetTeamID(s):
    return teams.find_team_by_abbreviation(s)['id']
gamelogs['OPPONENT_ID'] = gamelogs['OPPONENT'].apply(GetTeamID)

In [29]:
#pickle the gamelogs
title = ".\logs\All_%s-%s_Gamelogs.pkl" % (season_list[-1][0:4],season_list[0][-2:])
gamelogs.to_pickle(title)

# Get eligible players by position

Iterate over three position types: `G, F, C` and sort by players who have played 500+ minutes and 30+ games.

**consider changing this to something more robust to account for players who have been hurt, or the beginning of the season**

In [30]:
#filter by games and minutes played
def gp_min_filter(df,min_limit = 500, gp_limit = 30):
    return df[(df['MIN'] >= min_limit) & (df['GP'] >= gp_limit)]

In [31]:
eligible_players = pd.DataFrame()

for position in ['G', 'F', 'C']:
    
    df = leaguedashplayerstats.LeagueDashPlayerStats(
    player_position_abbreviation_nullable=position).get_data_frames()[0]
    
    #drop all of the ranking columns
    df.drop([i for i in df.columns if 'RANK' in i], axis=1, inplace=True)
    #drop all of the PCT columns, because they are linear combinations
    df.drop([i for i in df.columns if 'PCT' in i], axis=1, inplace=True)
    
    df = gp_min_filter(df)
    df['POSITION'] = position
    
    df = df[['PLAYER_NAME','PLAYER_ID','AGE','POSITION','TEAM_ABBREVIATION']]
    
    if eligible_players.empty:
        eligible_players = df
    else:
        eligible_players = pd.concat([eligible_players,df])

In [32]:
#pickle the players list for later if necessary

eligible_players.to_pickle(".\logs\Eligible_Players.pkl")

# Join the player position and gamelogs

Get 1 dataframe with every game played by an eligible player in the last five seasons, and their position (to split on later)

n.b. `df` must be sorted by `GAME_DATE` ascending or else the rolling average transformations won't work!

In [134]:
eligible_players = pd.read_pickle("./logs/Eligible_Players.pkl")
gamelogs = pd.read_pickle("./logs/All_2018-21_Gamelogs.pkl")

df = pd.merge(eligible_players,gamelogs, on=('PLAYER_ID','PLAYER_NAME','TEAM_ABBREVIATION'))
df.sort_values(['PLAYER_NAME','GAME_DATE'], inplace=True) 

# Make stat transformers

### Current stats
`ShiftStats` transformer selects the appropriate personal stats and shifts them by -1 so that for each upcoming game you are using each player's last performance

In [34]:
class ShiftStats(BaseEstimator, TransformerMixin):
    def __init__(self,stats, shiftwidth=1):
        self.stats = stats
        self.shiftwidth = shiftwidth
    
    def fit(self, X, y=None):
        return self
    
    def transform(self,X):
        stats_shifted = X[['PLAYER_ID']+self.stats].groupby(['PLAYER_ID']).shift(self.shiftwidth)
        return stats_shifted

### Cumulative stats
`MakeCumulativeStats` is a transformer that does two things:
1. Makes a cumulative average from all previous game per player. These stats are appended with `_AVG` (e.g.  `PTS_AVG`,  `MIN_AVG`, `PF_AVG`, etc.)

2. Makes a rolling average of the last n (default 4) games, with a Gaussian smoothing to weigh the more recent games more. These stats are appended with `_RECENT` (e.g. `PTS_RECENT`,  `MIN_RECENT`, `PF_RECENT`, etc.)

All of these stats are then shifted by -1, so that for each upcoming game you are using the rolling and accumulated stats from the last game

In [303]:
class MakeCumulativeStats(BaseEstimator, TransformerMixin):

    def __init__(self, stats, window = 4):
        self.stats = stats
        self.window = window
        
    def fit(self, X, y=None):
        self.game_number = X[['PLAYER_ID']].groupby(['PLAYER_ID']).cumcount() + 1
        return self
    
    def transform(self, X):
        df = pd.DataFrame()
        for stat in self.stats:
           
            #make cumulative sum of counting stats and divide by game number
            df[stat+'_AVG'] = X[['PLAYER_ID',stat]].groupby(['PLAYER_ID']).cumsum()
            df[stat+'_AVG'] = df[stat+'_AVG'] / self.game_number
            
            #make rolling average of counting stats. Uses a window of 4 games, and a Gaussian window with std 3 weight
            temp = (X[['PLAYER_ID',stat]]
                    .groupby(['PLAYER_ID'])
                    .rolling(self.window, min_periods=1, win_type='gaussian', closed='left')
                    .mean(std=3)
                   )
            
            df[stat+'_RECENT'] = temp.reset_index(level=0)[stat]
        return df.sort_index(ascending=False, inplace=True)

### Opponent stats
`MakeOpponentStats` is a transformer that does two things:
1. Get the stats versus the last matchup with the current opponent. These stats are not shifted, but use the `closed = 'left'` parameter to only account for previous matchups. Consequently, these numbers do not need to be shifted. These stats are appended with `_PREV_VS_OPP` (e.g.  `PTS_PREV_VS_OPP`,  `MIN_PREV_VS_OPP`, `PF_PREV_VS_OPP`, etc.).

2. Makes a rolling average of the last n (default 3) games versus a the current opponent. If the player has not played n games against the opponent, the window shrinks to accommodate. These stats are not shifted, but use the `closed = 'left'` parameter to only account for previous matchups. Consequently, these numbers do not need to be shifted. These stats are appended with `_PREV3_VS_OPP` (e.g.  `PTS_PREV3_VS_OPP`,  `MIN_PREV3_VS_OPP`, `PF_PREV3_VS_OPP`, etc.).

In [304]:
class MakeOpponentStats(BaseEstimator, TransformerMixin):
    def __init__(self, stats, window = 3):
        self.stats = stats
        self.window = window
    
    def fit(self, X, y=None):
        self.game_number = X[['PLAYER_ID']].groupby(['PLAYER_ID']).cumcount() + 1
        return self
    
    def transform(self,X):
        df = pd.DataFrame()
        for stat in self.stats:
            #last matchup against opponent
            temp = (X[['PLAYER_ID','OPPONENT', stat]]
                    .groupby(['PLAYER_ID','OPPONENT'])
                    .rolling(1,min_periods=1,closed='left')
                    .mean()
                   ) 
                    
            df[stat+'_PREV_VS_OPP'] = temp.reset_index(level=[0,1])[stat] #reset multilevel indices

            #avg 3 past matchups against opponent
            temp = (X[['PLAYER_ID','OPPONENT', stat]]
                    .groupby(['PLAYER_ID','OPPONENT'])
                    .rolling(self.window,min_periods=1,closed='left')
                    .mean()
                   )
            
            df[stat+'_PREV3_VS_OPP'] = temp.reset_index(level=[0,1])[stat]
            
        return df.sort_index(ascending=False)

### Per-position estimator
A "group-by" estimator applies the pipeline defined below to each player position (`G`, `F`, `C`). Based on the `ts` miniproject.

In [286]:

class GroupbyEstimator(BaseEstimator, RegressorMixin):
    
    def __init__(self, column, estimator_factory):
        # column is the value to group by; estimator_factory can be
        # called to produce estimators
        self.column = column
        self.estimator_factory = estimator_factory
        self.predictors = {}
        self.predictions = {}
        
    def helper_fit(self, X, y):
        #get city name
        position_name = X[self.column].iloc[0]
        
        #index y by city index to get "grouped" y
        y_position = y.loc[X.index]
        
        #apply estimator to grouped X and y
        self.predictors[position_name] = self.estimator_factory().fit(X,y_position)
        
    def fit(self, X, y):
        X_group = X.groupby(self.column)
        X_group.apply(self.helper_fit, y)
        
        return self

    def helper_predict(self, X):
        #get position name
        position_name = X[self.column].iloc[0]
        #predict make a predictor for a given city
        self.predictions[position_name] = pd.Series(self.predictors[position_name].predict(X))
        return self.predictors[position_name]
        
    def predict(self, X):
        X_group = X.groupby(self.column)
        X_group.apply(self.helper_predict)
        return pd.concat(self.predictions)

# Make pipeline 

In [305]:
def stat_pipeline():
    #List of stats that should be accumulated. Might change later
    stats_to_accumulate = ['MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM',
       'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV',
       'PF', 'PTS', 'PLUS_MINUS']

    #one-hot encode the player's team, the opponent team, and whether it's a playoff or regular game
    ct_features = ColumnTransformer([
        ('OHE', OneHotEncoder(), ['TEAM_ABBREVIATION', 'OPPONENT', 'PLAYOFFS'])
    ])

    #join together the OHE features, and the 3 transformers for current, cumulative, and opponent stats
    all_features = FeatureUnion([
        ('ct_features', ct_features),
        ('one_game_ago', ShiftStats(stats_to_accumulate,shiftwidth=1)),
        ('two_games_ago', ShiftStats(stats_to_accumulate,shiftwidth=2)),
        ('three_games_ago', ShiftStats(stats_to_accumulate,shiftwidth=3)),
        ('cum_stats', MakeCumulativeStats(stats_to_accumulate)),
        ('opp_stats', MakeOpponentStats(stats_to_accumulate))
    ])
    
    pipe = Pipeline([
        ('features', all_features),
        ('imputer', SimpleImputer()),
        ('regressor', RidgeCV(alphas=np.logspace(-4,4,10)))
    ])
    return pipe

In [306]:
model = GroupbyEstimator('POSITION', stat_pipeline).fit(df, df['PTS'])

In [289]:
model.score(df,df['PTS'])

-0.5049544068937737

In [290]:
#how to get pipeline transformer
df_G_transform = model.predictors['G'].named_steps['features'].transform(df)

## Make the predictions

In [291]:
#predict the points based on pipe model
pts_predict = model.predict(df)

# Interactive chart of past performance compared with prediction

## Put data in long form and save as JSON

In [292]:
import altair as alt
import json
from datetime import date
import urllib

prediction_df = df.copy()

y_fit = pts_predict[df['POSITION'].unique()] #re-order the prediction by position to match the original dataframe
y_fit.reset_index(drop=True)
prediction_df['PTS_PREDICT'] = y_fit.reset_index(drop=True)

prediction_df = prediction_df.groupby(['PLAYER_NAME','GAME_DATE','OPPONENT']).agg({"PTS":'mean','PTS_PREDICT':'mean'}).reset_index()

#melt the dataframe into a long format for Altair convenience
to_melt = ["PTS","PTS_PREDICT"] #variables to melt
to_keep = prediction_df.columns.difference(to_melt) #variables to keep

df_long = pd.melt(prediction_df, id_vars=to_keep, value_vars=to_melt, var_name="MODEL", value_name="VALUE")

#save dataframe as JSON because otherwise too long for Altair
url = f"./logs/Predictions_Latest.json"
df_long.to_json(url, orient='records')

## Get regular season and playoff start/end dates

In [293]:
cutoffs = df.groupby(["SEASON_ID", "PLAYOFFS"]).agg({"GAME_DATE":["min","max"]})
cutoffs.columns = ['start', 'stop']
cutoffs.reset_index(inplace=True)
cutoffs.replace([0,1], ['Regular Season','Playoffs'], inplace=True)

## Put it all together to plot

In [294]:
#player dropdown menu
players = df_long.PLAYER_NAME.sort_values().unique()
player_dropdown = alt.binding_select(options=players, name='Player: ')
selection = alt.selection_single(fields=['PLAYER_NAME'], bind=player_dropdown, init={'PLAYER_NAME':'RJ Barrett'})

nearest = alt.selection(type='single', nearest=True, on='mouseover',
                        fields=['GAME_DATE'], empty='none')

domain = ['2021-10-19',date.today().strftime("%Y-%m-%d")]

#make chart of points (actual vs. predicted)
chart = alt.Chart(url, title='Actual vs. Predicted Points').mark_line(point=True).encode(
    x = alt.X('GAME_DATE:T', title='Game date', timeUnit='yearmonthdate', scale=alt.Scale(domain=domain)),
    y = alt.Y('VALUE:Q', title='Points'),
    color = 'MODEL:N',
    tooltip=['PLAYER_NAME:N','VALUE:Q', 'GAME_DATE:T', 'OPPONENT:N']
).add_selection(selection).transform_filter(selection).interactive()

#make chart of playoff vs. regular season regions to overlay
playoffs = (
    alt.Chart(
        cutoffs
    )
    .mark_rect(opacity=0.15)
    .encode(
        x='start:T',
        x2 = 'stop:T',
        y=alt.value(0),
        y2=alt.value(300),
        color=alt.Color('PLAYOFFS:N', legend=alt.Legend(title="Legend"))
    )
)

#chart.properties(width=800, height=300).interactive() + playoffs.properties(width=800, height=300)

final_chart = alt.layer(
    chart,
    playoffs
).properties(width=800, height=300).interactive()

final_chart.save(f"./logs/PTS_chart.html")

In [295]:
final_chart

# Scrape the DraftKings lines

This will go to DraftKings and scrape the lines for every countable stat category, pickle them, and concatenate them into one dataframe of all stats. If for some reason one of these stat pages fails, it will pull the most recent cached stats. This will ensure that the project will still function even when there are no upcoming games (e.g. for demonstrations over the summer).

The contest lines are as follows:
 - points
 - rebounds
 - assists
 - threes made
 - blocks
 - steals
 - points+rebounds
 - points+assists
 - assists+rebounds
 - steals+blocks
 - points+rebounds+assists

In [71]:
from lxml import html
from bs4 import BeautifulSoup
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def urlBuilder(base, params):
    #selenium doesn't have a built-in URL builder for parameters, so stitch together base and parameters for full URL
    url = base
    url+="?"    
    if params:
        for k, v in params.items():
            url += (k+"="+v+"&")
    return url[:-1]

def parse(url):
    #scrape source code from rendered webpage
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    response = webdriver.Chrome(options=options, service=Service(ChromeDriverManager().install()))
    response.get(url)
    sourceCode = response.page_source
    return sourceCode


In [72]:
stat_dict = {'PTS':'points', 
             'REB':'rebounds',
             'AST':'assists',
             'FG3M':'threes',
             'BLK':'blocks',
             'STL':'steals',
             'PR':'pts-%2b-reb',
             'PA':'pts-%2b-ast',
             'AR':'ast-%2b-reb',
             'SB':'steals-%2b-blocks',
             'PRA':'pts,-reb-%26-ast'
            }

def getLines():
    base = "https://sportsbook.draftkings.com/leagues/basketball/88670846" #NBA league ID = 88670846
    df_all = pd.DataFrame(columns=["PLAYER"]) #make an empty dataframe with a PLAYERS column for joining to later
    for category in stat_dict.keys():
        params = {'category':'player-props',
                   'subcategory':stat_dict[category] #this can be changed for whatever contest you like
                  }
        #make URL and parse HTML with beautiful soup
        url = urlBuilder(base,params)
        soup = BeautifulSoup(parse(url),'html')
        #scrape the first table (should be the only table)
        x = soup.findAll("table")
        df = pd.DataFrame()
        
        for t in x: #iterate over all of the tables (i.e. concatenate all upcoming games)
            temp_df = pd.read_html(t.prettify())[0]
            df = pd.concat([df,temp_df])
        try: #try to load the line, over and under for the given stat
            #parse the line and odds from the dataframe
            df[[category+"_LINE",category+"_OVER"]] = df["OVER"].str.extract(r"[OU]([\W\w]*)([+|-]\d*)",expand=True).apply(pd.to_numeric)
            df[category+"_UNDER"] = df["UNDER"].str.extract(r"[OU][\W\w]*([+|-]\d*)",expand=True).apply(pd.to_numeric)
            df = df[["PLAYER", category+"_LINE", category+"_OVER", category+"_UNDER"]]
            df.to_pickle(f".\logs\LatestLine_{category}.pkl") #cache these stats in case there is failure later
        except KeyError: #if  stats cannot be loaded (because they haven't been posted), load cached ones
            print(f"Lines for {category} have not been posted yet for this contest. Please try again later.")
            print('In the mean time, we have loaded the last cached stats.')
            df = pd.read_pickle(f".\logs\LatestLine_{category}.pkl")
        df_all = df_all.merge(df, how="outer", on="PLAYER")
        
    df_all.to_pickle(f".\logs\LatestLine_All.pkl") #cache these stats in case there is failure later
    return df_all

### Get all of the team info about the matchup

In [81]:
import dateutil
import datetime

def GetMatchupDetails():
    base = "https://sportsbook.draftkings.com/leagues/basketball/88670846" #NBA league ID = 88670846
    params = {'category':'player-props',
               'subcategory':'points' #this can be changed for whatever contest you like
              }
    #make URL and parse HTML with beautiful soup
    url = urlBuilder(base,params)
    soup = BeautifulSoup(parse(url),'html')
    
    
    opponent_dict = dict()
    df_matchup = pd.DataFrame(columns=["GAME_DATE","HOME","AWAY"])
    
    matchups = soup.find_all("div", {"class":"sportsbook-event-accordion__title-wrapper"})
    game_dates = soup.find_all("span", {'class':'sportsbook-event-accordion__date'})

    for datestr, match in zip(game_dates,matchups):
        logos = match.find_all("img")

        away_str = logos[0]['src']
        away_team = away_str.split('/')[-1].split('.png')[0]

        home_str = logos[1]['src']
        home_team = home_str.split('/')[-1].split('.png')[0]
                
        if datestr.text.split()[0] == 'Today':
            d = datetime.date.today()
        elif datestr.text.split()[0] == 'Tomorrow':
            d = datetime.date.today() + datetime.timedelta(days=1)
        else:
            d = dateutil.parser.parse(datestr.text).date()

        opponent_dict[home_team] = (away_team, d)
        opponent_dict[away_team] = (home_team, d)
        
    return opponent_dict

In [83]:
matchup_dict = GetMatchupDetails()
lines = getLines()




[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Fred\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Fred\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Fred\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Fred\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Fred\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Fred\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Fred\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Fred\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Fred\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Fred\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Fred\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Fred\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache


In [272]:
def FutureStats(lines, eligible_players, matchup_dict):
    cols = ["PLAYER_NAME","PLAYER_ID","POSITION","GAME_DATE","TEAM_ABBREVIATION","OPPONENT","PLAYOFFS"]
    X = pd.merge(eligible_players, lines, left_on="PLAYER_NAME", right_on="PLAYER", sort=True)
    X['OPPONENT'] = X.apply(lambda x: matchup_dict[x.TEAM_ABBREVIATION][0], axis=1)
    X['GAME_DATE'] = X.apply(lambda x: matchup_dict[x.TEAM_ABBREVIATION][1], axis=1).astype(str)
    X['PLAYOFFS'] = 1
    
    return X[cols]

In [297]:
futures = FutureStats(lines, eligible_players, matchup_dict)
futures["PREDICTION"] = 1

In [298]:
players = futures['PLAYER_NAME'].unique()
df_upcoming = df[df['PLAYER_NAME'].isin(players)].copy()
#df_upcoming = df.copy()
df_upcoming["PREDICTION"] = 0

df_upcoming = pd.concat([df_upcoming,futures], ignore_index=True)
df_upcoming.sort_values(['PLAYER_NAME','GAME_DATE','PREDICTION'], inplace=True) 
#df_upcoming.sort_values('GAME_DATE')

In [308]:
df_upcoming

Unnamed: 0,PLAYER_NAME,PLAYER_ID,AGE,POSITION,TEAM_ABBREVIATION,SEASON_ID,TEAM_ID,TEAM_NAME,GAME_ID,GAME_DATE,...,STL,BLK,TOV,PF,PTS,PLUS_MINUS,PLAYOFFS,OPPONENT,OPPONENT_ID,PREDICTION
0,Al Horford,201143,36.0,F,BOS,22018,1.610613e+09,Boston Celtics,0021800001,2018-10-16,...,0.0,4.0,3.0,1.0,9.0,9.0,0,PHI,1.610613e+09,0
1,Al Horford,201143,36.0,C,BOS,22018,1.610613e+09,Boston Celtics,0021800001,2018-10-16,...,0.0,4.0,3.0,1.0,9.0,9.0,0,PHI,1.610613e+09,0
2,Al Horford,201143,36.0,F,BOS,22018,1.610613e+09,Boston Celtics,0021800019,2018-10-19,...,1.0,1.0,3.0,3.0,14.0,-10.0,0,TOR,1.610613e+09,0
3,Al Horford,201143,36.0,C,BOS,22018,1.610613e+09,Boston Celtics,0021800019,2018-10-19,...,1.0,1.0,3.0,3.0,14.0,-10.0,0,TOR,1.610613e+09,0
4,Al Horford,201143,36.0,F,BOS,22018,1.610613e+09,Boston Celtics,0021800028,2018-10-20,...,1.0,1.0,2.0,4.0,8.0,1.0,0,NYK,1.610613e+09,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3251,Stephen Curry,201939,34.0,G,GSW,42021,1.610613e+09,Golden State Warriors,0042100314,2022-05-24,...,1.0,0.0,2.0,2.0,20.0,-23.0,1,DAL,1.610613e+09,0
3252,Stephen Curry,201939,34.0,G,GSW,42021,1.610613e+09,Golden State Warriors,0042100315,2022-05-26,...,3.0,1.0,2.0,2.0,15.0,11.0,1,DAL,1.610613e+09,0
3253,Stephen Curry,201939,34.0,G,GSW,42021,1.610613e+09,Golden State Warriors,0042100401,2022-06-02,...,3.0,0.0,2.0,3.0,34.0,-9.0,1,BOS,1.610613e+09,0
3254,Stephen Curry,201939,34.0,G,GSW,42021,1.610613e+09,Golden State Warriors,0042100402,2022-06-05,...,3.0,0.0,2.0,2.0,29.0,24.0,1,BOS,1.610613e+09,0


In [309]:
upcoming_predict = model.predict(df_upcoming)

In [276]:
df_upcoming.shape

(58534, 36)

In [277]:
upcoming_predict.loc['C'].max()

35.131985434256904

In [311]:
upcoming_predict.max()

34.46100506911169

In [318]:
upcoming_predict.loc['G']

0       12.986319
1       13.574203
2       13.740558
3       14.452837
4       13.241258
          ...    
1576    24.190815
1577    23.008301
1578    21.604555
1579    22.278245
1580    23.830051
Length: 1581, dtype: float64

In [330]:
guard_upcoming = df_upcoming[df_upcoming['POSITION']=='G'][['PLAYER_NAME','GAME_DATE','OPPONENT','PTS']].reset_index(drop=True)

In [332]:
predict_upcoming = upcoming_predict.loc['G']

In [336]:
guard_upcoming.merge(predict_upcoming.rename('PTS_PREDICT'), left_index=True, right_index=True)

Unnamed: 0,PLAYER_NAME,GAME_DATE,OPPONENT,PTS,PTS_PREDICT
0,Derrick White,2022-02-11,DEN,15.0,12.986319
1,Derrick White,2022-02-13,ATL,14.0,13.574203
2,Derrick White,2022-02-15,PHI,11.0,13.740558
3,Derrick White,2022-02-16,DET,9.0,14.452837
4,Derrick White,2022-02-24,BKN,13.0,13.241258
...,...,...,...,...,...
1576,Stephen Curry,2022-05-24,DAL,20.0,24.190815
1577,Stephen Curry,2022-05-26,DAL,15.0,23.008301
1578,Stephen Curry,2022-06-02,BOS,34.0,21.604555
1579,Stephen Curry,2022-06-05,BOS,29.0,22.278245


In [280]:
#smart = df_upcoming.copy()
smart = df[df['PLAYER_NAME']=='Marcus Smart'].copy()
smart.replace(np.NaN,0,inplace=True)

In [281]:
stats_to_accumulate = ['MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM',
   'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV',
   'PF', 'PTS', 'PLUS_MINUS']

M = MakeOpponentStats(stats_to_accumulate)
#M = MakeCumulativeStats(stats_to_accumulate)

In [282]:
X = M.fit_transform(smart)
X

Unnamed: 0,MIN_PREV_VS_OPP,MIN_PREV3_VS_OPP,FGM_PREV_VS_OPP,FGM_PREV3_VS_OPP,FGA_PREV_VS_OPP,FGA_PREV3_VS_OPP,FG_PCT_PREV_VS_OPP,FG_PCT_PREV3_VS_OPP,FG3M_PREV_VS_OPP,FG3M_PREV3_VS_OPP,...,BLK_PREV_VS_OPP,BLK_PREV3_VS_OPP,TOV_PREV_VS_OPP,TOV_PREV3_VS_OPP,PF_PREV_VS_OPP,PF_PREV3_VS_OPP,PTS_PREV_VS_OPP,PTS_PREV3_VS_OPP,PLUS_MINUS_PREV_VS_OPP,PLUS_MINUS_PREV3_VS_OPP
20149,,,,,,,,,,,...,,,,,,,,,,
20148,,,,,,,,,,,...,,,,,,,,,,
20147,,,,,,,,,,,...,,,,,,,,,,
20146,,,,,,,,,,,...,,,,,,,,,,
20145,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19854,37.0,37.333333,6.0,5.666667,11.0,16.000000,0.545,0.369667,1.0,3.000000,...,0.0,0.333333,4.0,3.666667,6.0,4.000000,16.0,16.666667,-16.0,1.000000
19853,24.0,33.666667,1.0,5.000000,5.0,12.666667,0.200,0.369667,1.0,2.333333,...,0.0,0.333333,1.0,2.000000,4.0,3.666667,5.0,15.000000,13.0,9.333333
19852,40.0,33.666667,4.0,3.666667,15.0,10.333333,0.267,0.337333,1.0,1.000000,...,0.0,0.000000,2.0,2.333333,4.0,4.666667,14.0,11.666667,-12.0,-5.000000
19851,34.0,35.333333,8.0,6.333333,12.0,13.000000,0.667,0.500000,4.0,3.000000,...,0.0,0.000000,0.0,1.333333,1.0,3.000000,20.0,18.333333,11.0,7.666667


In [283]:
smart[['PLAYER_NAME','GAME_DATE','OPPONENT','MIN']].sort_index(ascending=False)

Unnamed: 0,PLAYER_NAME,GAME_DATE,OPPONENT,MIN
20149,Marcus Smart,2018-10-16,PHI,25
20148,Marcus Smart,2018-10-19,TOR,22
20147,Marcus Smart,2018-10-20,NYK,28
20146,Marcus Smart,2018-10-22,ORL,19
20145,Marcus Smart,2018-10-25,OKC,21
...,...,...,...,...
19854,Marcus Smart,2022-05-25,MIA,24
19853,Marcus Smart,2022-05-27,MIA,40
19852,Marcus Smart,2022-05-29,MIA,41
19851,Marcus Smart,2022-06-02,GSW,30
