In [1]:
# title: predict.ipynb
# author: @ericramsey
# description: file used to impliment machine learning and predictive analysis from DataFrames obtained from web scrapping process (NBA, MLB, [other sports to impliment])

import pandas as pd

In [2]:
# NBA Data - Import Process
nba_df = pd.read_csv("nba_games.csv", index_col=0) # read file to import NBA statistical data from web scrapping process

In [3]:
nba_df = nba_df.sort_values("date") # sort obtained data by date of the NBA season

In [4]:
nba_df = nba_df.reset_index(drop=True) # ensure the index values match the nba_df sorted values by date

In [5]:
# remove any duplicate or uneccesary columns for goal of project for each DataFrame
del nba_df["mp.1"]
del nba_df["mp_opp.1"]
del nba_df["index_opp"]

In [6]:
def add_target(group):
    
    group["target"] = group["won"].shift(-1)
    
    return group

# Initialize Data "Cleaning" Process
nba_df = nba_df.groupby("team", group_keys=False).apply(add_target) # seperate the data in nba_df by team to initiate process of predictive analysis by team

In [7]:
nba_df["target"][pd.isnull(nba_df["target"])] = 2 # return null elements for identifying data to remove from set (future games not yet played)
nba_df["target"] = nba_df["target"].astype(int, errors="ignore") # bypass SettingWithCopyWarning/error - convert from boolean to integer data type

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_df["target"][pd.isnull(nba_df["target"])] = 2 # return null elements for identifying data to remove from set (future games not yet played)


In [8]:
nba_df["won"].value_counts() # used to determine if the values are balanced (present_values: 8867 == 8867 [balenced])

True     8867
False    8867
Name: won, dtype: int64

In [9]:
nba_df["target"].value_counts() # due to NBA schedule - teams may not have a "next game" (value 2 identifies Null)

1    8853
0    8851
2      30
Name: target, dtype: int64

In [10]:
null_values = pd.isnull(nba_df) # identify null values in the DataFrame that need to be altered
null_values = null_values.sum()

In [11]:
null_values = null_values[null_values > 0] # assign null_values any data element > 0

In [12]:
null_values

+/-             17734
mp_max          17734
mp_max.1        17734
+/-_opp         17734
mp_max_opp      17734
mp_max_opp.1    17734
dtype: int64

In [13]:
valid_columns = nba_df.columns[~nba_df.columns.isin(null_values.index)] # remove any DataFrame columns with value == Null with negation operator ("~")

In [14]:
valid_columns # output valid_columns to verify columns with null values have been removed

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=142)

In [15]:
nba_df = nba_df[valid_columns].copy() # add the established valid_columns to a copy of nba_df (done to avoid using the original retreieved data - slice/copy warning)

In [16]:
nba_df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,...,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True,1
1,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,1
2,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,1
3,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,0
4,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17729,240.0,40.0,91.0,0.440,15.0,43.0,0.349,12.0,15.0,0.800,...,32.4,205.0,120.0,BOS,97,1,2022,2022-06-10,True,1
17730,240.0,41.0,88.0,0.466,9.0,40.0,0.225,13.0,15.0,0.867,...,94.4,300.0,112.0,BOS,94,0,2022,2022-06-13,True,1
17731,240.0,31.0,75.0,0.413,11.0,32.0,0.344,21.0,31.0,0.677,...,36.2,222.0,107.0,GSW,104,1,2022,2022-06-13,False,0
17732,240.0,38.0,92.0,0.413,19.0,46.0,0.413,8.0,8.0,1.000,...,42.6,141.0,126.0,BOS,90,1,2022,2022-06-16,True,2


In [17]:
from sklearn.feature_selection import SequentialFeatureSelector # used to train ML model with a smaller set of columns
from sklearn.model_selection import TimeSeriesSplit # used to ensure data is split properly when using SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier 

#Initialize Machine Learning Model
ridge_regressor = RidgeClassifier(alpha=1)
time_split = TimeSeriesSplit(n_splits=3)
sf_selector = SequentialFeatureSelector(ridge_regressor, n_features_to_select=30, direction="forward", cv=time_split)

In [18]:
deselected_columns = ["season", "date", "won", "target", "team", "team_opp"] # columns removed to not scale of the ML model
selected_columns = nba_df.columns[~nba_df.columns.isin(deselected_columns)] # list of the columns selected to be scaled - except deselected_columns

In [19]:
from sklearn.preprocessing import MinMaxScaler 

scaler = MinMaxScaler() # initialize scaler for re-formatting data values in DataFrame
nba_df[selected_columns] = scaler.fit_transform(nba_df[selected_columns]) # scale the selected_columns in nba_df to range between 0 and 1 for efficent ridge-regressor performance

In [20]:
nba_df # DataFrame values: (0 = loss, 1 = win, 2 = Null)

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.409091,0.529412,0.277512,0.413793,0.378788,0.491686,0.441860,0.396825,0.730455,...,0.150193,0.800948,0.517647,ATL,0.288462,1.0,2016,2015-10-27,True,1
1,0.0,0.409091,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.088575,0.232227,0.329412,CLE,0.298077,0.0,2016,2015-10-27,True,1
2,0.0,0.500000,0.529412,0.377990,0.310345,0.393939,0.356295,0.441860,0.333333,0.893816,...,0.215661,0.530806,0.505882,NOP,0.298077,0.0,2016,2015-10-27,True,1
3,0.0,0.363636,0.338235,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,0.277279,0.554502,0.317647,GSW,0.451923,1.0,2016,2015-10-27,False,0
4,0.0,0.431818,0.500000,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.160462,0.345972,0.317647,CHI,0.317308,1.0,2016,2015-10-27,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17729,0.0,0.477273,0.455882,0.409091,0.517241,0.590909,0.414489,0.255814,0.222222,0.766628,...,0.132221,0.549763,0.505882,BOS,0.317308,1.0,2022,2022-06-10,True,1
17730,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.928113,1.000000,0.411765,BOS,0.288462,0.0,2022,2022-06-13,True,1
17731,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.181001,0.630332,0.352941,GSW,0.384615,1.0,2022,2022-06-13,False,0
17732,0.0,0.431818,0.470588,0.344498,0.655172,0.636364,0.490499,0.162791,0.111111,1.000000,...,0.263158,0.246445,0.576471,BOS,0.250000,1.0,2022,2022-06-16,True,2


In [21]:
sf_selector.fit(nba_df[selected_columns], nba_df["target"]) # call sf_selector to pass selected_columns to analyze required data features for prediction of the target

In [22]:
predictors = list(selected_columns[sf_selector.get_support()]) # index selected_columns to verify which data elements the ML model should be using for predictions
predictors

['mp',
 'trb%',
 'usg%',
 'drtg',
 '3p%_max',
 'ft_max',
 'fta_max',
 '+/-_max',
 'drb%_max',
 'trb%_max',
 'blk%_max',
 'tov%_max',
 'usg%_max',
 'drtg_max',
 'mp_opp',
 '3p_opp',
 'blk_opp',
 'trb%_opp',
 'usg%_opp',
 'fg_max_opp',
 'fga_max_opp',
 '3p_max_opp',
 'ft%_max_opp',
 'drb_max_opp',
 'blk_max_opp',
 'ftr_max_opp',
 'drb%_max_opp',
 'stl%_max_opp',
 'blk%_max_opp',
 'usg%_max_opp']

In [23]:
# function used for making predictions using historical data to predict future/potential outcomes starting with season index=2 (2018) then move forward
def backtest(data, model, predictors, start=2, step=1):
    
    season_predictions = [] # list usd to store predictions for a given season (ex: use data from 2021-2022 to predict outcomes for 2023)
    
    seasons = sorted(data["season"].unique()) # list of all seasons that exist in scrapped data
    
    # iterate through retrieved data (all collected NBA seasons data from 2016-2022)
    for i in range(start, len(seasons), step):
        
        season = seasons[i]
        # split the traing data and the testing data for ML
        training_data = data[data["season"] < season] # training_data will use statistics for 2016-2017
        test = data[data["season"] == season] # test will determine accuaracy for 2018 seasons based on the training_data
        
        model.fit(training_data[predictors], training_data["target"]) # take training_data and use the predictors to train ML model for target
        
        predictions = model.predict(test[predictors]) # ML model will use the test data to make predictions based on retrieved historical data
        predictions = pd.Series(predictions, index=test.index) # convert numpy to panda.series (pd.series)
        
        combined = pd.concat([test["target"], predictions], axis=1) # combine the actual values and predictions
        combined.columns = ["actual_outcome", "prediction"] # initialize columns to display the actual outcome and the ML model prection
        
        season_predictions.append(combined)
        
    return pd.concat(season_predictions) # combine the data columns as seperate rows for review

sorted(nba_df["season"].unique()) # verify required season datasets for backtest() have been populated

[2016, 2017, 2018, 2019, 2020, 2021, 2022]

In [24]:
predictions = backtest(nba_df, ridge_regressor, predictors) # call backtest() function to determine efficency of ML model and initialize training
predictions

Unnamed: 0,actual_outcome,prediction
5238,1,1
5239,1,0
5240,1,0
5241,0,0
5242,0,1
...,...,...
17729,1,1
17730,1,1
17731,0,1
17732,2,1


In [25]:
from sklearn.metrics import accuracy_score

predictions = predictions[predictions["actual_outcome"] != 2] # remove the null values (2) to improve accuracy - recall null values identify if a team did not have a scheduled game
accuracy_score(predictions["actual_outcome"], predictions["prediction"]) # call accuracy_score to compare the actual outcome with the ML model predictions

0.5494946253810364

In [26]:
nba_df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0]) # determine what percentage a team wins home vs away games (this is an important factor regarding outcome)

home
0.0    0.427879
1.0    0.572121
dtype: float64

In [27]:
nba_df_rolling_avgs = nba_df[list(selected_columns) + ["won", "team", "season"]] # intialize new dataframe to consider outcomes where a team performs better then expected
nba_df_rolling_avgs

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,team,season
0,0.0,0.409091,0.529412,0.277512,0.413793,0.378788,0.491686,0.441860,0.396825,0.730455,...,0.071,0.550314,0.150193,0.800948,0.517647,0.288462,1.0,True,DET,2016
1,0.0,0.409091,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.185,0.270440,0.088575,0.232227,0.329412,0.298077,0.0,True,CHI,2016
2,0.0,0.500000,0.529412,0.377990,0.310345,0.393939,0.356295,0.441860,0.333333,0.893816,...,0.063,0.344864,0.215661,0.530806,0.505882,0.298077,0.0,True,GSW,2016
3,0.0,0.363636,0.338235,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,0.079,0.679245,0.277279,0.554502,0.317647,0.451923,1.0,False,NOP,2016
4,0.0,0.431818,0.500000,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.140,0.509434,0.160462,0.345972,0.317647,0.317308,1.0,False,CLE,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17729,0.0,0.477273,0.455882,0.409091,0.517241,0.590909,0.414489,0.255814,0.222222,0.766628,...,0.070,0.222222,0.132221,0.549763,0.505882,0.317308,1.0,True,GSW,2022
17730,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.124,0.423480,0.928113,1.000000,0.411765,0.288462,0.0,True,GSW,2022
17731,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.076,0.300839,0.181001,0.630332,0.352941,0.384615,1.0,False,BOS,2022
17732,0.0,0.431818,0.470588,0.344498,0.655172,0.636364,0.490499,0.162791,0.111111,1.000000,...,0.160,1.000000,0.263158,0.246445,0.576471,0.250000,1.0,True,GSW,2022


In [28]:
# function used to determine team averages for potential outcomes that were unexpected (such as upsets or teams winning despit having lower odds)
def get_team_averages(team):
    # *** can change to # < 10 for early season games ***
    rolling_average = team.rolling(10).mean() # split the data by teams and find the teams average for last 10 games
    
    return rolling_average

# group by team and season for algo to effectivly predict for each team seperatly - group_keys=false to avoid pandas adding another level to index (avg of team, only use from current season)
nba_df_rolling_avgs = nba_df_rolling_avgs.groupby(["team", "season"], group_keys=False).apply(get_team_averages)
nba_df_rolling_avgs # review the newly established dataframe containing team averages for unexpected outcomes

  rolling_average = team.rolling(10).mean() # split the data by teams and find the teams average for last 10 games


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,season
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17729,0.0,0.502273,0.388235,0.500478,0.493103,0.501515,0.458789,0.309302,0.276190,0.741657,...,0.0737,0.1147,0.431761,0.242875,0.567773,0.575294,0.394231,0.4,0.7,2022.0
17730,0.0,0.502273,0.364706,0.517703,0.455172,0.481818,0.440736,0.320930,0.282540,0.757993,...,0.0716,0.1171,0.374109,0.321566,0.642654,0.564706,0.392308,0.4,0.7,2022.0
17731,0.0,0.354545,0.279412,0.404545,0.437931,0.465152,0.429572,0.434884,0.385714,0.736639,...,0.0591,0.1113,0.483229,0.174711,0.438863,0.483529,0.350000,0.5,0.5,2022.0
17732,0.0,0.484091,0.379412,0.482297,0.486207,0.507576,0.448812,0.316279,0.269841,0.801750,...,0.0656,0.1152,0.444025,0.308601,0.628910,0.568235,0.395192,0.5,0.7,2022.0


In [29]:
rolling_avgs_columns = [f"{column}_10" for column in nba_df_rolling_avgs.columns] # rename col names for rollingaverages to avoid overlapping data in DataFrame
nba_df_rolling_avgs.columns = rolling_avgs_columns # assign rollingaverages_columns to DataFrame

nba_df = pd.concat([nba_df, nba_df_rolling_avgs], axis=1) # concat new data with orignial DataFrame (axis=1 - combines DataFrames using columns)
nba_df = nba_df.dropna() # drop rows with missing values
nba_df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp_10,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10
230,0.0,0.522727,0.382353,0.523923,0.344828,0.333333,0.457245,0.255814,0.238095,0.708285,...,0.0628,0.0679,0.413522,0.124134,0.361611,0.449412,0.347115,0.4,0.8,2016.0
246,0.0,0.659091,0.426471,0.645933,0.620690,0.515152,0.562945,0.325581,0.238095,0.927655,...,0.0613,0.0772,0.469497,0.219641,0.394787,0.531765,0.324038,0.5,1.0,2016.0
252,0.0,0.386364,0.382353,0.358852,0.206897,0.181818,0.445368,0.511628,0.412698,0.827305,...,0.0625,0.1145,0.437841,0.138126,0.507109,0.360000,0.351923,0.6,0.4,2016.0
253,0.0,0.340909,0.250000,0.413876,0.310345,0.257576,0.509501,0.511628,0.412698,0.827305,...,0.0699,0.1072,0.380294,0.273427,0.270616,0.478824,0.308654,0.6,0.7,2016.0
256,0.0,0.500000,0.382353,0.497608,0.344828,0.318182,0.475059,0.325581,0.349206,0.593932,...,0.0646,0.0759,0.512159,0.133633,0.277251,0.388235,0.308654,0.4,0.6,2016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17729,0.0,0.477273,0.455882,0.409091,0.517241,0.590909,0.414489,0.255814,0.222222,0.766628,...,0.0737,0.1147,0.431761,0.242875,0.567773,0.575294,0.394231,0.4,0.7,2022.0
17730,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.0716,0.1171,0.374109,0.321566,0.642654,0.564706,0.392308,0.4,0.7,2022.0
17731,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.0591,0.1113,0.483229,0.174711,0.438863,0.483529,0.350000,0.5,0.5,2022.0
17732,0.0,0.431818,0.470588,0.344498,0.655172,0.636364,0.490499,0.162791,0.111111,1.000000,...,0.0656,0.1152,0.444025,0.308601,0.628910,0.568235,0.395192,0.5,0.7,2022.0


In [30]:
# function used to move columns back a row
def shift_col(team, col_name):
    
    next_col = team[col_name].shift(-1)
    
    return next_col

# function used to add columns to DataFrame
def add_col(nba_df, col_name):
    
    return nba_df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

# create required columns and add them to DataFrame for initiated data regarding previous 10 games
nba_df["home_next"] = add_col(nba_df, "home")
nba_df["team_opp_next"] = add_col(nba_df, "team_opp")
nba_df["date_next"] = add_col(nba_df, "date")

In [31]:
nba_df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10,home_next,team_opp_next,date_next
230,0.0,0.522727,0.382353,0.523923,0.344828,0.333333,0.457245,0.255814,0.238095,0.708285,...,0.124134,0.361611,0.449412,0.347115,0.4,0.8,2016.0,0.0,BOS,2015-11-13
246,0.0,0.659091,0.426471,0.645933,0.620690,0.515152,0.562945,0.325581,0.238095,0.927655,...,0.219641,0.394787,0.531765,0.324038,0.5,1.0,2016.0,1.0,BRK,2015-11-14
252,0.0,0.386364,0.382353,0.358852,0.206897,0.181818,0.445368,0.511628,0.412698,0.827305,...,0.138126,0.507109,0.360000,0.351923,0.6,0.4,2016.0,0.0,MIN,2015-11-15
253,0.0,0.340909,0.250000,0.413876,0.310345,0.257576,0.509501,0.511628,0.412698,0.827305,...,0.273427,0.270616,0.478824,0.308654,0.6,0.7,2016.0,0.0,SAC,2015-11-15
256,0.0,0.500000,0.382353,0.497608,0.344828,0.318182,0.475059,0.325581,0.349206,0.593932,...,0.133633,0.277251,0.388235,0.308654,0.4,0.6,2016.0,0.0,CHI,2015-11-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17729,0.0,0.477273,0.455882,0.409091,0.517241,0.590909,0.414489,0.255814,0.222222,0.766628,...,0.242875,0.567773,0.575294,0.394231,0.4,0.7,2022.0,1.0,BOS,2022-06-13
17730,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.321566,0.642654,0.564706,0.392308,0.4,0.7,2022.0,0.0,BOS,2022-06-16
17731,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.174711,0.438863,0.483529,0.350000,0.5,0.5,2022.0,1.0,GSW,2022-06-16
17732,0.0,0.431818,0.470588,0.344498,0.655172,0.636364,0.490499,0.162791,0.111111,1.000000,...,0.308601,0.628910,0.568235,0.395192,0.5,0.7,2022.0,,,


In [32]:
nba_df = nba_df.copy() # Due to sliced data error above - ensure that the DataFrame is no longer pointing at a sliced copy of itself

In [33]:
# merge DataFrame with established rolling_cols containing team stats with opp stats of previous 10 games - improve ML model accuracy (innerjoin)
nba_df_merged = nba_df.merge(nba_df[rolling_avgs_columns + ["team_opp_next", "date_next", "team"]],
                             left_on=["team", "date_next"], 
                             right_on=["team_opp_next", "date_next"]
                            )

In [34]:
nba_df_merged # _y col name is data from the team_opp DataFrame

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp_10_y,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,won_10_y,season_10_y,team_opp_next_y,team_y
0,0.0,0.340909,0.250000,0.413876,0.310345,0.257576,0.509501,0.511628,0.412698,0.827305,...,0.437212,0.124904,0.404739,0.408235,0.428846,0.2,0.3,2016.0,TOR,SAC
1,0.0,0.477273,0.500000,0.375598,0.379310,0.348485,0.483373,0.441860,0.396825,0.730455,...,0.380294,0.273427,0.270616,0.478824,0.308654,0.6,0.7,2016.0,SAC,TOR
2,0.0,0.545455,0.426471,0.511962,0.275862,0.363636,0.339667,0.348837,0.317460,0.722287,...,0.427568,0.258280,0.514218,0.335294,0.390385,0.5,0.0,2016.0,DAL,PHI
3,0.0,0.318182,0.323529,0.318182,0.275862,0.272727,0.432304,0.186047,0.158730,0.787631,...,0.310377,0.127086,0.312796,0.410588,0.353846,0.5,0.6,2016.0,PHI,DAL
4,0.0,0.409091,0.279412,0.476077,0.241379,0.227273,0.437055,0.441860,0.396825,0.730455,...,0.484906,0.201284,0.375355,0.530588,0.345192,0.4,1.0,2016.0,LAC,GSW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15731,0.0,0.386364,0.264706,0.461722,0.517241,0.545455,0.445368,0.279070,0.222222,0.844807,...,0.452516,0.166624,0.444076,0.531765,0.358654,0.6,0.6,2022.0,GSW,BOS
15732,0.0,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.431761,0.242875,0.567773,0.575294,0.394231,0.4,0.7,2022.0,BOS,GSW
15733,0.0,0.477273,0.455882,0.409091,0.517241,0.590909,0.414489,0.255814,0.222222,0.766628,...,0.471908,0.170603,0.431754,0.522353,0.348077,0.5,0.6,2022.0,GSW,BOS
15734,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.483229,0.174711,0.438863,0.483529,0.350000,0.5,0.5,2022.0,GSW,BOS


In [35]:
nba_df_merged[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]] # display to verify merge was successful

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,TOR,SAC,SAC,TOR,2015-11-15
1,SAC,TOR,TOR,SAC,2015-11-15
2,DAL,PHI,PHI,DAL,2015-11-16
3,PHI,DAL,DAL,PHI,2015-11-16
4,LAC,GSW,GSW,LAC,2015-11-19
...,...,...,...,...,...
15731,GSW,BOS,BOS,GSW,2022-06-10
15732,BOS,GSW,GSW,BOS,2022-06-13
15733,GSW,BOS,BOS,GSW,2022-06-13
15734,GSW,BOS,BOS,GSW,2022-06-16


In [36]:
deselected_columns = list(nba_df_merged.columns[nba_df_merged.dtypes == "object"]) + deselected_columns # list of columns to remove from the merged DataFrame
deselected_columns # columns deselected to avoid passing them through ML model - done to make model more accurate on real world decisions

['team_x',
 'team_opp',
 'date',
 'team_opp_next_x',
 'date_next',
 'team_opp_next_y',
 'team_y',
 'season',
 'date',
 'won',
 'target',
 'team',
 'team_opp']

In [37]:
selected_columns = nba_df_merged.columns[~nba_df_merged.columns.isin(deselected_columns)] # columns except deselected_columns

In [38]:
sf_selector.fit(nba_df_merged[selected_columns], nba_df_merged["target"]) # initalize sf_selector with combined data and the target for predictions

In [39]:
predictors = list(selected_columns[sf_selector.get_support()]) #indicate if the selected features were initiated or selected - returns boolean - index cols
predictors

['pts',
 'ftr',
 'usg%',
 'ortg',
 'trb%_max',
 'total',
 'fta_opp',
 'usg%_opp',
 'drtg_opp',
 '3par_max_opp',
 'pf_10_x',
 'trb%_10_x',
 'usg%_10_x',
 'ortg_10_x',
 'drtg_10_x',
 'fg_opp_10_x',
 '3p%_opp_10_x',
 'trb%_opp_10_x',
 'usg%_opp_10_x',
 'fta_max_opp_10_x',
 'ft%_max_opp_10_x',
 'won_10_x',
 'home_next',
 'ts%_10_y',
 'usg%_10_y',
 'ast%_max_10_y',
 'usg%_opp_10_y',
 'tov_max_opp_10_y',
 '3par_max_opp_10_y',
 'won_10_y']

In [40]:
predictions = backtest(nba_df_merged, ridge_regressor, predictors) # run backtest with added data
predictions

Unnamed: 0,actual_outcome,prediction
4665,1,1
4666,1,0
4667,1,1
4668,1,0
4669,1,1
...,...,...
15731,1,0
15732,0,0
15733,1,1
15734,1,1


In [41]:
accuracy_score(predictions["actual_outcome"], predictions["prediction"]) # call accuracy_score to compare the actual outcome with the ML model predictions

0.6344503658206124