## Predicting NBA Games using Machine Learning and Pythom
### Data Cleaning

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/nba_games.csv', index_col=0)
df.head()

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,18.0,...,22.8,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True
1,240.0,240.0,36.0,100.0,0.36,7.0,31.0,0.226,16.0,19.0,...,50.0,32.6,152.0,111.0,ATL,98,0,2016,2015-12-09,False
2,240.0,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,...,20.0,30.9,148.0,116.0,SAS,107,1,2018,2017-10-18,False
3,240.0,240.0,41.0,89.0,0.461,8.0,21.0,0.381,17.0,19.0,...,28.6,30.9,138.0,118.0,MIN,99,0,2018,2017-10-18,True
4,240.0,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,20.0,...,16.8,30.9,157.0,90.0,MEM,92,1,2021,2021-04-30,False


In [3]:
df.shape

(17772, 150)

In [4]:
df.sort_values('date')
# delete the old indexes and new ones that follow the
# order of the dates
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,18.0,...,22.8,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True
1,240.0,240.0,36.0,100.0,0.36,7.0,31.0,0.226,16.0,19.0,...,50.0,32.6,152.0,111.0,ATL,98,0,2016,2015-12-09,False
2,240.0,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,...,20.0,30.9,148.0,116.0,SAS,107,1,2018,2017-10-18,False
3,240.0,240.0,41.0,89.0,0.461,8.0,21.0,0.381,17.0,19.0,...,28.6,30.9,138.0,118.0,MIN,99,0,2018,2017-10-18,True
4,240.0,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,20.0,...,16.8,30.9,157.0,90.0,MEM,92,1,2021,2021-04-30,False


In [5]:
del df['mp.1']
del df['mp_opp.1']
del df['index_opp']

In [6]:
df.head()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,18.0,0.778,...,22.8,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True
1,240.0,36.0,100.0,0.36,7.0,31.0,0.226,16.0,19.0,0.842,...,50.0,32.6,152.0,111.0,ATL,98,0,2016,2015-12-09,False
2,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,0.739,...,20.0,30.9,148.0,116.0,SAS,107,1,2018,2017-10-18,False
3,240.0,41.0,89.0,0.461,8.0,21.0,0.381,17.0,19.0,0.895,...,28.6,30.9,138.0,118.0,MIN,99,0,2018,2017-10-18,True
4,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,20.0,0.75,...,16.8,30.9,157.0,90.0,MEM,92,1,2021,2021-04-30,False


In [7]:
# The data frame is organized by date.
# This function requires the data frame to be grouped by team
# The idea is to assign to 'target' the value of 'won' from the next row.
# This would allow to know whether the team won the next game
def add_target(team):
  team['target'] = team['won'].shift(-1)
  return team

df = df.groupby('team', group_keys=False).apply(add_target)

In [8]:
df.head()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,18.0,0.778,...,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True,True
1,240.0,36.0,100.0,0.36,7.0,31.0,0.226,16.0,19.0,0.842,...,32.6,152.0,111.0,ATL,98,0,2016,2015-12-09,False,False
2,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,0.739,...,30.9,148.0,116.0,SAS,107,1,2018,2017-10-18,False,True
3,240.0,41.0,89.0,0.461,8.0,21.0,0.381,17.0,19.0,0.895,...,30.9,138.0,118.0,MIN,99,0,2018,2017-10-18,True,False
4,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,20.0,0.75,...,30.9,157.0,90.0,MEM,92,1,2021,2021-04-30,False,True


In [9]:
# The last row has no value for 'target' since there is no next game
df[df['team'] == 'WAS'].tail()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
17564,240.0,47.0,90.0,0.522,9.0,19.0,0.474,22.0,28.0,0.786,...,44.5,180.0,129.0,UTA,121,1,2021,2021-04-12,True,True
17641,240.0,46.0,101.0,0.455,10.0,26.0,0.385,10.0,12.0,0.833,...,32.8,200.0,119.0,CHI,107,0,2017,2017-03-17,True,False
17678,240.0,40.0,100.0,0.4,9.0,46.0,0.196,18.0,19.0,0.947,...,33.4,182.0,116.0,TOR,125,1,2019,2018-11-23,False,True
17715,240.0,47.0,86.0,0.547,16.0,30.0,0.533,20.0,23.0,0.87,...,30.8,183.0,139.0,ORL,103,0,2018,2017-12-23,True,False
17716,240.0,43.0,89.0,0.483,9.0,26.0,0.346,16.0,20.0,0.8,...,36.0,229.0,116.0,SAC,120,1,2016,2016-03-30,False,


In [10]:
# set the value 2 for all the rows whose current value is NaN
df['target'][pd.isnull(df['target'])] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'][pd.isnull(df['target'])] = 2


In [11]:
df['target'] = df['target'].astype(int, errors='ignore')
df.tail()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
17767,240.0,35.0,81.0,0.432,11.0,26.0,0.423,27.0,36.0,0.75,...,33.7,160.0,118.0,OKC,92,0,2019,2018-10-19,True,1
17768,240.0,37.0,74.0,0.5,13.0,25.0,0.52,26.0,37.0,0.703,...,30.0,139.0,129.0,ORL,108,1,2017,2016-12-14,True,2
17769,240.0,42.0,89.0,0.472,14.0,33.0,0.424,10.0,20.0,0.5,...,29.9,175.0,126.0,LAC,113,0,2017,2016-12-14,False,2
17770,240.0,41.0,85.0,0.482,9.0,26.0,0.346,26.0,30.0,0.867,...,27.1,150.0,126.0,MIA,106,1,2020,2020-09-19,True,2
17771,240.0,33.0,85.0,0.388,12.0,44.0,0.273,28.0,34.0,0.824,...,36.2,141.0,114.0,BOS,117,0,2020,2020-09-19,False,2


In [12]:
df['won'].value_counts()

True     8886
False    8886
Name: won, dtype: int64

In [13]:
df['target'].value_counts()

1    8873
0    8869
2      30
Name: target, dtype: int64

In [14]:
nulls = pd.isnull(df)
nulls = nulls.sum()
nulls = nulls[nulls > 0]
nulls

+/-             17772
mp_max          17772
mp_max.1        17772
+/-_opp         17772
mp_max_opp      17772
mp_max_opp.1    17772
dtype: int64

In [15]:
valid_columns = df.columns[~df.columns.isin(nulls.index)]
valid_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=142)

In [16]:
df = df[valid_columns].copy()

In [17]:
df.head()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,18.0,0.778,...,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True,1
1,240.0,36.0,100.0,0.36,7.0,31.0,0.226,16.0,19.0,0.842,...,32.6,152.0,111.0,ATL,98,0,2016,2015-12-09,False,0
2,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,0.739,...,30.9,148.0,116.0,SAS,107,1,2018,2017-10-18,False,1
3,240.0,41.0,89.0,0.461,8.0,21.0,0.381,17.0,19.0,0.895,...,30.9,138.0,118.0,MIN,99,0,2018,2017-10-18,True,0
4,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,20.0,0.75,...,30.9,157.0,90.0,MEM,92,1,2021,2021-04-30,False,1


In [18]:
df.shape

(17772, 142)

In [19]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction='forward', cv=split)

In [20]:
removed_columns = ['season', 'date', 'won', 'target', 'team', 'team_opp']
# ~ -> negation. in this case, keep columns not in removed_columns
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [21]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])
df.head()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.454545,0.308824,0.507177,0.206897,0.242424,0.356295,0.302326,0.269841,0.740957,...,0.088575,0.421801,0.4,DAL,0.298077,1.0,2016,2015-12-09,True,1
1,0.0,0.386364,0.588235,0.217703,0.241379,0.409091,0.268409,0.348837,0.285714,0.815636,...,0.134788,0.298578,0.4,ATL,0.326923,0.0,2016,2015-12-09,False,0
2,0.0,0.409091,0.367647,0.397129,0.275862,0.227273,0.5,0.372093,0.349206,0.695449,...,0.112965,0.279621,0.458824,SAS,0.413462,1.0,2018,2017-10-18,False,1
3,0.0,0.5,0.426471,0.45933,0.275862,0.257576,0.452494,0.372093,0.285714,0.87748,...,0.112965,0.232227,0.482353,MIN,0.336538,0.0,2018,2017-10-18,True,0
4,0.0,0.181818,0.382353,0.107656,0.206897,0.333333,0.274347,0.325581,0.301587,0.708285,...,0.112965,0.322275,0.152941,MEM,0.269231,1.0,2021,2021-04-30,False,1


In [22]:
sfs.fit(df[selected_columns], df['target'])

In [23]:
predictors = list(selected_columns[sfs.get_support()])
predictors

['drb',
 'pf',
 'blk%',
 'usg%',
 'fg_max',
 'orb_max',
 'ts%_max',
 'efg%_max',
 '3par_max',
 'trb%_max',
 'ast%_max',
 'blk%_max',
 'drtg_max',
 '3p_opp',
 '3pa_opp',
 '3p%_opp',
 'fta_opp',
 'ftr_opp',
 'ast%_opp',
 'usg%_opp',
 '3pa_max_opp',
 '3p%_max_opp',
 'fta_max_opp',
 'ast_max_opp',
 'tov_max_opp',
 'pts_max_opp',
 'ts%_max_opp',
 'drb%_max_opp',
 'ast%_max_opp',
 'stl%_max_opp']

In [24]:
def backtest(data, model, predictors, start=2, step=1):
  all_predictions = []
  seasons = sorted(data['season'].unique())

  for i in range(start, len(seasons), step):
    season = seasons[i]

    train = data[data['season'] < season]
    test = data[data['season'] == season]

    model.fit(train[predictors], train['target'])
    
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index)
    
    combined = pd.concat([test['target'], preds], axis=1)
    combined.columns = ['actual', 'prediction']

    all_predictions.append(combined)
  return pd.concat(all_predictions)


In [25]:
predictions = backtest(df, rr, predictors)
predictions.head()

Unnamed: 0,actual,prediction
2,1,0
3,0,1
24,0,1
25,1,0
38,0,1


In [26]:
predictions

Unnamed: 0,actual,prediction
2,1,0
3,0,1
24,0,1
25,1,0
38,0,1
...,...,...
17721,1,0
17748,0,1
17749,2,1
17758,2,1


In [27]:
predictions[predictions['actual'] == predictions['prediction']]

Unnamed: 0,actual,prediction
39,0,0
69,0,0
96,1,1
114,0,0
115,0,0
...,...,...
17638,1,1
17647,1,1
17656,1,1
17657,1,1


In [28]:
from sklearn.metrics import accuracy_score
preds = predictions[predictions['actual'] != 2]
accuracy_score(preds['actual'], preds['prediction'])

0.5063189889617661

In [29]:
df.groupby(by='home').apply(lambda x: x[x['won'] == 1].shape[0] / x.shape[0])

home
0.0    0.428314
1.0    0.571686
dtype: float64

In [30]:
df_rolling = df[list(selected_columns) + ['won', 'team', 'season']]
df_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,team,season
0,0.0,0.454545,0.308824,0.507177,0.206897,0.242424,0.356295,0.302326,0.269841,0.740957,...,0.044,0.190776,0.088575,0.421801,0.400000,0.298077,1.0,True,ATL,2016
1,0.0,0.386364,0.588235,0.217703,0.241379,0.409091,0.268409,0.348837,0.285714,0.815636,...,0.062,0.475891,0.134788,0.298578,0.400000,0.326923,0.0,False,DAL,2016
2,0.0,0.409091,0.367647,0.397129,0.275862,0.227273,0.500000,0.372093,0.349206,0.695449,...,0.076,0.161426,0.112965,0.279621,0.458824,0.413462,1.0,False,MIN,2018
3,0.0,0.500000,0.426471,0.459330,0.275862,0.257576,0.452494,0.372093,0.285714,0.877480,...,0.050,0.251572,0.112965,0.232227,0.482353,0.336538,0.0,True,SAS,2018
4,0.0,0.181818,0.382353,0.107656,0.206897,0.333333,0.274347,0.325581,0.301587,0.708285,...,0.113,0.127883,0.112965,0.322275,0.152941,0.269231,1.0,False,ORL,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.363636,0.308824,0.389952,0.379310,0.333333,0.502375,0.604651,0.555556,0.708285,...,0.063,0.310273,0.148909,0.336493,0.482353,0.269231,0.0,True,LAC,2019
17768,0.0,0.409091,0.205882,0.552632,0.448276,0.318182,0.617577,0.581395,0.571429,0.653442,...,0.038,0.213836,0.101412,0.236967,0.611765,0.423077,1.0,True,LAC,2017
17769,0.0,0.522727,0.426471,0.485646,0.482759,0.439394,0.503563,0.209302,0.301587,0.416569,...,0.074,0.220126,0.100128,0.407583,0.576471,0.471154,0.0,False,ORL,2017
17770,0.0,0.500000,0.367647,0.509569,0.310345,0.333333,0.410926,0.581395,0.460317,0.844807,...,0.064,0.242138,0.064185,0.289100,0.576471,0.403846,1.0,True,BOS,2020


In [31]:
def find_team_averages(team):
  rolling = team.rolling(10).mean()
  return rolling

df_rolling = df_rolling.groupby(['team', 'season'], group_keys=False).apply(find_team_averages)
df_rolling

  rolling = team.rolling(10).mean()


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,season
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.025,0.434091,0.413235,0.392344,0.389655,0.378788,0.465083,0.553488,0.461905,0.803734,...,0.0502,0.0699,0.282914,0.215019,0.408057,0.480000,0.525000,0.4,0.4,2019.0
17768,0.000,0.465909,0.295588,0.533014,0.382759,0.346970,0.496200,0.460465,0.412698,0.753092,...,0.0510,0.0520,0.334696,0.122593,0.329858,0.544706,0.365385,0.4,0.8,2017.0
17769,0.050,0.495455,0.419118,0.455742,0.351724,0.343939,0.446793,0.316279,0.304762,0.698250,...,0.0500,0.0821,0.307547,0.127599,0.409005,0.465882,0.398077,0.5,0.5,2017.0
17770,0.025,0.502273,0.427941,0.460526,0.427586,0.463636,0.436698,0.393023,0.322222,0.800000,...,0.0559,0.1086,0.322117,0.150578,0.440284,0.528235,0.420192,0.3,0.7,2020.0


In [32]:
rolling_cols = [f'{col}_10' for  col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis=1)
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp_10,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10
0,0.0,0.454545,0.308824,0.507177,0.206897,0.242424,0.356295,0.302326,0.269841,0.740957,...,,,,,,,,,,
1,0.0,0.386364,0.588235,0.217703,0.241379,0.409091,0.268409,0.348837,0.285714,0.815636,...,,,,,,,,,,
2,0.0,0.409091,0.367647,0.397129,0.275862,0.227273,0.500000,0.372093,0.349206,0.695449,...,,,,,,,,,,
3,0.0,0.500000,0.426471,0.459330,0.275862,0.257576,0.452494,0.372093,0.285714,0.877480,...,,,,,,,,,,
4,0.0,0.181818,0.382353,0.107656,0.206897,0.333333,0.274347,0.325581,0.301587,0.708285,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.363636,0.308824,0.389952,0.379310,0.333333,0.502375,0.604651,0.555556,0.708285,...,0.0502,0.0699,0.282914,0.215019,0.408057,0.480000,0.525000,0.4,0.4,2019.0
17768,0.0,0.409091,0.205882,0.552632,0.448276,0.318182,0.617577,0.581395,0.571429,0.653442,...,0.0510,0.0520,0.334696,0.122593,0.329858,0.544706,0.365385,0.4,0.8,2017.0
17769,0.0,0.522727,0.426471,0.485646,0.482759,0.439394,0.503563,0.209302,0.301587,0.416569,...,0.0500,0.0821,0.307547,0.127599,0.409005,0.465882,0.398077,0.5,0.5,2017.0
17770,0.0,0.500000,0.367647,0.509569,0.310345,0.333333,0.410926,0.581395,0.460317,0.844807,...,0.0559,0.1086,0.322117,0.150578,0.440284,0.528235,0.420192,0.3,0.7,2020.0


In [33]:
df = df.dropna()
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp_10,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10
891,0.0,0.431818,0.264706,0.521531,0.275862,0.272727,0.432304,0.325581,0.301587,0.708285,...,0.0409,0.0735,0.389937,0.243261,0.434123,0.485882,0.375962,0.6,0.6,2018.0
918,0.0,0.272727,0.235294,0.332536,0.275862,0.363636,0.339667,0.372093,0.349206,0.695449,...,0.0562,0.0535,0.294444,0.165854,0.337441,0.468235,0.358654,0.5,0.4,2018.0
935,0.0,0.681818,0.529412,0.576555,0.206897,0.333333,0.274347,0.279070,0.206349,0.917153,...,0.0559,0.0527,0.299790,0.160847,0.342180,0.490588,0.360577,0.5,0.4,2018.0
960,0.0,0.568182,0.544118,0.442584,0.724138,0.606061,0.566508,0.209302,0.174603,0.805134,...,0.0561,0.0639,0.400105,0.175225,0.462085,0.560000,0.337500,0.6,0.8,2022.0
972,0.0,0.477273,0.455882,0.409091,0.517241,0.590909,0.414489,0.255814,0.222222,0.766628,...,0.0630,0.1438,0.320335,0.151091,0.432701,0.512941,0.372115,0.7,0.8,2022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.363636,0.308824,0.389952,0.379310,0.333333,0.502375,0.604651,0.555556,0.708285,...,0.0502,0.0699,0.282914,0.215019,0.408057,0.480000,0.525000,0.4,0.4,2019.0
17768,0.0,0.409091,0.205882,0.552632,0.448276,0.318182,0.617577,0.581395,0.571429,0.653442,...,0.0510,0.0520,0.334696,0.122593,0.329858,0.544706,0.365385,0.4,0.8,2017.0
17769,0.0,0.522727,0.426471,0.485646,0.482759,0.439394,0.503563,0.209302,0.301587,0.416569,...,0.0500,0.0821,0.307547,0.127599,0.409005,0.465882,0.398077,0.5,0.5,2017.0
17770,0.0,0.500000,0.367647,0.509569,0.310345,0.333333,0.410926,0.581395,0.460317,0.844807,...,0.0559,0.1086,0.322117,0.150578,0.440284,0.528235,0.420192,0.3,0.7,2020.0


In [34]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby('team', group_keys=False).apply(lambda x:  shift_col(x, col_name))

df['home_next'] = add_col(df, 'home')
df['team_opp_next'] = add_col(df, 'team_opp')
df['date_next'] = add_col(df, 'date')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['home_next'] = add_col(df, 'home')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['team_opp_next'] = add_col(df, 'team_opp')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_next'] = add_col(df, 'date')


In [35]:
df.head()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10,home_next,team_opp_next,date_next
891,0.0,0.431818,0.264706,0.521531,0.275862,0.272727,0.432304,0.325581,0.301587,0.708285,...,0.243261,0.434123,0.485882,0.375962,0.6,0.6,2018.0,1.0,IND,2018-01-03
918,0.0,0.272727,0.235294,0.332536,0.275862,0.363636,0.339667,0.372093,0.349206,0.695449,...,0.165854,0.337441,0.468235,0.358654,0.5,0.4,2018.0,1.0,MIN,2018-03-17
935,0.0,0.681818,0.529412,0.576555,0.206897,0.333333,0.274347,0.27907,0.206349,0.917153,...,0.160847,0.34218,0.490588,0.360577,0.5,0.4,2018.0,0.0,NYK,2021-05-13
960,0.0,0.568182,0.544118,0.442584,0.724138,0.606061,0.566508,0.209302,0.174603,0.805134,...,0.175225,0.462085,0.56,0.3375,0.6,0.8,2022.0,1.0,ATL,2022-04-19
972,0.0,0.477273,0.455882,0.409091,0.517241,0.590909,0.414489,0.255814,0.222222,0.766628,...,0.151091,0.432701,0.512941,0.372115,0.7,0.8,2022.0,1.0,HOU,2016-04-27


In [36]:
df = df.copy()

In [37]:
df.head()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10,home_next,team_opp_next,date_next
891,0.0,0.431818,0.264706,0.521531,0.275862,0.272727,0.432304,0.325581,0.301587,0.708285,...,0.243261,0.434123,0.485882,0.375962,0.6,0.6,2018.0,1.0,IND,2018-01-03
918,0.0,0.272727,0.235294,0.332536,0.275862,0.363636,0.339667,0.372093,0.349206,0.695449,...,0.165854,0.337441,0.468235,0.358654,0.5,0.4,2018.0,1.0,MIN,2018-03-17
935,0.0,0.681818,0.529412,0.576555,0.206897,0.333333,0.274347,0.27907,0.206349,0.917153,...,0.160847,0.34218,0.490588,0.360577,0.5,0.4,2018.0,0.0,NYK,2021-05-13
960,0.0,0.568182,0.544118,0.442584,0.724138,0.606061,0.566508,0.209302,0.174603,0.805134,...,0.175225,0.462085,0.56,0.3375,0.6,0.8,2022.0,1.0,ATL,2022-04-19
972,0.0,0.477273,0.455882,0.409091,0.517241,0.590909,0.414489,0.255814,0.222222,0.766628,...,0.151091,0.432701,0.512941,0.372115,0.7,0.8,2022.0,1.0,HOU,2016-04-27


In [38]:
full = df.merge(df[rolling_cols + ['team_opp_next', 'date_next', 'team']],
                left_on=['team', 'date_next'],
                right_on=['team_opp_next', 'date_next'])

In [39]:
full[['team_x', 'team_opp_next_x', 'team_y', 'team_opp_next_y', 'date_next']]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,IND,DET,DET,IND,2017-11-08
1,DET,IND,IND,DET,2017-11-08
2,DET,MIL,MIL,DET,2017-12-06
3,DAL,GSW,GSW,DAL,2022-01-05
4,PHI,BRK,BRK,PHI,2016-03-15
...,...,...,...,...,...
15533,MIL,NYK,NYK,MIL,2018-12-25
15534,PHO,DET,DET,PHO,2022-01-16
15535,MEM,LAC,LAC,MEM,2021-02-26
15536,LAC,OKC,OKC,LAC,2018-10-19


In [40]:
removed_columns = list(full.columns[full.dtypes == 'object']) + removed_columns

In [41]:
removed_columns

['team_x',
 'team_opp',
 'date',
 'team_opp_next_x',
 'date_next',
 'team_opp_next_y',
 'team_y',
 'season',
 'date',
 'won',
 'target',
 'team',
 'team_opp']

In [42]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]
sfs.fit(full[selected_columns], full['target'])

In [None]:
predictors = list(selected_columns[sfs.get_support()])

In [None]:
predictions = backtest(full, rr, predictors)
accuracy_score(predictions['actual'], predictions['prediction'])