In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("nba_games_2.csv", index_col=0)

In [3]:
df = df.sort_values("date")

In [4]:
df = df.reset_index(drop=True)

In [5]:
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [6]:
def add_target(team):
    team["target"] = team["won"].shift(-1)
    return team

df = df.groupby("team").apply(add_target)

In [7]:
df["target"][df["target"].isnull()] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][df["target"].isnull()] = 2


In [8]:
df["target"] = df["target"].astype(int, errors='ignore')

In [9]:
nulls = df.isnull()

In [10]:
nulls = nulls.sum()

In [11]:
nulls = nulls[nulls > 0]

In [12]:
nulls.index

Index(['+/-', 'mp_max', 'mp_max.1', '+/-_opp', 'mp_max_opp', 'mp_max_opp.1'], dtype='object')

In [13]:
valid_columns = df.columns[~df.columns.isin(nulls.index)]

In [14]:
valid_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=142)

In [15]:
df = df[valid_columns].copy()

In [16]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split)

In [17]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]

In [18]:
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [19]:
selected_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'trb%_max_opp', 'ast%_max_opp', 'stl%_max_opp', 'blk%_max_opp',
       'tov%_max_opp', 'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp',
       'total_opp', 'home_opp'],
      dtype='object', length=136)

In [20]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [21]:
sfs.fit(df[selected_columns], df["target"])

In [22]:
predictors = list(selected_columns[sfs.get_support()])

In [23]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
        
    return pd.concat(all_predictions)                    

In [24]:
predictions = backtest(df, rr, predictors)

In [25]:
from sklearn.metrics import accuracy_score

predictions = predictions[predictions["actual"] != 2]
accuracy_score(predictions["actual"], predictions["prediction"])

0.5485110470701249

In [26]:
df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

home
0.0    0.428314
1.0    0.571686
dtype: float64

In [27]:
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

In [28]:
def find_team_averages(team):
    rolling = team.rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

In [29]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols

df = pd.concat([df, df_rolling], axis=1)

In [31]:
df = df.dropna()

In [35]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["home_next"] = add_col(df, "home")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["team_opp_next"] = add_col(df, "team_opp")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["date_next"] = add_col(df, "date")


In [37]:
df = df.copy()

In [40]:
full = df.merge(
    df[rolling_cols + ["team_opp_next", "date_next", "team"]],
    left_on=["team", "date_next"],
    right_on=["team_opp_next", "date_next"]
)

In [42]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,SAC,TOR,TOR,SAC,2015-11-15
1,TOR,SAC,SAC,TOR,2015-11-15
2,CLE,DET,DET,CLE,2015-11-17
3,GSW,TOR,TOR,GSW,2015-11-17
4,DEN,NOP,NOP,DEN,2015-11-17
...,...,...,...,...,...
15769,BOS,GSW,GSW,BOS,2022-06-10
15770,GSW,BOS,BOS,GSW,2022-06-13
15771,BOS,GSW,GSW,BOS,2022-06-13
15772,GSW,BOS,BOS,GSW,2022-06-16


In [43]:
removed_columns

['season', 'date', 'won', 'target', 'team', 'team_opp']

In [44]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

In [45]:
removed_columns

['team_x',
 'team_opp',
 'date',
 'team_opp_next_x',
 'date_next',
 'team_opp_next_y',
 'team_y',
 'season',
 'date',
 'won',
 'target',
 'team',
 'team_opp']

In [46]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [47]:
sfs.fit(full[selected_columns], full["target"])

In [48]:
predictors = list(selected_columns[sfs.get_support()])

In [49]:
predictors

['mp',
 'orb',
 'ast',
 'tov',
 'usg%',
 'pf_max',
 'trb%_max',
 'stl%_max',
 'mp_opp',
 'usg%_opp',
 'usg%_10_x',
 'ft%_max_10_x',
 '3par_max_10_x',
 'usg%_opp_10_x',
 'stl_max_opp_10_x',
 'won_10_x',
 'home_next',
 'drb_10_y',
 'trb%_10_y',
 'usg%_10_y',
 'ft_max_10_y',
 'efg%_max_10_y',
 'tov%_max_10_y',
 'trb%_opp_10_y',
 'usg%_opp_10_y',
 'fga_max_opp_10_y',
 'fta_max_opp_10_y',
 'ft%_max_opp_10_y',
 'orb%_max_opp_10_y',
 'won_10_y']

In [50]:
predictions = backtest(full, rr, predictors)

In [51]:
accuracy_score(predictions["actual"], predictions["prediction"])

0.6296296296296297