In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv("data/nba_games.csv", index_col=0)

In [3]:
df = df.sort_values("date")

In [4]:
df = df.reset_index(drop=True)

In [5]:
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [6]:
def add_target(team):
    team["target"] = team["won"].shift(-1)
    return team

In [7]:
df = df.groupby("team", group_keys=False).apply(add_target)

  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"] = team["won"].shift(-1)
  team["target"]

In [8]:
# df[df["team"]=="WAS"]

In [9]:
df["target"][pd.isnull(df["target"])] = 2

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["target"][pd.isnull(df["target"])] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnul

In [10]:
df["target"] = df["target"].astype(int, errors="ignore")

In [11]:
df["won"].value_counts()

won
False    4223
True     4223
Name: count, dtype: int64

In [12]:
df["target"].value_counts()

target
1    4208
0    4208
2      30
Name: count, dtype: int64

In [13]:
nulls = pd.isnull(df)

In [14]:
nulls = nulls.sum()

In [15]:
nulls = nulls[nulls>0]

In [16]:
valid_columns = df.columns[~df.columns.isin(nulls.index)]

In [17]:
valid_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=140)

In [18]:
df = df[valid_columns].copy()

In [19]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

In [20]:
rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits = 3)

sfs = SequentialFeatureSelector(rr, n_features_to_select = 25, direction="forward", cv=split)

In [21]:
# Scale the columns so they fall between 0 and 1
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]

In [22]:
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [23]:
from sklearn.preprocessing import MinMaxScaler

In [24]:
scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [25]:
sfs.fit(df[selected_columns], df["target"])

In [26]:
predictors = list(selected_columns[sfs.get_support()])

In [27]:
predictors

['fg%',
 '3p',
 'drb',
 'blk',
 'pf',
 'efg%',
 'ftr',
 'drb%',
 'tov%',
 'usg%',
 'fg_max',
 'fta_max',
 'tov_max',
 'orb_opp',
 'stl_opp',
 'pts_opp',
 '3par_opp',
 'orb%_opp',
 'usg%_opp',
 '3pa_max_opp',
 'fta_max_opp',
 'ft%_max_opp',
 'orb%_max_opp',
 'trb%_max_opp',
 'total_opp']

In [28]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "predictions"]
        
        all_predictions.append(combined)
    
    return pd.concat(all_predictions)

In [29]:
predictions = backtest(df, rr, predictors)

In [30]:
from sklearn.metrics import accuracy_score

In [31]:
predictions = predictions[predictions["actual"] != 2]
accuracy_score(predictions["actual"], predictions["predictions"])

0.5484913793103449

In [32]:
df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

  df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])


home
0.0    0.438077
1.0    0.561923
dtype: float64

In [33]:
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

In [34]:
df_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,team,season
0,0.000000,0.357143,0.298246,0.468293,0.259259,0.358491,0.410211,0.523810,0.58,0.677143,...,0.045,0.170860,0.233333,0.262626,0.376623,0.417476,0.0,False,LAL,2021
1,0.000000,0.500000,0.508772,0.478049,0.444444,0.566038,0.480634,0.285714,0.34,0.624286,...,0.039,1.000000,0.178205,0.323232,0.428571,0.349515,1.0,True,LAC,2021
2,0.000000,0.452381,0.491228,0.439024,0.481481,0.471698,0.619718,0.571429,0.60,0.732857,...,0.127,0.475891,0.129487,0.833333,0.454545,0.252427,0.0,True,BRK,2021
3,0.000000,0.333333,0.614035,0.236585,0.296296,0.433962,0.397887,0.309524,0.42,0.502857,...,0.074,0.408805,0.235897,1.000000,0.168831,0.504854,1.0,False,GSW,2021
4,0.000000,0.452381,0.333333,0.558537,0.185185,0.188679,0.480634,0.333333,0.34,0.774286,...,0.043,0.344864,0.084615,0.227273,0.311688,0.388350,1.0,False,MIA,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8441,0.000000,0.357143,0.491228,0.331707,0.333333,0.452830,0.434859,0.261905,0.42,0.378571,...,0.307,0.475891,0.150000,0.641414,0.311688,0.514563,1.0,False,TOR,2024
8442,0.000000,0.571429,0.368421,0.673171,0.370370,0.415094,0.524648,0.785714,0.94,0.591429,...,0.028,0.213836,0.134615,0.782828,0.688312,0.601942,1.0,True,MIL,2024
8443,0.000000,0.476190,0.368421,0.558537,0.518519,0.622642,0.519366,0.285714,0.28,0.821429,...,0.103,0.213836,0.069231,0.348485,0.493506,0.213592,1.0,True,CLE,2024
8444,0.333333,0.595238,0.719298,0.439024,0.407407,0.622642,0.396127,0.357143,0.40,0.675714,...,0.093,0.101677,0.100000,0.590909,0.480519,0.524272,1.0,False,UTA,2024


In [39]:
def find_team_averages(team):
    team['won'] = team['won'].astype(int)
    rolling = team.rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

DataError: Cannot aggregate non-numeric type: object

In [None]:
df_rolling