In [59]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

CSV_PATH = "../data_collection/scraped_data/combined_historic_data/data.csv"
ROLL_WINDOW = 5
ELO_K = 20.0
ELO_H = 65.0
ELO_BASE = 1500.0
SEASONAL_RESET = True

In [60]:
df = pd.read_csv(CSV_PATH)
print("shape:", df.shape)
df.head()

shape: (14684, 43)


Unnamed: 0.1,Unnamed: 0,SEASON,DATE,HOME/AWAY,TEAM,PTS,TEAM_OPP,PTS_OPP,WIN/LOSS,FG,...,FTA_OPP,FT_PCT_OPP,ORB_OPP,DRB_OPP,TRB_OPP,AST_OPP,STL_OPP,BLK_OPP,TOV_OPP,PF_OPP
0,0,2019,2018-10-16,AWAY,PHI,87,BOS,105,LOSS,34,...,14,0.714,12,43,55,21,7,5,15,20
1,1,2019,2018-10-16,HOME,BOS,105,PHI,87,WIN,42,...,23,0.609,6,41,47,18,8,5,16,20
2,2,2019,2018-10-16,AWAY,OKC,100,GSW,108,LOSS,33,...,18,0.944,17,41,58,28,7,7,21,29
3,3,2019,2018-10-16,HOME,GSW,108,OKC,100,WIN,42,...,37,0.649,16,29,45,21,12,6,15,21
4,4,2019,2018-10-17,AWAY,MIL,113,CHO,112,WIN,42,...,22,0.636,9,32,41,21,8,9,11,19


In [61]:
df["DATE"] = pd.to_datetime(df["DATE"])
df = df.sort_values(by=["SEASON", "TEAM", "DATE"]).reset_index(drop=True)

for cat_col in ["HOME/AWAY", "WIN/LOSS"]:
    if cat_col in df.columns:
        df[cat_col] = df[cat_col].astype("category")

if "HOME/AWAY" in df:
    df["IS_HOME"] = df["HOME/AWAY"].astype(str).str.upper().map({"HOME":1, "AWAY":0}).fillna(0).astype(int)
else:
    df["IS_HOME"] = 0

if "WIN/LOSS" in df:
    df["WIN"] = df["WIN/LOSS"].astype(str).str.upper().map({"WIN":1, "LOSS":0}).fillna(0).astype(int)

In [62]:
# implement elo ratings
def expected_score(r_a, r_b):
    return 1.0 / (1.0 + 10.0 ** ((r_b - r_a) / 400.0))

def add_elo_features(frame: pd.DataFrame,
                     base_elo=1500.0, k=20.0, h=65.0,
                     seasonal_reset=True) -> pd.DataFrame:
    frame = frame.copy()
    frame["team_elo_pre"] = np.nan
    frame["opp_elo_pre"]  = np.nan

    if seasonal_reset:
        season_iter = frame.groupby("SEASON", sort=False)
    else:
        season_iter = [("ALL", frame)]

    for _, sdf in season_iter:
        work = sdf.copy()
        # Pair the two rows for the same game using a robust key
        tmin = np.minimum(work["TEAM"].values, work["TEAM_OPP"].values)
        tmax = np.maximum(work["TEAM"].values, work["TEAM_OPP"].values)
        work["_game_key"] = list(zip(work["DATE"].values, tmin, tmax))

        elo = {}

        # Iterate games in chronological order
        for _, g in work.sort_values("DATE").groupby("_game_key", sort=False):
            if len(g) != 2:
                continue 

            i, j = g.index[0], g.index[1]
            team_i, team_j = frame.loc[i, "TEAM"], frame.loc[j, "TEAM"]

            R_i = elo.get(team_i, base_elo)
            R_j = elo.get(team_j, base_elo)

            # Expected scores with home advantage applied only to expectation
            R_i_eff = R_i + (h if frame.loc[i, "IS_HOME"] == 1 else 0.0)
            R_j_eff = R_j + (h if frame.loc[j, "IS_HOME"] == 1 else 0.0)

            E_i = expected_score(R_i_eff, R_j_eff)

            # Store PRE-game Elos (no leakage)
            frame.at[i, "team_elo_pre"] = R_i
            frame.at[i, "opp_elo_pre"]  = R_j
            frame.at[j, "team_elo_pre"] = R_j
            frame.at[j, "opp_elo_pre"]  = R_i

            # Update ratings using actual results
            S_i = float(frame.loc[i, "WIN"])
            S_j = float(frame.loc[j, "WIN"])

            elo[team_i] = R_i + k * (S_i - E_i)
            elo[team_j] = R_j + k * (S_j - (1.0 - E_i))

    frame["elo_diff"] = frame["team_elo_pre"] - frame["opp_elo_pre"]
    return frame

df = add_elo_features(df, base_elo=ELO_BASE, k=ELO_K, h=ELO_H, seasonal_reset=SEASONAL_RESET)

In [63]:
home_stats = [
    "PTS","FG","FGA","FG_PCT","FG3","FG3A","FG3_PCT","FT","FTA","FT_PCT",
    "ORB","DRB","TRB","AST","STL","BLK","TOV","PF"
]
away_stats = [
    "PTS_OPP","FG_OPP","FGA_OPP","FG_PCT_OPP","FG3_OPP","FG3A_OPP","FG3_PCT_OPP",
    "FT_OPP","FTA_OPP","FT_PCT_OPP","ORB_OPP","DRB_OPP","TRB_OPP",
    "AST_OPP","STL_OPP","BLK_OPP","TOV_OPP","PF_OPP"
]
roll_cols = home_stats + away_stats

def add_rolling_features(frame: pd.DataFrame, cols, window=5):
    out = frame.copy()
    g = out.groupby(["SEASON", "TEAM"], group_keys=False)
    for c in cols:
        out[f"{c}_roll{window}"] = (
            g[c].apply(lambda s: s.shift(1).rolling(window, min_periods=window).mean()).values
        )
    return out

df = add_rolling_features(df, roll_cols, window=ROLL_WINDOW)

needed = [f"{c}_roll{ROLL_WINDOW}" for c in roll_cols] + ["team_elo_pre", "opp_elo_pre", "elo_diff"]
df_model = df.dropna(subset=needed).copy()
df_model.head()

Unnamed: 0.1,Unnamed: 0,SEASON,DATE,HOME/AWAY,TEAM,PTS,TEAM_OPP,PTS_OPP,WIN/LOSS,FG,...,FTA_OPP_roll5,FT_PCT_OPP_roll5,ORB_OPP_roll5,DRB_OPP_roll5,TRB_OPP_roll5,AST_OPP_roll5,STL_OPP_roll5,BLK_OPP_roll5,TOV_OPP_roll5,PF_OPP_roll5
5,174,2019,2018-10-29,AWAY,ATL,92,PHI,113,LOSS,35,...,26.0,0.7086,9.8,38.2,48.0,25.2,9.4,5.4,18.2,25.0
6,192,2019,2018-10-30,AWAY,ATL,114,CLE,136,LOSS,44,...,23.4,0.7426,11.0,40.6,51.6,27.6,8.4,6.0,18.8,24.8
7,227,2019,2018-11-01,HOME,ATL,115,SAC,146,LOSS,42,...,23.4,0.7564,12.2,40.8,53.0,25.8,9.2,5.4,18.0,23.0
8,255,2019,2018-11-03,HOME,ATL,123,MIA,118,WIN,46,...,27.0,0.7904,11.2,41.2,52.4,28.8,11.0,5.6,17.2,23.8
9,296,2019,2018-11-06,AWAY,ATL,102,CHO,113,LOSS,41,...,27.0,0.8212,12.0,38.2,50.2,29.2,10.8,5.0,17.4,22.4


In [64]:
def season_time_split(frame: pd.DataFrame, frac_train=0.8):
    f = frame.sort_values("DATE")
    n = len(f)
    n_tr = int(np.floor(frac_train * n))
    return f.index[:n_tr], f.index[n_tr:]

# Build train and test indices across seasons
train_idx_list, test_idx_list = [], []
for season, sdf in df_model.groupby("SEASON"):
    tr_idx, te_idx = season_time_split(sdf, 0.8)
    train_idx_list.append(tr_idx)
    test_idx_list.append(te_idx)

train_idx = np.concatenate([idx.values if hasattr(idx, "values") else idx for idx in train_idx_list])
test_idx  = np.concatenate([idx.values if hasattr(idx, "values") else idx for idx in test_idx_list])

target = "WIN"
feature_cols = [f"{c}_roll{ROLL_WINDOW}" for c in roll_cols] + ["IS_HOME", "elo_diff"]
feature_cols = [c for c in feature_cols if c in df_model.columns]

X_train = df_model.loc[train_idx, feature_cols]
y_train = df_model.loc[train_idx, target].values
X_test  = df_model.loc[test_idx,  feature_cols]
y_test  = df_model.loc[test_idx,  target].values

display(df_model.loc[train_idx, feature_cols].head())

Unnamed: 0,PTS_roll5,FG_roll5,FGA_roll5,FG_PCT_roll5,FG3_roll5,FG3A_roll5,FG3_PCT_roll5,FT_roll5,FTA_roll5,FT_PCT_roll5,...,ORB_OPP_roll5,DRB_OPP_roll5,TRB_OPP_roll5,AST_OPP_roll5,STL_OPP_roll5,BLK_OPP_roll5,TOV_OPP_roll5,PF_OPP_roll5,IS_HOME,elo_diff
1648,106.4,39.6,94.6,0.4178,11.0,31.6,0.3474,16.2,21.4,0.7628,...,10.6,40.0,50.6,22.6,5.8,5.0,17.8,22.4,1,-54.50085
347,112.2,40.6,91.8,0.4424,13.6,36.2,0.3712,17.4,22.4,0.7736,...,9.8,38.4,48.2,24.4,6.2,7.0,16.8,22.6,1,14.004333
1484,116.2,40.6,88.6,0.46,10.4,26.8,0.3894,24.6,28.4,0.8448,...,14.0,31.2,45.2,24.2,6.2,5.6,14.8,23.2,1,-41.553285
2349,117.0,44.6,89.6,0.4982,12.6,33.6,0.3738,15.2,20.2,0.75,...,13.8,30.2,44.0,22.4,6.8,5.0,14.2,22.0,1,42.96052
2178,117.2,45.8,90.2,0.5062,9.2,23.0,0.395,16.4,25.0,0.6468,...,12.6,36.4,49.0,25.6,8.6,5.2,15.6,24.0,1,11.828828


In [65]:
# GBN 
clf = GaussianNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Overall Test Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))

Overall Test Accuracy: 0.6082698585418934

Classification Report:
               precision    recall  f1-score   support

           0     0.6289    0.5304    0.5755      1380
           1     0.5932    0.6863    0.6364      1377

    accuracy                         0.6083      2757
   macro avg     0.6110    0.6084    0.6059      2757
weighted avg     0.6111    0.6083    0.6059      2757



In [66]:
# Ridge Classifier
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

ridge = RidgeClassifier(alpha=(3.0), class_weight="balanced")
ridge.fit(X_train_scaled, y_train)

y_pred = ridge.predict(X_test_scaled)
acc = accuracy_score(y_test, y_pred)
print("Overall Test Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))

Overall Test Accuracy: 0.6525208560029017

Classification Report:
               precision    recall  f1-score   support

           0     0.6575    0.6384    0.6478      1380
           1     0.6478    0.6667    0.6571      1377

    accuracy                         0.6525      2757
   macro avg     0.6527    0.6525    0.6525      2757
weighted avg     0.6527    0.6525    0.6525      2757



In [67]:
# XGBoost Classifier
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
scale_pos_weight = (neg / pos) if pos > 0 else 1.0

xgb = XGBClassifier(
    n_estimators=600,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    min_child_weight=1.0,
    gamma=0.0,
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    scale_pos_weight=scale_pos_weight,
    n_jobs=-1,
    random_state=42
)

val_frac = 0.1
val_n = max(1, int(len(X_train) * val_frac))
X_tr, X_val = X_train[:-val_n], X_train[-val_n:]
y_tr, y_val = y_train[:-val_n], y_train[-val_n:]

xgb.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=False,
)

y_pred = xgb.predict(X_test)
y_prob = xgb.predict_proba(X_test)[:, 1]

print("Overall Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Overall Test Accuracy: 0.6307580703663402

Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.63      0.63      1380
           1       0.63      0.63      0.63      1377

    accuracy                           0.63      2757
   macro avg       0.63      0.63      0.63      2757
weighted avg       0.63      0.63      0.63      2757



GNB: 60.827%

Ridge: 65.252%

XGBoost: 62.133%

In [68]:
# Export Ridge Classifier

joblib.dump(ridge, 'ridge.pkl')

['ridge.pkl']