In [3]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [4]:
# --------------------------------------------------------------------
# 1. LOAD DATA
# --------------------------------------------------------------------
df = pd.read_csv('data/games.csv', parse_dates=['GAME_DATE_EST'])

# We will use 'x' as the number of recent games for rolling averages.
x = 35

In [5]:
# --------------------------------------------------------------------
# 2. CONVERT DATA TO LONG FORMAT
#    One row = (TEAM_ID, date, PTS, FG_PCT, FT_PCT, FG3_PCT, AST, REB, home/away)
# --------------------------------------------------------------------
home_stats = df[['GAME_ID', 'GAME_DATE_EST', 'HOME_TEAM_ID', 
                 'PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home',
                 'AST_home', 'REB_home']].copy()
home_stats.rename(columns={
    'HOME_TEAM_ID': 'TEAM_ID',
    'PTS_home': 'PTS',
    'FG_PCT_home': 'FG_PCT',
    'FT_PCT_home': 'FT_PCT',
    'FG3_PCT_home': 'FG3_PCT',
    'AST_home': 'AST',
    'REB_home': 'REB'
}, inplace=True)
home_stats['home_away'] = 'home'

away_stats = df[['GAME_ID', 'GAME_DATE_EST', 'TEAM_ID_away',
                 'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away',
                 'AST_away', 'REB_away']].copy()
away_stats.rename(columns={
    'TEAM_ID_away': 'TEAM_ID',
    'PTS_away': 'PTS',
    'FG_PCT_away': 'FG_PCT',
    'FT_PCT_away': 'FT_PCT',
    'FG3_PCT_away': 'FG3_PCT',
    'AST_away': 'AST',
    'REB_away': 'REB'
}, inplace=True)
away_stats['home_away'] = 'away'

team_stats_long = pd.concat([home_stats, away_stats], ignore_index=True)
team_stats_long.sort_values(['TEAM_ID', 'GAME_DATE_EST'], inplace=True)

In [6]:
# --------------------------------------------------------------------
# 3. COMPUTE ROLLING AVERAGES & REST DAYS
#    - rolling averages for last x games (e.g., PTS, FG%, etc.)
#    - rest days: difference (in days) between consecutive games for the same team
# --------------------------------------------------------------------
def compute_rolling_and_rest(group):
    group = group.sort_values('GAME_DATE_EST')
    
    # Calculate REST_DAYS as difference from previous game date
    group['REST_DAYS'] = group['GAME_DATE_EST'].diff().dt.days
    
    # shift(1) so the current game doesn't include itself in rolling stats
    group['rolling_PTS']     = group['PTS'].shift(1).rolling(window=x, min_periods=x).mean()
    group['rolling_FG_PCT']  = group['FG_PCT'].shift(1).rolling(window=x, min_periods=x).mean()
    group['rolling_FT_PCT']  = group['FT_PCT'].shift(1).rolling(window=x, min_periods=x).mean()
    group['rolling_FG3_PCT'] = group['FG3_PCT'].shift(1).rolling(window=x, min_periods=x).mean()
    group['rolling_AST']     = group['AST'].shift(1).rolling(window=x, min_periods=x).mean()
    group['rolling_REB']     = group['REB'].shift(1).rolling(window=x, min_periods=x).mean()
    
    # shift(1) for REST_DAYS so that the rest for the current game is from the previous game
    group['REST_DAYS'] = group['REST_DAYS'].shift(1)
    
    return group

team_stats_long = team_stats_long.groupby('TEAM_ID').apply(compute_rolling_and_rest)
team_stats_long = team_stats_long.dropna(subset=['rolling_PTS'])  # ensure at least x prior games


  team_stats_long = team_stats_long.groupby('TEAM_ID').apply(compute_rolling_and_rest)


In [7]:
# --------------------------------------------------------------------
# 4. MERGE ROLLING & REST FEATURES BACK INTO WIDE FORMAT
# --------------------------------------------------------------------
def get_team_rolling_stats(game_date, team_id, team_stats):
    # Filter to games for this team before 'game_date'
    subset = team_stats[(team_stats['TEAM_ID'] == team_id) & (team_stats['GAME_DATE_EST'] < game_date)]
    if subset.empty:
        return pd.Series({
            'rolling_PTS': np.nan,
            'rolling_FG_PCT': np.nan,
            'rolling_FT_PCT': np.nan,
            'rolling_FG3_PCT': np.nan,
            'rolling_AST': np.nan,
            'rolling_REB': np.nan,
            'REST_DAYS': np.nan
        })
    else:
        # Return the last row (most recent game) for that team before 'game_date'
        last_row = subset.iloc[-1]
        return pd.Series({
            'rolling_PTS': last_row['rolling_PTS'],
            'rolling_FG_PCT': last_row['rolling_FG_PCT'],
            'rolling_FT_PCT': last_row['rolling_FT_PCT'],
            'rolling_FG3_PCT': last_row['rolling_FG3_PCT'],
            'rolling_AST': last_row['rolling_AST'],
            'rolling_REB': last_row['rolling_REB'],
            'REST_DAYS': last_row['REST_DAYS']
        })

# Create columns for home & away rolling stats + rest days
for stat in ['rolling_PTS', 'rolling_FG_PCT', 'rolling_FT_PCT', 'rolling_FG3_PCT', 'rolling_AST', 'rolling_REB', 'REST_DAYS']:
    df[f'home_{stat}'] = np.nan
    df[f'away_{stat}'] = np.nan

for idx, row in df.iterrows():
    home_info = get_team_rolling_stats(row['GAME_DATE_EST'], row['HOME_TEAM_ID'], team_stats_long)
    away_info = get_team_rolling_stats(row['GAME_DATE_EST'], row['TEAM_ID_away'], team_stats_long)
    
    df.at[idx, 'home_rolling_PTS']     = home_info['rolling_PTS']
    df.at[idx, 'home_rolling_FG_PCT']  = home_info['rolling_FG_PCT']
    df.at[idx, 'home_rolling_FT_PCT']  = home_info['rolling_FT_PCT']
    df.at[idx, 'home_rolling_FG3_PCT'] = home_info['rolling_FG3_PCT']
    df.at[idx, 'home_rolling_AST']     = home_info['rolling_AST']
    df.at[idx, 'home_rolling_REB']     = home_info['rolling_REB']
    df.at[idx, 'home_REST_DAYS']       = home_info['REST_DAYS']
    
    df.at[idx, 'away_rolling_PTS']     = away_info['rolling_PTS']
    df.at[idx, 'away_rolling_FG_PCT']  = away_info['rolling_FG_PCT']
    df.at[idx, 'away_rolling_FT_PCT']  = away_info['rolling_FT_PCT']
    df.at[idx, 'away_rolling_FG3_PCT'] = away_info['rolling_FG3_PCT']
    df.at[idx, 'away_rolling_AST']     = away_info['rolling_AST']
    df.at[idx, 'away_rolling_REB']     = away_info['rolling_REB']
    df.at[idx, 'away_REST_DAYS']       = away_info['REST_DAYS']

# Drop rows where we lack rolling stats or rest days for either team
df = df.dropna(subset=[
    'home_rolling_PTS', 'away_rolling_PTS',
    'home_REST_DAYS', 'away_REST_DAYS'
])

In [8]:
# --------------------------------------------------------------------
# 5. CREATE DIFFERENCE FEATURES
# --------------------------------------------------------------------
# We'll create difference features for rolling stats and for REST_DAYS as well.
for stat in ['rolling_PTS', 'rolling_FG_PCT', 'rolling_FT_PCT', 'rolling_FG3_PCT', 'rolling_AST', 'rolling_REB', 'REST_DAYS']:
    df[f'diff_{stat}'] = df[f'home_{stat}'] - df[f'away_{stat}']

In [9]:
# --------------------------------------------------------------------
# 6. SET UP FEATURE MATRIX (X) AND TARGET (y)
# --------------------------------------------------------------------
feature_cols = (
    [f'home_{stat}' for stat in ['rolling_PTS', 'rolling_FG_PCT', 'rolling_FT_PCT', 'rolling_FG3_PCT', 'rolling_AST', 'rolling_REB', 'REST_DAYS']] +
    [f'away_{stat}' for stat in ['rolling_PTS', 'rolling_FG_PCT', 'rolling_FT_PCT', 'rolling_FG3_PCT', 'rolling_AST', 'rolling_REB', 'REST_DAYS']] +
    [f'diff_{stat}' for stat in ['rolling_PTS', 'rolling_FG_PCT', 'rolling_FT_PCT', 'rolling_FG3_PCT', 'rolling_AST', 'rolling_REB', 'REST_DAYS']]
)

X = df[feature_cols].copy()
y = df['HOME_TEAM_WINS'].copy()  # 1 if home team wins, 0 otherwise

In [10]:
# --------------------------------------------------------------------
# 7. TIME SERIES SPLIT
# --------------------------------------------------------------------
# We sort by date so that earlier games come before later games.
df_sorted = df.sort_values(by='GAME_DATE_EST')
X_sorted = X.loc[df_sorted.index]
y_sorted = y.loc[df_sorted.index]

# We'll use a 5-fold time series split for demonstration.
tscv = TimeSeriesSplit(n_splits=5)

In [11]:
# --------------------------------------------------------------------
# 8. XGBOOST + GRID SEARCH
# --------------------------------------------------------------------
model = XGBClassifier(eval_metric='logloss', random_state=42)

param_grid = {
    'xgb__n_estimators': [100, 300],
    'xgb__max_depth': [3, 6, 9],
    'xgb__learning_rate': [0.01, 0.1],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0]
}


# We'll do a pipeline with scaling + XGB. (XGBoost often handles unscaled data well,
# but scaling won't hurt, especially if we consider alternative models later.)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', model)
])

grid_search = GridSearchCV(
    pipeline, 
    param_grid, 
    scoring='accuracy', 
    cv=tscv, 
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_sorted, y_sorted)
print("Best parameters:", grid_search.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters: {'xgb__colsample_bytree': 0.8, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 6, 'xgb__n_estimators': 300, 'xgb__subsample': 0.8}


In [12]:
# --------------------------------------------------------------------
# 9. FINAL EVALUATION
# --------------------------------------------------------------------
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_sorted)
acc = accuracy_score(y_sorted, y_pred)
print("Final Accuracy (on entire dataset):", acc)

Final Accuracy (on entire dataset): 0.6710698605869213


In [13]:
import pickle

# Save the best estimator (the full pipeline)
with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)