In [27]:
import pandas as pd
import numpy as np
import math
import random

In [28]:
year = 2024
games_df = pd.read_csv(f'game-data/{year}games.csv', index_col='Unnamed: 0')

In [29]:
advanced_metrics = [
    "AdjO", "AdjD", "EffO", "eFG%", "TO%", "Reb%", "FTR", "EffD", 
    "Opp eFG%", "Opp TO%", "Opp Reb%", "Opp FTR"
]


# compute a 4-game rolling average for each metric, excluding the current game (shifted values)
for metric in advanced_metrics:
    rolling_col_name = f"4G Avg {metric}"
    games_df[rolling_col_name] = games_df.groupby("Team")[metric].shift(1).rolling(window=4, min_periods=1).mean()

In [30]:
games_df.tail(5)

Unnamed: 0,Date,Team,Conference,Opponent,Venue,Result,AdjO,AdjD,EffO,eFG%,...,4G Avg EffO,4G Avg eFG%,4G Avg TO%,4G Avg Reb%,4G Avg FTR,4G Avg EffD,4G Avg Opp eFG%,4G Avg Opp TO%,4G Avg Opp Reb%,4G Avg Opp FTR
6863,2024-04-06,Purdue,B10,N.C. State,N,W,104.0,71.9,97.9,49.1,...,116.725,55.35,13.025,30.95,36.225,96.35,46.3,11.925,19.425,21.175
204,2024-04-06,Alabama,SEC,Connecticut,N,L,136.3,109.8,114.5,54.3,...,119.8,57.05,13.9,32.975,39.225,102.7,49.125,11.375,21.925,20.425
6862,2024-04-06,N.C. State,ACC,Purdue,N,L,87.0,81.8,77.7,41.2,...,113.95,52.8,13.375,32.725,41.1,95.825,41.875,11.65,25.1,27.6
2092,2024-04-08,Connecticut,BE,Purdue,N,W,135.0,83.5,122.6,53.2,...,120.275,53.775,11.725,35.15,41.275,105.625,47.975,11.925,24.275,26.0
2093,2024-04-08,Purdue,B10,Connecticut,N,L,115.3,99.6,98.1,45.4,...,117.875,54.025,14.225,33.625,30.25,100.4,46.375,13.575,24.7,23.325


In [31]:
games_df.columns

Index(['Date', 'Team', 'Conference', 'Opponent', 'Venue', 'Result', 'AdjO',
       'AdjD', 'EffO', 'eFG%', 'TO%', 'Reb%', 'FTR', 'EffD', 'Opp eFG%',
       'Opp TO%', 'Opp Reb%', 'Opp FTR', 'G-SC', 'Opponent Conference',
       'Game Tempo', 'Game Unique ID', 'Coach', 'Opponent Coach', 'Unknown',
       'Game Importance', 'Team Points', 'Opponent Points',
       'Point Differential', 'Pre-Game Team Elo', 'Pre-Game Opponent Elo',
       'Post-Game Team Elo', 'Post-Game Opponent Elo', '4G Avg AdjO',
       '4G Avg AdjD', '4G Avg EffO', '4G Avg eFG%', '4G Avg TO%',
       '4G Avg Reb%', '4G Avg FTR', '4G Avg EffD', '4G Avg Opp eFG%',
       '4G Avg Opp TO%', '4G Avg Opp Reb%', '4G Avg Opp FTR'],
      dtype='object')

In [32]:
!pip install xgboost


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

features = [
    'Point Differential', 'Pre-Game Team Elo', 'Pre-Game Opponent Elo',
    '4G Avg AdjO', '4G Avg AdjD', '4G Avg EffO', '4G Avg eFG%', '4G Avg TO%',
    '4G Avg Reb%', '4G Avg FTR', '4G Avg EffD', '4G Avg Opp eFG%',
    '4G Avg Opp TO%', '4G Avg Opp Reb%', '4G Avg Opp FTR'
]

target = 'Result'  # This column should have 'W' or 'L'

games_df[target] = games_df[target].apply(lambda x: 1 if x == 'W' else 0)

df_clean = games_df.dropna(subset=features + [target])

X = df_clean[features]
y = df_clean[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=4, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

accuracy


1.0