In [1]:
import numpy as np
import pandas as pd
import math
from datetime import datetime

# XGBoost
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import seaborn as sns # confusion matrix

In [2]:
# Dead kernel?
from NFLUtils import NFLUtils
nfl_utils = NFLUtils()

## Global variables go here

In [3]:
current_season = 2024

In [5]:
# array of { 'Season':<>, 'Home_Team':<>, 'Visitor_Team':<>, 'Date':<> }
current_date_games = nfl_utils.get_current_date_games(current_season, 0)
print(current_date_games)

[{'Season': 2024, 'Home_Team': 'PHI', 'Visitor_Team': 'WAS', 'Date': '2025-01-26'}, {'Season': 2024, 'Home_Team': 'KAN', 'Visitor_Team': 'BUF', 'Date': '2025-01-26'}]


In [6]:
old_current_date_games = [
    {'Season': 2024, 'Home_Team': 'BAL', 'Visitor_Team': 'CLE', 'Home_Odds': 1.04, 'Visitor_Odds': 12, 'Date': '2025-01-04'},
    {'Season': 2024, 'Home_Team': 'PIT', 'Visitor_Team': 'CIN', 'Home_Odds': 2.1, 'Visitor_Odds': 1.74, 'Date': '2025-01-04'}
]

## Define cont_cols, y_col

In [7]:
cont_cols = [
    'D_datediff', # Days since last game (Home - visitor)
    
    # first downs
    'D_First_Downs',
    
    # Basic Stats
    'D_Rush',
    'D_Yds',
    'D_TDs',
    'D_Cmp',
    'D_Att',
    'D_Yd',
    'D_TD',
    'D_INT',
    'D_Sacked',
    'D_Yards',
    'D_Net_Pass_Yards',
    'D_Total_Yards',
    'D_Fumbles',
    'D_Lost',
    'D_Turnovers',
    'D_Penalties',
    
    # Passing Detailed
    'D_passing_att',
    'D_passing_cmp',
    'D_passing_int',
    'D_passing_lng',
    'D_passing_sk',
    'D_passing_td',
    'D_passing_yds',
    
    # Receiving
    'D_receiving_lng',
    'D_receiving_td',
    'D_receiving_yds',
    
    # Rushing Detailed
    'D_rushing_att',
    'D_rushing_lng',
    'D_rushing_td',
    'D_rushing_yds',
    
    # Defense interceptions
    'D_def_interceptions_int',
    'D_def_interceptions_lng',
    # 'D_def_interceptions_pd',
    'D_def_interceptions_td',
    'D_def_interceptions_yds',
    
    # Defense fumbles
    'D_fumbles_ff',
    'D_fumbles_fr',
    'D_fumbles_td',
    'D_fumbles_yds',
    
    # Defense tackles
    'D_sk',
    'D_tackles_ast',
    'D_tackles_comb',
    # 'D_tackles_qbhits',
    'D_tackles_solo',
    # 'D_tackles_tfl',
    
    # Kick Returns
    'D_kick_returns_lng',
    'D_kick_returns_rt',
    'D_kick_returns_td',
    'D_kick_returns_yds',
    
    # Punt Returns
    'D_punt_returns_lng',
    'D_punt_returns_ret',
    'D_punt_returns_td',
    'D_punt_returns_yds',
    
    # Punting / Scoring
    'D_punting_lng',
    'D_punting_pnt',
    'D_punting_yds',
    'D_scoring_fga',
    'D_scoring_fgm',
    'D_scoring_xpa',
    'D_scoring_xpm'
]


y_col = ['H_Won']

## Create df from combined.csv
Then filter to only show rows from current season

In [8]:
df = pd.read_csv('footballData/combined.csv', index_col=False, low_memory=False)
df = df.sort_values(by='Date')

# Create the H_Won column
df['H_Won'] = np.where(df['H_Final'] > df['V_Final'], 1.0, 0.0)
print(df.shape)

# Filter out all but the current season
df = df[df['Season'] == current_season]
print(df.shape)
# print(df.tail())

# Add games from current date
for game in current_date_games:
    df = df.append(game, ignore_index=True)


(7989, 145)
(281, 145)


In [9]:
track_dict = nfl_utils.get_track_dict(df)
# print(track_dict)

In [10]:
minimum_window = 4
print(df.shape)

indices_to_drop = []
current_count = 0
track_cols = nfl_utils.track_cols

for row in df.itertuples():
    if current_count % 10 == 0:
        print(f'{current_count}/{df.shape[0]}')
    current_count = current_count + 1
    index = row.Index
    # year = row.Date.split('-')[0]
    year = row.Season
    home_team = row.Home_Team
    visitor_team = row.Visitor_Team
    # Home team min window
    home_date_column = f'{year}_{home_team}_Date'
    visitor_date_column = f'{year}_{visitor_team}_Date'

    # Current row is older than Home team at min_window
    if len(track_dict[home_date_column]) > minimum_window and row.Date <= track_dict[home_date_column][minimum_window]:
        indices_to_drop.append(index)
        continue
    # Current row is older than Visitor team at min_window
    if len(track_dict[visitor_date_column]) > minimum_window and row.Date <= track_dict[visitor_date_column][minimum_window]:
        indices_to_drop.append(index)
        continue

    home_date_index = track_dict[home_date_column].index(row.Date)
    visitor_date_index = track_dict[visitor_date_column].index(row.Date)
    # print(f'H: {home_date_index} V: {visitor_date_index}')

    # Update df to have average for each track_cols (Ignoring 'Date', 'datediff' the 1-2nd item)
    for col in track_cols[1:]:
        # Update home
        home_col_list = track_dict[f'{year}_{home_team}_{col}'][:home_date_index-1]
        dataframe_val = pd.DataFrame({'value': home_col_list})
        ema = dataframe_val['value'].ewm(span=min(minimum_window, len(home_col_list)), adjust=False).mean().iloc[-1]
        df.at[index, 'H_' + col] = ema
        
        # Update Visitor
        visitor_col_list = track_dict[f'{year}_{visitor_team}_{col}'][:visitor_date_index-1]
        dataframe_val = pd.DataFrame({'value': visitor_col_list})
        ema = dataframe_val['value'].ewm(span=min(minimum_window, len(visitor_col_list)), adjust=False).mean().iloc[-1]
        df.at[index, 'V_' + col] = ema

    # Add num days since last game for home, visitor
    df.at[index, f'H_datediff'] = 0
    if home_date_index > 0:
        current_game_date = datetime.strptime(track_dict[home_date_column][home_date_index], "%Y-%m-%d")
        previous_game_date = datetime.strptime(track_dict[home_date_column][home_date_index-1], "%Y-%m-%d")
        game_diff = int((current_game_date - previous_game_date).days)
        # print(f'{current_game_date} minus {previous_game_date} is {game_diff}')
        df.at[index, f'H_datediff'] = game_diff
    
    df.at[index, f'V_datediff'] = 0
    if visitor_date_index > 0:
        current_game_date = datetime.strptime(track_dict[visitor_date_column][visitor_date_index], "%Y-%m-%d")
        previous_game_date = datetime.strptime(track_dict[visitor_date_column][visitor_date_index-1], "%Y-%m-%d")
        game_diff = int((current_game_date - previous_game_date).days)
        # print(f'{current_game_date} minus {previous_game_date} is {game_diff}')
        df.at[index, f'V_datediff'] = game_diff
        
df.drop(indices_to_drop, inplace=True)

track_cols.append('datediff')
print(df.shape)
for col in track_cols[1:]:
    df['D_' + col] = df['H_' + col] - df['V_' + col]

track_cols.pop()


print(df.shape)
# print(df.tail())

(283, 145)
0/283
10/283
20/283
30/283
40/283
50/283
60/283
70/283
80/283
90/283
100/283
110/283
120/283
130/283
140/283
150/283
160/283
170/283
180/283
190/283
200/283
210/283
220/283
230/283
240/283
250/283
260/283
270/283
280/283
(201, 147)
(201, 208)


In [11]:
# df.head()

### 3. Create an array of continuous values
Numpy array 'conts' containing stack of each continuous column

In [12]:
# Remove duplicate columns
# df = df.loc[:, ~df.columns.duplicated()].copy()

# create cont_df and y_df from the df
cont_df = df[cont_cols]
y_df = df[y_col]

print(df.shape)

(201, 208)


In [13]:
for item in cont_df.columns:
    print(item)
cont_df.shape

D_datediff
D_First_Downs
D_Rush
D_Yds
D_TDs
D_Cmp
D_Att
D_Yd
D_TD
D_INT
D_Sacked
D_Yards
D_Net_Pass_Yards
D_Total_Yards
D_Fumbles
D_Lost
D_Turnovers
D_Penalties
D_passing_att
D_passing_cmp
D_passing_int
D_passing_lng
D_passing_sk
D_passing_td
D_passing_yds
D_receiving_lng
D_receiving_td
D_receiving_yds
D_rushing_att
D_rushing_lng
D_rushing_td
D_rushing_yds
D_def_interceptions_int
D_def_interceptions_lng
D_def_interceptions_td
D_def_interceptions_yds
D_fumbles_ff
D_fumbles_fr
D_fumbles_td
D_fumbles_yds
D_sk
D_tackles_ast
D_tackles_comb
D_tackles_solo
D_kick_returns_lng
D_kick_returns_rt
D_kick_returns_td
D_kick_returns_yds
D_punt_returns_lng
D_punt_returns_ret
D_punt_returns_td
D_punt_returns_yds
D_punting_lng
D_punting_pnt
D_punting_yds
D_scoring_fga
D_scoring_fgm
D_scoring_xpa
D_scoring_xpm


(201, 59)

In [14]:
conts = np.stack([cont_df[col].values for col in list(cont_df.columns)], 1)
y_col = np.stack([y_df[col].values for col in y_col], 1)


conts_train = conts
y_train = y_col
print(conts.shape)
print(y_col.shape)

(201, 59)
(201, 1)


In [15]:
conts_current_date = conts[len(conts) - len(current_date_games):]
print(conts_current_date.shape)

(2, 59)


# Load XGBoost model
from xgboost_model.bin

In [16]:
# model = xgb.XGBClassifier()
model = xgb.XGBRegressor()
model.load_model('xgboost_model.bin')

In [19]:
y_pred = model.predict(conts_current_date)
# Params
current_account_value = 738

# From tuning kelly hyperparameters
confidence_threshold = 0.0
# model_win_probability = 0.665
position_size = 0.1
ignore_games = 0
# LAC, LVR?

adjusted_position_size = min(position_size, (100.0 / (len(y_pred) - ignore_games)) / 100)
bet_size = current_account_value * adjusted_position_size
for i in range(0,len(y_pred)):
    game = current_date_games[i]
    decimal_odds = 1
    if y_pred[i] > 0.5 + confidence_threshold:
        print(f"Prediction: {game['Home_Team']} win against {game['Visitor_Team']}")
        # decimal_odds = game['Home_Odds']
        
    elif y_pred[i] < 0.5 - confidence_threshold:
        print(f"Prediction: {game['Visitor_Team']} win against {game['Home_Team']}")
        # decimal_odds = game['Visitor_Odds']
    else:
        print(f"Skipping {game['Visitor_Team']} vs {game['Home_Team']} game.")
        print(y_pred[i])
        continue

    print(f"Bet size: {bet_size}")

Prediction: WAS win against PHI
Bet size: 73.8
Prediction: BUF win against KAN
Bet size: 73.8


In [18]:
position_size = 0.25
bet_size = current_account_value * position_size

# win_prob, prediction (1 or 0 with xgb), fractional_odds
# MIN win against CHI
bet_amt = bet_size * nfl_utils.kelly_criterion(0.67, 1, 1.285)
print(bet_amt)

# ATL win against LVR
bet_amt = bet_size * nfl_utils.kelly_criterion(0.67, 1, 1.363)
print(bet_amt)

182.12494649805447
184.0189389214967
