In [14]:
import numpy as np
import pandas as pd
import math
from datetime import datetime

# XGBoost
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import seaborn as sns # confusion matrix

In [2]:
# Dead kernel?
from NFLUtils import NFLUtils
nfl_utils = NFLUtils()

## Global variables go here

In [3]:
current_season = 2024

In [4]:
# array of { 'Season':<>, 'Home_Team':<>, 'Visitor_Team':<>, 'Date':<> }
current_date_games = nfl_utils.get_current_date_games(current_season)
print(current_date_games)

[{'Season': 2024, 'Home_Team': 'NOR', 'Visitor_Team': 'WAS', 'Date': '2024-12-15'}, {'Season': 2024, 'Home_Team': 'JAX', 'Visitor_Team': 'NYJ', 'Date': '2024-12-15'}, {'Season': 2024, 'Home_Team': 'HOU', 'Visitor_Team': 'MIA', 'Date': '2024-12-15'}, {'Season': 2024, 'Home_Team': 'TEN', 'Visitor_Team': 'CIN', 'Date': '2024-12-15'}, {'Season': 2024, 'Home_Team': 'CLE', 'Visitor_Team': 'KAN', 'Date': '2024-12-15'}, {'Season': 2024, 'Home_Team': 'CAR', 'Visitor_Team': 'DAL', 'Date': '2024-12-15'}, {'Season': 2024, 'Home_Team': 'NYG', 'Visitor_Team': 'BAL', 'Date': '2024-12-15'}, {'Season': 2024, 'Home_Team': 'ARI', 'Visitor_Team': 'NWE', 'Date': '2024-12-15'}, {'Season': 2024, 'Home_Team': 'DET', 'Visitor_Team': 'BUF', 'Date': '2024-12-15'}, {'Season': 2024, 'Home_Team': 'LAC', 'Visitor_Team': 'TAM', 'Date': '2024-12-15'}, {'Season': 2024, 'Home_Team': 'PHI', 'Visitor_Team': 'PIT', 'Date': '2024-12-15'}, {'Season': 2024, 'Home_Team': 'DEN', 'Visitor_Team': 'IND', 'Date': '2024-12-15'}, {'S

## Define cont_cols, y_col

In [5]:
cont_cols = [
    'D_datediff', # Days since last game (Home - visitor)
    
    # first downs
    'D_First_Downs',
    
    # Basic Stats
    'D_Rush',
    'D_Yds',
    'D_TDs',
    'D_Cmp',
    'D_Att',
    'D_Yd',
    'D_TD',
    'D_INT',
    'D_Sacked',
    'D_Yards',
    'D_Net_Pass_Yards',
    'D_Total_Yards',
    'D_Fumbles',
    'D_Lost',
    'D_Turnovers',
    'D_Penalties',
    
    # Passing Detailed
    'D_passing_att',
    'D_passing_cmp',
    'D_passing_int',
    'D_passing_lng',
    'D_passing_sk',
    'D_passing_td',
    'D_passing_yds',
    
    # Receiving
    'D_receiving_lng',
    'D_receiving_td',
    'D_receiving_yds',
    
    # Rushing Detailed
    'D_rushing_att',
    'D_rushing_lng',
    'D_rushing_td',
    'D_rushing_yds',
    
    # Defense interceptions
    'D_def_interceptions_int',
    'D_def_interceptions_lng',
    'D_def_interceptions_pd',
    'D_def_interceptions_td',
    'D_def_interceptions_yds',
    
    # Defense fumbles
    'D_fumbles_ff',
    'D_fumbles_fr',
    'D_fumbles_td',
    'D_fumbles_yds',
    
    # Defense tackles
    'D_sk',
    'D_tackles_ast',
    'D_tackles_comb',
    'D_tackles_qbhits',
    'D_tackles_solo',
    'D_tackles_tfl',
    
    # Kick Returns
    'D_kick_returns_lng',
    'D_kick_returns_rt',
    'D_kick_returns_td',
    'D_kick_returns_yds',
    
    # Punt Returns
    'D_punt_returns_lng',
    'D_punt_returns_ret',
    'D_punt_returns_td',
    'D_punt_returns_yds',
    
    # Punting / Scoring
    'D_punting_lng',
    'D_punting_pnt',
    'D_punting_yds',
    'D_scoring_fga',
    'D_scoring_fgm',
    'D_scoring_xpa',
    'D_scoring_xpm'
]


y_col = ['H_Won']

## Create df from combined.csv
Then filter to only show rows from current season

In [6]:
df = pd.read_csv('footballData/combined.csv', index_col=False, low_memory=False)
df = df.sort_values(by='Date')

# Create the H_Won column
df['H_Won'] = np.where(df['H_Final'] > df['V_Final'], 1.0, 0.0)
print(df.shape)

# Filter out all but the current season
df = df[df['Season'] == current_season]
print(df.shape)
print(df.tail())

# Add games from current date
for game in current_date_games:
    df = df.append(game, ignore_index=True)


(6882, 151)
(178, 151)
      Season        Date Home_Team  H_Q1  H_Q2  H_Q3  H_Q4  H_OT  H_Final  \
6826    2024  2024-11-24       TAM     7    16     7     0     0       30   
6825    2024  2024-11-24       DAL     0     3     7    24     0       34   
6829    2024  2024-11-24       DEN     3     6     7    13     0       29   
6835    2024  2024-11-24       NWE     0     0     0    15     0       15   
6836    2024  2024-11-25       BAL     0    14     3    13     0       30   

     Visitor_Team  ...  V_punting_yds  V_scoring_fga  V_scoring_fgm  \
6826          NYG  ...          168.0            0.0            0.0   
6825          WAS  ...          247.0            3.0            2.0   
6829          LVR  ...          190.0            4.0            4.0   
6835          MIA  ...          240.0            2.0            2.0   
6836          LAC  ...          165.0            3.0            3.0   

      V_scoring_xpa  V_scoring_xpm  H_halftime_odds  V_halftime_odds  \
6826           

In [7]:
track_dict = nfl_utils.get_track_dict(df)
# print(track_dict)

In [8]:
minimum_window = 4
print(df.shape)

indices_to_drop = []
current_count = 0
track_cols = nfl_utils.track_cols

for row in df.itertuples():
    if current_count % 10 == 0:
        print(f'{current_count}/{df.shape[0]}')
    current_count = current_count + 1
    index = row.Index
    # year = row.Date.split('-')[0]
    year = row.Season
    home_team = row.Home_Team
    visitor_team = row.Visitor_Team
    # Home team min window
    home_date_column = f'{year}_{home_team}_Date'
    visitor_date_column = f'{year}_{visitor_team}_Date'

    # Current row is older than Home team at min_window
    if len(track_dict[home_date_column]) > minimum_window and row.Date <= track_dict[home_date_column][minimum_window]:
        indices_to_drop.append(index)
        continue
    # Current row is older than Visitor team at min_window
    if len(track_dict[visitor_date_column]) > minimum_window and row.Date <= track_dict[visitor_date_column][minimum_window]:
        indices_to_drop.append(index)
        continue

    home_date_index = track_dict[home_date_column].index(row.Date)
    visitor_date_index = track_dict[visitor_date_column].index(row.Date)
    # print(f'H: {home_date_index} V: {visitor_date_index}')

    # Update df to have average for each track_cols (Ignoring 'Date', 'datediff' the 1-2nd item)
    for col in track_cols[1:]:
        # Update home
        home_col_list = track_dict[f'{year}_{home_team}_{col}'][:home_date_index-1]
        dataframe_val = pd.DataFrame({'value': home_col_list})
        ema = dataframe_val['value'].ewm(span=min(minimum_window, len(home_col_list)), adjust=False).mean().iloc[-1]
        df.at[index, 'H_' + col] = ema
        
        # Update Visitor
        visitor_col_list = track_dict[f'{year}_{visitor_team}_{col}'][:visitor_date_index-1]
        dataframe_val = pd.DataFrame({'value': visitor_col_list})
        ema = dataframe_val['value'].ewm(span=min(minimum_window, len(visitor_col_list)), adjust=False).mean().iloc[-1]
        df.at[index, 'V_' + col] = ema

    # Add num days since last game for home, visitor
    df.at[index, f'H_datediff'] = 0
    if home_date_index > 0:
        current_game_date = datetime.strptime(track_dict[home_date_column][home_date_index], "%Y-%m-%d")
        previous_game_date = datetime.strptime(track_dict[home_date_column][home_date_index-1], "%Y-%m-%d")
        game_diff = int((current_game_date - previous_game_date).days)
        # print(f'{current_game_date} minus {previous_game_date} is {game_diff}')
        df.at[index, f'H_datediff'] = game_diff
    
    df.at[index, f'V_datediff'] = 0
    if visitor_date_index > 0:
        current_game_date = datetime.strptime(track_dict[visitor_date_column][visitor_date_index], "%Y-%m-%d")
        previous_game_date = datetime.strptime(track_dict[visitor_date_column][visitor_date_index-1], "%Y-%m-%d")
        game_diff = int((current_game_date - previous_game_date).days)
        # print(f'{current_game_date} minus {previous_game_date} is {game_diff}')
        df.at[index, f'V_datediff'] = game_diff
        
df.drop(indices_to_drop, inplace=True)

track_cols.append('datediff')
print(df.shape)
for col in track_cols[1:]:
    df['D_' + col] = df['H_' + col] - df['V_' + col]

track_cols.pop()


print(df.shape)
# print(df.tail())

(191, 151)
0/191
10/191
20/191
30/191
40/191
50/191
60/191
70/191
80/191
90/191
100/191
110/191
120/191
130/191
140/191
150/191
160/191
170/191
180/191
190/191
(109, 153)
(109, 217)


In [9]:
# df.head()

### 3. Create an array of continuous values
Numpy array 'conts' containing stack of each continuous column

In [10]:
# Remove duplicate columns
# df = df.loc[:, ~df.columns.duplicated()].copy()

# create cont_df and y_df from the df
cont_df = df[cont_cols]
y_df = df[y_col]

print(df.shape)

(109, 217)


In [11]:
for item in cont_df.columns:
    print(item)
cont_df.shape

D_datediff
D_First_Downs
D_Rush
D_Yds
D_TDs
D_Cmp
D_Att
D_Yd
D_TD
D_INT
D_Sacked
D_Yards
D_Net_Pass_Yards
D_Total_Yards
D_Fumbles
D_Lost
D_Turnovers
D_Penalties
D_passing_att
D_passing_cmp
D_passing_int
D_passing_lng
D_passing_sk
D_passing_td
D_passing_yds
D_receiving_lng
D_receiving_td
D_receiving_yds
D_rushing_att
D_rushing_lng
D_rushing_td
D_rushing_yds
D_def_interceptions_int
D_def_interceptions_lng
D_def_interceptions_pd
D_def_interceptions_td
D_def_interceptions_yds
D_fumbles_ff
D_fumbles_fr
D_fumbles_td
D_fumbles_yds
D_sk
D_tackles_ast
D_tackles_comb
D_tackles_qbhits
D_tackles_solo
D_tackles_tfl
D_kick_returns_lng
D_kick_returns_rt
D_kick_returns_td
D_kick_returns_yds
D_punt_returns_lng
D_punt_returns_ret
D_punt_returns_td
D_punt_returns_yds
D_punting_lng
D_punting_pnt
D_punting_yds
D_scoring_fga
D_scoring_fgm
D_scoring_xpa
D_scoring_xpm


(109, 62)

In [12]:
conts = np.stack([cont_df[col].values for col in list(cont_df.columns)], 1)
y_col = np.stack([y_df[col].values for col in y_col], 1)


conts_train = conts
y_train = y_col
print(conts.shape)
print(y_col.shape)

(109, 62)
(109, 1)


In [24]:
conts_current_date = conts[len(conts) - len(current_date_games):]
print(conts_current_date.shape)

(13, 62)


# Load XGBoost model
from xgboost_model.bin

In [15]:
model = xgb.XGBClassifier()
model.load_model('xgboost_model.bin')

In [32]:
y_pred = model.predict(conts_current_date)
for i in range(0,len(y_pred)):
    game = current_date_games[i]
    if y_pred[i] == 1:
        print(f"Prediction: {game['Home_Team']} won against {game['Visitor_Team']}")
    else:
        print(f"Prediction: {game['Visitor_Team']} won against {game['Home_Team']}")
    print(f"Prediction: {y_pred[i]}")

Prediction: WAS won against NOR
Prediction: 0
Prediction: NYJ won against JAX
Prediction: 0
Prediction: MIA won against HOU
Prediction: 0
Prediction: CIN won against TEN
Prediction: 0
Prediction: KAN won against CLE
Prediction: 0
Prediction: DAL won against CAR
Prediction: 0
Prediction: BAL won against NYG
Prediction: 0
Prediction: ARI won against NWE
Prediction: 1
Prediction: DET won against BUF
Prediction: 1
Prediction: LAC won against TAM
Prediction: 1
Prediction: PHI won against PIT
Prediction: 1
Prediction: DEN won against IND
Prediction: 1
Prediction: GNB won against SEA
Prediction: 0
