In [41]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from warnings import filterwarnings
filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

In [42]:
games = pd.read_csv("/Users/elisabethkollrack/Thesis/EK-thesis/game_attention.csv")
# Drop games when team changed/no longer exists first
games = games[(games['home_team'] != 'LAC') & (games['away_team'] != 'LAC')]
games = games[(games['home_team'] != 'STL') & (games['away_team'] != 'STL')]
games = games[(games['home_team'] != 'LA') & (games['away_team'] != 'LA')]
games = games[(games['home_team'] != 'SD') & (games['away_team'] != 'SD')]
# drop when attention is 0
games = games[games['attention'] > 0]
# adjust attention
# take log of attention
games['log_attention'] = np.log10(games['attention'])

games['gametime'] = games['gametime'].str.replace(':', '').astype(int)
games['date'] = pd.to_datetime(games['date'])

# games when ne is either home or away
ne_games = games[(games['home_team'] == 'NE') | (games['away_team'] == 'NE')]

ne_games['is_home'] = np.where(ne_games['home_team'] == 'NE', 1, 0)

# add column for the team they are playing against
ne_games['opponent'] = np.where(ne_games['is_home'] == 1, ne_games['away_team'], ne_games['home_team'])
ne_games.drop(columns=['home_team', 'away_team'], inplace=True)

# one hot encode weekday
ne_games = pd.get_dummies(ne_games, columns=['weekday', 'opponent'], drop_first=True)
ne_games.head()



Unnamed: 0,date,game_id,season,week,gametime,home_win_pct,away_win_pct,num_lead_changes,total_score,score_differential,...,opponent_NYG,opponent_NYJ,opponent_OAK,opponent_PHI,opponent_PIT,opponent_SEA,opponent_SF,opponent_TB,opponent_TEN,opponent_WAS
1,2013-09-08,2013_01_NE_BUF,2013,1,1300,0.0,0.0,2,44.0,2.0,...,False,False,False,False,False,False,False,False,False,False
16,2013-09-12,2013_02_NYJ_NE,2013,2,2025,1.0,1.0,0,23.0,3.0,...,False,True,False,False,False,False,False,False,False,False
38,2013-09-22,2013_03_TB_NE,2013,3,1300,1.0,0.0,1,26.0,20.0,...,False,False,False,False,False,False,False,True,False,False
61,2013-09-29,2013_04_NE_ATL,2013,4,2030,0.333333,1.0,1,53.0,7.0,...,False,False,False,False,False,False,False,False,False,False
65,2013-10-06,2013_05_NE_CIN,2013,5,1300,0.5,1.0,0,19.0,7.0,...,False,False,False,False,False,False,False,False,False,False


In [43]:
# XG BOOST to predict attention with only NE games
X = ne_games.drop(columns=['attention', 'log_attention', 'date', 'game_id'])
y = ne_games['log_attention']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = XGBRegressor(objective ='reg:squarederror', n_estimators=1000, learning_rate=0.01)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 0.4009901476625819
R^2 Score: -0.021182923746200233


In [44]:
dal_games = games[(games['home_team'] == 'DAL') | (games['away_team'] == 'DAL')]

dal_games['is_home'] = np.where(dal_games['home_team'] == 'DAL', 1, 0)

# add column for the team they are playing against
dal_games['opponent'] = np.where(dal_games['is_home'] == 1, dal_games['away_team'], dal_games['home_team'])
dal_games.drop(columns=['home_team', 'away_team'], inplace=True)

# one hot encode weekday
dal_games = pd.get_dummies(dal_games, columns=['weekday', 'opponent'], drop_first=True)
dal_games.head()


Unnamed: 0,date,game_id,season,week,gametime,home_win_pct,away_win_pct,num_lead_changes,total_score,score_differential,...,opponent_NYG,opponent_NYJ,opponent_OAK,opponent_PHI,opponent_PIT,opponent_SEA,opponent_SF,opponent_TB,opponent_TEN,opponent_WAS
13,2013-09-08,2013_01_NYG_DAL,2013,1,2030,0.0,0.0,0,67.0,5.0,...,True,False,False,False,False,False,False,False,False,False
24,2013-09-15,2013_02_DAL_KC,2013,2,1300,1.0,1.0,2,33.0,1.0,...,False,False,False,False,False,False,False,False,False,False
73,2013-10-06,2013_05_DEN_DAL,2013,5,1625,0.5,1.0,3,99.0,3.0,...,False,False,False,False,False,False,False,False,False,False
90,2013-10-13,2013_06_WAS_DAL,2013,6,2030,0.4,0.25,0,47.0,15.0,...,False,False,False,False,False,False,False,False,False,True
99,2013-10-20,2013_07_DAL_PHI,2013,7,1300,0.5,0.5,0,20.0,14.0,...,False,False,False,True,False,False,False,False,False,False


In [45]:
# XG BOOST to predict attention with only DAL games
X = dal_games.drop(columns=['attention', 'log_attention', 'date', 'game_id'])
y = dal_games['log_attention']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = XGBRegressor(objective ='reg:squarederror', n_estimators=1000, learning_rate=0.01)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 0.29785034497250196
R^2 Score: 0.3070742221620145


In [46]:
gb_games = games[(games['home_team'] == 'GB') | (games['away_team'] == 'GB')]

gb_games['is_home'] = np.where(gb_games['home_team'] == 'GB', 1, 0)

# add column for the team they are playing against
gb_games['opponent'] = np.where(gb_games['is_home'] == 1, gb_games['away_team'], gb_games['home_team'])
gb_games.drop(columns=['home_team', 'away_team'], inplace=True)

# one hot encode weekday
gb_games = pd.get_dummies(gb_games, columns=['weekday', 'opponent'], drop_first=True)
gb_games.head()


Unnamed: 0,date,game_id,season,week,gametime,home_win_pct,away_win_pct,num_lead_changes,total_score,score_differential,...,opponent_NYG,opponent_NYJ,opponent_OAK,opponent_PHI,opponent_PIT,opponent_SEA,opponent_SF,opponent_TB,opponent_TEN,opponent_WAS
11,2013-09-08,2013_01_GB_SF,2013,1,1625,0.0,0.0,2,62.0,6.0,...,False,False,False,False,False,False,True,False,False,False
21,2013-09-15,2013_02_WAS_GB,2013,2,1300,0.0,0.0,0,58.0,18.0,...,False,False,False,False,False,False,False,False,False,True
35,2013-09-22,2013_03_GB_CIN,2013,3,1300,0.5,0.5,2,64.0,4.0,...,False,False,False,False,False,False,False,False,False,False
66,2013-10-06,2013_05_DET_GB,2013,5,1300,0.333333,0.75,0,31.0,13.0,...,False,False,False,False,False,False,False,False,False,False
78,2013-10-13,2013_06_GB_BAL,2013,6,1300,0.6,0.5,0,36.0,2.0,...,False,False,False,False,False,False,False,False,False,False


In [47]:
# XG BOOST to predict attention with only DAL games
X = gb_games.drop(columns=['attention', 'log_attention', 'date', 'game_id'])
y = gb_games['log_attention']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = XGBRegressor(objective ='reg:squarederror', n_estimators=1000, learning_rate=0.01)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 0.31028044978347236
R^2 Score: -0.24863279457705145


In [48]:
# XG BOOST to predict attention for all teams
# Get list of all teams
teams = sorted(list(set(games['home_team']).union(set(games['away_team']))))

results = []

for team in teams:

    # Filter games for the current team
    team_games = games[(games['home_team'] == team) | (games['away_team'] == team)].copy()
        
    # Create features
    team_games['is_home'] = np.where(team_games['home_team'] == team, 1, 0)
    team_games['opponent'] = np.where(team_games['is_home'] == 1,
                                    team_games['away_team'], team_games['home_team'])

    # Drop redundant columns
    team_games.drop(columns=['home_team', 'away_team'], inplace=True)

    # One-hot encode categorical variables
    team_games = pd.get_dummies(team_games, columns=['weekday', 'opponent'], drop_first=True)

    X = team_games.drop(columns=['attention', 'log_attention', 'date', 'game_id'])
    y = team_games['log_attention']


    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Train XGBoost model
    model = XGBRegressor(
        objective='reg:squarederror', 
        n_estimators=1000, 
        learning_rate=0.01,
        random_state=42
    )

    model.fit(X_train, y_train)

    # Evaluate model
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    results.append({
        'team': team,
        'num_games': len(team_games),
        'R2_score': r2,
        'MSE': mse
    })
    

# Convert to DataFrame and sort by R²
results_df = pd.DataFrame(results).sort_values(by='R2_score', key=lambda x: abs(x), ascending=False)
print("\nResults Summary:")
print(results_df)



Results Summary:
   team  num_games  R2_score       MSE
13  IND         76 -1.315008  0.445084
14  JAX         73 -1.085922  0.203559
28  TEN         76 -1.002134  0.328498
3   BUF         77 -0.832060  0.133568
27   TB         75 -0.758405  0.261030
17  MIN         76 -0.658197  0.186551
25  SEA         69 -0.636905  0.377166
12  HOU         76 -0.509556  0.344553
29  WAS         75 -0.471117  0.194623
26   SF         69 -0.465675  0.703454
21  NYJ         77  0.432328  0.080762
1   ATL         77 -0.426942  0.272263
24  PIT         78  0.412167  0.156393
20  NYG         75 -0.385171  0.366912
2   BAL         77  0.379158  0.110940
6   CIN         77 -0.313391  0.125114
8   DAL         75  0.307074  0.297850
19   NO         76 -0.282337  0.256854
11   GB         78 -0.248633  0.310280
23  PHI         76 -0.245578  0.562306
15   KC         69  0.173162  0.119811
5   CHI         77 -0.166392  0.213433
9   DEN         69  0.131081  0.103806
22  OAK         69 -0.101564  0.288000
4   CAR