# Griffin Dobbins NFL Model
## Run all cells at once. If you want to change the hyperparameters, change them in the code and then run it all. Don't run the cells out of order since I reuse and modify variables.

In [1323]:
import pandas as pd
import numpy as np

In [1324]:
elo = pd.read_csv('nfl_elo.csv')

In [1325]:
elo['date'] = pd.to_datetime(elo['date'])

In [1326]:
spreadspoke = pd.read_csv('spreadspoke_scores.csv')

In [1327]:
spreadspoke['schedule_date'] = pd.to_datetime(spreadspoke['schedule_date'])

In [1328]:
teamMap = {
    'Arizona Cardinals': 'ARI',
    'Atlanta Falcons': 'ATL',
    'Baltimore Ravens': 'BAL',
    'Buffalo Bills': 'BUF',
    'Carolina Panthers': 'CAR',
    'Chicago Bears': 'CHI',
    'Cincinnati Bengals': 'CIN',
    'Cleveland Browns': 'CLE',
    'Dallas Cowboys': 'DAL',
    'Denver Broncos': 'DEN',
    'Detroit Lions': 'DET',
    'Green Bay Packers': 'GB',
    'Houston Texans': 'HOU',
    'Indianapolis Colts': 'IND',
    'Jacksonville Jaguars': 'JAX',
    'Kansas City Chiefs': 'KC',
    'Los Angeles Chargers': 'LAC',
    'Los Angeles Rams': 'LAR',
    'Miami Dolphins': 'MIA',
    'Minnesota Vikings': 'MIN',
    'New England Patriots': 'NE',
    'New Orleans Saints': 'NO',
    'New York Giants': 'NYG',
    'New York Jets': 'NYJ',
    'Las Vegas Raiders': 'LV',
    'Philadelphia Eagles': 'PHI',
    'Pittsburgh Steelers': 'PIT',
    'San Francisco 49ers': 'SF',
    'Seattle Seahawks': 'SEA',
    'Tampa Bay Buccaneers': 'TB',
    'Tennessee Titans': 'TEN',
    'Washington Commanders': 'WAS',
    'Oakland Raiders': 'OAK',
    'San Diego Chargers': 'SD',
    'St. Louis Rams': 'STL',
    'Washington Football Team': 'WAS',
    'Washington Redskins': 'WAS'
}


In [1329]:
spreadspoke['team_home'] = spreadspoke['team_home'].map(teamMap)
spreadspoke['team_away'] = spreadspoke['team_away'].map(teamMap)

df = pd.merge(elo, spreadspoke, left_on=['date', 'team1'], right_on=['schedule_date', 'team_home'], how='inner')

In [1330]:
df = df[['date', 'season', 'neutral', 'playoff', 'team1', 'team2', 'elo1_pre',
       'elo2_pre', 'qbelo1_pre', 'qbelo2_pre', 'score1', 'score2', 'schedule_week','team_favorite_id', 'spread_favorite',
       'over_under_line', 'stadium', 'stadium_neutral']]

In [1331]:
df['over_under_line'] = pd.to_numeric(df['over_under_line'], errors='coerce')
df.dropna(subset=['over_under_line'], inplace=True)
df['over_under_line'] = df['over_under_line'].astype(np.float64)

In [1332]:
df['homecover'] = np.where(
    ((df['score1'] - df['score2']) > -df['spread_favorite']) & (df['team_favorite_id'] == df['team1']) |
    ((df['score1'] - df['score2']) > df['spread_favorite']) & (df['team_favorite_id'] == df['team2']),
    1, np.where(((df['score1'] - df['score2']) == -df['spread_favorite']) & (df['team_favorite_id'] == df['team1']) |
    ((df['score1'] - df['score2']) == df['spread_favorite']) & (df['team_favorite_id'] == df['team2']), 0.5, 0)
)
df.dropna(subset=['homecover', 'spread_favorite'], inplace=True)

In [1333]:
df = df[df['homecover'] != 0.5] # ignore pushes

In [1334]:
df['home_points_scored_so_far'] = df.groupby(['team1', 'season'])['score1'].cumsum()
df['away_points_scored_so_far'] = df.groupby(['team2', 'season'])['score2'].cumsum()
df['home_points_allowed_so_far'] = df.groupby(['team1', 'season'])['score1'].cumsum()
df['away_points_allowed_so_far'] = df.groupby(['team2', 'season'])['score2'].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['home_points_scored_so_far'] = df.groupby(['team1', 'season'])['score1'].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['away_points_scored_so_far'] = df.groupby(['team2', 'season'])['score2'].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['home_points_allowed_so_far'] = 

# Dataset #1
### Using fewer features but getting more rows

In [1335]:
X = df[['neutral', 'elo1_pre', 'elo2_pre', 'qbelo1_pre', 'qbelo2_pre', 'spread_favorite', 'over_under_line','home_points_scored_so_far','away_points_scored_so_far', 'home_points_allowed_so_far',
       'away_points_allowed_so_far']]

In [1336]:
y = df['homecover']

In [1337]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6248004257583821


In [1338]:
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

model = MLPClassifier(hidden_layer_sizes=(7,4), max_iter=500, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_dev)

accuracy = accuracy_score(y_dev, y_pred)
print("Accuracy:", accuracy)

y_test_pred = model.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy_test)

Accuracy: 0.5042598509052183
Test Accuracy: 0.5300691857370942


In [1339]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_dev)
accuracy = accuracy_score(y_dev, y_pred)
print("Random Forest Accuracy (Dev):", accuracy)

y_test_pred = model.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
print("Random Forest Accuracy (Test):", accuracy_test)

Random Forest Accuracy (Dev): 0.6272630457933972
Random Forest Accuracy (Test): 0.6381053751995742


In [1340]:
# Baseline: always take the home team

y_pred_baseline = np.ones(len(y_test))

accuracy_baseline = accuracy_score(y_test, y_pred_baseline)
print(accuracy_baseline)

0.4699308142629058


# Dataset #2
### Using more features, but fewer rows

In [1341]:
weekly = pd.read_csv('weekly_team_data.csv')

In [1342]:
df = df.loc[~df['schedule_week'].isin(['Superbowl', 'Wildcard', 'Division', 'Conference'])]
df['schedule_week'] = df['schedule_week'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['schedule_week'] = df['schedule_week'].astype(int)


In [1343]:
df = df.merge(
    weekly,
    how='left',
    left_on=['team1', 'season', 'schedule_week'],
    right_on=['team', 'season', 'week'],
    suffixes=('', '_home')
)

df = df.merge(
    weekly,
    how='left',
    left_on=['team2', 'season', 'schedule_week'],
    right_on=['team', 'season', 'week'],
    suffixes=('', '_away')
)

In [1344]:
df.dropna(subset=['rushing_yards_away'], inplace=True)
df['home_yards_so_far'] = df.groupby(['team1', 'season'])['yards_gained'].cumsum()
df['away_yards_so_far'] = df.groupby(['team2', 'season'])['yards_gained_away'].cumsum()
df['home_touchdowns_so_far'] = df.groupby(['team1', 'season'])['touchdown'].cumsum()
df['away_touchdowns_so_far'] = df.groupby(['team2', 'season'])['touchdown_away'].cumsum()
df['home_interceptions_so_far'] = df.groupby(['team1', 'season'])['interception'].cumsum()
df['away_interceptions_so_far'] = df.groupby(['team2', 'season'])['interception_away'].cumsum()
df['home_air_yards_so_far'] = df.groupby(['team1', 'season'])['air_yards'].cumsum()
df['away_air_yards_so_far'] = df.groupby(['team2', 'season'])['air_yards_away'].cumsum()
df['home_fumbles_so_far'] = df.groupby(['team1', 'season'])['fumble'].cumsum()
df['away_fumbles_so_far'] = df.groupby(['team2', 'season'])['fumble_away'].cumsum()
df['home_qb_scrambled_so_far'] = df.groupby(['team1', 'season'])['qb_scramble'].cumsum()
df['away_qb_scrambled_so_far'] = df.groupby(['team2', 'season'])['qb_scramble_away'].cumsum()
df['home_pass_snaps_pct_so_far'] = df.groupby(['team1', 'season'])['pass_snaps_pct'].cumsum()
df['away_pass_snaps_pct_so_far'] = df.groupby(['team2', 'season'])['pass_snaps_pct_away'].cumsum()

In [1345]:
features = [
    'neutral','elo1_pre', 'elo2_pre', 'qbelo1_pre', 'qbelo2_pre', 'spread_favorite', 'over_under_line',
    'home_points_scored_so_far', 'away_points_scored_so_far',
    'home_points_allowed_so_far', 'away_points_allowed_so_far',
    'home_yards_so_far', 'away_yards_so_far', 'home_touchdowns_so_far', 'away_touchdowns_so_far',
    'home_interceptions_so_far', 'away_interceptions_so_far', 'home_air_yards_so_far', 'away_air_yards_so_far',
    'home_fumbles_so_far', 'away_fumbles_so_far', 'home_qb_scrambled_so_far', 'away_qb_scrambled_so_far',
    'home_pass_snaps_pct_so_far', 'away_pass_snaps_pct_so_far'
]

In [1346]:
for col in features:
  if col in df.columns and df[col].isnull().any():
    df[col] = df[col].fillna(df[col].mean())

X = df[features]
y = df['homecover']

In [1347]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(solver = 'liblinear', max_iter=10000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6673913043478261


In [1348]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

model = MLPClassifier(hidden_layer_sizes=(60,12), max_iter=500, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_dev)

accuracy = accuracy_score(y_dev, y_pred)
print("Dev Accuracy:", accuracy)

y_test_pred = model.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy_test)

Dev Accuracy: 0.5804347826086956
Test Accuracy: 0.5739130434782609


In [1349]:
model = RandomForestClassifier(n_estimators=80, criterion='entropy', random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_dev)
accuracy = accuracy_score(y_dev, y_pred)
print("Random Forest Accuracy (Dev):", accuracy)

y_test_pred = model.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
print("Random Forest Accuracy (Test):", accuracy_test)

Random Forest Accuracy (Dev): 0.6521739130434783
Random Forest Accuracy (Test): 0.6804347826086956
