In [27]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [28]:
# load the datasets
team_stats = pd.read_csv('../data/nfl_team_stats.csv')
game_data = pd.read_csv('../data/nfl_spreadspoke_data.csv')
team_mapping = pd.read_csv('../data/nfl_teams.csv')


In [29]:
# team name mapping process
team_name_map = team_mapping.set_index('team_name')['team_name_short'].to_dict()

def standardize_team_name(name):
    return team_name_map.get(name, name)

# apply to datasets for consitent team names
team_stats['team'] = team_stats['team'].apply(standardize_team_name)
game_data['team_home'] = game_data['team_home'].apply(standardize_team_name)
game_data['team_away'] = game_data['team_away'].apply(standardize_team_name)


In [30]:
# prepare team stats for each given game
home_stats = team_stats.add_prefix('home_')
away_stats = team_stats.add_prefix('away_')

# apply to game data
merged_data = pd.merge(game_data, home_stats, left_on=['schedule_season', 'team_home'], right_on=['home_season', 'home_team'], how='left')
merged_data = pd.merge(merged_data, away_stats, left_on=['schedule_season', 'team_away'], right_on=['away_season', 'away_team'], how='left')


In [31]:
# create target value (total points) and drop na values for key features
merged_data['total_points'] = merged_data['score_home'] + merged_data['score_away']
merged_data = merged_data.dropna(subset=['total_points', 'over_under_line'])

# add team stats as features
feature_columns = [col for col in merged_data.columns if col.startswith('home_') or col.startswith('away_')]
# remove home/away team and scores from feature set
feature_columns = [col for col in feature_columns if col not in ['home_team', 'away_team', 'home_season', 'away_season']]
# add betting data features
feature_columns += ['over_under_line', 'spread_favorite']

X = merged_data[feature_columns]
y = merged_data['total_points']

X = X.fillna(0)


In [32]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [33]:
# run rf predictions and get mae
y_pred = rf_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error on Test Set: {mae:.2f}")


Mean Absolute Error on Test Set: 10.06


In [34]:
def predict_total_points(home_team, away_team, over_under_line, spread_favorite, season):
    # make sure team names are correct
    home_team_std = standardize_team_name(home_team)
    away_team_std = standardize_team_name(away_team)
    
    # team stats for season
    home_team_stats = team_stats[(team_stats['season'] == season) & (team_stats['team'] == home_team_std)].add_prefix('home_')
    away_team_stats = team_stats[(team_stats['season'] == season) & (team_stats['team'] == away_team_std)].add_prefix('away_')
    
    # make sure team has stats
    if home_team_stats.empty or away_team_stats.empty:
        print("Team stats for the given season are not available.")
        return None
    
    # stat and game data
    input_data = pd.concat([home_team_stats.reset_index(drop=True), away_team_stats.reset_index(drop=True)], axis=1)
    input_data['over_under_line'] = over_under_line
    input_data['spread_favorite'] = spread_favorite
    
    non_numeric_cols = ['home_team', 'away_team', 'home_season', 'away_season']
    input_data = input_data.drop(columns=non_numeric_cols, errors='ignore')
    
    # clean up any missing cols and reorder
    missing_cols = set(X.columns) - set(input_data.columns)
    for col in missing_cols:
        input_data[col] = 0
    
    input_data = input_data[X.columns]
    
    # run data into model
    predicted_total = rf_model.predict(input_data)[0]
    return predicted_total


In [35]:
# set data for test
home_team = 'Pittsburgh Steelers'
away_team = 'Cleveland Browns'
over_under_line = 43.5
spread_favorite = -6.5  # negative favors home team
season = 2023  # select season

# run data
predicted_points = predict_total_points(home_team, away_team, over_under_line, spread_favorite, season)
if predicted_points is not None:
    print(f"Predicted Total Points: {predicted_points:.2f}")


Predicted Total Points: 43.88


In [36]:
import pickle

# save to a file
with open("../pickles/rf_model.pkl", "wb") as file:
    pickle.dump(rf_model, file)

In [38]:
# to load model
with open("../pickles/rf_model.pkl", "rb") as file:
    loaded_rf = pickle.load(file)