In [19]:
# Imports
import numpy as np
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Preprocessing

In [20]:
df = pd.read_csv('train.csv')
df_raw = pd.read_csv('train.csv')

# drop unnecessary columns
df = df.drop(columns=['weather_temperature', 'weather_wind_mph', 'weather_humidity', 'weather_detail', 'stadium'])
df.head()

# adding total score columns
df['total_score'] = df['score_home'] + df['score_away']


# add column that represented current record for each team before each game of a season
# ensure games are sorted chronologically within each season
df["datetime"] = pd.to_datetime(df["schedule_date"])
df = df.sort_values(["schedule_season", "datetime"]).reset_index(drop=True)

# make output lists
home_records = []
away_records = []

# make dictionaries to track each team's W-L-T within the current season
team_wins = {}
team_losses = {}
team_ties = {}

current_season = None

for i, row in df.iterrows():
    season = row["schedule_season"]
    home = row["team_home"]
    away = row["team_away"]
    home_score = row["score_home"]
    away_score = row["score_away"]
   
    # new season,  reset all
    if season != current_season:
        team_wins = {}
        team_losses = {}
        team_ties = {}
        current_season = season

    # initialize teams for this season if needed
    for team in [home, away]:
        if team not in team_wins:
            team_wins[team] = 0
            team_losses[team] = 0
            team_ties[team] = 0

    # add current record before the game
    home_records.append(
        f"{team_wins[home]}-{team_losses[home]}-{team_ties[home]}"
    )
    away_records.append(
        f"{team_wins[away]}-{team_losses[away]}-{team_ties[away]}"
    )

    # update records after the game
    if home_score > away_score:
        team_wins[home] += 1
        team_losses[away] += 1
    elif away_score > home_score:
        team_wins[away] += 1
        team_losses[home] += 1
    else:
        # tie
        team_ties[home] += 1
        team_ties[away] += 1

# add results to dataframe
df["home_team_record"] = home_records
df["away_team_record"] = away_records


# make individual columns for wins, losses, and ties
df['home_wins'] = df['home_team_record'].apply(lambda x: int(x.split('-')[0]))
df['home_losses'] = df['home_team_record'].apply(lambda x: int(x.split('-')[1]))
df['home_ties'] = df['home_team_record'].apply(lambda x: int(x.split('-')[2]))
df['away_wins'] = df['away_team_record'].apply(lambda x: int(x.split('-')[0]))
df['away_losses'] = df['away_team_record'].apply(lambda x: int(x.split('-')[1]))
df['away_ties'] = df['away_team_record'].apply(lambda x: int(x.split('-')[2]))


# filter games that have already been recorded, no scheduled games
df = df[df["datetime"] <= "2025-11-04"]

In [21]:
# Compute each team's average score per season
team_season_avg = (
    df.groupby(["team_home", "schedule_season"])["score_home"].mean().reset_index()
)
team_season_avg.columns = ["team", "season", "avg_score"]

# also include away team scoring
team_season_avg_away = (
    df.groupby(["team_away", "schedule_season"])["score_away"].mean().reset_index()
)
team_season_avg_away.columns = ["team", "season", "avg_score"]

# combine home + away scoring for a true team season average
team_season_avg = pd.concat([team_season_avg, team_season_avg_away])
team_season_avg = team_season_avg.groupby(["team", "season"])["avg_score"].mean().reset_index()

# shift averages to represent previous season
team_season_avg["prev_season"] = team_season_avg["season"] + 1

# prev_season avg is used in the next year's games
team_prev = team_season_avg[["team", "prev_season", "avg_score"]]
team_prev.columns = ["team", "schedule_season", "prev_season_avg"]

# merge into main df
df = df.merge(team_prev, left_on=["team_home", "schedule_season"], right_on=["team", "schedule_season"], how="left")
df.rename(columns={"prev_season_avg": "home_prev_avg"}, inplace=True)
df = df.drop(columns=["team"])

df = df.merge(team_prev, left_on=["team_away", "schedule_season"], right_on=["team", "schedule_season"], how="left")
df.rename(columns={"prev_season_avg": "away_prev_avg"}, inplace=True)
df = df.drop(columns=["team"])

In [22]:
# Creating rolling avgs within each season

# separate into home, away dfs
home = df[["schedule_season", "datetime", "team_home", "score_home", "score_away"]].rename(
    columns={"team_home": "team", "score_home": "points_scored", "score_away": "points_allowed"}
)

away = df[["schedule_season", "datetime", "team_away", "score_away", "score_home"]].rename(
    columns={"team_away": "team", "score_away": "points_scored", "score_home": "points_allowed"}
)

# long df, duplicate games, sort by team, season, date
long_df = pd.concat([home, away])
long_df = long_df.sort_values(["team", "schedule_season", "datetime"]).reset_index(drop=True)

groups = long_df.groupby(["team", "schedule_season"])

# compute rolling averages
long_df["rolling_scored"] = groups["points_scored"].transform(
    lambda s: s.shift().expanding().mean()
)

long_df["rolling_allowed"] = groups["points_allowed"].transform(
    lambda s: s.shift().expanding().mean()
)

# merge back into original df
df = df.merge(
    long_df[["team", "schedule_season", "datetime", "rolling_scored", "rolling_allowed"]],
    left_on=["team_home", "schedule_season", "datetime"],
    right_on=["team", "schedule_season", "datetime"],
    how="left"
).rename(
    columns={
        "rolling_scored": "home_rolling_scored",
        "rolling_allowed": "home_rolling_allowed"
    }
).drop(columns=["team"])


df = df.merge(
    long_df[["team", "schedule_season", "datetime", "rolling_scored", "rolling_allowed"]],
    left_on=["team_away", "schedule_season", "datetime"],
    right_on=["team", "schedule_season", "datetime"],
    how="left"
).rename(
    columns={
        "rolling_scored": "away_rolling_scored",
        "rolling_allowed": "away_rolling_allowed"
    }
).drop(columns=["team"])

In [23]:
# Dropping variables that aren't informative to our model
df_filtered = df.drop(columns=['datetime', 'stadium_neutral', 'home_team_record', 'away_team_record',
                       'schedule_date', 'team_favorite_id', 'team_home', 'team_away', 'schedule_week'])
df_filtered = df_filtered.dropna(axis = 0)

In [24]:
# splitting into x and y
X = df_filtered.drop(columns=['total_score', 'over_under_line', 'score_home', 'score_away'])
y = df_filtered['total_score']

In [25]:
# Train/test splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 123)

In [26]:
class MyModel(nn.Module):
    def __init__(self, h):
        super().__init__()
        l1 = nn.Linear(15, h)
        a1 = nn.ReLU()
        l2 = nn.Linear(h,h//2)
        a2 = nn.ReLU()
        l3 = nn.Linear(h//2,1)
        l = [l1, a1, l2, a2, l3]
        self.module_list = nn.ModuleList(l)
    
    def forward(self, X):
        for f in self.module_list:
            X = f(X)
        return X

torch.manual_seed(42)
np.random.seed(42)

k_folds = 5
learning_rates = [0.0005, 0.001, 0.005]
hidden_sizes = [16, 32, 64]
epochs_options = [150, 300, 450]
batch_size = 32


def run_kfold(lr, h, epochs):
    n_epochs = epochs
    learning_rate = lr
    hidden_size = h
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)


    fold_mse = []
    fold_r2 = []

    for fold, (train_index, val_index) in enumerate(kf.split(X)):

        X_np = X.values
        y_np = y.values
        X_train_fold, X_val_fold = X_np[train_index], X_np[val_index]
        y_train_fold, y_val_fold = y_np[train_index], y_np[val_index]

        scaler = StandardScaler()
        X_train_fold = scaler.fit_transform(X_train_fold)
        X_val_fold = scaler.transform(X_val_fold)

        X_train_tensor = torch.tensor(X_train_fold, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train_fold, dtype=torch.float32).reshape(-1, 1)
        X_val_tensor = torch.tensor(X_val_fold, dtype=torch.float32)
        y_val_tensor = torch.tensor(y_val_fold, dtype=torch.float32).reshape(-1, 1)

        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        model = MyModel(hidden_size)

        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        cost_function = nn.MSELoss()

        for epoch in range(n_epochs):
            model.train()
            for x_batch, y_batch in train_loader:
                optimizer.zero_grad()
                y_pred = model(x_batch)
                cost = cost_function(y_pred, y_batch)
                cost.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            val_pred = model(X_val_tensor)
            mse = cost_function(val_pred, y_val_tensor).item()
            r2 = r2_score(y_val_tensor.numpy(), val_pred.numpy())

        fold_mse.append(mse)
        fold_r2.append(r2)
    return np.mean(fold_mse), np.mean(fold_r2)


for lr in learning_rates:
    for h in hidden_sizes:
        for ep in epochs_options:
            mse, r2 = run_kfold(lr, h, ep)
            print(f"Learning rate: {lr}, Hidden units: {h}, Epochs: {ep}, MSE: {round(mse,3)}, R^2: {round(r2,3)}")

Learning rate: 0.0005, Hidden units: 16, Epochs: 150, MSE: 188.854, R^2: 0.021
Learning rate: 0.0005, Hidden units: 16, Epochs: 300, MSE: 189.209, R^2: 0.019
Learning rate: 0.0005, Hidden units: 16, Epochs: 450, MSE: 190.775, R^2: 0.011
Learning rate: 0.0005, Hidden units: 32, Epochs: 150, MSE: 189.158, R^2: 0.019
Learning rate: 0.0005, Hidden units: 32, Epochs: 300, MSE: 193.155, R^2: -0.001
Learning rate: 0.0005, Hidden units: 32, Epochs: 450, MSE: 195.067, R^2: -0.011
Learning rate: 0.0005, Hidden units: 64, Epochs: 150, MSE: 192.028, R^2: 0.004
Learning rate: 0.0005, Hidden units: 64, Epochs: 300, MSE: 202.265, R^2: -0.049
Learning rate: 0.0005, Hidden units: 64, Epochs: 450, MSE: 213.55, R^2: -0.107
Learning rate: 0.001, Hidden units: 16, Epochs: 150, MSE: 188.783, R^2: 0.021
Learning rate: 0.001, Hidden units: 16, Epochs: 300, MSE: 190.955, R^2: 0.01
Learning rate: 0.001, Hidden units: 16, Epochs: 450, MSE: 194.446, R^2: -0.007
Learning rate: 0.001, Hidden units: 32, Epochs: 150,

It can be seen that the neural network does not capture the underlying relationships within our data, with our R^2 being consistently around 0, and even negative at times. Increasing the number of epochs and hidden units did not improve performance. A neural network is not well suited for this data.