### Data Loading and Preprocessing

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df_train = pd.read_csv('../data/train.csv')

# drop unnecessary columns
df_train = df_train.drop(columns=['weather_temperature', 'weather_wind_mph', 'weather_humidity', 'weather_detail', 'stadium'])
df_train.head()

# first add total_score column
df_train['total_score'] = df_train['score_home'] + df_train['score_away']

# add column that represented current record for each team before each game of a season
# ensure games are sorted chronologically within each season
df_train["datetime"] = pd.to_datetime(df_train["schedule_date"])
df_train = df_train.sort_values(["schedule_season", "datetime"]).reset_index(drop=True)

# make output lists
home_records = []
away_records = []

# make dictionaries to track each team's W-L-T within the current season
team_wins = {}
team_losses = {}
team_ties = {}

current_season = None

for i, row in df_train.iterrows():
    season = row["schedule_season"]
    home = row["team_home"]
    away = row["team_away"]
    home_score = row["score_home"]
    away_score = row["score_away"]
   
    # new season, we reset all
    if season != current_season:
        team_wins = {}
        team_losses = {}
        team_ties = {}
        current_season = season

    # initialize teams for this season if needed
    for team in [home, away]:
        if team not in team_wins:
            team_wins[team] = 0
            team_losses[team] = 0
            team_ties[team] = 0

    # add current record before the game
    home_records.append(
        f"{team_wins[home]}-{team_losses[home]}-{team_ties[home]}"
    )
    away_records.append(
        f"{team_wins[away]}-{team_losses[away]}-{team_ties[away]}"
    )

    # update records after the game
    if home_score > away_score:
        team_wins[home] += 1
        team_losses[away] += 1
    elif away_score > home_score:
        team_wins[away] += 1
        team_losses[home] += 1
    else:
        # tie
        team_ties[home] += 1
        team_ties[away] += 1

# add results to dataframe
df_train["home_team_record"] = home_records
df_train["away_team_record"] = away_records

# make individual columns for wins, losses, and ties
df_train['home_wins'] = df_train['home_team_record'].apply(lambda x: int(x.split('-')[0]))
df_train['home_losses'] = df_train['home_team_record'].apply(lambda x: int(x.split('-')[1]))
df_train['home_ties'] = df_train['home_team_record'].apply(lambda x: int(x.split('-')[2]))
df_train['away_wins'] = df_train['away_team_record'].apply(lambda x: int(x.split('-')[0]))
df_train['away_losses'] = df_train['away_team_record'].apply(lambda x: int(x.split('-')[1]))
df_train['away_ties'] = df_train['away_team_record'].apply(lambda x: int(x.split('-')[2]))

# filter games that have already been recorded, no scheduled games
df_train = df_train[df_train["datetime"] <= "2025-11-04"]


In [7]:
# 1. Compute each team's average score per season
team_season_avg = (
    df_train.groupby(["team_home", "schedule_season"])["score_home"].mean().reset_index()
)
team_season_avg.columns = ["team", "season", "avg_score"]

# also include away team scoring
team_season_avg_away = (
    df_train.groupby(["team_away", "schedule_season"])["score_away"].mean().reset_index()
)
team_season_avg_away.columns = ["team", "season", "avg_score"]

# combine home + away scoring for a true team season average
team_season_avg = pd.concat([team_season_avg, team_season_avg_away])
team_season_avg = team_season_avg.groupby(["team", "season"])["avg_score"].mean().reset_index()

# 2. Shift averages to represent "previous season"
team_season_avg["prev_season"] = team_season_avg["season"] + 1

# this means: prev_season avg is used in the next year's games
team_prev = team_season_avg[["team", "prev_season", "avg_score"]]
team_prev.columns = ["team", "schedule_season", "prev_season_avg"]

# 3. Merge into main df_train
df_train = df_train.merge(
    team_prev,
    left_on=["team_home", "schedule_season"],
    right_on=["team", "schedule_season"],
    how="left"
)
df_train.rename(columns={"prev_season_avg": "home_prev_avg"}, inplace=True)
df_train = df_train.drop(columns=["team"])

df_train = df_train.merge(
    team_prev,
    left_on=["team_away", "schedule_season"],
    right_on=["team", "schedule_season"],
    how="left"
)
df_train.rename(columns={"prev_season_avg": "away_prev_avg"}, inplace=True)
df_train = df_train.drop(columns=["team"])



In [8]:
home = df_train[["schedule_season", "datetime", "team_home", "score_home", "score_away"]].rename(
    columns={"team_home": "team", "score_home": "points_scored", "score_away": "points_allowed"}
)

away = df_train[["schedule_season", "datetime", "team_away", "score_away", "score_home"]].rename(
    columns={"team_away": "team", "score_away": "points_scored", "score_home": "points_allowed"}
)

long_df = pd.concat([home, away])
long_df = long_df.sort_values(["team", "schedule_season", "datetime"]).reset_index(drop=True)

groups = long_df.groupby(["team", "schedule_season"])

long_df["rolling_scored"] = groups["points_scored"].transform(
    lambda s: s.shift().expanding().mean()
)

long_df["rolling_allowed"] = groups["points_allowed"].transform(
    lambda s: s.shift().expanding().mean()
)

df_train = df_train.merge(
    long_df[["team", "schedule_season", "datetime", "rolling_scored", "rolling_allowed"]],
    left_on=["team_home", "schedule_season", "datetime"],
    right_on=["team", "schedule_season", "datetime"],
    how="left"
).rename(
    columns={
        "rolling_scored": "home_rolling_scored",
        "rolling_allowed": "home_rolling_allowed"
    }
).drop(columns=["team"])


df_train = df_train.merge(
    long_df[["team", "schedule_season", "datetime", "rolling_scored", "rolling_allowed"]],
    left_on=["team_away", "schedule_season", "datetime"],
    right_on=["team", "schedule_season", "datetime"],
    how="left"
).rename(
    columns={
        "rolling_scored": "away_rolling_scored",
        "rolling_allowed": "away_rolling_allowed"
    }
).drop(columns=["team"])

# drop 2005 due to NaNs from prev season averages
df_train = df_train[df_train["schedule_season"] != 2005]

### Ridge Regression Model

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

Below are the features that will be used in the model:

In [11]:
features = [
    "over_under_line",
    "stadium_neutral",
    "home_wins", "home_losses", "home_ties",
    "away_wins", "away_losses", "away_ties",
    "home_prev_avg", "away_prev_avg",
    "home_rolling_scored", "home_rolling_allowed",
    "away_rolling_scored", "away_rolling_allowed"
]

X_df = df_train[features]
X = df_train[features].to_numpy()
y = df_train["total_score"].to_numpy()

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=123)

In [22]:
pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),  # fills NaNs
    ("scaler", StandardScaler()),
    ("ridge", Ridge(random_state=123))
])

In [23]:
# initiate param grid for sweep
param_grid = {
    "ridge__alpha": [0.01, 0.1, 1.0, 10.0, 100.0],
    "ridge__solver": ["auto", "svd", "cholesky"]
}

In [24]:
# cross validation and param sweep
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step..._state=123))])
,param_grid,"{'ridge__alpha': [0.01, 0.1, ...], 'ridge__solver': ['auto', 'svd', ...]}"
,scoring,'neg_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,100.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,123


In [25]:
best_model = grid_search.best_estimator_

print("Best hyperparameters:")
print(grid_search.best_params_)

Best hyperparameters:
{'ridge__alpha': 100.0, 'ridge__solver': 'auto'}


In [27]:
y_pred = best_model.predict(X_val)

mse  = mean_squared_error(y_val, y_pred)
r2   = r2_score(y_val, y_pred)

print(f"Val MSE:  {mse:.4f}")
print(f"Val R2:   {r2:.4f}")

Val MSE:  172.9768
Val R2:   0.0844
