Data Preprocessing


In [7]:
!pip install scikit-learn pandas




In [8]:
# imports
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np


In [9]:

df = pd.read_csv('/Users/dylandietrich/DS4021-Final-Project/data/train.csv')
df_raw = pd.read_csv('/Users/dylandietrich/DS4021-Final-Project/data/train.csv')

# drop unnecessary columns
df = df.drop(columns=['weather_temperature', 'weather_wind_mph', 'weather_humidity', 'weather_detail', 'stadium'])
df.head()

# adding total score columns
df['total_score'] = df['score_home'] + df['score_away']


# add column that represented current record for each team before each game of a season
# ensure games are sorted chronologically within each season
df["datetime"] = pd.to_datetime(df["schedule_date"])
df = df.sort_values(["schedule_season", "datetime"]).reset_index(drop=True)

# make output lists
home_records = []
away_records = []

# make dictionaries to track each team's W-L-T within the current season
team_wins = {}
team_losses = {}
team_ties = {}

current_season = None

for i, row in df.iterrows():
    season = row["schedule_season"]
    home = row["team_home"]
    away = row["team_away"]
    home_score = row["score_home"]
    away_score = row["score_away"]
   
    # new season,  reset all
    if season != current_season:
        team_wins = {}
        team_losses = {}
        team_ties = {}
        current_season = season

    # initialize teams for this season if needed
    for team in [home, away]:
        if team not in team_wins:
            team_wins[team] = 0
            team_losses[team] = 0
            team_ties[team] = 0

    # add current record before the game
    home_records.append(
        f"{team_wins[home]}-{team_losses[home]}-{team_ties[home]}"
    )
    away_records.append(
        f"{team_wins[away]}-{team_losses[away]}-{team_ties[away]}"
    )

    # update records after the game
    if home_score > away_score:
        team_wins[home] += 1
        team_losses[away] += 1
    elif away_score > home_score:
        team_wins[away] += 1
        team_losses[home] += 1
    else:
        # tie
        team_ties[home] += 1
        team_ties[away] += 1

# add results to dataframe
df["home_team_record"] = home_records
df["away_team_record"] = away_records


# make individual columns for wins, losses, and ties
df['home_wins'] = df['home_team_record'].apply(lambda x: int(x.split('-')[0]))
df['home_losses'] = df['home_team_record'].apply(lambda x: int(x.split('-')[1]))
df['home_ties'] = df['home_team_record'].apply(lambda x: int(x.split('-')[2]))
df['away_wins'] = df['away_team_record'].apply(lambda x: int(x.split('-')[0]))
df['away_losses'] = df['away_team_record'].apply(lambda x: int(x.split('-')[1]))
df['away_ties'] = df['away_team_record'].apply(lambda x: int(x.split('-')[2]))


# filter games that have already been recorded, no scheduled games
df = df[df["datetime"] <= "2025-11-04"]


In [10]:
# Compute each team's average score per season
team_season_avg = (
    df.groupby(["team_home", "schedule_season"])["score_home"].mean().reset_index()
)
team_season_avg.columns = ["team", "season", "avg_score"]

# also include away team scoring
team_season_avg_away = (
    df.groupby(["team_away", "schedule_season"])["score_away"].mean().reset_index()
)
team_season_avg_away.columns = ["team", "season", "avg_score"]

# combine home + away scoring for a true team season average
team_season_avg = pd.concat([team_season_avg, team_season_avg_away])
team_season_avg = team_season_avg.groupby(["team", "season"])["avg_score"].mean().reset_index()

# shift averages to represent previous season
team_season_avg["prev_season"] = team_season_avg["season"] + 1

# prev_season avg is used in the next year's games
team_prev = team_season_avg[["team", "prev_season", "avg_score"]]
team_prev.columns = ["team", "schedule_season", "prev_season_avg"]

# merge into main df
df = df.merge(team_prev, left_on=["team_home", "schedule_season"], right_on=["team", "schedule_season"], how="left")
df.rename(columns={"prev_season_avg": "home_prev_avg"}, inplace=True)
df = df.drop(columns=["team"])

df = df.merge(team_prev, left_on=["team_away", "schedule_season"], right_on=["team", "schedule_season"], how="left")
df.rename(columns={"prev_season_avg": "away_prev_avg"}, inplace=True)
df = df.drop(columns=["team"])


In [11]:
# Creating rolling avgs within each season

# separate into home, away dfs
home = df[["schedule_season", "datetime", "team_home", "score_home", "score_away"]].rename(
    columns={"team_home": "team", "score_home": "points_scored", "score_away": "points_allowed"}
)

away = df[["schedule_season", "datetime", "team_away", "score_away", "score_home"]].rename(
    columns={"team_away": "team", "score_away": "points_scored", "score_home": "points_allowed"}
)

# long df, duplicate games, sort by team, season, date
long_df = pd.concat([home, away])
long_df = long_df.sort_values(["team", "schedule_season", "datetime"]).reset_index(drop=True)

groups = long_df.groupby(["team", "schedule_season"])

# compute rolling averages
long_df["rolling_scored"] = groups["points_scored"].transform(
    lambda s: s.shift().expanding().mean()
)

long_df["rolling_allowed"] = groups["points_allowed"].transform(
    lambda s: s.shift().expanding().mean()
)

# merge back into original df
df = df.merge(
    long_df[["team", "schedule_season", "datetime", "rolling_scored", "rolling_allowed"]],
    left_on=["team_home", "schedule_season", "datetime"],
    right_on=["team", "schedule_season", "datetime"],
    how="left"
).rename(
    columns={
        "rolling_scored": "home_rolling_scored",
        "rolling_allowed": "home_rolling_allowed"
    }
).drop(columns=["team"])


df = df.merge(
    long_df[["team", "schedule_season", "datetime", "rolling_scored", "rolling_allowed"]],
    left_on=["team_away", "schedule_season", "datetime"],
    right_on=["team", "schedule_season", "datetime"],
    how="left"
).rename(
    columns={
        "rolling_scored": "away_rolling_scored",
        "rolling_allowed": "away_rolling_allowed"
    }
).drop(columns=["team"])


### Ensemble Model (Gradient Boosting)


In [12]:
# imports
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split


In [13]:
# Dropping variables that aren't informative to our model
df_filtered = df.drop(columns=['datetime', 'stadium_neutral', 'home_team_record', 'away_team_record',
                       'schedule_date', 'team_favorite_id', 'team_home', 'team_away', 'schedule_week'])
df_filtered = df_filtered.dropna(axis = 0)


In [14]:
# splitting into x and y
X = df_filtered.drop(columns=['total_score', 'over_under_line', 'score_home', 'score_away'])
y = df_filtered['total_score']


In [15]:
# Train/test splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 123)


In [16]:
# Identify categorical + numeric columns
categorical_cols = ['schedule_playoff']
numeric_cols = [col for col in X_train.columns if col not in categorical_cols]

# Column transformer to handle preprocessing
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='if_binary'), categorical_cols),
        ('num', StandardScaler(), numeric_cols)])

# Build preprocess, Gradient Boosting pipeline
pip = Pipeline(steps=[('preprocess', preprocess),('gbr', GradientBoostingRegressor(random_state=123))])

# Parameter grid for Gradient Boosting
param_grid = {
    'gbr__n_estimators': [50, 100, 200],
    'gbr__learning_rate': [0.01, 0.1, 0.2],
    'gbr__max_depth': [3, 5, 7],
    'gbr__min_samples_split': [2, 5, 10],
    'gbr__min_samples_leaf': [1, 2, 4],
    'gbr__subsample': [0.8, 1.0]
}

# Grid search
grid_search = GridSearchCV(pip, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

# Fit on raw X_train and y_train
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 486 candidates, totalling 2430 fits


0,1,2
,estimator,Pipeline(step..._state=123))])
,param_grid,"{'gbr__learning_rate': [0.01, 0.1, ...], 'gbr__max_depth': [3, 5, ...], 'gbr__min_samples_leaf': [1, 2, ...], 'gbr__min_samples_split': [2, 5, ...], ...}"
,scoring,'neg_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'if_binary'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,loss,'squared_error'
,learning_rate,0.01
,n_estimators,200
,subsample,0.8
,criterion,'friedman_mse'
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [17]:
# Results
print("Best Parameters:", grid_search.best_params_)
print("Best CV MSE:", round(-grid_search.best_score_,3))
print("Best CV RMSE:", round(np.sqrt(-grid_search.best_score_),3))

best_gbr = grid_search.best_estimator_
y_pred = best_gbr.predict(X_test)

# Evaluation metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

print("Test R²:", round(r2_score(y_test, y_pred),3))
print("Test MSE:", round(mean_squared_error(y_test, y_pred),3))
print("Test RMSE:", round(np.sqrt(mean_squared_error(y_test, y_pred)),3))
print("Test MAE:", round(mean_absolute_error(y_test, y_pred), 3))


Best Parameters: {'gbr__learning_rate': 0.01, 'gbr__max_depth': 3, 'gbr__min_samples_leaf': 2, 'gbr__min_samples_split': 5, 'gbr__n_estimators': 200, 'gbr__subsample': 0.8}
Best CV MSE: 179.758
Best CV RMSE: 13.407
Test R²: 0.031
Test MSE: 208.462
Test RMSE: 14.438
Test MAE: 11.519
