# Machine Learning Approach

In [1]:
# Local libraries
import Tools.ratings_utils as ru
import Tools.system_utils as sys

# Third party packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb


YEAR = 2024
FILENAME = f"Data/Seasons/data_{YEAR}.json"
TOURNAMENT_FILENAME = f"Data/Tournaments/tournament_{YEAR}.csv"
PICKS_FILENAME = f"Data/Tournament Picks/picks_{YEAR}.csv"
RATINGS_FILENAME = f"Data/Season Ratings/data_{YEAR}.json"


# Create data frame for valid teams in the current season that can be used for tournament simulation
score_df = ru.set_rating_data_frame(filename=FILENAME)
rating_score_df = None



# Save Ratings to JSON
### (Skip if already run for this season)

In [2]:
rating_score_df = ru.add_ratings_per_game(score_df=score_df)

Complete: 62 / 6168 or 1.005%
Complete: 124 / 6168 or 2.01%
Complete: 186 / 6168 or 3.016%
Complete: 247 / 6168 or 4.005%
Complete: 309 / 6168 or 5.01%
Complete: 371 / 6168 or 6.015%
Complete: 432 / 6168 or 7.004%
Complete: 494 / 6168 or 8.009%
Complete: 556 / 6168 or 9.014%
Complete: 617 / 6168 or 10.003%
Complete: 679 / 6168 or 11.008%
Complete: 741 / 6168 or 12.014%
Complete: 802 / 6168 or 13.003%
Complete: 864 / 6168 or 14.008%
Complete: 926 / 6168 or 15.013%
Complete: 987 / 6168 or 16.002%
Complete: 1049 / 6168 or 17.007%
Complete: 1111 / 6168 or 18.012%
Complete: 1172 / 6168 or 19.001%
Complete: 1234 / 6168 or 20.006%
Complete: 1296 / 6168 or 21.012%
Complete: 1357 / 6168 or 22.001%
Complete: 1419 / 6168 or 23.006%
Complete: 1481 / 6168 or 24.011%
Complete: 1542 / 6168 or 25.0%
Complete: 1604 / 6168 or 26.005%
Complete: 1666 / 6168 or 27.01%
Complete: 1728 / 6168 or 28.016%
Complete: 1789 / 6168 or 29.005%
Complete: 1851 / 6168 or 30.01%
Complete: 1913 / 6168 or 31.015%
Complete:

In [3]:
rating_score_df.to_json(RATINGS_FILENAME, orient='records', indent=4) 

# Logistic Regression Model - Option #1
### Home team is winner -> 1
### Away team is winner -> 0

In [5]:
# Read data from JSON
rating_score_df = pd.read_json(RATINGS_FILENAME)

# Set data frame and target variable
df = rating_score_df.copy()
df["y"] = (df["Winner"] == df["Home"]).astype(int)

# Add feature columns
df["Massey_diff"] = df["Home_Massey"] - df["Away_Massey"]
df["Colley_diff"] = df["Home_Colley"] - df["Away_Colley"]
df["Elo_diff"] = df["Home_Elo"] - df["Away_Elo"]
df["Adj_Elo_diff"] = df["Home_Adj_Elo"] - df["Away_Adj_Elo"]

In [6]:
# Set features
features = [
    "Massey_diff",
    "Colley_diff",
    "Elo_diff",
    "Adj_Elo_diff"
]

# Create X, y data frames
X = df[features]
y = df["y"]

# Split train/test data sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

Accuracy: 0.7690437601296597
ROC AUC: 0.7462831328438266


In [9]:
coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_[0]
})
print(coef_df)

        Feature  Coefficient
0   Massey_diff     0.026569
1   Colley_diff     0.000000
2      Elo_diff     0.001819
3  Adj_Elo_diff     0.002874


# Test Logistic Regression Model Against March Madness Tournament

In [10]:
ratings_dict = ru.compile_ratings_dict(score_df=score_df)

_, _, tourney_dict = ru.simulate_tournament_with_all_ratings(filename=TOURNAMENT_FILENAME,
                                                             ratings=ratings_dict,
                                                             model=model)
sys.write_tournament_to_csv(tourney_dict=tourney_dict,
                            filename=PICKS_FILENAME,
                            rating_type="log_model")

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Adj_Elo_diff


# XGBoost Model - Option #2

In [None]:
# Read data from JSON
rating_score_df = pd.read_json(RATINGS_FILENAME)

# Set data frame and target variable
df = rating_score_df.copy()
df["y"] = (df["Winner"] == df["Home"]).astype(int)

# Add feature columns
df["Massey_diff"] = df["Home_Massey"] - df["Away_Massey"]
df["Colley_diff"] = df["Home_Colley"] - df["Away_Colley"]
df["Elo_diff"] = df["Home_Elo"] - df["Away_Elo"]

In [None]:
# Set features
features = [
    "Massey_diff",
    "Colley_diff",
    "Elo_diff"
]

# Create X, y data frames
X = df[features]
y = df["y"]

# Split train/test data sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train XGBoost classifier
model = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    use_label_encoder=False
)

model.fit(X_train, y_train)

In [None]:
# Evaluate
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))

In [None]:
xgb.plot_importance(model, importance_type="gain")

# Test XGBoost Model Against March Madness Tournament

In [None]:
ratings_dict = ru.compile_ratings_dict(score_df=score_df)

_, _, tourney_dict = ru.simulate_tournament_with_all_ratings(filename=TOURNAMENT_FILENAME,
                                                             ratings=ratings_dict,
                                                             model=model)
sys.write_tournament_to_csv(tourney_dict=tourney_dict,
                            filename=PICKS_FILENAME,
                            rating_type="xgb_model")