# Machine Learning Approach

In [1]:
# Local libraries
import Tools.ratings_utils as ru
import Tools.system_utils as sys
import Tools.season_utils as su

# Third party packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

YEARS = [2021]
# YEARS = [2021, 2022, 2023, 2024, 2025]
WRITE_TO_CSV = False


_, tournament_filename, picks_filename, ratings_filename = su.create_filenames(years=YEARS)

# Create data frame for valid teams in the current season that can be used for tournament simulation
score_df = ru.create_score_df(years=YEARS)
rating_score_df = None



# Logistic Regression Model - Option #1
### Home team is winner -> 1
### Away team is winner -> 0

In [None]:
# Read data from JSON
rating_score_df = pd.read_json(ratings_filename)

# Set data frame and target variable
df = rating_score_df.copy()
df["y"] = (df["Winner"] == df["Home"]).astype(int)

# Add feature columns
df = ru.derive_features(df=df)

In [None]:
# Set features
features = ru.ML_FEATURES

# Create X, y data frames
X = df[features]
y = df["y"]

# Split train/test data sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

In [None]:
y_pred = log_model.predict(X_test)
y_prob = log_model.predict_proba(X_test)[:, 1]

In [None]:
# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

# Print neatly
print(f"Logistic Regression Model\nPerformance Metrics:")
print(f"---------------------------")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {roc_auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": log_model.coef_[0]
})
print(coef_df)

# Test Logistic Regression Model Against March Madness Tournament

In [None]:
ratings_dict = ru.compile_ratings_dict(score_df=score_df)

_, _, tourney_dict, results = ru.simulate_tournament_with_all_ratings(
    filename=tournament_filename,
    ratings=ratings_dict,
    model=log_model)

print(results)

if WRITE_TO_CSV:
    sys.write_tournament_to_csv(tourney_dict=tourney_dict,
                                filename=picks_filename,
                                rating_type="log_model")

# XGBoost Model - Option #2

In [None]:
# Read data from JSON
rating_score_df = pd.read_json(ratings_filename)

# Set data frame and target variable
df = rating_score_df.copy()
df["y"] = (df["Winner"] == df["Home"]).astype(int)

# Add feature columns
df = ru.derive_features(df=df)

In [None]:
# Set features
features = ru.ML_FEATURES

# Create X, y data frames
X = df[features]
y = df["y"]

# Split train/test data sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train XGBoost classifier
xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    use_label_encoder=False
)

xgb_model.fit(X_train, y_train)

In [None]:
# Evaluate
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]

In [None]:
# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

# Print neatly
print(f"XGBoost Model\nPerformance Metrics:")
print(f"---------------------------")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {roc_auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
xgb.plot_importance(xgb_model, importance_type="gain")

# Test XGBoost Model Against March Madness Tournament

In [None]:
ratings_dict = ru.compile_ratings_dict(score_df=score_df)

_, _, tourney_dict, results = ru.simulate_tournament_with_all_ratings(
    filename=tournament_filename,
    ratings=ratings_dict,
    model=xgb_model)

print(results)

if WRITE_TO_CSV:
    sys.write_tournament_to_csv(tourney_dict=tourney_dict,
                                filename=picks_filename,
                                rating_type="xgb_model")

# Random Forest - Option #3

In [None]:
# Read data from JSON
rating_score_df = pd.read_json(ratings_filename)

# Set data frame and target variable
df = rating_score_df.copy()
df["y"] = (df["Winner"] == df["Home"]).astype(int)

# Add feature columns
df = ru.derive_features(df=df)

In [None]:
# Set features
features = ru.ML_FEATURES

# Create X, y data frames
X = df[features]
y = df["y"]

# Split train/test data sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=500,      # number of trees
    max_depth=None,       # let trees go deep until pure
    min_samples_split=2,  # default
    min_samples_leaf=1,   # default
    max_features="sqrt",  # good for classification
    random_state=42,
    n_jobs=-1             # use all cores
)

rf_model.fit(X_train, y_train)

In [None]:
# Evaluate
y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)[:, 1]

In [None]:
# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

# Print neatly
print(f"Random Forest Model\nPerformance Metrics:")
print(f"---------------------------")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {roc_auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)
print(importances.sort_values(ascending=False))

In [None]:
ratings_dict = ru.compile_ratings_dict(score_df=score_df)

_, _, tourney_dict, results = ru.simulate_tournament_with_all_ratings(
    filename=tournament_filename,
    ratings=ratings_dict,
    model=rf_model)

print(results)

if WRITE_TO_CSV:
    sys.write_tournament_to_csv(tourney_dict=tourney_dict,
                                filename=picks_filename,
                                rating_type="rf_model")

## Exploratory Data Anaysis per feature (Plotting)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Read data from JSON
rating_score_df = pd.read_json(ratings_filename)

# Set data frame and target variable
df = rating_score_df.copy()
df["y"] = (df["Winner"] == df["Home"]).astype(int)

# Add feature columns
df = ru.derive_features(df=df)

In [None]:
def plot_feature_vs_win(df, feature_col, target_col='y'):
    y = df[target_col].values
    x = df[feature_col].values

    # jitter Y so points don't stack
    y_jitter = y + np.random.normal(0, 0.03, size=len(y))

    plt.figure(figsize=(6,4))
    plt.scatter(x, y_jitter, alpha=0.4)
    plt.yticks([0, 1], ['Loss', 'Win'])
    plt.xlabel(feature_col)
    plt.ylabel('Outcome')
    plt.title(f'Win vs {feature_col}')
    plt.grid(alpha=0.3)
    plt.show()


def plot_binned_win_rate(df, feature_col, target_col='y', bins=10):
    x = df[feature_col]
    y = df[target_col]

    df_tmp = df.copy()
    df_tmp['bin'] = pd.qcut(x, bins, duplicates='drop')

    win_rate = df_tmp.groupby('bin')[target_col].mean()
    bin_mid = [interval.mid for interval in win_rate.index]

    plt.figure(figsize=(6,4))
    plt.plot(bin_mid, win_rate, marker='o')
    plt.xlabel(feature_col)
    plt.ylabel('Win Probability')
    plt.title(f'Binned Win Rate vs {feature_col}')
    plt.grid(alpha=0.3)
    plt.show()


def plot_binned_win_rate_fixed(df, feature_col, target_col='y', bins=10):
    x = df[feature_col]
    y = df[target_col]

    df_tmp = df.copy()
    df_tmp['bin'] = pd.cut(x, bins=bins)

    win_rate = df_tmp.groupby('bin')[target_col].mean()
    bin_mid = [interval.mid for interval in win_rate.index]

    plt.figure(figsize=(6,4))
    plt.plot(bin_mid, win_rate, marker='o')
    plt.xlabel(feature_col)
    plt.ylabel('Home Win Probability')
    plt.title(f'Binned Win Rate vs {feature_col}')
    plt.grid(alpha=0.3)
    plt.show()


In [None]:
# Run plots for target outcome (home team wins) against all features
for feature in ru.ML_FEATURES:
    # plot_feature_vs_win(df, feature)
    # plot_binned_win_rate(df, feature)
    plot_binned_win_rate_fixed(df, feature)

## Save Ratings to JSON
### (Skip if already run for this season)

In [2]:
rating_score_df = ru.add_ratings_per_game(score_df=score_df)
rating_score_df.to_json(ratings_filename, orient='records', indent=4)

Complete: 43 / 4261 or 1.009%
Complete: 86 / 4261 or 2.018%
Complete: 128 / 4261 or 3.004%
Complete: 171 / 4261 or 4.013%
Complete: 214 / 4261 or 5.022%
Complete: 256 / 4261 or 6.008%
Complete: 299 / 4261 or 7.017%
Complete: 341 / 4261 or 8.003%
Complete: 384 / 4261 or 9.012%
Complete: 427 / 4261 or 10.021%
Complete: 469 / 4261 or 11.007%
Complete: 512 / 4261 or 12.016%
Complete: 554 / 4261 or 13.002%
Complete: 597 / 4261 or 14.011%
Complete: 640 / 4261 or 15.02%
Complete: 682 / 4261 or 16.006%
Complete: 725 / 4261 or 17.015%
Complete: 767 / 4261 or 18.0%
Complete: 810 / 4261 or 19.01%
Complete: 853 / 4261 or 20.019%
Complete: 895 / 4261 or 21.004%
Complete: 938 / 4261 or 22.014%
Complete: 981 / 4261 or 23.023%
Complete: 1023 / 4261 or 24.008%
Complete: 1066 / 4261 or 25.018%
Complete: 1108 / 4261 or 26.003%
Complete: 1151 / 4261 or 27.012%
Complete: 1194 / 4261 or 28.022%
Complete: 1236 / 4261 or 29.007%
Complete: 1279 / 4261 or 30.016%
Complete: 1321 / 4261 or 31.002%
Complete: 1364 