# Machine Learning Approach

In [1]:
# Local libraries
import Tools.ratings_utils as ru
import Tools.system_utils as sys
import Tools.season_utils as su

# Third party packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

YEARS = [2021]
# YEARS = [2021, 2022, 2023, 2024, 2025]
WRITE_TO_CSV = False


_, tournament_filename, picks_filename, ratings_filename = su.create_filenames(years=YEARS)

# Create data frame for valid teams in the current season that can be used for tournament simulation
score_df = ru.create_score_df(years=YEARS)
rating_score_df = None



# Save Ratings to JSON
### (Skip if already run for this season)

In [None]:
rating_score_df = ru.add_ratings_per_game(score_df=score_df)

In [None]:
rating_score_df.to_json(ratings_filename, orient='records', indent=4)

# Logistic Regression Model - Option #1
### Home team is winner -> 1
### Away team is winner -> 0

In [2]:
# Read data from JSON
rating_score_df = pd.read_json(ratings_filename)

# Set data frame and target variable
df = rating_score_df.copy()
df["y"] = (df["Winner"] == df["Home"]).astype(int)

# Add feature columns
df["Massey_diff"] = df["Home_Massey"] - df["Away_Massey"]
df["Colley_diff"] = df["Home_Colley"] - df["Away_Colley"]
df["Elo_diff"] = df["Home_Elo"] - df["Away_Elo"]
df["Adj_Elo_diff"] = df["Home_Adj_Elo"] - df["Away_Adj_Elo"]

                 Date               Home  Home_Score                  Away  \
0 2020-11-25 11:00:00  Abilene Christian          70  East Tennessee State   
1 2020-11-25 11:30:00      South Florida          94       Florida College   
2 2020-11-25 12:00:00             Xavier         101               Oakland   
3 2020-11-25 12:00:00               Navy          78     George Washington   
4 2020-11-25 12:00:00           Nebraska         102         McNeese State   

   Away_Score             Winner  Home_Massey  Away_Massey  Home_Colley  \
0          47  Abilene Christian          0.0          0.0          0.0   
1          84      South Florida          0.0          0.0          0.0   
2          49             Xavier          0.0          0.0          0.0   
3          71               Navy          0.0          0.0          0.0   
4          55           Nebraska          0.0          0.0          0.0   

   Away_Colley  Home_Elo  Away_Elo  Home_Adj_Elo  Away_Adj_Elo  
0          0.0 

In [3]:
# Set features
features = [
    "Massey_diff",
    "Colley_diff",
    "Elo_diff",
    "Adj_Elo_diff"
]

# Create X, y data frames
X = df[features]
y = df["y"]

# Split train/test data sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

In [5]:
y_pred = log_model.predict(X_test)
y_prob = log_model.predict_proba(X_test)[:, 1]

In [6]:
# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

# Print neatly
print(f"Logistic Regression Model\nPerformance Metrics:")
print(f"---------------------------")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {roc_auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Logistic Regression Model
Performance Metrics:
---------------------------
Accuracy : 0.7515
Precision: 0.7750
Recall   : 0.9178
F1 Score : 0.8404
ROC AUC  : 0.7234

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.34      0.44       245
           1       0.78      0.92      0.84       608

    accuracy                           0.75       853
   macro avg       0.70      0.63      0.64       853
weighted avg       0.73      0.75      0.73       853



In [7]:
coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": log_model.coef_[0]
})
print(coef_df)

        Feature  Coefficient
0   Massey_diff     0.051924
1   Colley_diff    -0.914390
2      Elo_diff     0.000899
3  Adj_Elo_diff     0.003240


# Test Logistic Regression Model Against March Madness Tournament

In [8]:
ratings_dict = ru.compile_ratings_dict(score_df=score_df)

_, _, tourney_dict, results = ru.simulate_tournament_with_all_ratings(
    filename=tournament_filename,
    ratings=ratings_dict,
    model=log_model)

print(results)

if WRITE_TO_CSV:
    sys.write_tournament_to_csv(tourney_dict=tourney_dict,
                                filename=picks_filename,
                                rating_type="log_model")

         Loyola (IL)  Montana Western   Cincinnati  Stony Brook   Penn State  \
Massey     23.641125       -25.702925    15.058259     6.156487    24.874560   
Colley      0.901125         0.352543     0.727658     0.382821     0.726397   
Elo      1710.993411      1484.465029  1551.039009  1442.537573  1496.747393   
Adj_Elo  2106.484237      1377.402783  1698.743737  1424.040790  1738.230442   

            Duquesne  Central Michigan    McKendree  Southern Utah  \
Massey     13.331669         -1.061759    -7.264776      10.343380   
Colley      0.590096          0.293702     0.334238       0.744327   
Elo      1506.715420       1404.336252  1473.293716    1640.872926   
Adj_Elo  1542.302357       1257.336178  1358.969728    1837.190825   

         Hawaii Pacific  ...       Marist      Wingate   Ohio State  \
Massey       -25.933740  ...     1.949622   -16.542332    30.162573   
Colley         0.352142  ...     0.499551     0.434282     0.962147   
Elo         1485.000000  ...  1514.

KeyError: 'Massey'

# XGBoost Model - Option #2

In [None]:
# Read data from JSON
rating_score_df = pd.read_json(ratings_filename)

# Set data frame and target variable
df = rating_score_df.copy()
df["y"] = (df["Winner"] == df["Home"]).astype(int)

# Add feature columns
df["Massey_diff"] = df["Home_Massey"] - df["Away_Massey"]
df["Colley_diff"] = df["Home_Colley"] - df["Away_Colley"]
df["Elo_diff"] = df["Home_Elo"] - df["Away_Elo"]
df["Adj_Elo_diff"] = df["Home_Adj_Elo"] - df["Away_Adj_Elo"]

In [None]:
# Set features
features = [
    "Massey_diff",
    "Colley_diff",
    "Elo_diff",
    "Adj_Elo_diff"
]

# Create X, y data frames
X = df[features]
y = df["y"]

# Split train/test data sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train XGBoost classifier
xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    use_label_encoder=False
)

xgb_model.fit(X_train, y_train)

In [None]:
# Evaluate
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]

In [None]:
# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

# Print neatly
print(f"XGBoost Model\nPerformance Metrics:")
print(f"---------------------------")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {roc_auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
xgb.plot_importance(xgb_model, importance_type="gain")

# Test XGBoost Model Against March Madness Tournament

In [None]:
ratings_dict = ru.compile_ratings_dict(score_df=score_df)

_, _, tourney_dict, results = ru.simulate_tournament_with_all_ratings(
    filename=tournament_filename,
    ratings=ratings_dict,
    model=xgb_model)

print(results)

if WRITE_TO_CSV:
    sys.write_tournament_to_csv(tourney_dict=tourney_dict,
                                filename=picks_filename,
                                rating_type="xgb_model")

# Random Forest - Option #3

In [None]:
# Read data from JSON
rating_score_df = pd.read_json(ratings_filename)

# Set data frame and target variable
df = rating_score_df.copy()
df["y"] = (df["Winner"] == df["Home"]).astype(int)

# Add feature columns
df["Massey_diff"] = df["Home_Massey"] - df["Away_Massey"]
df["Colley_diff"] = df["Home_Colley"] - df["Away_Colley"]
df["Elo_diff"] = df["Home_Elo"] - df["Away_Elo"]
df["Adj_Elo_diff"] = df["Home_Adj_Elo"] - df["Away_Adj_Elo"]

In [None]:
# Set features
features = [
    "Massey_diff",
    "Colley_diff",
    "Elo_diff",
    "Adj_Elo_diff"
]

# Create X, y data frames
X = df[features]
y = df["y"]

# Split train/test data sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=500,      # number of trees
    max_depth=None,       # let trees go deep until pure
    min_samples_split=2,  # default
    min_samples_leaf=1,   # default
    max_features="sqrt",  # good for classification
    random_state=42,
    n_jobs=-1             # use all cores
)

rf_model.fit(X_train, y_train)

In [None]:
# Evaluate
y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)[:, 1]

In [None]:
# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

# Print neatly
print(f"Random Forest Model\nPerformance Metrics:")
print(f"---------------------------")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {roc_auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)
print(importances.sort_values(ascending=False))

In [None]:
ratings_dict = ru.compile_ratings_dict(score_df=score_df)

_, _, tourney_dict, results = ru.simulate_tournament_with_all_ratings(
    filename=tournament_filename,
    ratings=ratings_dict,
    model=rf_model)

print(results)

if WRITE_TO_CSV:
    sys.write_tournament_to_csv(tourney_dict=tourney_dict,
                                filename=picks_filename,
                                rating_type="rf_model")