# Machine Learning Approach

In [1]:
# Local libraries
import Tools.ratings_utils as ru
import Tools.system_utils as sys

# Third party packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb


YEAR = 2025
FILENAME = f"Data/Seasons/data_{YEAR}.json"
TOURNAMENT_FILENAME = f"Data/Tournaments/tournament_{YEAR}.csv"
PICKS_FILENAME = f"Data/Tournament Picks/picks_{YEAR}.csv"
RATINGS_FILENAME = f"Data/Season Ratings/data_{YEAR}.json"


# Create data frame for valid teams in the current season that can be used for tournament simulation
score_df = ru.set_rating_data_frame(filename=FILENAME)
rating_score_df = None



### Multiple Seasons

In [2]:
import pandas as pd

YEARS = [2021, 2022, 2023, 2024, 2025]
YEARS = [2021, 2022, 2023, 2024]


for YEAR in YEARS:
    FILENAME = f"Data/Seasons/data_{YEAR}.json"
    TOURNAMENT_FILENAME = f"Data/Tournaments/tournament_{YEAR}.csv"
    PICKS_FILENAME = f"Data/Tournament Picks/picks_{YEAR}.csv"
    
    if YEAR is YEARS[0]:
        # Create data frame for valid teams in the current season that can be used for tournament simulation
        score_df = ru.set_rating_data_frame(filename=FILENAME)
    else:
        # Concatenate
        new_season_score_df = ru.set_rating_data_frame(filename=FILENAME)
        score_df = pd.concat([score_df, new_season_score_df], ignore_index=True)


TOURNAMENT_YEAR = YEARS[-1]
FILENAME_YEARS = f"{YEARS[0]}-{TOURNAMENT_YEAR}"
PICKS_FILENAME = f"Data/Tournament Picks/picks_{FILENAME_YEARS}.csv"
RATINGS_FILENAME = f"Data/Season Ratings/data_{FILENAME_YEARS}.json"

# Save Ratings to JSON
### (Skip if already run for this season)

In [3]:
rating_score_df = ru.add_ratings_per_game(score_df=score_df)

Complete: 226 / 22595 or 1.0%
Complete: 452 / 22595 or 2.0%
Complete: 678 / 22595 or 3.001%
Complete: 904 / 22595 or 4.001%
Complete: 1130 / 22595 or 5.001%
Complete: 1356 / 22595 or 6.001%
Complete: 1582 / 22595 or 7.002%
Complete: 1808 / 22595 or 8.002%
Complete: 2034 / 22595 or 9.002%
Complete: 2260 / 22595 or 10.002%
Complete: 2486 / 22595 or 11.002%
Complete: 2712 / 22595 or 12.003%
Complete: 2938 / 22595 or 13.003%
Complete: 3164 / 22595 or 14.003%
Complete: 3390 / 22595 or 15.003%
Complete: 3616 / 22595 or 16.004%
Complete: 3842 / 22595 or 17.004%
Complete: 4067 / 22595 or 18.0%
Complete: 4293 / 22595 or 19.0%
Complete: 4519 / 22595 or 20.0%
Complete: 4745 / 22595 or 21.0%
Complete: 4971 / 22595 or 22.0%
Complete: 5197 / 22595 or 23.001%
Complete: 5423 / 22595 or 24.001%
Complete: 5649 / 22595 or 25.001%
Complete: 5875 / 22595 or 26.001%
Complete: 6101 / 22595 or 27.002%
Complete: 6327 / 22595 or 28.002%
Complete: 6553 / 22595 or 29.002%
Complete: 6779 / 22595 or 30.002%
Complet

In [4]:
rating_score_df.to_json(RATINGS_FILENAME, orient='records', indent=4)

# Logistic Regression Model - Option #1
### Home team is winner -> 1
### Away team is winner -> 0

In [None]:
# Read data from JSON
rating_score_df = pd.read_json(RATINGS_FILENAME)

# Set data frame and target variable
df = rating_score_df.copy()
df["y"] = (df["Winner"] == df["Home"]).astype(int)

# Add feature columns
df["Massey_diff"] = df["Home_Massey"] - df["Away_Massey"]
df["Colley_diff"] = df["Home_Colley"] - df["Away_Colley"]
df["Elo_diff"] = df["Home_Elo"] - df["Away_Elo"]
df["Adj_Elo_diff"] = df["Home_Adj_Elo"] - df["Away_Adj_Elo"]

In [None]:
# Set features
features = [
    "Massey_diff",
    "Colley_diff",
    "Elo_diff",
    "Adj_Elo_diff"
]

# Create X, y data frames
X = df[features]
y = df["y"]

# Split train/test data sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

In [None]:
coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_[0]
})
print(coef_df)

# Test Logistic Regression Model Against March Madness Tournament

In [None]:
ratings_dict = ru.compile_ratings_dict(score_df=score_df)

_, _, tourney_dict = ru.simulate_tournament_with_all_ratings(filename=TOURNAMENT_FILENAME,
                                                             ratings=ratings_dict,
                                                             model=model)
sys.write_tournament_to_csv(tourney_dict=tourney_dict,
                            filename=PICKS_FILENAME,
                            rating_type="log_model")

# XGBoost Model - Option #2

In [None]:
# Read data from JSON
rating_score_df = pd.read_json(RATINGS_FILENAME)

# Set data frame and target variable
df = rating_score_df.copy()
df["y"] = (df["Winner"] == df["Home"]).astype(int)

# Add feature columns
df["Massey_diff"] = df["Home_Massey"] - df["Away_Massey"]
df["Colley_diff"] = df["Home_Colley"] - df["Away_Colley"]
df["Elo_diff"] = df["Home_Elo"] - df["Away_Elo"]
df["Adj_Elo_diff"] = df["Home_Adj_Elo"] - df["Away_Adj_Elo"]

In [None]:
# Set features
features = [
    "Massey_diff",
    "Colley_diff",
    "Elo_diff",
    "Adj_Elo_diff"
]

# Create X, y data frames
X = df[features]
y = df["y"]

# Split train/test data sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train XGBoost classifier
model = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    use_label_encoder=False
)

model.fit(X_train, y_train)

In [None]:
# Evaluate
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))

In [None]:
xgb.plot_importance(model, importance_type="gain")

# Test XGBoost Model Against March Madness Tournament

In [None]:
ratings_dict = ru.compile_ratings_dict(score_df=score_df)

_, _, tourney_dict = ru.simulate_tournament_with_all_ratings(filename=TOURNAMENT_FILENAME,
                                                             ratings=ratings_dict,
                                                             model=model)
sys.write_tournament_to_csv(tourney_dict=tourney_dict,
                            filename=PICKS_FILENAME,
                            rating_type="xgb_model")