# Machine Learning Approach

In [1]:
# Local libraries
import Tools.ratings_utils as ru
import Tools.system_utils as sys

# Third party packages
import pandas as pd

YEAR = 2024
FILENAME = f"Data/Seasons/data_{YEAR}.json"
TOURNAMENT_FILENAME = f"Data/Tournaments/tournament_{YEAR}.csv"
PICKS_FILENAME = f"Data/Tournament Picks/picks_{YEAR}.csv"
RATINGS_FILENAME = f"Data/Season Ratings/data_{YEAR}.json"


# Create data frame for valid teams in the current season that can be used for tournament simulation
score_df = ru.set_rating_data_frame(filename=FILENAME)
rating_score_df = None

# Save Ratings to JSON

In [None]:
rating_score_df = ru.add_ratings_per_game(score_df=score_df)

In [None]:
print(rating_score_df)

In [None]:
rating_score_df.to_json(RATINGS_FILENAME, orient='records', indent=4) 

# Logistic Regression Model

In [2]:
rating_score_df = pd.read_json(RATINGS_FILENAME)

In [None]:
print(rating_score_df)

## Home team is winner -> 1
## Away team is winner -> 0

In [3]:
df = rating_score_df.copy()
df["y"] = (df["Winner"] == df["Home"]).astype(int)

In [4]:
df["Massey_diff"] = df["Home_Massey"] - df["Away_Massey"]
df["Colley_diff"] = df["Home_Colley"] - df["Away_Colley"]
df["Elo_diff"] = df["Home_Elo"] - df["Away_Elo"]

In [5]:
X = df[["Massey_diff", "Colley_diff", "Elo_diff"]]
y = df["y"]

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)



In [7]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

Accuracy: 0.7277147487844409
ROC AUC: 0.7303617650684087


In [9]:
coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_[0]
})
print(coef_df)

       Feature  Coefficient
0  Massey_diff     0.000000
1  Colley_diff     0.000000
2     Elo_diff     0.011682


In [10]:
print(y_pred)
print(y_prob)

[1 0 1 ... 1 1 1]
[0.95604606 0.41776405 0.53928426 ... 0.93047762 0.86490789 0.50880524]
