# Machine Learning Approach

In [1]:
# Local libraries
import Tools.ratings_utils as ru
import Tools.system_utils as sys

# Third party packages
import pandas as pd

YEAR = 2024
FILENAME = f"Data/Seasons/data_{YEAR}.json"
TOURNAMENT_FILENAME = f"Data/Tournaments/tournament_{YEAR}.csv"
PICKS_FILENAME = f"Data/Tournament Picks/picks_{YEAR}.csv"
RATINGS_FILENAME = f"Data/Season Ratings/data_{YEAR}.json"


# Create data frame for valid teams in the current season that can be used for tournament simulation
score_df = ru.set_rating_data_frame(filename=FILENAME)
rating_score_df = None

# Save Ratings to JSON

In [None]:
rating_score_df = ru.add_ratings_per_game(score_df=score_df)

In [None]:
print(rating_score_df)

In [None]:
rating_score_df.to_json(RATINGS_FILENAME, orient='records', indent=4) 

# Logistic Regression Model

In [2]:
rating_score_df = pd.read_json(RATINGS_FILENAME)

In [3]:
print(rating_score_df)

                    Date               Home  Home_Score  \
0    2023-11-06 11:00:00            IU Indy          70   
1    2023-11-06 11:30:00            Hofstra         101   
2    2023-11-06 12:00:00               Troy          92   
3    2023-11-06 12:30:00  Stephen F. Austin          96   
4    2023-11-06 13:00:00       Prairie View          89   
...                  ...                ...         ...   
6163 2024-03-27 19:00:00         Seton Hall          91   
6164 2024-03-27 21:00:00               Utah          74   
6165 2024-04-02 19:00:00      Indiana State         100   
6166 2024-04-02 21:30:00         Seton Hall          84   
6167 2024-04-04 19:00:00         Seton Hall          79   

                          Away  Away_Score             Winner  Home_Massey  \
0                     Spalding          63            IU Indy     0.000000   
1     St. Joseph's–Long Island          48            Hofstra     0.000000   
2      Univ. of Ft. Lauderdale          47               

## Home team is winner -> 1
## Away team is winner -> 0

In [4]:
df = rating_score_df.copy()
df["y"] = (df["Winner"] == df["Home"]).astype(int)

In [5]:
df["Massey_diff"] = df["Home_Massey"] - df["Away_Massey"]
df["Colley_diff"] = df["Home_Colley"] - df["Away_Colley"]
df["Elo_diff"] = df["Home_Elo"] - df["Away_Elo"]

In [6]:
X = df[["Massey_diff", "Colley_diff", "Elo_diff"]]
y = df["y"]

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)



In [8]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [9]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

Accuracy: 0.7277147487844409
ROC AUC: 0.7303617650684087


# Test Model Against March Madness Tournament

In [16]:
ratings_dict = ru.compile_ratings_dict(score_df=score_df)

_, _, tourney_dict = ru.simulate_tournament_with_all_ratings(filename=TOURNAMENT_FILENAME,
                                                             ratings=ratings_dict,
                                                             model=model)
sys.write_tournament_to_csv(tourney_dict=tourney_dict,
                            filename=PICKS_FILENAME,
                            rating_type="log_model")

Round: 1 / Round of 64 - Correct picks: 15 out of 32 - Total Points: 150 out of 320
Round: 2 / Round of 32 - Correct picks: 9 out of 16 - Total Points: 180 out of 320
Round: 3 / Sweet 16 - Correct picks: 3 out of 8 - Total Points: 120 out of 320
Round: 4 / Elite 8 - Correct picks: 2 out of 4 - Total Points: 160 out of 320
Round: 5 / Final 4 - Correct picks: 0 out of 2 - Total Points: 0 out of 320
Round: 6 / Championship - Correct picks: 0 out of 1 - Total Points: 0 out of 320

Total correct picks in tournament: 29 out of 63

Total points in tournament: 610 out of 1920

CSV written to Data/Tournament Picks/picks_2024_log_model.csv
