# Machine Learning Approach

In [1]:
# Local libraries
import Tools.ratings_utils as ru
import Tools.system_utils as sys

# Third party packages
import pandas as pd

YEAR = 2024
FILENAME = f"Data/Seasons/data_{YEAR}.json"
TOURNAMENT_FILENAME = f"Data/Tournaments/tournament_{YEAR}.csv"
PICKS_FILENAME = f"Data/Tournament Picks/picks_{YEAR}.csv"
RATINGS_FILENAME = f"Data/Season Ratings/data_{YEAR}.json"


# Create data frame for valid teams in the current season that can be used for tournament simulation
score_df = ru.set_rating_data_frame(filename=FILENAME)
rating_score_df = None

# Save Ratings to JSON

In [None]:
rating_score_df = ru.add_ratings_per_game(score_df=score_df)

In [None]:
print(rating_score_df)

In [None]:
rating_score_df.to_json(RATINGS_FILENAME, orient='records', indent=4) 

# Logistic Regression Model

In [2]:
rating_score_df = pd.read_json(RATINGS_FILENAME)

In [None]:
print(rating_score_df)

## Home team is winner -> 1
## Away team is winner -> 0

In [3]:
df = rating_score_df.copy()
df["y"] = (df["Winner"] == df["Home"]).astype(int)

In [4]:
df["Massey_diff"] = df["Home_Massey"] - df["Away_Massey"]
df["Colley_diff"] = df["Home_Colley"] - df["Away_Colley"]
df["Elo_diff"] = df["Home_Elo"] - df["Away_Elo"]

In [5]:
X = df[["Massey_diff", "Colley_diff", "Elo_diff"]]
y = df["y"]

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)



In [7]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

Accuracy: 0.7277147487844409
ROC AUC: 0.7303617650684087


In [9]:
coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_[0]
})
print(coef_df)

       Feature  Coefficient
0  Massey_diff     0.000000
1  Colley_diff     0.000000
2     Elo_diff     0.011682


In [10]:
print(y_pred)
print(y_prob)

[1 0 1 ... 1 1 1]
[0.95604606 0.41776405 0.53928426 ... 0.93047762 0.86490789 0.50880524]


# Test Model Against March Madness Tournament

In [11]:
massey_ratings = ru.calculate_massey_ratings(score_df=score_df,
                                             debug=False)
colley_ratings = ru.calculate_colley_ratings(score_df=score_df,
                                             debug=False)
elo_ratings = ru.calculate_elo_ratings(score_df=score_df,
                                       K=30,
                                       debug=False,
                                       adjust_K=False)

In [12]:
print(X_test)

      Massey_diff  Colley_diff    Elo_diff
4934     0.000000          0.0  175.367250
4935     0.000000          0.0 -116.665845
4936     0.000000          0.0  -74.771709
4937     0.000000          0.0   86.852605
4938     0.000000          0.0  -92.912455
...           ...          ...         ...
6163     1.783608          0.0  -25.691186
6164     6.021434          0.0  -77.695032
6165    -3.033870          0.0  133.798903
6166     2.873843          0.0   70.679303
6167    -0.447248          0.0  -85.235127

[1234 rows x 3 columns]


In [19]:
ratings_dict = {}

for k in massey_ratings.keys():

    # Initialize dictionary entry
    ratings_dict[k] = {}

    # Assign values to entry
    ratings_dict[k]['Massey'] = massey_ratings[k]
    ratings_dict[k]['Colley'] = colley_ratings[k]
    ratings_dict[k]['Elo'] = elo_ratings[k]
    

{'Texas Tech': {'Massey': 36.52764691567818, 'Colley': 0.960888559991068, 'Elo': 1649.6150249738848}, 'Bridgewater (VA)': {'Massey': -10.68784027563178, 'Colley': 0.3165215097640698, 'Elo': 1473.7294773045273}, 'DePauw': {'Massey': -0.20385739888052612, 'Colley': 0.3759152849746552, 'Elo': 1483.7633887960574}, 'St. Andrews': {'Massey': -43.75495591276484, 'Colley': 0.23387944628218937, 'Elo': 1443.5827169036454}, 'UTEP': {'Massey': 18.76049569576048, 'Colley': 0.6471860690236025, 'Elo': 1534.694005094835}, 'Pittsburgh-Greensburg': {'Massey': -11.899889701692357, 'Colley': 0.25272327510316855, 'Elo': 1482.6945334005554}, 'Indiana State': {'Massey': 33.58860801911291, 'Colley': 0.9953572287890939, 'Elo': 1714.1203976149723}, 'San Diego': {'Massey': 15.137785572359194, 'Colley': 0.6718821462335582, 'Elo': 1519.2408165857169}, 'Santa Clara': {'Massey': 26.27499184591472, 'Colley': 0.7772762380126509, 'Elo': 1567.6369494081323}, 'Western New Mexico': {'Massey': -16.27324765004102, 'Colley':

In [30]:
team1 = 'Texas Tech'
team2 = 'North Carolina'
Massey_diff = ratings_dict[team1]['Massey'] - ratings_dict[team2]['Massey']
Colley_diff = ratings_dict[team1]['Colley'] - ratings_dict[team2]['Colley']
Elo_diff = ratings_dict[team1]['Elo'] - ratings_dict[team2]['Elo']

x1_dict = {
    'Massey_diff': [Massey_diff],
    'Colley_diff': [Colley_diff],
    'Elo_diff': [Elo_diff]
}
x2_dict = {
    'Massey_diff': [-Massey_diff],
    'Colley_diff': [-Colley_diff],
    'Elo_diff': [-Elo_diff]
}

x1 = pd.DataFrame(x1_dict)
x2 = pd.DataFrame(x2_dict)

print(x1)
print(x2)

print(model.predict_proba(x1)[:, 1][0])
print(model.predict_proba(x2)[:, 1][0])

model_ratings = {}
model_ratings[team1] = model.predict_proba(x1)[:, 1][0]
model_ratings[team2] = model.predict_proba(x2)[:, 1][0]

print(model_ratings)

   Massey_diff  Colley_diff   Elo_diff
0    -5.140963    -0.120023 -62.657739
   Massey_diff  Colley_diff   Elo_diff
0     5.140963     0.120023  62.657739
0.5741933393940849
0.8535794605119654
{'Texas Tech': 0.5741933393940849, 'North Carolina': 0.8535794605119654}


In [None]:
_, _, tourney_dict = ru.simulate_tournament_with_all_ratings(filename=TOURNAMENT_FILENAME,
                                                             ratings=ratings_dict,
                                                             model=model)

In [None]:


sys.write_tournament_to_csv(tourney_dict=tourney_dict,
                            filename=PICKS_FILENAME,
                            rating_type="elo")