# Compare to Other Models

- Robert Shaw
- Data Driven March Madness

In this file, we test code written in march_madness_classes. The code allows us to train a head to head model predicting the the probability of team 1 beating team 2 in a college basketball games based on a logistic model of the probability. We run the tournament n times, counting the expected score of each team over the iterations. We then take the total score over the n iterations and predict head to head matchups as arg_max(points_1, points_2).

In [1]:
import march_madness_classes as mmc
import march_madness_games as mmg
import march_madness_models as mmm
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression as LogReg

In [2]:
# read in the dataset
seeds = pd.read_csv("datasets/kaggle_data/TourneySeeds.csv")
slots = pd.read_csv("datasets/kaggle_data/TourneySlots.csv")
games = pd.read_csv("datasets/kaggle_data/TourneyCompactResults.csv")

seeds_arr = mmg.filter_into_seasons(seeds)
slots_arr = mmg.filter_into_seasons(slots)
games_arr = mmg.filter_into_seasons(games)

In [3]:
# extract predictors
markov_data = pd.read_csv("datasets/our_data/stationary", index_col=0)
consistency = pd.read_csv("datasets/our_data/consistency", index_col=0)
dominance = pd.read_csv("datasets/our_data/dominance", index_col=0)
past_resul = pd.read_csv("datasets/our_data/past_results", index_col=0)
rpi = pd.read_csv("datasets/our_data/rpi", index_col=0)
bad_losses = pd.read_csv("datasets/our_data/bad_losses", index_col=0)
tough_wins = pd.read_csv("datasets/our_data/tough_wins", index_col=0)
close_games= pd.read_csv("datasets/our_data/close_games", index_col=0)
close_wins = pd.read_csv("datasets/our_data/close_wins",index_col=0)
close_wins_perc = pd.read_csv("datasets/our_data/close_wins_perc", index_col=0)
momentum = pd.read_csv("datasets/our_data/momentum", index_col=0)
weighted_wins = pd.read_csv("datasets/our_data/weighted_wins", index_col=0)
seed_matrix_df = pd.read_csv("datasets/our_data/team_summary_data/seeds_matrix", index_col=0)

In [4]:
# get data into correct format
predictor_names = ["min_index_id", "max_index_id", "markov", "dominance", "rpi", "bad_losses", "tough_wins", "close_wins", "close_wins_perc", "weighted_wins", "past_resul", "momentum"] 
predictor_dfs = [markov_data, dominance, rpi, bad_losses, tough_wins, close_wins, close_wins_perc, weighted_wins, past_resul, momentum]                                           

In [5]:
pred, resp = mmg.generate_multiple_years_of_games(range(1987, 2001), 
                                                  seeds_arr, 
                                                  slots_arr, 
                                                  games_arr, 
                                                  ["min_index_id", "max_index_id", "markov", "dominance", "rpi", "bad_losses", "tough_wins", "close_wins", "close_wins_perc", "weighted_wins", "past_resul", "momentum"], 
                                                  [markov_data, dominance, rpi, bad_losses, tough_wins, close_wins, close_wins_perc, weighted_wins, past_resul, momentum],
                                                  scoring_dif = False)


In [7]:
scaler = StandardScaler().fit(pred.iloc[:, [2,4,5,6,7,10]])

log_reg_model = LogReg(C = 10)
log_reg_model.fit(scaler.transform(pred.iloc[:, [2,4,5,6,7,10]]), resp.values.T[0])

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Test on 2002

In [87]:
reload(mmc)
reload(mmm)

<module 'march_madness_models' from 'march_madness_models.py'>

In [95]:
year = 2005
seeds = seeds_arr[year - 1985]
slots = slots_arr[year - 1985]
resul = games_arr[year - 1985]

# biased model ---> predicts upsets with proability p + .05 (where p comes from log reg model)
head_to_head_biased_model = mmm.ModelPredictor(log_reg_model, 
                                               scaler,
                                               predictor_dfs, 
                                               year, 
                                               simulation=False, 
                                               higher_seed_bias=True, 
                                               seeds_df=seeds, 
                                               higher_seed_bias_delta=.05)


# biased model with cooling ----> predicts upsets with proability p + .05 * cooling_factor (where p comes from log reg model)
head_to_head_biased_model_cooling = mmm.ModelPredictor(log_reg_model, 
                                                       scaler, 
                                                       predictor_dfs, 
                                                       year, 
                                                       seeds_df=seeds, 
                                                       simulation=False, 
                                                       higher_seed_bias=True, 
                                                       higher_seed_bias_delta=.01,
                                                       cooling = {6:10, 5:10, 4:10, 3:10, 2:-5, 1:-5}
                                                      )

# unbiased model ----> predicts based on head to head log reg model
head_to_head_unbiased_model = mmm.ModelPredictor(log_reg_model, scaler, predictor_dfs, year, simulation=False, higher_seed_bias=False, seeds_df=seeds)


# for comparison
tourney_actual = mmc.Tournament(seeds, slots, mmm.ActualTournament(resul))
tourney_top_seed = mmc.Tournament(seeds, slots, mmm.BasicPredictor())

# predict tournament
tourney_biased_model = mmc.Tournament(seeds, slots, head_to_head_biased_model)
tourney_biased_model_cooling = mmc.Tournament(seeds, slots, head_to_head_biased_model_cooling)
tourney_unbiased_model = mmc.Tournament(seeds, slots, head_to_head_unbiased_model)

In [96]:
tourney_biased_model.compare_to_dif_tournament(tourney_actual, tourney_top_seed)

7     1338
23    1280
30    1301
39    1104
48    1257
53    1328
55    1242
56    1257
60    1228
61    1314
62    1228
Name: Prediction, dtype: object
7     1334
23    1280
30    1301
39    1454
48    1257
53    1246
55    1458
56    1257
60    1228
61    1314
62    1314
Name: Prediction, dtype: object
6
2
11


NameError: global name 'a' is not defined