# Prediction
- Robert Shaw
- CS109a Project: Data Driven March Madness

In this file, we make predictions for years 2015-16 using the methods tuned on years 2001-2013 (see modeling files). We make both a single prediction bracket for an office pool type setting, as well as 10 brackets for ESPN bracket challenge. We compare our results to the ESPN field.

In [1]:
import march_madness_classes as mmc
import march_madness_games as mmg
import march_madness_models as mmm
import march_madness_train_and_tune as mmtt

import pandas as pd
import numpy as np
import importlib
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression as LogReg

In [2]:
# read in the dataset
teams = pd.read_csv("datasets/kaggle_data_2021/MTeams.csv")
seeds = pd.read_csv("datasets/kaggle_data_2021/MNCAATourneySeeds.csv")
slots = pd.read_csv("datasets/kaggle_data_2021/MNCAATourneySlots.csv")
games = pd.read_csv("datasets/kaggle_data_2021/MNCAATourneyCompactResults.csv")
#games_2016 = pd.read_csv("datasets/kaggle_data_2021/MNCAATourneyCompactResults2016.csv")

seeds_arr = mmg.filter_into_seasons(seeds)
slots_arr = mmg.filter_into_seasons(slots)
games_arr = mmg.filter_into_seasons(games)

#games_arr.append(games_2016)

In [23]:
games_arr

[    Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT
 0     1985     136     1116      63     1234      54    N      0
 1     1985     136     1120      59     1345      58    N      0
 2     1985     136     1207      68     1250      43    N      0
 3     1985     136     1229      58     1425      55    N      0
 4     1985     136     1242      49     1325      38    N      0
 ..     ...     ...      ...     ...      ...     ...  ...    ...
 58    1985     146     1385      69     1301      60    N      0
 59    1985     146     1437      56     1314      44    N      0
 60    1985     152     1207      77     1385      59    N      0
 61    1985     152     1437      52     1272      45    N      0
 62    1985     154     1437      66     1207      64    N      0
 
 [63 rows x 8 columns],
      Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT
 63     1986     136     1133      83     1431      65    N      0
 64     1986     136     1177      72     1438  

In [3]:
# extract predictors, chosen from the variable selection notebook
markov          = pd.read_csv("datasets/our_data/stationary", index_col=0)
rpi             = pd.read_csv("datasets/our_data/regular_season_rpi_matrix", index_col=0)
bad_losses      = pd.read_csv("datasets/our_data/bad_losses_matrix", index_col=0)

# seeds
seed_matrix_df  = pd.read_csv("datasets/our_data/team_summary_data/seeds_matrix", index_col=0)

# get data into correct format
predictor_names = ["min_index_id", "max_index_id", "markov","rpi","bad_losses"] 

# package the predictors into an array
predictor_dfs = [markov, rpi, bad_losses] 

We have 3 predictors, markov, rpi, and bad losses. See our data exploration/model selection for information on this.

In [27]:
importlib.reload(mmg)
importlib.reload(mmm)
importlib.reload(mmtt)

<module 'march_madness_train_and_tune' from '/Users/danielalpert/Projects/march_madness/march_madness_train_and_tune.py'>

### 1) Setup Head to Head Models

In [16]:
games_arr[0]

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0
...,...,...,...,...,...,...,...,...
58,1985,146,1385,69,1301,60,N,0
59,1985,146,1437,56,1314,44,N,0
60,1985,152,1207,77,1385,59,N,0
61,1985,152,1437,52,1272,45,N,0


In [81]:
train_test_arr = []

window   = 5
min_year = 2014
max_year = 2020
#training_yrs = [2016, 2017, 2018, 2019]

year_range = range(min_year, max_year)

# generate our train test split for each year
for year in year_range:
    # do the split for the current year
    train_test_tuple = mmtt.train_test_split(window, 
                                             year, 
                                             seeds_arr, 
                                             slots_arr, 
                                             games_arr, 
                                             predictor_names, 
                                             predictor_dfs)
    
    # add to our array
    train_test_arr.append(train_test_tuple)

24
2009
25
2010
26
2011
27
2012
28
2013
29
2014
25
2010
26
2011
27
2012
28
2013
29
2014
30
2015
26
2011
27
2012
28
2013
29
2014
30
2015
31
2016
27
2012
28
2013
29
2014
30
2015
31
2016
32
2017
28
2013
29
2014
30
2015
31
2016
32
2017
33
2018
29
2014
30
2015
31
2016
32
2017
33
2018
34
2019


Here we use a "windowing" approach. The head to head model is trained on the 3 years prior.

In [82]:
# our cross validated value of c, from variable selection notebook
c = 1
variables = ["markov","rpi","bad_losses"] 

# models and scalers to be fit
models  = []
scalers = []

for year in year_range:
        # get train data
        train_x = train_test_arr[year - min_year][0][variables]
        train_y = train_test_arr[year - min_year][1].values[:, 0]

        # get cross validation set
        cross_x = train_test_arr[year - min_year][2][variables]
        cross_y = train_test_arr[year - min_year][3].values[:, 0]

        # scaling
        scaler = StandardScaler().fit(train_x)
        scaled_train_x = scaler.transform(train_x)
        scaled_cross_x = scaler.transform(cross_x)

        # init model
        model = LogReg(C=c)

        # fit model
        model.fit(scaled_train_x, train_y)
        
        # append to our lis of models
        models.append(model)
        scalers.append(scaler)



Fit head to head log reg model for predicting the outcomes of individual games.

---

### 2) Predict A Single Bracket

- a) Simulation, with Bias of .1 (Cross Validated in Other Files)

In [83]:
# setup simulator
simulators = []

i = 0
# iterate years
year_range = range(2021,2022)
for year in year_range: 
    # get data needed
    seeds_year = seeds_arr[year-1985] 
    slots_year = slots_arr[year-1985] 
    #games_year = games_arr[year-1985]
    
    # setup head to head model, simulator with .07 bias
    head_to_head_model_year = mmm.ModelPredictor(models[i], scalers[i], predictor_dfs, year, seeds_year, 
                                                 simulation=True,
                                                 higher_seed_bias=True,
                                                 higher_seed_bias_delta=.1)
    
    # setup simulator
    simulators.append(mmc.Simulator(seeds_year, slots_year, head_to_head_model_year))
    
    i = i + 1

In [84]:
# run simulations
i = 0
for year in year_range:
    # run simulation
    points = simulators[i].simulate_tournament(300) 
    
    # predict bracket based on the simulation
    bracket = simulators[i].predict_tournament()
    
    i = i + 1
    
    print(year)

2021


In [85]:
bracket.round_1_df['StrongSeedID'] = bracket.round_1_df['Strongseed Team'].astype(int)
bracket.round_1_df['WeakSeedID'] = bracket.round_1_df['Weakseed Team'].astype(int)
bracket.round_1_df['PredictionID'] = bracket.round_1_df['Prediction'].astype(int)
bracket.round_2_df['StrongSeedID'] = bracket.round_2_df['Strongseed Team'].astype(int)
bracket.round_2_df['WeakSeedID'] = bracket.round_2_df['Weakseed Team'].astype(int)
bracket.round_2_df['PredictionID'] = bracket.round_2_df['Prediction'].astype(int)
bracket.round_3_df['StrongSeedID'] = bracket.round_3_df['Strongseed Team'].astype(int)
bracket.round_3_df['WeakSeedID'] = bracket.round_3_df['Weakseed Team'].astype(int)
bracket.round_3_df['PredictionID'] = bracket.round_3_df['Prediction'].astype(int)
bracket.round_4_df['StrongSeedID'] = bracket.round_4_df['Strongseed Team'].astype(int)
bracket.round_4_df['WeakSeedID'] = bracket.round_4_df['Weakseed Team'].astype(int)
bracket.round_4_df['PredictionID'] = bracket.round_4_df['Prediction'].astype(int)
bracket.round_5_df['StrongSeedID'] = bracket.round_5_df['Strongseed Team'].astype(int)
bracket.round_5_df['WeakSeedID'] = bracket.round_5_df['Weakseed Team'].astype(int)
bracket.round_5_df['PredictionID'] = bracket.round_5_df['Prediction'].astype(int)
bracket.round_6_df['StrongSeedID'] = bracket.round_6_df['Strongseed Team'].astype(int)
bracket.round_6_df['WeakSeedID'] = bracket.round_6_df['Weakseed Team'].astype(int)
bracket.round_6_df['PredictionID'] = bracket.round_6_df['Prediction'].astype(int)

In [92]:
bracket.round_6_df.merge(teams[['TeamID', 'TeamName']], left_on='StrongSeedID', right_on='TeamID', suffixes=['','_strong'])\
.merge(teams[['TeamID', 'TeamName']], left_on='WeakSeedID', right_on='TeamID', suffixes=['','_weak'])\
.merge(teams[['TeamID', 'TeamName']], left_on='PredictionID', right_on='TeamID', suffixes=['','_pred'])\
[['Slot', 'StrongSeed','Strongseed Team','TeamName','WeakSeed','Weakseed Team','TeamName_weak',\
'TeamName_pred']]

Unnamed: 0,Slot,StrongSeed,Strongseed Team,TeamName,WeakSeed,Weakseed Team,TeamName_weak,TeamName_pred
0,R6CH,R5WX,1211,Gonzaga,R5YZ,1228,Illinois,Gonzaga


- b) Results

In [None]:
# for comparison

# run greedy and top seed tourneys
year = min_year
top_seed_tourneys = []
actual_tourneys   = []

# analyze results for all simulations
i = 0
for year in year_range:
    # get data from our db
    seeds_year = seeds_arr[year-1985] 
    slots_year = slots_arr[year-1985] 
    #games_year = games_arr[year-1985]
    
    # get actual models
    actual_model = mmm.ActualTournament(games_arr[year-1985])
    actual_tourneys.append(mmc.Tournament(seeds_year, slots_year, actual_model, include_scoring_dif=False))
    
    # get top seed models
    top_seed_model = mmm.BasicPredictor()
    top_seed_tourneys.append(mmc.Tournament(seeds_year, slots_year, top_seed_model, include_scoring_dif=False))
    
    year = year + 1

In [None]:
year = min_year

simulator_scores = np.zeros(len(year_range))
top_seed_scores = np.zeros(len(year_range))

# analyze results for all simulations
i = 0
for simulator in simulators:
    # get data from our db
    seeds_year = seeds_arr[year-1985] 
    slots_year = slots_arr[year-1985] 
    games_year = games_arr[year-1985]
    
    
    # score tournament 
    simulator_scores[i], x = simulator.score_tournament(actual_tourneys[i], print_res=True)
    top_seed_scores[i],  x = top_seed_tourneys[i].score_tournament(actual_tourneys[i], print_res=False)
    
    year = year + 1
    i = i + 1

In [None]:
top_seed_scores

In [None]:
simulator_scores

Our simulation method does better than just predicting the top seed, especially in 2016.

---

### 3) Predict 10 Brackets
- a) Ensemble Method

In [75]:
# buffers
ensembles = []

i = 0
for year in year_range:
    # get data from our db
    seeds_year = seeds_arr[year-1985] 
    slots_year = slots_arr[year-1985] 
    #games_year = games_arr[year-1985]
    
    # setup ensembles
    ensembles.append(mmc.Ensemble(seeds_arr[year-1985], 
                                   slots_arr[year-1985], 
                                   models[i], 
                                   scalers[i],  
                                   predictor_dfs, 
                                   year))
    
    i = i + 1

In [79]:
# buffers
ensemble_scores = np.zeros(len(year_range))
ind_bracket_scores = np.zeros((len(year_range), 10))

ensemble_dif_matrix = np.zeros((len(year_range), 10, 10))
ensemble_avg_dif = np.zeros((len(year_range), 10))
ensemble_dif_top_seed = np.zeros((len(year_range), 10))

i = 0
for year in year_range:
    # get data from our db
    seeds_year = seeds_arr[year-1985] 
    slots_year = slots_arr[year-1985] 
    #games_year = games_arr[year-1985]
    
    # setup ensembles
    ensemble_tournament = ensembles[i]
    
    # individual bracket scores
    #ind_bracket_scores[i, :] = ensemble_tournament.score_tournament(actual_tourneys[i])
    
    # ensemble score
    ensemble_scores[i] = np.max(ind_bracket_scores[i, :])
    
    # difference between brackets
    #ensemble_dif_matrix[i, :, :] = ensemble_tournament.compute_dif_matrix(actual_tourneys[i]) 
    #ensemble_avg_dif[i, :] = ensemble_tournament.avg_game_dif()
    #ensemble_dif_top_seed[i, :] =  ensemble_tournament.compute_dif_vect(actual_tourneys[i], top_seed_tourneys[i])
    
    i = i + 1

In [80]:
print(ensemble_scores)
print(simulator_scores)
print(top_seed_scores)

[0.]


NameError: name 'simulator_scores' is not defined

We see that the ensemble gives us a big boost in 2016, showing that the diversity of brackets is helpful.

---

### 4) Compare Our Brackets to the Field

- 2016
    - Ensemble: 1260 is the 99.2 percentile for 2016.
    - Single Bracket: 1000 is the 95 percentile for 2016.

- 2015
    - Ensemble: 980 is the 76 percentile for 2015.
    - Single Bracket: 960 is the 75 percentile for 2015.

---
### 5) Save Our Predictions

Send Predictions to CSVs for use on the website.

In [190]:
def add_team_name(bracket, teams):
    strong_seed_names = []
    weak_seed_names   = []
    prediction_names  = []
    
    for index, row in bracket.iterrows():
        # extract ids
        strong_seed_id = int(row["Strongseed Team"])
        weak_seed_id   = int(row["Weakseed Team"])
        prediction_id  = int(row["Prediction"])
        
        # reverse lookup
        strong_seed_team = teams[teams["Team_Id"] == strong_seed_id]["Team_Name"].values[0]
        weak_seed_team = teams[teams["Team_Id"] == weak_seed_id]["Team_Name"].values[0]
        prediction_team = teams[teams["Team_Id"] == prediction_id]["Team_Name"].values[0]
        
        # add to array
        strong_seed_names.append(strong_seed_team)
        weak_seed_names.append(weak_seed_team)
        prediction_names.append(prediction_team)
        
    bracket["Strongseed Team Name"] = strong_seed_names
    bracket["Weakseed Team Name"] = weak_seed_names
    bracket["Prediction Team Name"] = prediction_names
    
    return bracket

In [193]:
# save predictions

prediction_2015 = simulators[0].tournament_prediction.entire_bracket
prediction_2015 = add_team_name(prediction_2015, teams)
prediction_2015.to_csv("datasets/predictions/2015_single_bracket_prediction.csv")

prediction_2016 = simulators[1].tournament_prediction.entire_bracket
prediction_2016 = add_team_name(prediction_2016, teams)
prediction_2016.to_csv("datasets/predictions/2016_single_bracket_prediction.csv")

In [198]:
i = 0
for year in year_range:
    ensemble = ensembles[i]
    
    # iterate
    j = 0 
    for tourney in ensemble.tourney_arr:
        filepath = "datasets/predictions/{}_ensemble_bracket_{}_prediction.csv".format(year, j)
    
        # add team names
        bracket = tourney.entire_bracket
        bracket = add_team_name(bracket, teams)
        bracket.to_csv(filepath)
        
        j = j + 1
        
    i = i + 1

In [199]:
i = 0
for year in year_range:
    actual_results = actual_tourneys[i].entire_bracket
    
    filepath = "datasets/predictions/{}_actual_results".format(year)
    
    # add team names
    actual_results = add_team_name(actual_results, teams)
    actual_results.to_csv(filepath)
    
    i = i + 1

In [200]:
i = 0
for year in year_range:
    top_seed_results = top_seed_tourneys[i].entire_bracket
    
    filepath = "datasets/predictions/{}_low_seed_prediction".format(year)
    
    top_seed_results = add_team_name(top_seed_results, teams)
    top_seed_results.to_csv(filepath)
    
    i = i + 1