In [1]:
import numpy as np
import pandas as pd
import itertools
from numpy.random import default_rng
from tqdm import tqdm

# Load data and models

In [2]:
# numpy random generator
rng = default_rng()

In [3]:
# Read in the kaggle data
seeds = pd.read_csv("data/kaggle_data/MNCAATourneySeeds.csv",dtype={'TeamID':str})
slots = pd.read_csv("data/kaggle_data/MNCAATourneySlots.csv")
games = pd.read_csv("data/kaggle_data/MNCAATourneyCompactResults.csv",dtype={'WTeamID':str,'LTeamID':str})
teams = pd.read_csv("data/kaggle_data/MTeams.csv",dtype={'TeamID':str})

In [4]:
# Read in predictors 
pred_df = pd.read_csv('data/generated_data/team_season_features.csv')

In [5]:
# Read in the model(s)... need to modify model notebook to save final one
# todo....

In [6]:
def sorted_ids(r,c1,c2):
    return ','.join([x for x in sorted([str(r[c1]),str(r[c2])])])

# need for 2021 game where VCU couldn't play due to COVID protocals
def fix_actual_winner(r):
    if r['Season'] == 2021 and r['sorted_ids'] == '1332,1433' :
        return '1332'
    return r['actual_winner']

# gets all possible tournament matchups in a season
def get_combos(season):
    team_ids = seeds[seeds.Season==season]['TeamID'].unique()
    return [comb for comb in itertools.combinations(team_ids,2)]


In [8]:
games['sorted_ids'] = games.apply(lambda r: sorted_ids(r,'WTeamID','LTeamID'),axis=1)
slots['Round'] = slots['Slot'].apply(lambda x:  int(x[1]) if x[0] == 'R' else 0)

# Make dataframe with predictions for every possible matchup in each tournament

In [62]:
# dummy data for now, need to read in real predictors and then apply best model

# precomputed dataframe
pred_df = pd.DataFrame([(season, team1, team2) for season in games[games.Season > 2001].Season.unique() for team1,team2 in get_combos(season)], columns=['Season','TeamID_1','TeamID_2'])
pred_df['team1_seed'] = pred_df.merge(seeds,how='left',left_on=['Season','TeamID_1'],right_on=['Season','TeamID'])['Seed']
pred_df['team2_seed'] = pred_df.merge(seeds,how='left',left_on=['Season','TeamID_2'],right_on=['Season','TeamID'])['Seed']
pred_df['team1_seed_num'] = pred_df['team1_seed'].str.extract('(\d+)', expand=False).map(int)
pred_df['team2_seed_num'] = pred_df['team2_seed'].str.extract('(\d+)', expand=False).map(int)

# calculate with model once have predictor data
pred_df['team1_win_prob'] = 1
#pred_df['team1_win_prob'] = pred_df.apply(lambda x: rng.random(), axis=1)
pred_df['team2_win_prob'] = 1-pred_df['team1_win_prob']
pred_df['team1_score_diff'] = pred_df['team1_win_prob'].apply(lambda x: (x-0.5)*30)

pred_df['model_winner'] = pred_df.apply(lambda x: x['TeamID_1'] if x['team1_win_prob'] > 0.5 else x['TeamID_2'], axis=1)
pred_df['seed_winner'] = pred_df.apply(lambda x: x['TeamID_1'] if x['team1_seed_num'] <= x['team2_seed_num'] else x['TeamID_2'], axis=1)
pred_df['model_winner_prob'] = pred_df.apply(lambda x: x['team1_win_prob'] if x['model_winner']==x['TeamID_1'] else x['team2_win_prob'], axis=1)
pred_df['seed_winner_prob'] = pred_df.apply(lambda x: x['team1_win_prob'] if x['seed_winner']==x['TeamID_1'] else x['team2_win_prob'], axis=1)
pred_df['actual_winner_prob'] = 1
pred_df['sorted_ids'] = pred_df.apply(lambda r: sorted_ids(r,'TeamID_1','TeamID_2'),axis=1)
pred_df['actual_winner'] = pred_df.merge(games,how='left',on=['Season','sorted_ids'])['WTeamID'].fillna('NA')
pred_df['actual_winner'] = pred_df.apply(lambda r: fix_actual_winner(r),axis=1)

In [63]:
# add simulation columns? may not be worth it since past work showed very similar to greedy

In [64]:
def run_sim(r):
    return np.random.choice([r['TeamID_1'],r['TeamID_2']],p=[r['team1_win_prob'],1-r['team1_win_prob']])

In [17]:
for i in tqdm(range(5)):
    pred_df['simulation_'+str(i)] = pred_df.apply(lambda r: run_sim(r),axis=1)

100%|██████████| 5/5 [00:10<00:00,  2.19s/it]


# Bracket class - may want to add something 

In [None]:
# need to add expected points

In [65]:
class Bracket:
    def __init__(self, season, seeds, slots, teams, prediction_df):
        self.season = season
        self.teams= teams
        self.prediction_df = prediction_df[prediction_df.Season==season].copy()
        self.seeds = seeds[seeds.Season==season].copy()
        self.slots = slots[slots.Season==season].copy()
        self.slots['Round'] = self.slots['Slot'].apply(lambda x:  int(x[1]) if x[0] == 'R' else 0)

        self.r0 = self.slots[self.slots['Round']==0].reset_index(drop=True)
        self.r1 = self.slots[self.slots['Round']==1].reset_index(drop=True)
        self.r2 = self.slots[self.slots['Round']==2].reset_index(drop=True)
        self.r3 = self.slots[self.slots['Round']==3].reset_index(drop=True)
        self.r4 = self.slots[self.slots['Round']==4].reset_index(drop=True)
        self.r5 = self.slots[self.slots['Round']==5].reset_index(drop=True)
        self.r6 = self.slots[self.slots['Round']==6].reset_index(drop=True)
        self.bracket = self.slots.copy()
        
    def fill_bracket(self,predictor):
        
        # playin round
        self.r0['TeamID_1'] = pd.merge(self.r0,self.seeds,how='left',left_on=['StrongSeed'],right_on=['Seed'])['TeamID']
        self.r0['team1_prob'] = 1
        self.r0['TeamID_2'] = pd.merge(self.r0,self.seeds,how='left',left_on=['WeakSeed'],right_on=['Seed'])['TeamID']
        self.r0['team2_prob'] = 1
        self.r0['sorted_ids'] = self.r0.apply(lambda r: sorted_ids(r,'TeamID_1','TeamID_2'),axis=1)
        self.r0[['Winner','curr_win_prob']] = pd.merge(self.r0,self.prediction_df,how='left',on=['sorted_ids'])[[predictor,predictor+'_prob']]
        self.r0['pre_win_prob'] = 1
        self.r0['cumm_win_prob'] = self.r0['curr_win_prob']
        
        playin_seeds = self.r0[['Season','Slot','Winner']].copy()
        playin_seeds.columns = ['Season','Seed','TeamID']
        self.new_seeds = pd.concat([self.seeds,playin_seeds],ignore_index=True)
                               
        self.r1['TeamID_1'] = pd.merge(self.r1,self.new_seeds,how='left',left_on=['StrongSeed'],right_on=['Seed'])['TeamID']
        self.r1['team1_prob'] = 1
        self.r1['TeamID_2'] = pd.merge(self.r1,self.new_seeds,how='left',left_on=['WeakSeed'],right_on=['Seed'])['TeamID']
        self.r1['team2_prob'] = 1
        self.r1['sorted_ids'] = self.r1.apply(lambda r: sorted_ids(r,'TeamID_1','TeamID_2'),axis=1)
        self.r1[['Winner','curr_win_prob']] = pd.merge(self.r1,self.prediction_df,how='left',on=['sorted_ids'])[[predictor,predictor+'_prob']]
        self.r1['pre_win_prob'] = 1
        self.r1['cumm_win_prob'] = self.r1['curr_win_prob']
        
        # can probably condense to for loop for subsequent rounds
        self.r2[['TeamID_1','team1_prob']] = pd.merge(self.r2,self.r1,how='left',left_on=['StrongSeed'],right_on=['Slot'])[['Winner','cumm_win_prob']]
        self.r2[['TeamID_2','team2_prob']] = pd.merge(self.r2,self.r1,how='left',left_on=['WeakSeed'],right_on=['Slot'])[['Winner','cumm_win_prob']]
        self.r2['sorted_ids'] = self.r2.apply(lambda r: sorted_ids(r,'TeamID_1','TeamID_2'),axis=1)
        self.r2[['Winner','curr_win_prob']] = pd.merge(self.r2,self.prediction_df,how='left',on=['sorted_ids'])[[predictor,predictor+'_prob']]
        self.r2['pre_win_prob'] = self.r2.apply(lambda r: r['team1_prob'] if r['Winner']==r['TeamID_1'] else r['team2_prob'],axis=1)
        self.r2['cumm_win_prob'] = self.r2['pre_win_prob']*self.r2['curr_win_prob']
        
        self.r3[['TeamID_1','team1_prob']] = pd.merge(self.r3,self.r2,how='left',left_on=['StrongSeed'],right_on=['Slot'])[['Winner','cumm_win_prob']]
        self.r3[['TeamID_2','team2_prob']] = pd.merge(self.r3,self.r2,how='left',left_on=['WeakSeed'],right_on=['Slot'])[['Winner','cumm_win_prob']]
        self.r3['sorted_ids'] = self.r3.apply(lambda r: sorted_ids(r,'TeamID_1','TeamID_2'),axis=1)
        self.r3[['Winner','curr_win_prob']] = pd.merge(self.r3,self.prediction_df,how='left',on=['sorted_ids'])[[predictor,predictor+'_prob']]
        self.r3['pre_win_prob'] = self.r3.apply(lambda r: r['team1_prob'] if r['Winner']==r['TeamID_1'] else r['team2_prob'],axis=1)
        self.r3['cumm_win_prob'] = self.r3['pre_win_prob']*self.r3['curr_win_prob']
        
        self.r4[['TeamID_1','team1_prob']] = pd.merge(self.r4,self.r3,how='left',left_on=['StrongSeed'],right_on=['Slot'])[['Winner','cumm_win_prob']]
        self.r4[['TeamID_2','team2_prob']] = pd.merge(self.r4,self.r3,how='left',left_on=['WeakSeed'],right_on=['Slot'])[['Winner','cumm_win_prob']]
        self.r4['sorted_ids'] = self.r4.apply(lambda r: sorted_ids(r,'TeamID_1','TeamID_2'),axis=1)
        self.r4[['Winner','curr_win_prob']] = pd.merge(self.r4,self.prediction_df,how='left',on=['sorted_ids'])[[predictor,predictor+'_prob']]
        self.r4['pre_win_prob'] = self.r4.apply(lambda r: r['team1_prob'] if r['Winner']==r['TeamID_1'] else r['team2_prob'],axis=1)
        self.r4['cumm_win_prob'] = self.r4['pre_win_prob']*self.r4['curr_win_prob']
        
        self.r5[['TeamID_1','team1_prob']] = pd.merge(self.r5,self.r4,how='left',left_on=['StrongSeed'],right_on=['Slot'])[['Winner','cumm_win_prob']]
        self.r5[['TeamID_2','team2_prob']] = pd.merge(self.r5,self.r4,how='left',left_on=['WeakSeed'],right_on=['Slot'])[['Winner','cumm_win_prob']]
        self.r5['sorted_ids'] = self.r5.apply(lambda r: sorted_ids(r,'TeamID_1','TeamID_2'),axis=1)
        self.r5[['Winner','curr_win_prob']] = pd.merge(self.r5,self.prediction_df,how='left',on=['sorted_ids'])[[predictor,predictor+'_prob']]
        self.r5['pre_win_prob'] = self.r5.apply(lambda r: r['team1_prob'] if r['Winner']==r['TeamID_1'] else r['team2_prob'],axis=1)
        self.r5['cumm_win_prob'] = self.r5['pre_win_prob']*self.r5['curr_win_prob']
        
        self.r6[['TeamID_1','team1_prob']] = pd.merge(self.r6,self.r5,how='left',left_on=['StrongSeed'],right_on=['Slot'])[['Winner','cumm_win_prob']]
        self.r6[['TeamID_2','team2_prob']] = pd.merge(self.r6,self.r5,how='left',left_on=['WeakSeed'],right_on=['Slot'])[['Winner','cumm_win_prob']]
        self.r6['sorted_ids'] = self.r6.apply(lambda r: sorted_ids(r,'TeamID_1','TeamID_2'),axis=1)
        self.r6[['Winner','curr_win_prob']] = pd.merge(self.r6,self.prediction_df,how='left',on=['sorted_ids'])[[predictor,predictor+'_prob']]
        self.r6['pre_win_prob'] = self.r6.apply(lambda r: r['team1_prob'] if r['Winner']==r['TeamID_1'] else r['team2_prob'],axis=1)
        self.r6['cumm_win_prob'] = self.r6['pre_win_prob']*self.r6['curr_win_prob']
        
        self.bracket = pd.concat([self.r0,self.r1,self.r2,self.r3,self.r4,self.r5,self.r6],ignore_index=True)
        self.bracket['TeamName_1'] = pd.merge(self.bracket,self.teams,how='left',left_on=['TeamID_1'],right_on=['TeamID'])['TeamName']
        self.bracket['TeamName_2'] = pd.merge(self.bracket,self.teams,how='left',left_on=['TeamID_2'],right_on=['TeamID'])['TeamName']
        self.bracket['WinnerName'] = pd.merge(self.bracket,self.teams,how='left',left_on=['Winner'],right_on=['TeamID'])['TeamName']
        self.bracket['potential_pts'] = self.bracket['Round'].apply(lambda x: 0 if x == 0 else 320/(64/(2**x)))
        self.bracket['expected_pts'] = self.bracket['cumm_win_prob']*self.bracket['potential_pts']

        return
    
    def score(self,actual):
        self.bracket[['Winner_actual','WinnerName_actual']] = pd.merge(self.bracket,actual.bracket[['Season','Slot','Winner','WinnerName']],how='left',on=['Season','Slot'],suffixes=['_predicted','_actual'])[['Winner_actual','WinnerName_actual']]
        self.bracket['correct'] = self.bracket.apply(lambda r: int(r['Winner']==r['Winner_actual']),axis=1)
        self.bracket['pts'] = self.bracket['potential_pts']*self.bracket['correct']
        return
    
    def get_team_scores(self):
        return self.bracket.groupby('Winner',as_index=False)['pts'].sum()
    
        

# Test out different strategies - need to incorporate people's bracket / who picked whom

In [66]:
a = Bracket(2021,seeds,slots,teams,pred_df)

In [67]:
a.fill_bracket('actual_winner')

In [68]:
a.bracket['expected_pts'].sum()

1920.0

In [69]:
b = Bracket(2021,seeds,slots,teams,pred_df)

In [70]:
b.fill_bracket('seed_winner')

In [71]:
b.bracket['expected_pts'].sum()

1920.0

In [32]:
def run_tourney(season,predictor,actual):
    b = Bracket(season,seeds,slots,teams,pred_df)
    b.fill_bracket(predictor)
    b.score(actual)
    round_score_df = b.bracket.groupby('Round')['pts'].sum()
    round_scores = {x:round_score_df[x] for x in range(1,7)}
    round_scores['total'] = b.bracket['pts'].sum()
    return round_scores
    

In [33]:
scores = {}
for season in range(2021,2022):
    scores[season] = {}
    actual = Bracket(season,seeds,slots,teams,pred_df)
    actual.fill_bracket('actual_winner')
    predictors = ['seed_winner','model_winner']
    for p in predictors:
        scores[season][p] = run_tourney(season,p,actual)

In [34]:
scores

{2021: {'seed_winner': {1: 220.0,
   2: 140.0,
   3: 160.0,
   4: 160.0,
   5: 0.0,
   6: 0.0,
   'total': 680.0},
  'model_winner': {1: 130.0,
   2: 80.0,
   3: 80.0,
   4: 0.0,
   5: 0.0,
   6: 0.0,
   'total': 290.0}}}

In [493]:
results = pd.DataFrame([x for x in range(2009,2022)],columns=['Season'])
predictors = ['seed_winner','model_winner']
for p in predictors:
    results[p] = results['Season'].apply(lambda x: scores[x][p]['total'])

In [None]:
# make some visualization of points for each strategy in each year

# Create 2022 Bracket

In [557]:
bracket_2022 = Bracket(2022,seeds,slots,teams,pred_df)
bracket_2022.fill_bracket()

In [558]:
# Look at bracket
bracket_2022.bracket

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round
