In [283]:
import os
import time
import pandas as pd  
import numpy as np
import re
import sqlite3

#### Inputs  
  
prediction_probabilities.csv: this is a file with all possible 2017 matchups and a predicted probability that the stronger seed wins.  
slots_2017.csv: from Kaggle. Has the slots/matchups for the tournament.

#### Clean up the win probabilities data set

In [356]:
win_prob_var_touse = "rforest_prob" #set this to whatever variable has the win probabilities for the matchup

In [357]:
win_probs = pd.read_csv("Simulations/prediction_probabilities.csv")
slots_2017 = pd.read_csv("Simulations/slots_2017.csv").drop('index',axis = 1)

win_probs.loc[:,"Win_Prob"] = win_probs[win_prob_var_touse]
win_probs.loc[:,'SS_Win'] = np.where(win_probs['Win_Prob'] >= .5,1,0)
win_probs.loc[:,'WinTeam'] = np.where(win_probs.SS_Win == 1,win_probs.SS_team,win_probs.WS_team)
win_probs.loc[:,'Winning_Seed'] = np.where(win_probs.SS_Win == 1,win_probs.SS_Seed.str[0:3],win_probs.WS_Seed.str[0:3])
win_probs = win_probs.drop(['rforest_vals','rforest_prob'],axis=1)

#mt. st marys beat new orleans
win_probs.ix[((win_probs.SS_team == 1291)&(win_probs.WS_team == 1309)),'SS_Win'] = 1
#k state beat wake forest
win_probs.ix[((win_probs.SS_team == 1243)&(win_probs.WS_team == 1448)),'SS_Win'] = 1
#nc central (1300) and uc davis
win_probs.ix[((win_probs.SS_team == 1300)&(win_probs.WS_team == 1413)),'SS_Win'] = 0
#providence (1344) and usc
win_probs.ix[((win_probs.SS_team == 1344)&(win_probs.WS_team == 1425)),'SS_Win'] = 0

##### Reassign Round to 0 for play in games and remove the a/b from Seed for rounds 1 and later.

In [358]:
slots_2017.ix[slots_2017.Strongseed.str.contains("a|b",regex=True), 'Round'] = 0
slots_2017_noplayins = slots_2017[slots_2017.Round != 0]
slots_2017_noplayins.loc[:,"Strongseed"] = np.where(slots_2017_noplayins.Round == 1, slots_2017_noplayins.Strongseed.str[0:3],slots_2017_noplayins.Strongseed)
slots_2017_noplayins.loc[:,"Weakseed"] = np.where(slots_2017_noplayins.Round == 1, slots_2017_noplayins.Weakseed.str[0:3],slots_2017_noplayins.Weakseed)

In [359]:
slots_2017[slots_2017.Strongseed.str.contains("a|b",regex=True)]
slots_2017_noplayins[slots_2017_noplayins.Strongseed.str.contains("a|b",regex=True)]

Unnamed: 0,Season,Slot,Strongseed,Weakseed,Round


#### Remove the letter from the seed of the 4 play in game winners (in the win probs dataset)

In [360]:
play_in_winners = pd.merge(slots_2017[slots_2017.Round == 0],win_probs,
                           left_on=['Strongseed','Weakseed'],
                           right_on=['SS_Seed','WS_Seed'],how='inner').WinTeam.tolist()
play_in_winners

[1344, 1291, 1300, 1243]

In [361]:
for team in play_in_winners:
    win_probs.ix[win_probs.WS_team == team, 'WS_Seed'] = win_probs.WS_Seed.str[0:3]
    win_probs.ix[win_probs.SS_team == team, 'SS_Seed'] = win_probs.SS_Seed.str[0:3]

In [362]:
#delete the games that had the losing play-in game teams
playin_teams_to_exclude = [1309,1448,1300,1344]
win_probs = win_probs[~((win_probs.SS_team.isin(playin_teams_to_exclude))|(win_probs.WS_team.isin(playin_teams_to_exclude)))]
#strip the letters off
win_probs.loc[:,"SS_Seed"] = win_probs.SS_Seed.str[0:3]
win_probs.loc[:,"WS_Seed"] = win_probs.WS_Seed.str[0:3]

# Simulation Set Up

#### This function will take the previous round and output a dataset that is the current round matchups and winner

In [371]:
def get_round_results(previous_round,roundnum):

    current_round = slots_for_simulation[(slots_for_simulation.Round ==roundnum)].copy()
    current_round_long = pd.merge(pd.melt(current_round,id_vars=['Season','Slot'],value_vars=['Strongseed','Weakseed'],value_name='RSlot'),
            previous_round[['Season','Slot','Winning_Seed']],left_on=['Season','RSlot'],right_on=['Season','Slot'],how='left') 
    current_round_long = current_round_long[['Season','Slot_x','Winning_Seed']].copy()
    current_round_long = current_round_long.rename(columns={"Slot_x":"Slot"})
    current_round_long.loc[:,"SeedNum"] = current_round_long.Winning_Seed.str[1:3].astype(int)
    current_round_long.loc[:,"Region"] = current_round_long.Winning_Seed.str[0:1]
    current_round_long = current_round_long.sort_values(['Season','Slot','SeedNum','Region']).drop(['SeedNum','Region'],axis=1)
    current_round_long.loc[:,'Is_WS'] = current_round_long.groupby(['Season','Slot']).cumcount()
    current_round_long_0 = current_round_long[current_round_long.Is_WS == 0].copy().rename(columns={"Winning_Seed":"SS_Seed"}).drop(['Is_WS'],axis=1)
    current_round_long_1 = current_round_long[current_round_long.Is_WS == 1].copy().rename(columns={"Winning_Seed":"WS_Seed"}).drop(['Is_WS'],axis=1)
    current_round = pd.merge(current_round_long_0,current_round_long_1,on=['Season','Slot'])
    current_round.loc[:,"Round"] = roundnum
    return(pd.merge(current_round,win_probs_for_simulation,on=['SS_Seed','WS_Seed'],how='left'))

#### Helper function for sampling win/loss in the simulations

In [None]:
def get_randomized_prob(theta):
    return(np.random.binomial(1, theta,size=None))

#### Run through the bracket based on the win probability just to make sure it works before running the simulations

In [None]:
slots_r1 = pd.merge(slots_for_simulation[slots_for_simulation.Round == 1],win_probs_for_simulation,
         left_on=['Strongseed','Weakseed'],right_on=['SS_Seed','WS_Seed'],how='left')
slots_r2 = get_round_results(slots_r1,2)
slots_r3 = get_round_results(slots_r2,3)
slots_r4 = get_round_results(slots_r3,4)
slots_r5 = get_round_results(slots_r4,5)
slots_r6 = get_round_results(slots_r5,6)

slots_complete = pd.concat([slots_r1, slots_r2, slots_r3, slots_r4, slots_r5, slots_r6], 
                           ignore_index=False).reset_index().drop(['index'],axis=1).drop(['Strongseed','Weakseed'],axis=1)

# Start of simulation

In [418]:
N = 1000

# set up dataframes to hold results
champions = pd.DataFrame(columns=['n','Winner'])
final4winners = pd.DataFrame(columns=['n','Winner'])
elite8winners = pd.DataFrame(columns=['n','Winner'])
sweet16winners = pd.DataFrame(columns=['n','Winner'])
round32winners = pd.DataFrame(columns=['n','Winner'])
round64winners = pd.DataFrame(columns=['n','Winner'])

for n in range(N):

    win_probs_for_simulation = win_probs.copy()
    slots_for_simulation = slots_2017_noplayins.copy()
    
    #draw random win/losses based on the win probablitity and set up the "winner" variables
    win_probs_for_simulation.loc[:,"n_win_prob"] = win_probs_for_simulation.Win_Prob.apply(get_randomized_prob)
    win_probs_for_simulation.loc[:,"Win_Prob"] = win_probs_for_simulation["n_win_prob"]
    win_probs_for_simulation.loc[:,'SS_Win'] = np.where(win_probs_for_simulation['Win_Prob'] >= .5,1,0)
    win_probs_for_simulation.loc[:,'WinTeam'] = np.where(win_probs_for_simulation.SS_Win == 1,win_probs_for_simulation.SS_team,win_probs_for_simulation.WS_team)
    win_probs_for_simulation.loc[:,'Winning_Seed'] = np.where(win_probs_for_simulation.SS_Win == 1,win_probs_for_simulation.SS_Seed.str[0:3],win_probs_for_simulation.WS_Seed.str[0:3])
    
    #run through the bracket to find the matchups that occurred and their winner
    slots_r1 = pd.merge(slots_for_simulation[slots_for_simulation.Round == 1],win_probs_for_simulation,
             left_on=['Strongseed','Weakseed'],right_on=['SS_Seed','WS_Seed'],how='left')
    slots_r2 = get_round_results(slots_r1,2)
    slots_r3 = get_round_results(slots_r2,3)
    slots_r4 = get_round_results(slots_r3,4)
    slots_r5 = get_round_results(slots_r4,5)
    slots_r6 = get_round_results(slots_r5,6)

    slots_complete = pd.concat([slots_r1, slots_r2, slots_r3, slots_r4, slots_r5, slots_r6], 
                               ignore_index=False).reset_index().drop(['index'],axis=1).drop(['Strongseed','Weakseed'],axis=1)
    
    #set up for storing results
    slots_complete.loc[:,"Winner"] = np.where(slots_complete.WinTeam == slots_complete.WS_team,slots_complete.WS_Name,slots_complete.SS_Name)
    slots_complete.loc[:,"n"] = n
    
    #add winners of each round to the appropriate data frame
    champions = champions.append(slots_complete[slots_complete.Round == 6][['Winner','n']], ignore_index=True)
    final4winners = final4winners.append(slots_complete[slots_complete.Round == 5][['Winner','n']], ignore_index=True)
    elite8winners = elite8winners.append(slots_complete[slots_complete.Round == 4][['Winner','n']], ignore_index=True)
    sweet16winners = sweet16winners.append(slots_complete[slots_complete.Round == 3][['Winner','n']], ignore_index=True)
    round32winners = round32winners.append(slots_complete[slots_complete.Round == 2][['Winner','n']], ignore_index=True)
    round64winners = round64winners.append(slots_complete[slots_complete.Round == 1][['Winner','n']], ignore_index=True)

# get counts. These are the number of times a team made it to that round and won. The pct is the predicted chance of winning in that round.
champions.loc[:,'counter'] = 1
champions_pcts = champions[['Winner','counter']].groupby('Winner').sum().reset_index().sort_values('counter',ascending=False)
champions_pcts.loc[:,"Chance_of_Winning"] = champions_pcts.counter / N
final4winners.loc[:,'counter'] = 1
final4winners_pcts = final4winners[['Winner','counter']].groupby('Winner').sum().reset_index().sort_values('counter',ascending=False)
final4winners_pcts.loc[:,"Chance_of_Winning"] = final4winners_pcts.counter / N
elite8winners.loc[:,'counter'] = 1
elite8winners_pcts = elite8winners[['Winner','counter']].groupby('Winner').sum().reset_index().sort_values('counter',ascending=False)
elite8winners_pcts.loc[:,"Chance_of_Winning"] = elite8winners_pcts.counter / N
sweet16winners.loc[:,'counter'] = 1
sweet16winners_pcts = sweet16winners[['Winner','counter']].groupby('Winner').sum().reset_index().sort_values('counter',ascending=False)
sweet16winners_pcts.loc[:,"Chance_of_Winning"] = sweet16winners_pcts.counter / N
round32winners.loc[:,'counter'] = 1
round32winners_pcts = round32winners[['Winner','counter']].groupby('Winner').sum().reset_index().sort_values('counter',ascending=False)
round32winners_pcts.loc[:,"Chance_of_Winning"] = round32winners_pcts.counter / N
round64winners.loc[:,'counter'] = 1
round64winners_pcts = round64winners[['Winner','counter']].groupby('Winner').sum().reset_index().sort_values('counter',ascending=False)
round64winners_pcts.loc[:,"Chance_of_Winning"] = round64winners_pcts.counter / N

#### Villanova is predicted to be the overall winner with a 19% chance of winning the tournament.  
  
To fill out a bracket, start at the championship, filling in the team with the highest chance of winning. Then look at the final4winners dataframe and fill in the top teams. Then move on to elite8, etc. 

In [434]:
champions_pcts

Unnamed: 0,Winner,counter,Chance_of_Winning
41,VILLANOVA,191,0.191
23,NORTH CAROLINA,88,0.088
0,ARIZONA,70,0.07
11,KANSAS,56,0.056
28,OREGON,50,0.05
2,BUTLER,49,0.049
6,DUKE,48,0.048
1,BAYLOR,47,0.047
9,GONZAGA,46,0.046
13,KENTUCKY,29,0.029
