
### NCAA Tournament Simulation using historical probabilities

- https://en.wikipedia.org/wiki/NCAA_Division_I_Men%27s_Basketball_Tournament
- https://www.betfirm.com/seeds-national-championship-odds/

In [1]:
import json
import random
import pandas as pd

In [2]:
rounds = [
  '1st Round',
  '2nd Round',
  'Sweet Sixteen',
  'Regional Finals',
  'Final Four',
]

historical_probabilities = {}
with open('../../data/ncaa/historical_tourny_win_perc.json', 'rb') as tourny_percs:
    historical_probabilities = json.loads(tourny_percs.read())

In [3]:
def chunk(items, chunk_size):
    array = []
    for i in range(0, len(items), chunk_size):
        array.append(
            tuple(sorted(items[i:i + chunk_size]))
        )

    return array

def get_probability(tournament_round, seed1, seed2):
    seed1_key = str(seed1)
    seed2_key = str(seed2)

    probability = 0
    if seed1_key in historical_probabilities:
        if tournament_round in historical_probabilities[seed1_key]:
            if seed2_key in historical_probabilities[seed1_key][tournament_round]:
                probability += random.random() / 10000
                probability += historical_probabilities[seed1_key][tournament_round][seed2_key]

    return 1 if probability == 0 else probability

def play_game(tournament_round, seed1, seed2):
    sequence = [seed1, seed2]
    probabilities = [
        get_probability(tournament_round, seed1, seed2),
        get_probability(tournament_round, seed2, seed1)
    ]
    
    return random.choices(
      sequence,
      probabilities,
      k=1
    )[0]

def play_regional():
    regional = [
      [
        (1, 16),
        (8, 9),
        (5, 12),
        (4, 13),
        (6, 11),
        (3, 14),
        (7, 10),
        (2, 15),
      ]
    ]

    for tournament_round in rounds[:-1]:
        games_to_play = regional[-1]
        winners = chunk(
          [
            play_game(tournament_round, t1, t2)
            for t1, t2
            in games_to_play
          ],
          2
        )

        regional.append(winners)

    return regional


In [4]:
n = 10000
rds = [0, 1, 2, 3, 4]
output = { rd: { i:0 for i in range(1, 17) } for rd in rds }

for _ in range(n):
    regional = play_regional()
    for i, games in enumerate(regional):
        for game in games:
            t1 = game[0]
            output[i][t1] += 1

            if len(game) > 1:
              t2 = game[1]
              output[i][t2] += 1

In [5]:
import pandas as pd

df = pd.DataFrame(output)
for rd in rds:
    df[rd] = df[rd] / n

df.columns = rounds
df

Unnamed: 0,1st Round,2nd Round,Sweet Sixteen,Regional Finals,Final Four
1,1.0,0.9935,0.8565,0.689,0.4234
2,1.0,0.94,0.6315,0.4484,0.195
3,1.0,0.8552,0.5251,0.2518,0.1118
4,1.0,0.7838,0.467,0.1487,0.0946
5,1.0,0.6436,0.3426,0.0584,0.0443
6,1.0,0.6317,0.2957,0.1054,0.0202
7,1.0,0.6089,0.1977,0.0654,0.0085
8,1.0,0.4834,0.0982,0.0627,0.0332
9,1.0,0.5166,0.0442,0.0216,0.0128
10,1.0,0.3911,0.1531,0.0513,0.0075


### Simulation of a Regional

In [6]:
regional_results = play_regional()

for i, regional_round in enumerate(rounds):
    results = regional_results[i]

    print(regional_round)
    print(results)
    print()


1st Round
[(1, 16), (8, 9), (5, 12), (4, 13), (6, 11), (3, 14), (7, 10), (2, 15)]

2nd Round
[(1, 9), (4, 12), (3, 6), (2, 10)]

Sweet Sixteen
[(1, 4), (2, 6)]

Regional Finals
[(2, 4)]

Final Four
[(4,)]



### Simulate Final Four Teams

In [7]:
regions = ['North', 'South', 'East', 'West']

for region in regions:

  print(region)
  regional_results = play_regional()

  for i, regional_round in enumerate(rounds):
      if i < 3:
          continue

      results = regional_results[i]

      print(regional_round)
      print(results)
      print()

North
Regional Finals
[(1, 7)]

Final Four
[(1,)]

South
Regional Finals
[(1, 2)]

Final Four
[(2,)]

East
Regional Finals
[(1, 6)]

Final Four
[(1,)]

West
Regional Finals
[(2, 4)]

Final Four
[(4,)]

