People often say that hindsight is 20/20. In fact, whenever there is a winner or a loser, you can count on sportswriters to find reasons to assert that the conclusion was inevitable: the three-point prowess of the winners or the susceptibility to turnovers of the losers. They often remark that the game was similar to another historical bout, or that the winnining team had favorable matchups to exploit the weaknesses of the other. While the outcome of the game may have been different if the two teams played on another day, these types of "postmortem" conclusions are ones which would be useful to draw from our model to ensure that it is sufficiently interpretable.

Obviously, the seeding of this tournament is not perfect. There are upsets every year. But even a model with an improved ranking system can fail to account for a lower ranked team with certain strengths which can play up to a higher ranked team's weaknesses. In fact, real sports have "rock-paper-scissors" interactions with cycles containing as many as all the teams in the league. This demonstrates the shortcomings of trying to apply the transitive property to sports rankings. Our model seeks to avoid this problem.

In [2]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

fdr = 'ncaam-march-mania-2021/'
def load(fname):
    return pd.read_csv(f'{fdr}{fname}')

seeds = load('MNCAATourneySeeds.csv')
all_stats = pd.read_csv('all_stats.csv', index_col=0)
tourney = pd.read_csv('TourneyCompactAugmented.csv', index_col=0)
tourney

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeed,LSeed,Round
0,1985,136,1116,63,1234,54,N,0,X09,X08,1
1,1985,136,1120,59,1345,58,N,0,Z11,Z06,1
2,1985,136,1207,68,1250,43,N,0,W01,W16,1
3,1985,136,1229,58,1425,55,N,0,Y09,Y08,1
4,1985,136,1242,49,1325,38,N,0,Z03,Z14,1
...,...,...,...,...,...,...,...,...,...,...,...
2246,2019,146,1120,77,1246,71,N,1,Y05,Y02,4
2247,2019,146,1277,68,1181,67,N,0,W02,W01,4
2248,2019,152,1403,61,1277,51,N,0,X03,W02,5
2249,2019,152,1438,63,1120,62,N,0,Z01,Y05,5


In [3]:
def get_seed(seed_str):
    return int(''.join(char for char in seed_str if char.isdigit()))

tourney['WSeedNum'] = [get_seed(seed_str) for seed_str in tourney.WSeed]
tourney['LSeedNum'] = [get_seed(seed_str) for seed_str in tourney.LSeed]
tourney[tourney.LSeedNum == get_seed('X16')]

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeed,LSeed,Round,WSeedNum,LSeedNum
2,1985,136,1207,68,1250,43,N,0,W01,W16,1,1,16
11,1985,136,1328,96,1299,83,N,0,Y01,Y16,1,1,16
13,1985,136,1385,83,1380,59,N,0,X01,X16,1,1,16
24,1985,137,1276,59,1192,55,N,0,Z01,Z16,1,1,16
65,1986,136,1181,85,1290,78,N,0,W01,W16,1,1,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2187,2019,135,1295,78,1300,74,N,0,W16a,W16b,0,16,16
2192,2019,136,1211,87,1192,49,N,0,X01,X16a,1,1,16
2205,2019,137,1181,85,1295,62,N,0,W01,W16a,1,1,16
2209,2019,137,1314,88,1233,73,N,0,Y01,Y16,1,1,16


In [4]:
# all_stats[(all_stats.season == 1985)&(all_stats.index == 1116)]
all_stats.loc[1116]

Unnamed: 0,season,seed,wp,papg,ppg,fgp,fgp3,orpg,drpg,ftp,apg,bpg,spg,fpg,topg
1116,2006,8,0.709677,65.064516,74.322581,0.459218,0.339731,12.193548,22.83871,0.693966,15.548387,6.129032,8.451613,16.935484,13.387097
1116,2007,12,0.617647,65.705882,71.382353,0.464112,0.352445,11.235294,24.294118,0.701408,15.176471,6.411765,6.911765,17.382353,15.323529
1116,2008,9,0.666667,67.030303,73.636364,0.465681,0.339161,12.818182,23.939394,0.672365,14.545455,5.30303,7.0,18.969697,15.454545
1116,2015,5,0.764706,70.147059,78.029412,0.447432,0.350309,13.0,22.764706,0.724227,16.147059,4.764706,7.764706,18.970588,11.735294
1116,2017,8,0.735294,74.0,79.794118,0.460899,0.363934,11.323529,24.941176,0.761965,14.088235,4.735294,7.529412,20.588235,11.735294
1116,2018,7,0.676471,75.529412,81.088235,0.47746,0.400908,10.176471,24.588235,0.677922,14.470588,4.617647,6.235294,20.205882,11.0


Let's try to predict the 2019 bracket with the 2018 regular season statistics.

In [23]:
def get_seed(seed_str):
    return int(''.join(char for char in seed_str if char.isdigit()))

def predict(yr, rnd):
    predictions = []
    games = tourney[(tourney.Season == yr)&(tourney.Round == rnd)]
    for _, game in games.iterrows():
        diff = all_stats[(all_stats.season == yr)&(all_stats.index == game.WTeamID)].drop(['season', 'seed'], axis=1).values\
                - all_stats[(all_stats.season == yr)&(all_stats.index == game.LTeamID)].drop(['season', 'seed'], axis=1).values
        relevant_matchups = tourney[((tourney.WSeedNum == get_seed(game.WSeed))\
                                    &(tourney.LSeedNum == get_seed(game.LSeed)))\
                                    |(tourney.WSeedNum == get_seed(game.LSeed))\
                                    &(tourney.LSeedNum == get_seed(game.WSeed))]
        X = np.array([all_stats[(all_stats.season == matchup.Season)&(all_stats.index == matchup.WTeamID)]\
                       .drop(['season', 'seed'], axis=1).values - all_stats[(all_stats.season == matchup.Season)\
                        &(all_stats.index == matchup.LTeamID)].drop(['season', 'seed'], axis=1).values\
                        for _, matchup in relevant_matchups.iterrows()], dtype=object)
        idx = [matchup.size > 0 for matchup in X]
        X = np.vstack(X[np.array(idx)])
        # 1 for upset, 0 for non-upset
        y = np.array([0 if matchup.WSeedNum > matchup.LSeedNum else 1 for _, matchup in relevant_matchups.iterrows()])
        y = y[np.array(idx)]
        model = KNeighborsClassifier()
        model.fit(X, y)
        predictions += [model.predict(diff).item()]
    
    return np.array(predictions)

def actual(yr, rnd):
    games = tourney[(tourney.Season == yr)&(tourney.Round == rnd)]
    return np.array([0 if game.WSeedNum > game.LSeedNum else 1 for _, game in games.iterrows()])

def evaluate(yr, rnd):
    bools = predict(yr, rnd) == actual(yr, rnd)
    return np.sum(bools)/bools.size

for yr in range(2006, 2018):
    print(evaluate(yr, 1))

0.71875
0.84375
0.78125
0.75
0.84375
0.8125
0.8125
0.78125
0.8125
0.78125
0.75
0.9375
