Download the [dataset from kaggle](https://www.kaggle.com/c/ncaam-march-mania-2021/data).

In [1]:
import numpy as np
import pandas as pd

In [2]:
fdr = 'ncaam-march-mania-2021/'
def load(fname):
    return pd.read_csv(f'{fdr}{fname}')
seeds = load('MNCAATourneySeeds.csv')
seeds

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374
...,...,...,...
2281,2019,Z12,1332
2282,2019,Z13,1414
2283,2019,Z14,1330
2284,2019,Z15,1159


In [3]:
tourney = load('MNCAATourneyCompactResults.csv')
tourney

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0
...,...,...,...,...,...,...,...,...
2246,2019,146,1120,77,1246,71,N,1
2247,2019,146,1277,68,1181,67,N,0
2248,2019,152,1403,61,1277,51,N,0
2249,2019,152,1438,63,1120,62,N,0


In [4]:
regular_season = load('MRegularSeasonDetailedResults.csv')
# regular_season
regular_season[regular_season.Season == 2019]

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
82041,2019,1,1104,82,1380,62,H,0,27,55,...,17,17,30,11,20,8,15,7,2,28
82042,2019,1,1113,102,1168,94,H,0,33,78,...,24,19,26,7,35,7,17,7,3,36
82043,2019,1,1119,73,1265,69,H,0,22,51,...,15,12,24,9,27,8,12,4,0,23
82044,2019,1,1120,101,1375,58,H,0,38,74,...,22,17,22,8,19,9,22,4,5,17
82045,2019,1,1123,86,1232,69,H,0,32,65,...,12,11,18,18,20,11,11,6,5,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87499,2019,132,1153,69,1222,57,N,0,22,50,...,33,11,18,17,16,8,7,2,4,19
87500,2019,132,1209,73,1426,64,N,0,20,50,...,33,11,17,13,28,12,14,5,2,24
87501,2019,132,1277,65,1276,60,N,0,22,55,...,25,10,12,3,26,17,6,5,5,11
87502,2019,132,1387,55,1382,53,N,0,22,59,...,19,8,10,13,30,9,11,2,7,16


In [5]:
regular_season.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'],
      dtype='object')

Let's compile season, seeds, winning percentage, points allowed per game, points per game, FG%, FG3%, offensive rebounds per game, defensive rebounds per game, FT%, assists per game, blocks per game, steals per game, fouls per game, and turnovers per game for each team. 
We also add per-posession statistics for each team, including offensive and defensive [rating](https://en.wikipedia.org/wiki/Advanced_statistics_in_basketball#Common_statistics).

In [6]:
class SeasonStats:
    def __init__(self, year):
        self.year = year
        self.data = regular_season[regular_season.Season == year]
        self.stats = pd.DataFrame(columns=['season', 'seed', 'wp', 'papg', 'ppg', 'fgp', 'fgp3', 'orpg', \
                                          'drpg', 'ftp', 'apg', 'bpg', \
                                          'spg', 'fpg', 'topg', 'offensive_rating', 'defensive_rating', 'efgp', 'true_shooting_percentage'])
        self.relevant_teams = pd.unique(tourney[tourney.Season == year] \
                                       [['WTeamID', 'LTeamID']].stack())
        
    def get_stats(self):
        for team in self.relevant_teams:
            losing_data = self.data[self.data.LTeamID == team]
            winning_data = self.data[self.data.WTeamID == team]
            n = len(losing_data)+len(winning_data)
            def pg_stat(abbr):
                return (losing_data[f'L{abbr}'].sum() \
                        +winning_data[f'W{abbr}'].sum())/n
            def total_stat(abbr):
                return losing_data[f'L{abbr}'].sum() + winning_data[f'W{abbr}'].sum()

            season = self.year
            seed = int(''.join(char for char in seeds[(seeds.Season == self.year)&(seeds.TeamID == team)]['Seed'] \
                            .item() if char.isdigit()))
            wp = len(winning_data)/n
            papg = (losing_data['WScore'].sum()+winning_data['LScore'].sum())/n
            ppg = pg_stat('Score')
            fgp = (losing_data['LFGM'].sum()+winning_data['WFGM'].sum())/ \
                    (losing_data['LFGA'].sum()+winning_data['WFGA'].sum())
            fgp3 = (losing_data['LFGM3'].sum()+winning_data['WFGM3'].sum())/ \
                    (losing_data['LFGA3'].sum()+winning_data['WFGA3'].sum())
            orpg = pg_stat('OR')
            drpg = pg_stat('DR')
            ftp = (losing_data['LFTM'].sum()+winning_data['WFTM'].sum())/ \
                    (losing_data['LFTA'].sum()+winning_data['WFTA'].sum())
            apg = pg_stat('Ast')
            bpg = pg_stat('Blk')
            spg = pg_stat('Stl')
            fpg = pg_stat('PF')
            topg = pg_stat('TO')

            possessions = 0.96 * (total_stat('FGA') - total_stat('OR') + total_stat('TO') + (0.475 * total_stat('FTA')))
            offensive_rating = (total_stat('Score') * 100) / possessions
            defensive_rating = ((losing_data['WScore'].sum() + winning_data['LScore'].sum()) * 100) / possessions
            efgp = (total_stat('FGM') + 0.5 * 3 * total_stat('FGM3')) / total_stat('FGA')
            true_shooting_percentage = total_stat('Score') / (2 * (total_stat('FGA') + (0.475 * total_stat('FTA'))))

            to_add = pd.DataFrame([[season, seed, wp, papg, ppg, fgp, fgp3, orpg, drpg, ftp, apg, \
                                    bpg, spg, fpg, topg, offensive_rating, defensive_rating, efgp, true_shooting_percentage]], \
                                      columns=self.stats.columns)
            self.stats = self.stats.append(to_add, ignore_index=True)
        self.stats.index = self.relevant_teams
        return self.stats

all_stats = pd.DataFrame(columns=['season', 'seed', 'wp', 'papg', 'ppg', 'fgp', 'fgp3', 'orpg', \
                                          'drpg', 'ftp', 'apg', 'bpg', \
                                          'spg', 'fpg', 'topg', 'offensive_rating', 'defensive_rating', 'efgp', 'true_shooting_percentage'])
for yr in range(2003, 2020):
    all_stats = all_stats.append(SeasonStats(yr).get_stats())
all_stats.to_csv('all_stats.csv')
all_stats
# We need to wait until Sunday on March 14 to know the relevant teams for 2020
# SeasonStats(2020).save_stats()

Unnamed: 0,season,seed,wp,papg,ppg,fgp,fgp3,orpg,drpg,ftp,apg,bpg,spg,fpg,topg,offensive_rating,defensive_rating,efgp,true_shooting_percentage
1421,2003,16,0.448276,78.448276,71.206897,0.429265,0.360153,12.275862,23.172414,0.762768,13.034483,3.000000,7.068966,19.103448,16.206897,104.963423,115.637669,0.600486,0.533502
1411,2003,16,0.600000,70.833333,72.800000,0.447527,0.320721,13.166667,24.800000,0.619952,14.200000,2.233333,6.433333,18.300000,15.233333,107.313852,104.414805,0.608565,0.530625
1112,2003,1,0.892857,70.250000,85.214286,0.461413,0.350534,15.178571,27.642857,0.701429,17.642857,4.214286,8.464286,17.750000,14.785714,114.985735,94.793353,0.622011,0.549137
1436,2003,16,0.655172,63.137931,67.793103,0.444444,0.340757,12.965517,25.724138,0.657848,14.206897,2.965517,6.862069,15.896552,14.068966,106.588769,99.269601,0.586111,0.520292
1113,2003,10,0.620690,69.172414,75.965517,0.478182,0.317808,13.689655,23.310345,0.669737,15.551724,4.241379,5.206897,19.413793,14.000000,113.603548,103.444719,0.583636,0.547737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1205,2019,16,0.645161,73.419355,75.774194,0.477726,0.376766,7.129032,25.741935,0.715994,13.612903,2.483871,6.225806,16.193548,11.774194,112.874952,109.367131,0.688746,0.580351
1439,2019,4,0.750000,62.093750,74.000000,0.475476,0.394095,8.906250,23.687500,0.757798,15.343750,2.312500,6.656250,15.281250,11.375000,119.112292,99.947687,0.741200,0.594415
1387,2019,13,0.657143,63.742857,67.057143,0.418000,0.308282,13.828571,25.942857,0.597730,12.971429,4.057143,7.085714,17.485714,12.714286,104.582188,99.413235,0.568750,0.493757
1449,2019,9,0.764706,64.382353,69.823529,0.452575,0.346049,9.470588,22.117647,0.694444,11.676471,5.735294,9.000000,18.411765,13.352941,108.233398,99.799034,0.659079,0.551375


Let's also add round, winning seed, and losing seed to the tourney results and remove everything else.

In [7]:
tourney['WSeed'] = [seeds[(seeds.TeamID == id)&(seeds.Season == season)]['Seed'].item() \
 for (id, season) in tourney[['WTeamID', 'Season']].values]
tourney['LSeed'] = [seeds[(seeds.TeamID == id)&(seeds.Season == season)]['Seed'].item() \
 for (id, season) in tourney[['LTeamID', 'Season']].values]
def tourney_round(day_num):
    if day_num == 134 or day_num == 135:
        return 0 # play-in
    elif day_num == 136 or day_num == 137:
        return 1 # round of 64
    elif day_num == 138 or day_num == 139:
        return 2 # round of 32
    elif day_num == 143 or day_num == 144:
        return 3 # Sweet Sixteen
    elif day_num == 145 or day_num == 146:
        return 4 # Elite Eight
    elif day_num == 152:
        return 5 # Final Four
    else:
        return 6 # Championship
tourney['Round'] = [tourney_round(day_num) for day_num in tourney['DayNum']]
tourney.to_csv('TourneyCompactAugmented.csv')
tourney

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WSeed,LSeed,Round
0,1985,136,1116,63,1234,54,N,0,X09,X08,1
1,1985,136,1120,59,1345,58,N,0,Z11,Z06,1
2,1985,136,1207,68,1250,43,N,0,W01,W16,1
3,1985,136,1229,58,1425,55,N,0,Y09,Y08,1
4,1985,136,1242,49,1325,38,N,0,Z03,Z14,1
...,...,...,...,...,...,...,...,...,...,...,...
2246,2019,146,1120,77,1246,71,N,1,Y05,Y02,4
2247,2019,146,1277,68,1181,67,N,0,W02,W01,4
2248,2019,152,1403,61,1277,51,N,0,X03,W02,5
2249,2019,152,1438,63,1120,62,N,0,Z01,Y05,5
