In [1]:
import numpy as np
import pandas as pd

## Reading files

In [21]:
# read in files needed for feature engineering
regular_season_compact_all = pd.read_csv('data/MRegularSeasonCompactResults.csv')
regular_season_detailed = pd.read_csv('data/MRegularSeasonDetailedResults.csv')
tourney_compact_all = pd.read_csv('data/MNCAATourneyCompactResults.csv')
tourney_detailed = pd.read_csv('data/MNCAATourneyDetailedResults.csv')
teams = pd.read_csv('data/MTeams.csv')
seeds_all = pd.read_csv('data/MNCAATourneySeeds.csv')
slots_all = pd.read_csv('data/MNCAATourneySlots.csv')

In [22]:
# trim everything to start in 2003
min_year = 2003
regular_season_compact = regular_season_compact_all[regular_season_compact_all['Season'] >= min_year].reset_index(drop=True)
tourney_compact = tourney_compact_all[tourney_compact_all['Season'] >= min_year].reset_index(drop=True)
seeds = seeds_all[seeds_all['Season'] >= min_year].reset_index(drop=True)
slots = slots_all[slots_all['Season'] >= min_year].reset_index(drop=True)

## Look at data

In [23]:
regular_season_compact.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,2003,10,1104,68,1328,62,N,0
1,2003,10,1272,70,1393,63,N,0
2,2003,11,1266,73,1437,61,N,0
3,2003,11,1296,56,1457,50,N,0
4,2003,11,1400,77,1208,71,N,0


In [24]:
regular_season_detailed.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [25]:
tourney_compact.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,2003,134,1421,92,1411,84,N,1
1,2003,136,1112,80,1436,51,N,0
2,2003,136,1113,84,1272,71,N,0
3,2003,136,1141,79,1166,73,N,0
4,2003,136,1143,76,1301,74,N,1


In [26]:
tourney_detailed.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19


In [27]:
teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2022
1,1102,Air Force,1985,2022
2,1103,Akron,1985,2022
3,1104,Alabama,1985,2022
4,1105,Alabama A&M,2000,2022


In [51]:
teams[teams['TeamName'] == 'Savannah St']

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
265,1366,Savannah St,2003,2019


In [28]:
seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,2003,W01,1328
1,2003,W02,1448
2,2003,W03,1393
3,2003,W04,1257
4,2003,W05,1280


In [30]:
slots.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
0,2003,R1W1,W01,W16
1,2003,R1W2,W02,W15
2,2003,R1W3,W03,W14
3,2003,R1W4,W04,W13
4,2003,R1W5,W05,W12


## Helper functions

In [85]:
def team_lookup(id_num):
    return(teams[teams['TeamID'] == id_num]['TeamName'].values[0])

In [86]:
team_lookup(1102)

'Air Force'

## Building source of truth table

In [120]:
### Reg season W/L

In [121]:
wins = regular_season_detailed.groupby(['Season','WTeamID'], as_index=False).count()[['Season', 'WTeamID', 'DayNum']]
wins.columns = ['Season', 'TeamID', 'wins']
wins.head()

Unnamed: 0,Season,TeamID,wins
0,2003,1102,12
1,2003,1103,13
2,2003,1104,17
3,2003,1105,7
4,2003,1106,13


In [122]:
losses = regular_season_detailed.groupby(['Season','LTeamID'], as_index=False).count()[['Season', 'LTeamID', 'DayNum']]
losses.columns = ['Season', 'TeamID', 'losses']
losses.head()

Unnamed: 0,Season,TeamID,losses
0,2003,1102,16
1,2003,1103,14
2,2003,1104,11
3,2003,1105,19
4,2003,1106,15


In [123]:
# outer join wins and losses to get all teams W/L record each season
WL_df = wins.merge(losses, on=['Season', 'TeamID'], how='outer')
WL_df.fillna(0, inplace=True)
WL_df[['wins','losses']] = WL_df[['wins','losses']].astype(int)
WL_df.head()

Unnamed: 0,Season,TeamID,wins,losses
0,2003,1102,12,16
1,2003,1103,13,14
2,2003,1104,17,11
3,2003,1105,7,19
4,2003,1106,13,15


In [124]:
### Tourney W/L

In [125]:
tourney_wins = tourney_detailed.groupby(['Season','WTeamID'], as_index=False).count()[['Season', 'WTeamID', 'DayNum']]
tourney_wins.columns = ['Season', 'TeamID', 'tourney_wins']
tourney_wins.head()

Unnamed: 0,Season,TeamID,tourney_wins
0,2003,1112,3
1,2003,1113,1
2,2003,1120,2
3,2003,1139,2
4,2003,1141,1


In [126]:
tourney_losses = tourney_detailed.groupby(['Season','LTeamID'], as_index=False).count()[['Season', 'LTeamID', 'DayNum']]
tourney_losses.columns = ['Season', 'TeamID', 'tourney_losses']
tourney_losses.head()

Unnamed: 0,Season,TeamID,tourney_losses
0,2003,1104,1
1,2003,1112,1
2,2003,1113,1
3,2003,1120,1
4,2003,1122,1


In [127]:
# outer join wins and losses to get all teams W/L record each tourney
tourney_WL_df = tourney_wins.merge(tourney_losses, on=['Season', 'TeamID'], how='outer')
tourney_WL_df.fillna(0, inplace=True)
tourney_WL_df[['tourney_wins','tourney_losses']] = tourney_WL_df[['tourney_wins','tourney_losses']].astype(int)
tourney_WL_df.head()

Unnamed: 0,Season,TeamID,tourney_wins,tourney_losses
0,2003,1112,3,1
1,2003,1113,1,1
2,2003,1120,2,1
3,2003,1139,2,1
4,2003,1141,1,1


In [128]:
### Merge columns

In [129]:
# merge all columns together
season_stats_basic = \
WL_df.merge(seeds, on=['Season', 'TeamID'], how='left')\
.merge(tourney_WL_df, on=['Season', 'TeamID'], how='left')

In [130]:
season_stats_basic

Unnamed: 0,Season,TeamID,wins,losses,Seed,tourney_wins,tourney_losses
0,2003,1102,12,16,,,
1,2003,1103,13,14,,,
2,2003,1104,17,11,Y10,0.0,1.0
3,2003,1105,7,19,,,
4,2003,1106,13,15,,,
...,...,...,...,...,...,...,...
6887,2015,1363,0,28,,,
6888,2021,1152,0,9,,,
6889,2022,1175,0,16,,,
6890,2022,1237,0,19,,,


In [131]:
season_stats_basic.groupby('Season').sum()

Unnamed: 0_level_0,TeamID,wins,losses,tourney_wins,tourney_losses
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2003,419735,4616,4616,64.0,64.0
2004,418446,4571,4571,64.0,64.0
2005,423838,4675,4675,64.0,64.0
2006,429048,4757,4757,64.0,64.0
2007,431823,5043,5043,64.0,64.0
2008,439417,5163,5163,64.0,64.0
2009,445803,5249,5249,64.0,64.0
2010,445803,5263,5263,64.0,64.0
2011,443049,5246,5246,67.0,67.0
2012,443205,5253,5253,67.0,67.0
