In [34]:
import numpy as np
import pandas as pd
from pandasql import sqldf 
from ast import literal_eval
import helper_functions as hf

## Reading files

In [295]:
# read in files needed for feature engineering
regular_season_compact_all = pd.read_csv('data/MRegularSeasonCompactResults.csv')
regular_season_detailed = pd.read_csv('data/MRegularSeasonDetailedResults.csv')
tourney_compact_all = pd.read_csv('data/MNCAATourneyCompactResults.csv')
tourney_detailed = pd.read_csv('data/MNCAATourneyDetailedResults.csv')
teams = pd.read_csv('data/MTeams.csv')
seeds_all = pd.read_csv('data/MNCAATourneySeeds.csv')
slots_all = pd.read_csv('data/MNCAATourneySlots.csv')
ordinals = pd.read_csv('data/MMasseyOrdinals.csv')

In [3]:
# trim everything to start in 2003
min_year = 2003
regular_season_compact = regular_season_compact_all[regular_season_compact_all['Season'] >= min_year].reset_index(drop=True)
tourney_compact = tourney_compact_all[tourney_compact_all['Season'] >= min_year].reset_index(drop=True)
seeds = seeds_all[seeds_all['Season'] >= min_year].reset_index(drop=True)
slots = slots_all[slots_all['Season'] >= min_year].reset_index(drop=True)

## Look at data

In [4]:
regular_season_compact.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,2003,10,1104,68,1328,62,N,0
1,2003,10,1272,70,1393,63,N,0
2,2003,11,1266,73,1437,61,N,0
3,2003,11,1296,56,1457,50,N,0
4,2003,11,1400,77,1208,71,N,0


In [5]:
regular_season_detailed.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [6]:
tourney_compact.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,2003,134,1421,92,1411,84,N,1
1,2003,136,1112,80,1436,51,N,0
2,2003,136,1113,84,1272,71,N,0
3,2003,136,1141,79,1166,73,N,0
4,2003,136,1143,76,1301,74,N,1


In [7]:
tourney_detailed.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19


In [8]:
teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2022
1,1102,Air Force,1985,2022
2,1103,Akron,1985,2022
3,1104,Alabama,1985,2022
4,1105,Alabama A&M,2000,2022


In [11]:
seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,2003,W01,1328
1,2003,W02,1448
2,2003,W03,1393
3,2003,W04,1257
4,2003,W05,1280


In [12]:
slots.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
0,2003,R1W1,W01,W16
1,2003,R1W2,W02,W15
2,2003,R1W3,W03,W14
3,2003,R1W4,W04,W13
4,2003,R1W5,W05,W12


## Feature engineering

### Adding derived game stats to regular_season_detailed

Create:
- possessions

Which will then be used to calculate:
- pace
- offensive efficiency
- defensive efficiency

Basic Possession Formula = 0.96*[(Field Goal Attempts)+(Turnovers)+0.44*(Free Throw Attempts)-(Offensive Rebounds)]

Pace Calculation Formula = [200/(Team Minutes)]*(Possession_team+Possession_opponent)/2

Offensive Efficiency Formula = 100*(Points Scored / Possessions)

Defensive Efficiency Formula = 100*(Opp Points Scored / Opp Possessions) = Opponent Offensive Efficiency

In [277]:
# winning team possessions
regular_season_detailed['WPos'] = \
    0.96 * (regular_season_detailed['WFGA'] + regular_season_detailed['WTO'] + \
     0.44 * regular_season_detailed['WFTA'] - regular_season_detailed['WOR'])

# losing team possessions
regular_season_detailed['LPos'] = \
    0.96 * (regular_season_detailed['LFGA'] + regular_season_detailed['LTO'] + \
     0.44 * regular_season_detailed['LFTA'] - regular_season_detailed['LOR'])

# pace
regular_season_detailed['Pace'] =  \
    (200/(200 + 5 * 5 * regular_season_detailed['NumOT'])) * \
    (regular_season_detailed['WPos'] + regular_season_detailed['LPos']) / 2

# winning team offensive efficiency, losing team defensive efficiency
regular_season_detailed['WOffEff'] = \
    100*(regular_season_detailed['WScore'] / regular_season_detailed['WPos'])

# losing team offensive efficiency, winning team defensive efficiency
regular_season_detailed['LOffEff'] = \
    100*(regular_season_detailed['LScore'] / regular_season_detailed['LPos'])

### General season and per-game team stats

Create: 
- games played
- regular season wins
- reg. season losses
- avg. game margin
- std. dev. game margin
- win %
- away/neutral wins
- home losses
- close wins
- close losses
- avg. points for
- avg. points against
- total 3 pointers made 
- total 3 pointers attempted
- 3 pt. %
- total FT made
- total FT attempted
- FT %
- avg. TOVs
- avg. offensive efficiency
- avg. defensive efficiency
- avg. pace

In [279]:
regular_season_detailed.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LAst,LTO,LStl,LBlk,LPF,WPos,LPos,Pace,WOffEff,LOffEff
0,2003,10,1104,68,1328,62,N,0,27,58,...,8,18,9,2,20,71.9232,67.8528,69.888,94.545293,91.374269
1,2003,10,1272,70,1393,63,N,0,26,62,...,7,12,8,6,16,65.6256,65.088,65.3568,106.665691,96.792035
2,2003,11,1266,73,1437,61,N,0,24,58,...,9,12,2,5,23,61.2096,61.5552,61.3824,119.262338,99.098045
3,2003,11,1296,56,1457,50,N,0,18,38,...,9,19,4,3,23,55.3344,55.296,55.3152,101.202868,90.422454
4,2003,11,1400,77,1208,71,N,0,30,61,...,12,10,7,1,14,61.1712,60.3648,60.768,125.876229,117.618215


In [280]:
regular_season_detailed.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
       'WPos', 'LPos', 'Pace', 'WOffEff', 'LOffEff'],
      dtype='object')

In [281]:
# aggregate data for when team is the winning team
wins_q = '''
SELECT 
    Season, WTeamID AS TeamID, 
    COUNT() as num_wins,
    GROUP_CONCAT(WScore - LScore) as winning_margins,
    SUM(WScore) sum_score,
    SUM(LScore) sum_score_of_opponents,
    SUM(CASE WHEN WLoc == 'A' OR WLoc == 'N' THEN 1 ELSE 0 END) non_home_wins,
    SUM(CASE WHEN WScore - LScore <= 3 OR NumOT > 0 THEN 1 ELSE 0 END) close_wins,
    SUM(WFGA3) sum_3FGA,
    SUM(WFGM3) sum_3FGM,
    SUM(WFTA) sum_FTA,
    SUM(WFTM) sum_FTM,
    SUM(WTO) sum_TOV,
    SUM(Pace) sum_pace,
    SUM(WOffEff) sum_off_eff,
    SUM(LOffEff) sum_def_eff
FROM regular_season_detailed
GROUP BY 1, 2
'''
wins = sqldf(wins_q)
wins.head()

Unnamed: 0,Season,TeamID,num_wins,winning_margins,sum_score,sum_score_of_opponents,non_home_wins,close_wins,sum_3FGA,sum_3FGM,sum_FTA,sum_FTM,sum_TOV,sum_pace,sum_off_eff,sum_def_eff
0,2003,1102,12,2952114918381323458,825,638,3,0,259,120,249,163,133,650.9376,1530.926442,1170.195361
1,2003,1103,13,473351181161121158,1141,1019,4,4,187,71,402,290,163,873.817173,1649.785724,1459.595972
2,2003,1104,17,6261562899818710691652323,1270,1046,4,1,354,120,383,272,222,1104.100267,1953.578718,1596.765971
3,2003,1105,7,1974118348,556,465,2,1,157,64,180,134,126,515.424,755.667122,631.810684
4,2003,1106,13,5512910611826241144,888,753,5,2,207,76,270,168,230,844.194133,1362.130906,1140.286408


In [282]:
# aggregate data for when team is the losing team
losses_q = '''
SELECT 
    Season, LTeamID AS TeamID, 
    COUNT() as num_losses,
    GROUP_CONCAT(LScore - WScore) as losing_margins,
    SUM(LScore) sum_score,
    SUM(WScore) sum_score_of_opponents,
    SUM(CASE WHEN WLoc == 'A' THEN 1 ELSE 0 END) home_losses,
    SUM(CASE WHEN WScore - LScore <= 3 OR NumOT > 0 THEN 1 ELSE 0 END) close_losses,
    SUM(LFGA3) sum_3FGA,
    SUM(LFGM3) sum_3FGM,
    SUM(LFTA) sum_FTA,
    SUM(LFTM) sum_FTM,
    SUM(LTO) sum_TOV,
    SUM(Pace) sum_pace,
    SUM(LOffEff) sum_off_eff,
    SUM(WOffEff) sum_def_eff
FROM regular_season_detailed
GROUP BY 1, 2
'''
losses = sqldf(losses_q)
losses.head()

Unnamed: 0,Season,TeamID,num_losses,losing_margins,sum_score,sum_score_of_opponents,home_losses,close_losses,sum_3FGA,sum_3FGM,sum_FTA,sum_FTM,sum_TOV,sum_pace,sum_off_eff,sum_def_eff
0,2003,1102,16,"-18,-13,-17,-10,-15,-2,-2,-10,-32,-14,-7,-14,-...",778,958,4,2,324,99,230,149,187,811.584,1528.733279,1896.446461
1,2003,1103,14,"-3,-3,-2,-6,-8,-14,-15,-7,-16,-7,-16,-2,-4,-2",986,1091,5,6,247,76,296,224,178,878.9632,1499.794451,1685.607938
2,2003,1104,11,"-2,-1,-9,-19,-17,-11,-19,-5,-4,-4,-13",670,774,2,2,202,58,203,144,150,664.608,1095.144346,1290.023656
3,2003,1105,19,"-28,-42,-13,-2,-22,-14,-10,-1,-1,-6,-5,-12,-31...",1310,1528,7,7,383,133,388,267,359,1342.574933,1788.600451,2134.194163
4,2003,1106,15,"-1,-12,-26,-16,-1,-9,-20,-6,-6,-1,-3,-19,-6,-1...",893,1032,4,5,287,95,191,130,247,949.8048,1393.217658,1646.962777


In [289]:
# outer join wins and losses to get all teams W/L record each season
team_season_stats = wins.merge(losses, on=['Season', 'TeamID'], how='outer', suffixes=('_w', '_l'))
team_season_stats.fillna(0, inplace=True)

# typecast columns
team_season_stats[['num_wins','num_losses']] = team_season_stats[['num_wins','num_losses']].astype(int)
team_season_stats[['winning_margins','losing_margins']] = team_season_stats[['winning_margins','losing_margins']].astype(str)

# total games
team_season_stats['total_games'] = team_season_stats['num_wins'] + team_season_stats['num_losses']

# total points for
team_season_stats['total_points_for'] = team_season_stats['sum_score_w'] + team_season_stats['sum_score_l']

# total points against
team_season_stats['total_points_against'] = team_season_stats['sum_score_of_opponents_w'] + team_season_stats['sum_score_of_opponents_l']

# win pct.
team_season_stats['win_pct'] = team_season_stats['num_wins'] / team_season_stats['total_games']

# avg. points for
team_season_stats['avg_points_for'] = team_season_stats['total_points_for'] / team_season_stats['total_games']

# avg. points against
team_season_stats['avg_points_against'] = team_season_stats['total_points_against'] / team_season_stats['total_games']

# 3FGA per game
team_season_stats['3FGA_pg'] = (team_season_stats['sum_3FGA_w'] + team_season_stats['sum_3FGA_l']) / team_season_stats['total_games']

# 3FGM per game
team_season_stats['3FGM_pg'] = (team_season_stats['sum_3FGM_w'] + team_season_stats['sum_3FGM_l']) / team_season_stats['total_games']

# 3 pt. pct.
team_season_stats['3_pct'] = team_season_stats['3FGM_pg'] / team_season_stats['3FGA_pg']

# FTA per game
team_season_stats['FTA_pg'] = (team_season_stats['sum_FTA_w'] + team_season_stats['sum_FTA_l']) / team_season_stats['total_games']

# FTM per game
team_season_stats['FTM_pg'] = (team_season_stats['sum_FTM_w'] + team_season_stats['sum_FTM_l']) / team_season_stats['total_games']

# FT pct.
team_season_stats['FT_pct'] = team_season_stats['FTM_pg'] / team_season_stats['FTA_pg']

# TOV per game
team_season_stats['TOV_pg'] = (team_season_stats['sum_TOV_w'] + team_season_stats['sum_TOV_l']) / team_season_stats['total_games']

# turn strings of winning and losing margins into one tuple
team_season_stats['differential_arr'] = [literal_eval(wm + ',' + lm) for wm, lm in zip(team_season_stats['winning_margins'], team_season_stats['losing_margins'])]

# avg. score differential
team_season_stats['avg_game_margin'] = [np.mean(arr) for arr in team_season_stats['differential_arr']]

# std. dev. of score differential
team_season_stats['std_game_margin'] = [np.std(arr) for arr in team_season_stats['differential_arr']]

# avg. pace
team_season_stats['avg_pace'] = (team_season_stats['sum_pace_w'] + team_season_stats['sum_pace_l']) / team_season_stats['total_games']

# avg. off. efficiency
team_season_stats['off_eff'] = (team_season_stats['sum_off_eff_w'] + team_season_stats['sum_off_eff_l']) / team_season_stats['total_games']

# avg. def. efficiency
team_season_stats['def_eff'] = (team_season_stats['sum_def_eff_w'] + team_season_stats['sum_def_eff_l']) / team_season_stats['total_games']

# pythagorean win pct. (Morey version: k=13.91)
k = 13.91
team_season_stats['pythag_win_pct'] = (team_season_stats['total_points_for'] ** k) / \
    (team_season_stats['total_points_for'] ** k + team_season_stats['total_points_against'] ** k)

# luck
team_season_stats['luck'] = team_season_stats['win_pct'] - team_season_stats['pythag_win_pct']

In [290]:
team_season_stats_compact = team_season_stats[['Season', 'TeamID', 'total_games', 
                     'num_wins', 'num_losses', 'win_pct',
                     'avg_points_for', 'avg_points_against',
                     'avg_game_margin', 'std_game_margin',
                     'non_home_wins', 'home_losses',
                     'close_wins', 'close_losses',
                     '3FGA_pg', '3FGM_pg', '3_pct',
                     'FTM_pg', 'FTA_pg', 'FT_pct', 'TOV_pg',
                     'avg_pace', 'off_eff', 'def_eff',
                     'pythag_win_pct', 'luck']]
team_season_stats_compact.head()

Unnamed: 0,Season,TeamID,total_games,num_wins,num_losses,win_pct,avg_points_for,avg_points_against,avg_game_margin,std_game_margin,...,3_pct,FTM_pg,FTA_pg,FT_pct,TOV_pg,avg_pace,off_eff,def_eff,pythag_win_pct,luck
0,2003,1102,28,12,16,0.428571,57.25,57.0,0.25,15.878163,...,0.375643,11.142857,17.107143,0.651357,11.428571,52.232914,109.273561,109.522922,0.515214,-0.086643
1,2003,1103,27,13,14,0.481481,78.777778,78.148148,0.62963,11.126039,...,0.33871,19.037037,25.851852,0.73639,12.62963,64.917792,116.651118,116.489034,0.527877,-0.046395
2,2003,1104,28,17,11,0.607143,69.285714,65.0,4.285714,13.149843,...,0.320144,14.857143,20.928571,0.709898,13.285714,63.168152,108.882967,103.09963,0.708513,-0.10137
3,2003,1105,26,7,19,0.269231,71.769231,76.653846,-4.884615,15.552212,...,0.364815,15.423077,21.846154,0.705986,18.653846,71.461497,97.856445,106.384802,0.285796,-0.016565
4,2003,1106,28,13,15,0.464286,63.607143,63.75,-0.142857,12.374266,...,0.346154,10.642857,16.464286,0.646421,17.035714,64.07139,98.405306,99.544614,0.492199,-0.027913


### Results vs. tourney and non-tourney teams

Create:
- seed
- games vs. tourney teams
- wins vs. tourney teams (good wins)
- losses vs. tourney teams (good losses)
- away wins vs. tourney teams (tough wins)
- losses vs. non-tourney teams (bad losses)
- win % vs. tourney teams

In [222]:
# join game results with seeds
team_seeds_q = '''
SELECT 
    a.*, b.Seed WTeamSeed, c.Seed LTeamSeed
FROM regular_season_compact a
LEFT JOIN seeds b
    ON a.WTeamID = b.TeamID
    AND a.Season = b.Season
LEFT JOIN seeds c
    ON a.LTeamID = c.TeamID
    AND a.Season = c.Season
'''
team_seeds = sqldf(team_seeds_q)
# make sure both columns are strings
team_seeds[['WTeamSeed','LTeamSeed']] = team_seeds[['WTeamSeed','LTeamSeed']].astype(str)
team_seeds.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WTeamSeed,LTeamSeed
0,2003,10,1104,68,1328,62,N,0,Y10,W01
1,2003,10,1272,70,1393,63,N,0,Z07,W03
2,2003,11,1266,73,1437,61,N,0,Y03,
3,2003,11,1296,56,1457,50,N,0,,
4,2003,11,1400,77,1208,71,N,0,X01,


In [214]:
wins_tourney_q = '''
SELECT
    Season, WTeamID AS TeamID, WTeamSeed AS Seed,
    SUM(CASE WHEN LTeamSeed != 'None' THEN 1 ELSE 0 END) wins_vs_tourney_teams,
    SUM(CASE WHEN LTeamSeed != 'None' AND WLoc != 'H' THEN 1 ELSE 0 END) away_wins_vs_tourney_teams
FROM team_seeds
GROUP BY 1, 2
'''
wins_tourney = sqldf(wins_tourney_q)
wins_tourney.head()

Unnamed: 0,Season,TeamID,Seed,wins_vs_tourney_teams,away_wins_vs_tourney_teams
0,2003,1102,,1,0
1,2003,1103,,1,1
2,2003,1104,Y10,5,1
3,2003,1105,,1,0
4,2003,1106,,1,0


In [221]:
losses_tourney_q = '''
SELECT
    Season, LTeamID AS TeamID, LTeamSeed AS Seed,
    SUM(CASE WHEN WTeamSeed != 'None' THEN 1 ELSE 0 END) losses_vs_tourney_teams,
    SUM(CASE WHEN WTeamSeed = 'None' THEN 1 ELSE 0 END) losses_vs_non_tourney_teams
FROM team_seeds
GROUP BY 1, 2
'''
losses_tourney = sqldf(losses_tourney_q)
losses_tourney.head()

Unnamed: 0,Season,TeamID,Seed,losses_vs_tourney_teams,losses_vs_non_tourney_teams
0,2003,1102,,7,9
1,2003,1103,,1,13
2,2003,1104,Y10,6,5
3,2003,1105,,3,16
4,2003,1106,,4,11


In [224]:
team_season_stats_tourney = wins_tourney.merge(losses_tourney, on=['Season', 'TeamID'], how='outer', suffixes=('_w', '_l'))
team_season_stats_tourney

Unnamed: 0,Season,TeamID,Seed_w,wins_vs_tourney_teams,away_wins_vs_tourney_teams,Seed_l,losses_vs_tourney_teams,losses_vs_non_tourney_teams
0,2003,1102,,1.0,0.0,,7.0,9.0
1,2003,1103,,1.0,1.0,,1.0,13.0
2,2003,1104,Y10,5.0,1.0,Y10,6.0,5.0
3,2003,1105,,1.0,0.0,,3.0,16.0
4,2003,1106,,1.0,0.0,,4.0,11.0
...,...,...,...,...,...,...,...,...
6887,2015,1363,,,,,5.0,23.0
6888,2021,1152,,,,,4.0,5.0
6889,2022,1175,,,,,0.0,16.0
6890,2022,1237,,,,,0.0,19.0


In [225]:
team_season_stats_vs_tourney_teams_q = '''
SELECT
    Season, TeamID,
    COALESCE(Seed_w, Seed_l) Seed,
    COALESCE(wins_vs_tourney_teams, 0) wins_vs_tourney_teams,
    COALESCE(away_wins_vs_tourney_teams, 0) away_wins_vs_tourney_teams,
    COALESCE(losses_vs_tourney_teams, 0) losses_vs_tourney_teams,
    COALESCE(losses_vs_non_tourney_teams, 0) losses_vs_non_tourney_teams
FROM team_season_stats_tourney
GROUP BY 1, 2
'''
team_season_stats_vs_tourney_teams = sqldf(team_season_stats_vs_tourney_teams_q)
team_season_stats_vs_tourney_teams.head()

Unnamed: 0,Season,TeamID,Seed,wins_vs_tourney_teams,away_wins_vs_tourney_teams,losses_vs_tourney_teams,losses_vs_non_tourney_teams
0,2003,1102,,1.0,0.0,7.0,9.0
1,2003,1103,,1.0,1.0,1.0,13.0
2,2003,1104,Y10,5.0,1.0,6.0,5.0
3,2003,1105,,1.0,0.0,3.0,16.0
4,2003,1106,,1.0,0.0,4.0,11.0


In [231]:
team_season_stats_vs_tourney_teams['games_vs_tourney_teams'] = \
    team_season_stats_vs_tourney_teams['wins_vs_tourney_teams'] + team_season_stats_vs_tourney_teams['losses_vs_tourney_teams']
team_season_stats_vs_tourney_teams['win_pct_vs_tourney_teams'] = \
    team_season_stats_vs_tourney_teams['wins_vs_tourney_teams'] / team_season_stats_vs_tourney_teams['games_vs_tourney_teams']

In [233]:
team_season_stats_vs_tourney_teams.head()

Unnamed: 0,Season,TeamID,Seed,wins_vs_tourney_teams,away_wins_vs_tourney_teams,losses_vs_tourney_teams,losses_vs_non_tourney_teams,games_vs_tourney_teams,win_pct_vs_tourney_teams
0,2003,1102,,1.0,0.0,7.0,9.0,8.0,0.125
1,2003,1103,,1.0,1.0,1.0,13.0,2.0,0.5
2,2003,1104,Y10,5.0,1.0,6.0,5.0,11.0,0.454545
3,2003,1105,,1.0,0.0,3.0,16.0,4.0,0.25
4,2003,1106,,1.0,0.0,4.0,11.0,5.0,0.2


### Advanced features

Create:
- RPI ranking
- momentum (RPI ranking % change over last n days)
- wins in previous x tourneys
- wins in previous x seasons
- weighted wins?
- close wins
- close losses

In [296]:
ordinals.head()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
0,2003,35,SEL,1102,159
1,2003,35,SEL,1103,229
2,2003,35,SEL,1104,12
3,2003,35,SEL,1105,314
4,2003,35,SEL,1106,260


In [314]:
rpi_rank = ordinals[ordinals['SystemName'] == 'RPI']
rpi_rank_yr = rpi_rank[rpi_rank['Season'] == 2018]
np.unique(rpi_rank_yr['RankingDayNum'])

array([ 44,  51,  58,  65,  72,  79,  86,  93, 100, 107, 114, 121, 128,
       133])

In [None]:
rpi_ = '''
SELECT
    *,
    ROW_NUMBER() OVER(PARTITION BY Season, TeamID ORDER BY RankingDayNum DESC) AS RankingNum
FROM rpi_rank
'''
rpi_table = sqldf(rpi_q)
rpi_table.head()

In [320]:
rpi_q = '''
SELECT 
    Season, TeamID,
    CASE WHEN RankingNum = 1 THEN OrdinalRank END final_rpi,
    CASE WHEN RankingNum = 4 THEN OrdinalRank END last_month_rpi
FROM (
    SELECT
        *,
        ROW_NUMBER() OVER(PARTITION BY Season, TeamID ORDER BY RankingDayNum DESC) AS RankingNum
    FROM rpi_rank
) a
GROUP BY 1, 2
'''
rpi_table = sqldf(rpi_q)
rpi_table.head()

Unnamed: 0,Season,TeamID,final_rpi,last_month_rpi
0,2003,1102,158,
1,2003,1103,182,
2,2003,1104,38,
3,2003,1105,313,
4,2003,1106,248,


In [321]:
rpi_table[rpi_table['Season'] == 2018]

Unnamed: 0,Season,TeamID,final_rpi,last_month_rpi
5122,2018,1101,260,
5123,2018,1102,253,
5124,2018,1103,227,
5125,2018,1104,42,
5126,2018,1105,350,
...,...,...,...,...
5468,2018,1460,99,
5469,2018,1461,98,
5470,2018,1462,3,
5471,2018,1463,183,


In [304]:
rpi_q = '''
SELECT 
    RankingDayNum, COUNT(*)
FROM rpi_rank_yr
GROUP BY 1
'''
sqldf(rpi_q)

Unnamed: 0,RankingDayNum,COUNT(*)
0,44,351
1,51,351
2,58,351
3,65,351
4,72,351
5,79,351
6,86,351
7,93,351
8,100,351
9,107,351


## All together

In [291]:
team_season_stats_compact.merge(team_season_stats_vs_tourney_teams, on=['Season', 'TeamID'], how='left')

Unnamed: 0,Season,TeamID,total_games,num_wins,num_losses,win_pct,avg_points_for,avg_points_against,avg_game_margin,std_game_margin,...,def_eff,pythag_win_pct,luck,Seed,wins_vs_tourney_teams,away_wins_vs_tourney_teams,losses_vs_tourney_teams,losses_vs_non_tourney_teams,games_vs_tourney_teams,win_pct_vs_tourney_teams
0,2003,1102,28,12,16,0.428571,57.250000,57.000000,0.250000,15.878163,...,109.522922,0.515214,-0.086643,,1.0,0.0,7.0,9.0,8.0,0.125000
1,2003,1103,27,13,14,0.481481,78.777778,78.148148,0.629630,11.126039,...,116.489034,0.527877,-0.046395,,1.0,1.0,1.0,13.0,2.0,0.500000
2,2003,1104,28,17,11,0.607143,69.285714,65.000000,4.285714,13.149843,...,103.099630,0.708513,-0.101370,Y10,5.0,1.0,6.0,5.0,11.0,0.454545
3,2003,1105,26,7,19,0.269231,71.769231,76.653846,-4.884615,15.552212,...,106.384802,0.285796,-0.016565,,1.0,0.0,3.0,16.0,4.0,0.250000
4,2003,1106,28,13,15,0.464286,63.607143,63.750000,-0.142857,12.374266,...,99.544614,0.492199,-0.027913,,1.0,0.0,4.0,11.0,5.0,0.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6887,2015,1363,28,0,28,0.000000,52.535714,70.678571,-17.517241,10.230518,...,116.939425,0.015885,-0.015885,,0.0,0.0,5.0,23.0,5.0,0.000000
6888,2021,1152,9,0,9,0.000000,55.444444,88.666667,-29.900000,16.507271,...,128.972550,0.001456,-0.001456,,0.0,0.0,4.0,5.0,4.0,0.000000
6889,2022,1175,16,0,16,0.000000,57.062500,76.750000,-18.529412,13.856906,...,113.652642,0.015938,-0.015938,,0.0,0.0,0.0,16.0,0.0,
6890,2022,1237,19,0,19,0.000000,49.947368,68.526316,-17.650000,12.141973,...,114.044491,0.012140,-0.012140,,0.0,0.0,0.0,19.0,0.0,
