In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [4]:
rdat = pd.read_csv('RegularSeasonCompactResults.csv')
tdat = pd.read_csv('TourneyCompactResults.csv')
dat = pd.concat([rdat, tdat])
dat.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [43]:
tdat.Season.describe()

count    2050.000000
mean     2000.676585
std         9.274791
min      1985.000000
25%      1993.000000
50%      2001.000000
75%      2009.000000
max      2016.000000
Name: Season, dtype: float64

In [25]:
games = dat.loc[:, ['Season', 'Daynum', 'Wteam', 'Lteam']].copy()

In [26]:
games.loc[:, 'Min_team'] = games.loc[:, ['Wteam', 'Lteam']].min(axis=1)
games.loc[:, 'Max_team'] = games.loc[:, ['Wteam', 'Lteam']].max(axis=1)

In [27]:
games.head()

Unnamed: 0,Season,Daynum,Wteam,Lteam,Min_team,Max_team
0,1985,20,1228,1328,1228,1328
1,1985,25,1106,1354,1106,1354
2,1985,25,1112,1223,1112,1223
3,1985,25,1165,1432,1165,1432
4,1985,25,1192,1447,1192,1447


In [28]:
games.loc[:, 'game'] = games.Min_team.apply(lambda x: str(int(x))) + '_' + games.Max_team.apply(lambda x: str(int(x)))

In [29]:
games.head()

Unnamed: 0,Season,Daynum,Wteam,Lteam,Min_team,Max_team,game
0,1985,20,1228,1328,1228,1328,1228_1328
1,1985,25,1106,1354,1106,1354,1106_1354
2,1985,25,1112,1223,1112,1223,1112_1223
3,1985,25,1165,1432,1165,1432,1165_1432
4,1985,25,1192,1447,1192,1447,1192_1447


In [30]:
games.loc[:, 'Min_team_win'] = games.Wteam == games.Min_team

In [31]:
games.head()

Unnamed: 0,Season,Daynum,Wteam,Lteam,Min_team,Max_team,game,Min_team_win
0,1985,20,1228,1328,1228,1328,1228_1328,True
1,1985,25,1106,1354,1106,1354,1106_1354,True
2,1985,25,1112,1223,1112,1223,1112_1223,True
3,1985,25,1165,1432,1165,1432,1165_1432,True
4,1985,25,1192,1447,1192,1447,1192_1447,True


In [32]:
games.tail()

Unnamed: 0,Season,Daynum,Wteam,Lteam,Min_team,Max_team,game,Min_team_win
2045,2016,146,1314,1323,1314,1323,1314_1323,True
2046,2016,146,1393,1438,1393,1438,1393_1438,True
2047,2016,152,1314,1393,1314,1393,1314_1393,True
2048,2016,152,1437,1328,1328,1437,1328_1437,False
2049,2016,154,1437,1314,1314,1437,1314_1437,False


In [81]:
sample = games.loc[games.game == '1276_1409'].sort_values(by=['Season', 'Daynum'])

In [82]:
sample

Unnamed: 0,Season,Daynum,Wteam,Lteam,Min_team,Max_team,game,Min_team_win
1986,2016,135,1276,1409,1276,1409,1276_1409,True


In [41]:
sample.loc[sample.Daynum > 132]

Unnamed: 0,Season,Daynum,Wteam,Lteam,Min_team,Max_team,game,Min_team_win
37,1985,138,1314,1323,1314,1323,1314_1323,True
175,1987,143,1314,1323,1314,1323,1314_1323,True
2045,2016,146,1314,1323,1314,1323,1314_1323,True


In [42]:
last_reg_day = 132

In [45]:
def get_winning_index(win_rate, n_games):
    return (win_rate + 0.01)**(n_games + 1)

In [70]:
def make_trainset(dat, year):
    feature_dat = dat.loc[(dat.Season < year) | ((dat.Season == year) & (dat.Daynum <= last_reg_day))]
    label_dat = dat.loc[(dat.Season == year) & (dat.Daynum > last_reg_day), ['Season', 'game', 'Min_team_win']]
    gpd = feature_dat.groupby('game')
    n_games = gpd.size()
    win_rate = gpd['Min_team_win'].sum() / n_games
    winning_index = get_winning_index(win_rate, n_games)
    tgpd = feature_dat.loc[feature_dat.Daynum > last_reg_day].groupby('game')
    n_tgames = tgpd.size()
    twin_rate = tgpd['Min_team_win'].sum() / n_tgames
    twinning_index = get_winning_index(twin_rate, n_tgames)
    
    winning_index.name = 'winning_index'
    twinning_index.name = 'twinning_index'
    features = pd.concat([winning_index, twinning_index], axis=1)
    res = label_dat.join(features, on='game', how='left').fillna(get_winning_index(0, 1))
    return res

In [71]:
make_trainset(sample, 2016)

Unnamed: 0,Season,game,Min_team_win,winning_index,twinning_index
2045,2016,1314_1323,True,0.001932,1.030301


In [68]:
feature_dat = games.loc[(games.Season < 1986) | ((games.Season == 1986) & (games.Daynum <= last_reg_day))]

In [69]:
feature_dat.shape

(7583, 8)

In [83]:
train_set = pd.DataFrame(None)
for year in range(1986, 2016):
    train_set = train_set.append(make_trainset(games, year))

In [84]:
train_set.head()

Unnamed: 0,Season,game,Min_team_win,winning_index,twinning_index
63,1986,1133_1431,True,0.0001,0.0001
64,1986,1177_1438,True,0.0001,0.0001
65,1986,1181_1290,True,0.0001,0.0001
66,1986,1207_1403,True,0.0001,0.0001
67,1986,1210_1265,True,0.0001,0.0001


In [85]:
train_set.describe()

Unnamed: 0,Season,winning_index,twinning_index
count,1920.0,1920.0,1920.0
mean,2000.65625,0.1472591,0.06318838
std,8.693829,0.3428586,0.2417776
min,1986.0,6.16281e-22,1e-08
25%,1993.0,0.0001,0.0001
50%,2001.0,0.0001,0.0001
75%,2008.0,0.001188138,0.0001
max,2015.0,1.104622,1.06152
