In [1]:
import math
import optuna
import numpy as np
import pandas as pd
import seaborn as sns
from numba import jit

from tqdm.notebook import tqdm
from joblib import dump, load
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb

## Glicko Functions

In [2]:
@jit(nopython=True)
def get_rd(rd,t,c):
    return min(np.sqrt(rd**2+t*c**2),350)
@jit(nopython=True)
def get_grd(rd):
    q2 = 0.00003313686
    pi2 = 9.86960440109
    return 1.0/np.sqrt(1+(3*q2*(rd**2))/pi2)
@jit(nopython=True)
def get_erd(rd,r0,ri,div):
    return max(1.0/(1.0+10**((get_grd(rd)*(r0-ri))/-div)),0.0001)
@jit(nopython=True)
def get_d2(rd,r0,ri,div):
    q2 = 0.00003313686
    return 1.0/(q2*(get_grd(rd)**2)*get_erd(rd,r0,ri,div)*(1-get_erd(rd,r0,ri,div)))
@jit(nopython=True)
def get_r_rd(rd,rdi,r0,ri,si,t,c,div):
    q = 0.00575646273
    rd = get_rd(rd,t,c)
    scale = (q/((1/(rd**2))+(1/get_d2(rdi,r0,ri,div))))
    r = r0 + scale * get_grd(rdi)*(si-get_erd(rdi,r0,ri,div))
    rd = get_rd_final(rd,rdi,r,ri,div)
    return  r,rd
@jit(nopython=True)
def get_rd_final(rd,rdi,r0,ri,div):
    return np.sqrt(((1/rd**2)+(1/get_d2(rdi,r0,ri,div)))**-1)  

## Additional Features

In [3]:
def FE():

    detail_season = pd.read_csv('/home/dominique/Projects/march_madness/ncaaw-march-mania-2021/ncaaw-march-mania-2021/WDataFiles_Stage2/WRegularSeasonDetailedResults.csv')
    #FGM - field goals made (by the winning team)
    #FGA - field goals attempted (by the winning team)
    #FGM3 - three pointers made (by the winning team)
    #FGA3 - three pointers attempted (by the winning team)
    #FTM - free throws made (by the winning team)
    #FTA - free throws attempted (by the winning team)
    #OR - offensive rebounds (pulled by the winning team)
    #DR - defensive rebounds (pulled by the winning team)
    #Ast - assists (by the winning team)
    #TO - turnovers committed (by the winning team)
    #Stl - steals (accomplished by the winning team)
    #Blk - blocks (accomplished by the winning team)
    #PF - personal fouls committed (by the winning team)

    detail_season = detail_season.fillna(detail_season.mean())
    detail_season.rename(columns={"WTeamID": "TeamID0", "LTeamID": "TeamID1", "WLoc": "GLoc"},inplace=True)   
    
    cols = detail_season.columns
    new_cols = []
    gvars = []
    for c in cols:
        if c.startswith('W'):
            new_cols.append(c[1:]+'0')
            gvars.append(c[1:])
        elif c.startswith('L'):
            new_cols.append(c[1:]+'1')        
        else:
            new_cols.append(c)
    
    detail_season.columns = new_cols
    
    for v in gvars:
        for i in [0,1]:
            detail_season[v+str(i)+'_r'] = np.zeros(len(detail_season))
            detail_season[v+str(i)+'_rd'] = np.zeros(len(detail_season))
            
    return detail_season,gvars

In [4]:
def glicko_setup(gvars,detail_season):
    team_glicko = dict()
    team_season_glicko = dict()
    team0 = pd.unique(detail_season['TeamID0'])
    team1 = pd.unique(detail_season['TeamID1'])
    teams = list(set(team0)|set(team1))

    for t in teams:
        team_glicko[t] = {}
        for v in gvars:
            r_name = v +'_r'
            team_glicko[t][r_name] = 1000
            rd_name = v +'_rd'
            team_glicko[t][rd_name] = 350

    team_season_glicko = dict()
    for season in range(2003,2022):
        team_season_glicko[season] = {}
        for t in teams:
            team_season_glicko[season][t] = {}
            for v in gvars:
                r_name = v +'_r'
                team_season_glicko[season][t][r_name] = 1000
                rd_name = v +'_rd'
                team_season_glicko[season][t][rd_name] = 350
                
                
            team_season_glicko[season][t]['cov_mat'] = [[]]
                
    return team_glicko,team_season_glicko

In [5]:
def glicko_compute(team_glicko,team_season_glicko,detail_season,gvars,c,div):
    for i in tqdm(range(len(detail_season))):

        team0 = detail_season.at[i,'TeamID0']
        team1 = detail_season.at[i,'TeamID1']
        for v in gvars:

            team0r = team_glicko[team0][v+'_r']
            team0rd = team_glicko[team0][v+'_rd']

            team1r = team_glicko[team1][v+'_r']
            team1rd = team_glicko[team1][v+'_rd']

            var0 = detail_season.at[i,v+'0']
            var1 = detail_season.at[i,v+'1']
                
            r0,rd0,r1,rd1 = 0,0,0,0
            if var0 > var1:           
                r0,rd0 = get_r_rd(team0rd,team1rd,team0r,team1r,1,1,c,div)
                r1,rd1 = get_r_rd(team1rd,team0rd,team1r,team0r,0,1,c,div)
            elif var1 > var0:
                r0,rd0 = get_r_rd(team0rd,team1rd,team0r,team1r,0,1,c,div)
                r1,rd1 = get_r_rd(team1rd,team0rd,team1r,team0r,1,1,c,div)
            else:
                r0,rd0 = get_r_rd(team0rd,team1rd,team0r,team1r,0.5,1,c,div)
                r1,rd1 = get_r_rd(team1rd,team0rd,team1r,team0r,0.5,1,c,div)

            detail_season.at[i, v+'0_r'] = team0r
            detail_season.at[i, v+'0_rd'] = team0rd
            detail_season.at[i, v+'1_r'] = team1r
            detail_season.at[i, v+'1_rd'] = team1rd

            team_glicko[team0][v+'_r'] = r0
            team_glicko[team0][v+'_rd'] = rd0

            team_glicko[team1][v+'_r'] = r1
            team_glicko[team1][v+'_rd'] = rd1

            season = detail_season.at[i,'Season']
            team_season_glicko[season][team0][v+'_r'] = r0
            team_season_glicko[season][team0][v+'_rd'] = rd0

            team_season_glicko[season][team1][v+'_r'] = r1
            team_season_glicko[season][team1][v+'_rd'] = rd1

    return team_glicko,team_season_glicko,detail_season

In [6]:
def compute_covariate_matrix(detail_season,team_season_glicko,gvars):
    gvars0 = list()
    gvars1= list()
    for v in gvars:
        gvars0.append(v+'0_r')
        gvars0.append(v+'1_r')
    for v in gvars:
        gvars1.append(v+'1_r')
        gvars1.append(v+'0_r')
    
    for season in range(2003,2022):
        season_df = detail_season[detail_season['Season'] == season]
        team0 = pd.unique(season_df['TeamID0'])
        team1 = pd.unique(season_df['TeamID1'])
        teams = list(set(team0)|set(team1))

        for t in teams:
            team_df_wins = season_df[season_df['TeamID0'] == t]
            team_df_losses = season_df[season_df['TeamID1'] == t]
            team_df_wins = team_df_wins[gvars0]
            team_df_losses = team_df_losses[gvars1]
            team_df_losses.columns = gvars0
            X = pd.concat([team_df_wins, team_df_losses]).sort_index(kind='merge')
            team_season_glicko[season][t]['cov_mat'] = np.cov(X,rowvar=False)

    return team_season_glicko

In [7]:
def parse_string(string):
    words = string.split("_")
    return int(words[0]),int(words[1]),int(words[2])

In [8]:
def optuna_gamescore(year,team1,team2,gvars,team_season_glicko,params,ngames):   
    t1means = list()
    for g in gvars:
        t1means.append(team_season_glicko[year][team1][g+'_r'])
        t1means.append(team_season_glicko[year][team2][g+'_r'])
        
    t2means = list()
    for g in gvars:
        t2means.append(team_season_glicko[year][team2][g+'_r'])
        t2means.append(team_season_glicko[year][team1][g+'_r'])
        
    cols = list()
    for g in gvars:
        cols.append(g+'0')
        cols.append(g+'1')
        
    games0 = pd.DataFrame(data=np.random.multivariate_normal(t1means, team_season_glicko[year][team1]['cov_mat'], ngames),columns=cols)
    games1 = pd.DataFrame(data=np.random.multivariate_normal(t2means, team_season_glicko[year][team2]['cov_mat'], ngames),columns=cols)
    
    gamescore = params[0]*games0['Score0'] + params[1]*games0['FGM0'] - params[2]*games0['FGA0'] - params[3]*(games0['FTA0']-games0['FTM0']) + params[4]*games0['OR0'] + \
    params[5]*games0['DR0'] + params[6]*games0['Stl0'] + params[7]*games0['Ast0'] + params[8]*games0['Blk0'] - params[9]*games0['PF0'] - params[10]*games0['TO0']
    
    gamescore -= params[0]*games1['Score0'] + params[1]*games1['FGM0'] - params[2]*games1['FGA0'] - params[3]*(games1['FTA0']-games1['FTM0']) + params[4]*games1['OR0'] + \
    params[5]*games1['DR0'] + params[6]*games1['Stl0'] + params[7]*games1['Ast0'] + params[8]*games1['Blk0'] - params[9]*games1['PF0'] - params[10]*games1['TO0']
    
    wins = np.where(gamescore > 0, 1, 0)

    return np.sum(wins)/ngames

In [9]:
# glicko_list = []
# for c in range(10,300,10):
#     div = 400
#     detail_season,gvars = FE()
#     team_glicko,team_season_glicko = glicko_setup(gvars,detail_season)
#     team_glicko,team_season_glicko,detail_season = glicko_compute(team_glicko,team_season_glicko,detail_season,gvars,c,div)
#     team_season_glicko = compute_covariate_matrix(detail_season,team_season_glicko,gvars)
#     glicko_list.append(team_season_glicko)

In [10]:
# dump(glicko_list,'/home/dominique/Projects/march_madness/ncaaw-march-mania-2021/ncaaw-march-mania-2021/WDataFiles_Stage2/glicko_list.pkl')
# dump(gvars,'/home/dominique/Projects/march_madness/ncaaw-march-mania-2021/ncaaw-march-mania-2021/WDataFiles_Stage2/gvars.pkl')

In [11]:
glicko_list = load('/home/dominique/Projects/march_madness/ncaaw-march-mania-2021/ncaaw-march-mania-2021/WDataFiles_Stage2/glicko_list.pkl')
gvars = load('/home/dominique/Projects/march_madness/ncaaw-march-mania-2021/ncaaw-march-mania-2021/WDataFiles_Stage2/gvars.pkl')
global_params = []
for ind,g in enumerate(glicko_list):
    global_params.append([])

In [12]:
file = '/home/dominique/Projects/march_madness/ncaaw-march-mania-2021/ncaaw-march-mania-2021/WDataFiles_Stage2/decade.csv'
sub = pd.read_csv(file)

for ind,g in enumerate(glicko_list):
    name = 'glicko_gamescore_' + str((ind+1)*10)
    sub[name] = np.zeros(len(sub))
    for i in range(len(sub)):
        ID = sub.at[i,'ID']
        year,team1,team2 = parse_string(ID)
        sub.at[i,name] = optuna_gamescore(year,team1,team2,gvars,g,global_params[ind],10000)

X = sub.copy()
X.drop(['ID','Pred'],axis=1,inplace=True)
X_opp = 1 - X
X = pd.concat([X, X_opp])
Y = sub['Pred']
Y_opp = (1 + Y) % 2
Y = pd.concat([Y, Y_opp])

model = LogisticRegressionCV(scoring='neg_log_loss',max_iter=1000,n_jobs=-1).fit(X,Y)
pred = model.predict_proba(X)
best = log_loss(Y,pred)
print(best)
dump(model,'/home/dominique/Projects/march_madness/ncaaw-march-mania-2021/ncaaw-march-mania-2021/WDataFiles_Stage2/model1.pkl')

0.3577408175931496


['/home/dominique/Projects/march_madness/ncaaw-march-mania-2021/ncaaw-march-mania-2021/WDataFiles_Stage2/model1.pkl']

In [16]:
model1 = load('/home/dominique/Projects/march_madness/ncaaw-march-mania-2021/ncaaw-march-mania-2021/WDataFiles_Stage2/model1.pkl')
file = '/home/dominique/Projects/march_madness/ncaaw-march-mania-2021/ncaaw-march-mania-2021/WDataFiles_Stage2/WSampleSubmissionStage2.csv'
model_comp1 = pd.read_csv(file)

pred = np.zeros(len(model_comp1))
for ind,g in enumerate(glicko_list):
    name = 'glicko_gamescore_' + str((ind+1)*10)
    model_comp1[name] = np.zeros(len(model_comp1))
    for i in range(len(model_comp1)):
        ID = model_comp1.at[i,'ID']
        year,team1,team2 = parse_string(ID)
        model_comp1.at[i,name] = optuna_gamescore(year,team1,team2,gvars,g,global_params[ind],10000)

In [17]:
X = model_comp1.copy()
X.drop(['ID','Pred'],axis=1,inplace=True)
Y = model_comp1['Pred']

pred = model1.predict_proba(X)
    
file = '/home/dominique/Projects/march_madness/ncaaw-march-mania-2021/ncaaw-march-mania-2021/WDataFiles_Stage2/WSampleSubmissionStage2.csv'
test = pd.read_csv(file)
for i in range(len(test)):
    test.at[i,'Pred'] = pred[i][1]
test.to_csv('/home/dominique/Projects/march_madness/ncaaw-march-mania-2021/ncaaw-march-mania-2021/WDataFiles_Stage2/Submission1.csv',index=False)