In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import copy
pd.set_option('max_columns',None)

In [2]:
tourney_res = pd.read_csv(r'G:\machine learning\kaggle\NACCM-2020\MDataFiles_Stage1\MNCAATourneyDetailedResults.csv')
season_res = pd.read_csv(r'G:\machine learning\kaggle\NACCM-2020\MDataFiles_Stage1\MRegularSeasonDetailedResults.csv')
seeds = pd.read_csv(r'G:\machine learning\kaggle\NACCM-2020\MDataFiles_Stage1\MNCAATourneySeeds.csv')
sample_sub = pd.read_csv(r'G:\machine learning\kaggle\NACCM-2020\MSampleSubmissionStage1_2020.csv')

In [3]:
def fix_data(df):
    df['WOR'] = df['WOR'] + df['WDR']
    df['LOR'] = df['LOR'] + df['LDR']
    df = df.rename(columns={'WOR':'WR','LOR':'LR'})
    df = df.drop(['WFGM3','WFGA3','WFTM','WFTA','WDR','WTO','WPF','LFGM3','LFGA3','LFTM','LFTA','LDR','LTO','LPF'],axis=1)
    return df
tourney_res = fix_data(tourney_res)
season_res = fix_data(season_res)

In [4]:
columns = ['WScore','LScore','WFGM','WFGA','WR','WAst','WStl','WBlk','LFGM','LFGA','LR','LAst','LStl','LBlk']
for column in columns:
    season_res.loc[season_res['NumOT']!=0, column] = season_res.loc[season_res['NumOT']!=0, column]*(40/(40+season_res['NumOT']*5))

In [5]:
def double_data(df):
    df_swap = df[['Season','DayNum','LTeamID','LScore','WTeamID','WScore','WLoc','NumOT','LFGM','LFGA','LR','LAst','LStl','LBlk','WFGM','WFGA','WR','WAst','WStl','WBlk']]
    df_swap.loc[df['WLoc']=='A','WLoc']='H'
    df_swap.loc[df['WLoc']=='H','WLoc']='A'
    df = df.rename(columns={'WLoc':'location'})
    df_swap = df_swap.rename(columns={'WLoc':'location'})
    df.columns=[x.replace('W','T1_').replace('L','T2_') for x in list(df.columns)]
    df_swap.columns=[x.replace('L','T1_').replace('W','T2_') for x in list(df_swap.columns)]
    res = pd.concat([df,df_swap]).sort_index(axis=0).reset_index(drop=True)
    return res
tourney_res = double_data(tourney_res)
season_res = double_data(season_res)

In [6]:
season_res['T1_TeamID'] = season_res['T1_TeamID'].astype(str)
season_res['T2_TeamID'] = season_res['T2_TeamID'].astype(str)

season_res['win'] = np.where(season_res['T1_Score']>season_res['T2_Score'], 1, 0)

def team_quality(season):
    formula = 'win~-1+T1_TeamID+T2_TeamID'
    glm = sm.GLM.from_formula(formula=formula, 
                              data=season_res.loc[season_res.Season==season,:], 
                              family=sm.families.Binomial()).fit()
    quality = pd.DataFrame(glm.params).reset_index()
    quality.columns = ['TeamID','beta']
    quality['Season'] = season
    quality['quality'] = np.exp(quality['beta'])
    quality = quality.loc[quality.TeamID.str.contains('T1_')].reset_index(drop=True)
    quality['TeamID'] = quality['TeamID'].apply(lambda x: x[10:14]).astype(int)
    return quality


team_quality = pd.concat([team_quality(2003),team_quality(2004),team_quality(2005),team_quality(2006),team_quality(2007),
                          team_quality(2008),team_quality(2009),team_quality(2010),team_quality(2011),team_quality(2012),
                          team_quality(2013),team_quality(2014),team_quality(2015),team_quality(2016),team_quality(2017),
                          team_quality(2018),team_quality(2019)]).reset_index(drop=True)
season_res['T1_TeamID'] = season_res['T1_TeamID'].astype(int)
season_res['T2_TeamID'] = season_res['T2_TeamID'].astype(int)

In [7]:
team_quality_T1 = team_quality[['TeamID','Season','quality']]
team_quality_T1.columns = ['T1_TeamID','Season','T1_quality']
team_quality_T2 = team_quality[['TeamID','Season','quality']]
team_quality_T2.columns = ['T2_TeamID','Season','T2_quality']

tourney_res['T1_TeamID'] = tourney_res['T1_TeamID'].astype(int)
tourney_res['T2_TeamID'] = tourney_res['T2_TeamID'].astype(int)
tourney_res = pd.merge(tourney_res,team_quality_T1, on = ['T1_TeamID','Season'], how = 'left')
tourney_res = pd.merge(tourney_res,team_quality_T2, on = ['T2_TeamID','Season'], how = 'left')

In [8]:
seeds['seed'] = seeds['Seed'].apply(lambda x: int(x[1:3]))
seeds['division'] = seeds['Seed'].apply(lambda x: x[0])

seeds_T1 = seeds[['Season','TeamID','seed','division']].copy()
seeds_T2 = seeds[['Season','TeamID','seed','division']].copy()
seeds_T1.columns = ['Season','T1_TeamID','T1_seed','T1_division']
seeds_T2.columns = ['Season','T2_TeamID','T2_seed','T2_division']

tourney_res = pd.merge(tourney_res,seeds_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_res = pd.merge(tourney_res,seeds_T2, on = ['Season', 'T2_TeamID'], how = 'left')
tourney_res['diff_seed'] = tourney_res['T1_seed'] - tourney_res['T2_seed']
tourney_res = tourney_res.drop(['T1_seed','T2_seed'], axis=1)

In [9]:
tourney_res['T1_powerrank'] = tourney_res.groupby(['Season','T1_division'])['T1_quality'].rank(method='dense', ascending=False).astype(int)
tourney_res['T2_powerrank'] = tourney_res.groupby(['Season','T2_division'])['T2_quality'].rank(method='dense', ascending=False).astype(int)

In [10]:
season_res['win'] = np.where((season_res['DayNum']>98)&(season_res['T1_Score']>season_res['T2_Score']), 1, 0)
season_res['games'] = np.where(season_res['DayNum']>98, 1, 0)
T1_season_summary = season_res.groupby(['Season','T1_TeamID']).agg({'win':'sum'}).reset_index()

group = season_res.groupby(['Season','T1_TeamID']).agg({'games':'sum'})
group.columns = ['play_games']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'],how='left')

T1_season_summary['T1_win30days_ratio'] = T1_season_summary['win']/T1_season_summary['play_games']
season_res = season_res.drop(['win','games'],axis=1)
T1_season_summary = T1_season_summary.drop(['win','play_games'],axis=1)

In [11]:
group = season_res.groupby(['Season','T1_TeamID']).agg({'T1_Score':'mean'})
group.columns = ['T1_mean_Score']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'], how='left')

group = season_res.groupby(['Season','T1_TeamID']).agg({'T1_Score':'median'})
group.columns = ['T1_median_Score']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'], how='left')

season_res['T1_Diff_Score'] = season_res['T1_Score'] - season_res['T2_Score']
group = season_res.groupby(['Season','T1_TeamID']).agg({'T1_Diff_Score':'mean'})
group.columns = ['T1_mean_diff_score']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season', 'T1_TeamID'], how='left')
season_res = season_res.drop(['T1_Diff_Score'],axis=1)

In [12]:
season_res['T1_PFG'] = round(season_res['T1_FGM']/season_res['T1_FGA'],4)
season_res['T2_PFG'] = round(season_res['T2_FGM']/season_res['T2_FGA'],4)

group = season_res.groupby(['Season','T1_TeamID']).agg({'T1_PFG':'min'})
group.columns = ['T1_min_PFG']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'], how='left')

group = season_res.groupby(['Season','T1_TeamID']).agg({'T1_PFG':'median'})
group.columns = ['T1_median_PFG']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'], how='left')

group = season_res.groupby(['Season','T1_TeamID']).agg({'T1_PFG':'max'})
group.columns = ['T1_max_PFG']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'], how='left')

group = season_res.groupby(['Season','T1_TeamID']).agg({'T2_PFG':'min'})
group.columns = ['T1_opposite_min_PFG']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'], how='left')

In [13]:
group = season_res.groupby(['Season','T1_TeamID']).agg({'T1_FGM':'sum'})
group.columns = ['T1_season_FGM']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'], how='left')
group = season_res.groupby(['Season','T1_TeamID']).agg({'T1_FGA':'sum'})
group.columns = ['T1_season_FGA']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'], how='left')
T1_season_summary['T1_season_PFG'] = round(T1_season_summary['T1_season_FGM']/T1_season_summary['T1_season_FGA'], 4)
T1_season_summary = T1_season_summary.drop(['T1_season_FGM', 'T1_season_FGA'], axis=1)

group = season_res.groupby(['Season','T1_TeamID']).agg({'T2_FGM':'sum'})
group.columns = ['T2_season_FGM']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'], how='left')
group = season_res.groupby(['Season','T1_TeamID']).agg({'T2_FGA':'sum'})
group.columns = ['T2_season_FGA']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'], how='left')
T1_season_summary['T1_opposite_season_PFG'] = round(T1_season_summary['T2_season_FGM']/T1_season_summary['T2_season_FGA'], 4)
T1_season_summary = T1_season_summary.drop(['T2_season_FGM', 'T2_season_FGA'], axis=1)

In [14]:
group = season_res.groupby(['Season','T1_TeamID']).agg({'T1_R':'mean'})
group.columns = ['T1_season_R']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'], how='left')

group = season_res.groupby(['Season','T1_TeamID']).agg({'T1_Ast':'mean'})
group.columns = ['T1_season_Ast']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'], how='left')

group = season_res.groupby(['Season','T1_TeamID']).agg({'T1_Stl':'mean'})
group.columns = ['T1_season_Stl']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'], how='left')

group = season_res.groupby(['Season','T1_TeamID']).agg({'T1_Blk':'mean'})
group.columns = ['T1_season_Blk']
T1_season_summary = pd.merge(T1_season_summary, group, on=['Season','T1_TeamID'], how='left')

In [15]:
T2_season_summary = copy.deepcopy(T1_season_summary)
T2_season_summary.columns = [x.replace('T1_','T2_') for x in list(T2_season_summary.columns)]

In [16]:
tourney_res['result'] = np.where(tourney_res['T1_Score']>tourney_res['T2_Score'], 1, 0)
tourney_res = tourney_res.drop(['DayNum','T1_Score','T2_Score','NumOT','T1_FGM','T1_FGA','T1_R','T1_Ast','T1_Stl',
                             'T1_Blk','T2_FGM','T2_FGA','T2_R','T2_Ast','T2_Stl','T2_Blk'],axis=1)

tourney_res = pd.merge(tourney_res, T1_season_summary, on=['Season','T1_TeamID'], how='left')
tourney_res = pd.merge(tourney_res, T2_season_summary, on=['Season','T2_TeamID'], how='left')

In [17]:
sample_sub['Season'] = sample_sub['ID'].apply(lambda x:int(x[0:4]))
sample_sub['T1_TeamID'] = sample_sub['ID'].apply(lambda x:int(x[5:9]))
sample_sub['T2_TeamID'] = sample_sub['ID'].apply(lambda x:int(x[-4:]))

sub = pd.DataFrame()
sub['ID'] = sample_sub['ID']
sample_sub = sample_sub.drop(['ID','Pred'], axis=1)

# sample_sub1 = copy.deepcopy(sample_sub)
# sample_sub2 = copy.deepcopy(sample_sub)
# sample_sub3 = copy.deepcopy(sample_sub)

sample_sub = pd.merge(sample_sub,team_quality_T1, on = ['T1_TeamID','Season'], how = 'left')
sample_sub = pd.merge(sample_sub,team_quality_T2, on = ['T2_TeamID','Season'], how = 'left')

sample_sub = pd.merge(sample_sub,seeds_T1, on = ['Season', 'T1_TeamID'], how = 'left')
sample_sub = pd.merge(sample_sub,seeds_T2, on = ['Season', 'T2_TeamID'], how = 'left')
sample_sub['diff_seed'] = sample_sub['T1_seed'] - sample_sub['T2_seed']
sample_sub = sample_sub.drop(['T1_seed','T2_seed'], axis=1)

sample_sub['T1_powerrank'] = tourney_res.groupby(['Season','T1_division'])['T1_quality'].rank(method='dense', ascending=False).astype(int)
sample_sub['T2_powerrank'] = tourney_res.groupby(['Season','T2_division'])['T2_quality'].rank(method='dense', ascending=False).astype(int)

sample_sub = pd.merge(sample_sub, T1_season_summary, on=['Season','T1_TeamID'], how='left')
sample_sub = pd.merge(sample_sub, T2_season_summary, on=['Season','T2_TeamID'], how='left')

In [18]:
train_y = tourney_res['result']
tourney_res = tourney_res.drop(['result','location'], axis=1)
train_x = tourney_res
test_data = sample_sub
train_x = pd.get_dummies(train_x)
test_data = pd.get_dummies(test_data)

In [19]:
from sklearn.model_selection import KFold,cross_val_score,train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from lightgbm.sklearn import LGBMRegressor

In [54]:
train_x = pd.get_dummies(train_x)
test_data = pd.get_dummies(test_data)

sub['pred'] = 0
temp = np.zeros(test_data.shape[0])

kfold = KFold(n_splits=10, shuffle = True, random_state= 12)
params = {'n_jobs':-1, 'learning_rate':0.01, 'n_estimators':1300, 'max_depth':8,
          'num_leaves':31, 'reg_alpha':1, 'reg_lambda':1, 'min_child_samples':20,
          'min_split_gain':0.7, 'colsample_bytree':0.3}
for train, test in kfold.split(train_x):
    X_train = train_x.iloc[train]
    y_train = train_y.iloc[train]
    X_test = train_x.iloc[test]
    y_test = train_y.iloc[test]
    lgb_train = lgb.Dataset(X_train, label=y_train, categorical_feature=['T1_division_W','T1_division_X','T1_division_Y','T1_division_Z',
                                                                         'T2_division_W','T2_division_X','T2_division_Y','T2_division_Z'])
    lgb_eval = lgb.Dataset(X_test, label=y_test, categorical_feature=['T1_division_W','T1_division_X','T1_division_Y','T1_division_Z',
                                                                         'T2_division_W','T2_division_X','T2_division_Y','T2_division_Z'])
    gbm = lgb.train(params, lgb_train, num_boost_round=500, valid_sets=lgb_eval)
    y_pred = gbm.predict(test_data)
    temp += y_pred/10
sub['pred'] = temp
sub.to_csv('lgb2_submission.csv', index=False)



In [38]:
train_x = pd.get_dummies(train_x)
test_data = pd.get_dummies(test_data)

sub['pred'] = 0
temp = np.zeros(test_data.shape[0])
kfold = KFold(n_splits=10, shuffle = True, random_state= 12)
lgb_model = LGBMRegressor(n_jobs=-1,
                          learning_rate=0.01,
                          n_estimators=1300,
                          max_depth=8,
                          num_leaves=31,
                          reg_alpha=1, 
                          reg_lambda=1,
                          min_child_samples=20,
                          min_split_gain=0.7,
                          colsample_bytree=0.3)
for train, test in kfold.split(train_x):
    X_train = train_x.iloc[train]
    y_train = train_y.iloc[train]
    X_test = train_x.iloc[test]
    y_test = train_y.iloc[test]
    lgb_model.fit(X_train,y_train)
    y_pred = lgb_model.predict(X=X_test)
    e = mean_squared_error(y_true=y_test,y_pred=y_pred)
    print(e)
lgb_pred = lgb_model.predict(test_data)
sub['pred'] = lgb_pred
sub.to_csv('lgb_submission2.csv', index=False)

0.1811592398939186
0.18691280303307936
0.1902236714284837
0.17197311671038196
0.18201892986907375
0.19328035409204
0.20176587229602908
0.19924814851806535
0.20510735399705904
0.18401838384085295


In [29]:
from sklearn.model_selection import GridSearchCV
cv_params = {'colsample_bytree':[0.7,0.8,0.9],
             'max_depth':[10,20,30],
             'learning_rate':[0.0001,0.0005,0.001]}
xgb_cv_model = xgb.XGBRegressor(colsample_bytree=0.9,
                          learning_rate=0.0005,
                          max_depth=10,
                          subsample=0.5,
                          objective='binary:logistic',
                          eval_metric='logloss',
                          min_child_weight=20,
                          gamma=0.25,
                          n_estimators=5000,
                          verbosity=0)
gs = GridSearchCV(xgb_cv_model,cv_params,scoring='r2',cv=4,verbose=1)
gs_result = gs.fit(X_train,y_train)
gs_result.best_params_

Fitting 4 folds for each of 27 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed: 25.1min finished


{'colsample_bytree': 0.9, 'learning_rate': 0.0005, 'max_depth': 10}

In [30]:
sub['pred'] = 0
kfold = KFold(n_splits=10, shuffle = True, random_state= 42)
xgb_model = xgb.XGBRegressor(colsample_bytree=0.98,
                             learning_rate=0.005,
                             max_depth=31,
                             subsample=1,
                             objective='binary:logistic',
                             eval_metric='logloss',
                             min_child_weight=3,
                             gamma=0.25,
                             n_estimators=5000,
                             verbosity=0)
for train, test in kfold.split(train_x):
    X_train = train_x.iloc[train]
    y_train = train_y.iloc[train]
    X_test = train_x.iloc[test]
    y_test = train_y.iloc[test]
    xgb_model.fit(X_train,y_train)
    y_pred = xgb_model.predict(X_test)
    e = mean_squared_error(y_true=y_test,y_pred=y_pred)
    print(e)
sub['pred'] = xgb_model.predict(test_data)
sub.to_csv('xgb_submission5.csv', index=False)

0.19744229108336459
0.216075413049697
0.19536075681035325
0.2453998039855302
0.23553662515278298
0.2337900119355137
0.20195461370685694
0.1835823207066073
0.20741240072745667
0.20795498308960844


In [None]:
# sub['pred'] = 0
# temp = np.zeros(test_data.shape[0])

# kfold = KFold(n_splits=10, shuffle = True, random_state= 42)
# params = {'colsample_bytree':0.8,'learning_rate':0.0003,'max_depth':31,'subsample':1,'objective':'binary:logistic','eval_metric':'logloss',
#           'min_child_weight':3,'gamma':0.25,'n_estimators':5000,'verbosity':0}
# for train, test in kfold.split(train_x):
#     X_train = train_x.iloc[train]
#     y_train = train_y.iloc[train]
#     X_test = train_x.iloc[test]
#     y_test = train_y.iloc[test]
#     lgb_train = lgb.Dataset(X_train, label=y_train, categorical_feature=['T1_division_W','T1_division_X','T1_division_Y','T1_division_Z',
#                                                                          'T2_division_W','T2_division_X','T2_division_Y','T2_division_Z'])
#     lgb_eval = lgb.Dataset(X_test, label=y_test, categorical_feature=['T1_division_W','T1_division_X','T1_division_Y','T1_division_Z',
#                                                                          'T2_division_W','T2_division_X','T2_division_Y','T2_division_Z'])
#     gbm = lgb.train(params, lgb_train, num_boost_round=500, valid_sets=lgb_eval)
#     y_pred = gbm.predict(test_data)
#     temp += y_pred/10
# sub['pred'] = temp
# sub.to_csv('lgb2_submission.csv', index=False)

In [31]:
# from sklearn.model_selection import GridSearchCV
# cv_params = {'min_child_sample':[20,30,40],
#             'min_split_gain':[0.3,0.5,0.7],
#             'colsample_bytree':[0.1,0.2,0.3]}
# lgb_cv_model = LGBMRegressor(n_jobs=-1,
#                           learning_rate=0.01,
#                           n_estimators=1300,
#                           max_depth=8,
#                           num_leaves=31,
#                           reg_alpha=1, 
#                           reg_lambda=1,
#                           min_child_samples=20,
#                           min_split_gain=0.7,
#                           colsample_bytree=0.3)
# gs = GridSearchCV(lgb_cv_model,cv_params,scoring='r2',cv=4,verbose=1)
# gs_result = gs.fit(X_train,y_train)
# gs_result.best_params_

Fitting 4 folds for each of 27 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:   32.8s finished


{'colsample_bytree': 0.3, 'min_child_sample': 20, 'min_split_gain': 0.7}