# Imports

In [None]:
import pandas as pd
import numpy as np
import re
import random
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel


# Use only season data

In [None]:
season=pd.read_csv('../input/ncaaw-march-mania-2021/WDataFiles_Stage2/WRegularSeasonDetailedResults.csv')

# Feature Engineering

### Possession features

In [None]:
# I found the formulas on internet looking at different blogs
season['Wpossession']=season.WFGA-season.WOR+season.WTO+(0.475*season.WFTA)
season['Lpossession']=season.LFGA-season.LOR+season.LTO+(0.475*season.LFTA)
Wposs=season.groupby(['Season','WTeamID']).Wpossession.agg(['count','mean']).reset_index()
Wposs=Wposs.rename(columns={'count':'nb_win','mean':'Wposs'})
Lposs=season.groupby(['Season','LTeamID']).Lpossession.agg(['count','mean']).reset_index()
Lposs=Lposs.rename(columns={'count':'nb_loss','mean':'Lposs'})

poss=Wposs.merge(Lposs,left_on=['Season','WTeamID'],right_on=['Season','LTeamID'],how='left')
poss['avg_poss']=(poss.nb_win*poss.Wposs+poss.nb_loss*poss.Lposs)/(poss.nb_win+poss.nb_loss)
poss=poss.rename(columns={'WTeamID':'TeamID'})
poss=poss[['Season','TeamID','avg_poss']]

### Efficiency Features

In [None]:
# I found all the formulas on internet looking at different blogs
season['WOE']=(2*season.WFGM + season.WFGM3 + season.WFTM) / season.Wpossession
season['LOE']=(2*season.LFGM + season.LFGM3 + season.LFTM) / season.Lpossession
season['WDE']=season['LOE']
season['LDE']=season['WOE']
team1=season[['Season', 'WTeamID', 'WOE', 'WDE']]
team1.columns = ['Season', 'TeamID', 'OE','DE']
team2=season[['Season', 'LTeamID', 'LOE', 'LDE']]
team2.columns = ['Season', 'TeamID', 'OE','DE']
oe=pd.concat([team1,team2])

# offensive and defensive efficiency
oe=oe.groupby(['Season','TeamID'])[['OE','DE']].agg('mean').reset_index()

In [None]:
team1=season[['Season', 'WTeamID', 'WFGM', 
              'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 
              'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]

team1.columns = ['Season', 'TeamID', 'FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR',
             'Ast','TO','Stl','Blk','PF']

team2=season[['Season', 'LTeamID', 'LFGM', 
              'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 
              'LFTA', 'LOR', 'LDR',
       'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']]

team2.columns = ['Season', 'TeamID', 'FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR',
             'Ast','TO','Stl','Blk','PF']

detailed=pd.concat([team1,team2])
detailed['FGM2']=detailed.FGM-detailed.FGM3


# shooting features
detailed['shooting']=detailed.FGM / detailed.FGA 
detailed['three_shooting']=detailed.FGM3 / detailed.FGA3


shoot=detailed.groupby(['Season','TeamID'])[['shooting','three_shooting']].agg('mean').reset_index()
shoot.head()

### Rebounds Features

In [None]:
season['WOFR']=season.WOR/(season.WOR+season.LDR)
season['LOFR']=season.LOR/(season.LOR+season.WDR)
season['WDFR']=1-season.LOFR
season['LDFR']=1-season.WOFR

wofr=season.groupby(['Season','WTeamID']).WOFR.agg(['count','mean']).reset_index()
wofr=wofr.rename(columns={'count':'nb_win','mean':'WOFR'})
lofr=season.groupby(['Season','LTeamID']).LOFR.agg(['count','mean']).reset_index()
lofr=lofr.rename(columns={'count':'nb_loss','mean':'LOFR'})

ofr=wofr.merge(lofr,left_on=['Season','WTeamID'],right_on=['Season','LTeamID'],how='left')

ofr['avg_ofr']=(ofr.nb_win*ofr.WOFR+ofr.nb_loss*ofr.LOFR)/(ofr.nb_win+ofr.nb_loss)
ofr=ofr.rename(columns={'WTeamID':'TeamID'})
ofr=ofr[['Season','TeamID','avg_ofr']]
ofr.head()

### Possession

In [None]:
season=pd.read_csv('../input/ncaaw-march-mania-2021/WDataFiles_Stage2/WRegularSeasonDetailedResults.csv')
season.columns
season['Wpossession']=season.WFGA-season.WOR+season.WTO+(0.475*season.WFTA)
season['Lpossession']=season.LFGA-season.LOR+season.LTO+(0.475*season.LFTA)

In [None]:
team1=season[['Season', 'WTeamID','WTO','Wpossession' ]]
team1.columns = ['Season', 'TeamID','TO','possession' ]
team2=season[['Season', 'LTeamID','LTO','Lpossession' ]]
team2.columns = ['Season', 'TeamID','TO','possession' ]
detailed=pd.concat([team1,team2])
# turnover features
detailed['avg_turnover']=detailed.TO / detailed.possession

turnover=detailed.groupby(['Season','TeamID']).avg_turnover.agg('mean').reset_index()
turnover.head()

### Number of win and score features

In [None]:
season=pd.read_csv('../input/ncaaw-march-mania-2021/WDataFiles_Stage2/WRegularSeasonCompactResults.csv')
season['score_gap']=season.WScore-season.LScore
season['score_gap']=season.WScore-season.LScore
nb_win=season.groupby(['Season','WTeamID']).agg('count').reset_index()[['Season','WTeamID','DayNum']]
nb_win=nb_win.rename(columns={'DayNum':'nbWin','WTeamID':'TeamID'})
nb_loss=season.groupby(['Season','LTeamID']).agg('count').reset_index()[['Season','LTeamID','DayNum']]
nb_loss=nb_loss.rename(columns={'DayNum':'nbLoss','LTeamID':'TeamID'})
nb_loss=nb_loss.fillna(0)

winRate=nb_win.merge(nb_loss,on=['Season','TeamID'],how='left')
winRate=winRate.fillna(0)
winRate['win_ratio']=winRate.apply(lambda x: x.nbWin/(x.nbWin+x.nbLoss),axis=1)

gap_win=season.groupby(['Season','WTeamID']).agg('mean').reset_index()
gap_win=gap_win[['Season','WTeamID','score_gap']].rename(columns={'WTeamID':'TeamID','score_gap':'avg_score_gapW'})
gap_loss=season.groupby(['Season','LTeamID']).agg('mean').reset_index()
gap_loss=gap_loss[['Season','LTeamID','score_gap']].rename(columns={'LTeamID':'TeamID','score_gap':'avg_score_gapL'})

avg_gap=gap_win.merge(gap_loss,on=['Season','TeamID'],how='left')
avg_gap=avg_gap.fillna(0)

season=winRate.merge(avg_gap,on=['Season','TeamID'],how='left')
season['avg_score_gap']=(season.avg_score_gapW*season.nbWin+season.avg_score_gapL*season.nbLoss)/(season.nbWin+season.nbLoss)

### Team Seed

In [None]:
def extract_seed(seed):
    """
    Get the seed of the team.
    
    Args:
    -------------
    seed (int): seed of the team
    """
    return int(re.sub('[^0-9]',"",seed))

seed=pd.read_csv('../input/ncaaw-march-mania-2021/WDataFiles_Stage2/WNCAATourneySeeds.csv')

seed['Seed']=seed.Seed.apply(lambda x: extract_seed(x))
seed.head()

### Functions

In [None]:
def merge_all(df):
    
    """
    Function to merge all our dataframes together.
    
    Args:
    ----------------
    df (dataframe): tourney results dataframe
    """
    
    # Offensive efficiency
    df=df.merge(oe,left_on=['Season','Team1'],right_on=['Season','TeamID'],how='left').drop('TeamID',axis=1)
    df=df.rename(columns={'OE':'OE1','DE':'DE1'})

    df=df.merge(oe,left_on=['Season','Team2'],right_on=['Season','TeamID'],how='left').drop('TeamID',axis=1)
    df=df.rename(columns={'OE':'OE2','DE':'DE2'})
    
    # merge with turnover
    df=df.merge(turnover,left_on=['Season','Team1'],right_on=['Season','TeamID'],how='left').drop('TeamID',axis=1)
    df=df.rename(columns={'avg_turnover':'avg_turnover1'})

    df=df.merge(turnover,left_on=['Season','Team2'],right_on=['Season','TeamID'],how='left').drop('TeamID',axis=1)
    df=df.rename(columns={'avg_turnover':'avg_turnover2'})

    # merge with ofr
    df=df.merge(ofr,left_on=['Season','Team1'],right_on=['Season','TeamID'],how='left').drop('TeamID',axis=1)
    df=df.rename(columns={'avg_ofr':'avg_ofr1'})

    df=df.merge(ofr,left_on=['Season','Team2'],right_on=['Season','TeamID'],how='left').drop('TeamID',axis=1)
    df=df.rename(columns={'avg_ofr':'avg_ofr2'})
    
    # merge with shooting stats
    df=df.merge(shoot,left_on=['Season','Team1'],right_on=['Season','TeamID'],how='left').drop('TeamID',axis=1)
    df=df.rename(columns={'shooting':'shooting1','three_shooting':'three_shooting1'})

    df=df.merge(shoot,left_on=['Season','Team2'],right_on=['Season','TeamID'],how='left').drop('TeamID',axis=1)
    df=df.rename(columns={'shooting':'shooting2','three_shooting':'three_shooting2'})
    
    # merge with poss
    df=df.merge(poss[['Season','TeamID','avg_poss']],left_on=['Season','Team1'],right_on=['Season','TeamID'],how='left').drop('TeamID',axis=1)
    df=df.rename(columns={'avg_poss':'avg_poss1'})

    df=df.merge(poss[['Season','TeamID','avg_poss']],left_on=['Season','Team2'],right_on=['Season','TeamID'],how='left').drop('TeamID',axis=1)
    df=df.rename(columns={'avg_poss':'avg_poss2'})
    
    # merge with season
    df=df.merge(season[['Season','TeamID','win_ratio','avg_score_gap']],left_on=['Season','Team1'],right_on=['Season','TeamID'],how='left').drop('TeamID',axis=1)
    df=df.rename(columns={'win_ratio':'win_ratio1','avg_score_gap':'avg_score_gap1'})

    df=df.merge(season[['Season','TeamID','win_ratio','avg_score_gap']],left_on=['Season','Team2'],right_on=['Season','TeamID'],how='left').drop('TeamID',axis=1)
    df=df.rename(columns={'win_ratio':'win_ratio2','avg_score_gap':'avg_score_gap2'})
    
    # merge with seed
    df=df.merge(seed,left_on=['Season','Team1'],right_on=['Season','TeamID'],how='left').drop('TeamID',axis=1)
    df=df.rename(columns={'Seed':'Seed1'})

    df=df.merge(seed,left_on=['Season','Team2'],right_on=['Season','TeamID'],how='left').drop('TeamID',axis=1)
    df=df.rename(columns={'Seed':'Seed2'})
    
    df['WinRatioDiff'] = df['win_ratio1'] - df['win_ratio2']
    df['GapAvgDiff'] = df['avg_score_gap1'] - df['avg_score_gap2']
    df['SeedDiff'] = df['Seed1'] - df['Seed2']
    
    return df

In [None]:
def add_loosing_matches(win_df):
    """
    Double the size of the dataset by changing the winning and the losing team.
    
    Args:
    -----------------
    win_df (dataframe): Need to be done before merge_all on tourney results csv data.
    """
    win_rename = {
        "WTeamID": "Team1", 
        "WScore" : "Score1", 
        "LTeamID" : "Team2",
        "LScore": "Score2",
        
     }
    
    lose_rename = {
        "WTeamID": "Team2", 
        "WScore" : "Score2", 
        "LTeamID" : "Team1",
        "LScore": "Score1",
        
    }
    
    win_df = win_df.copy()
    lose_df = win_df.copy()
    
    win_df = win_df.rename(columns=win_rename)
    lose_df = lose_df.rename(columns=lose_rename)
    
    return pd.concat([win_df, lose_df], 0, sort=False)

In [None]:
df=pd.read_csv('../input/ncaaw-march-mania-2021/WDataFiles_Stage2/WNCAATourneyCompactResults.csv')[['Season','WTeamID','LTeamID','WScore','LScore']]
df=df[df.Season>=2010].reset_index(drop=True)


df = add_loosing_matches(df)
df['target']=df.apply(lambda x: 1 if x.Score1-x.Score2>0 else 0,axis=1)
    
df=df[['Season','Team1','Team2','target']]

In [None]:
df=merge_all(df)
columns=[col for col in df.columns if col not in ['Season','Team1','Team2','target']]

# Model Training

### The following lgb parameters were obtained using Optuna.

In [None]:
lgb_params={'learning_rate': 0.009828949477606631,
 'max_depth': 54,
 'num_leaves': 110,
 'reg_alpha': 0.1737439152261937,
 'reg_lambda': 0.0057611709106477615,
 'n_estimators': 389,
 'colsample_bytree': 0.3219603496785809,
 'min_child_samples': 34,
 'subsample_freq': 2,
 'subsample': 0.8621417360868591,
 'max_bin': 143,
 'min_data_per_group': 177,
 'cat_smooth': 87,
 'cat_l2': 13,
            'metric': 'binary_logloss',
            'n_jobs':-1
           }

In [None]:
# Selected using feature Importance method
columns=['OE1',
 'DE1',
 'OE2',
 'DE2',
 'avg_turnover1',
 'avg_turnover2',
 'shooting1',
 'three_shooting1',
 'shooting2',
 'three_shooting2',
 'win_ratio1',
 'win_ratio2',
 'WinRatioDiff',
'Seed1',
 'Seed2',
 'GapAvgDiff',
 'SeedDiff']

In [None]:
def cross_val(df,test=None,lgb_params=lgb_params):
    
    """
    Cross validation function where the model is trained on all previous seasons and evaluated 
    on the next season.
    
    For example: if we want to validate the model on the results of year 2018, we will train the model 
    on all previous years starting from year 2017 season results.
    
    Args:
    ---------------------
    df (dataframe): dataframe resulted of the feature engineering
    test (dataframe): dataframe resulted of the feature engineering pn the test set
    lgb_params (dict): params for the LGBM model
    """
    
    seasons=df.Season.unique()
    metrics=[]
    pred_tests=[]
    i=0
    
    for season in seasons[5:]:
        print(f'Season {season} results calculation:')
        
        X_train=df[df.Season<season][columns]
        X_test=df[df.Season==season][columns]
        y_train=df[df.Season<season].target
        y_test=df[df.Season==season].target
        
        
        model=LGBMClassifier(**lgb_params)
        

        model.fit(X_train,y_train,verbose=0)
        
        if test is not None:
  
            pred_test = model.predict_proba(test[columns])[:, 1]
            
            pred_tests.append(pred_test)
            
        X_test=df[df.Season==season][columns]
        
        y_test=df[df.Season==season].target
        
        predictions=model.predict_proba(X_test)[:,1]
        metrics.append(log_loss(y_test,predictions))

        
        print(metrics[i])
        print('\n')
        i+=1
    
    print('Cross Validation Score:')
    print(np.mean(metrics))
    
    return pred_tests,model,np.mean(metrics)

In [None]:
# LGBM model
pred_tests,model,_=cross_val(df,test=None)

# Optuna Tuning to find LGBM best hyper parameters

In [None]:
def objective(trial,df=df):
    
    lgb_params={
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2),
        'max_depth': trial.suggest_int('max_depth', 6, 127),
        'num_leaves': trial.suggest_int('num_leaves', 31, 128),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
        'metric': 'binary_logloss',
        'n_estimators': trial.suggest_int('estimators', 100, 1000),
        'n_jobs': -1,
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.9),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.3, 0.9),
        'max_bin': trial.suggest_int('max_bin', 128, 1024),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 200),
        'cat_smooth': trial.suggest_int('cat_smooth', 10, 100),
        'cat_l2': trial.suggest_int('cat_l2', 1, 20)}
   
    _,_,loss=cross_val(df,test=None,lgb_params=lgb_params)
        
    return loss

In [None]:
study = optuna.create_study(direction='minimize') 
study.optimize(objective, timeout=3600*6)
study.best_params

# Make predictions

In [None]:
# train model
X_train=df[df.Season<2019][columns]
X_test=df[df.Season==2019][columns]
y_train=df[df.Season<2019].target
y_test=df[df.Season==2019].target
        
        
model=LGBMClassifier(**lgb_params)
        

model.fit(X_train,y_train,verbose=0)

In [None]:
predictions=model.predict_proba(X_test)[:,1]

In [None]:
log_loss(y_test,predictions)

# Test preds

In [None]:
# train model
X_train=df[df.Season<=2019][columns]
X_test=df[df.Season==2019][columns]
y_train=df[df.Season<=2019].target
y_test=df[df.Season==2019].target
        
        
model=LGBMClassifier(**lgb_params)
        

model.fit(X_train,y_train,verbose=0)

In [None]:
test=pd.read_csv('../input/ncaaw-march-mania-2021/WDataFiles_Stage2/WSampleSubmissionStage2.csv')

test['Season']=test.ID.apply(lambda x: int(x.split('_')[0]))
test['Team1']=test.ID.apply(lambda x: int(x.split('_')[1]))
test['Team2']=test.ID.apply(lambda x: int(x.split('_')[2]))

test=merge_all(test)

# make predictions
predictions=model.predict_proba(test[columns])[:,1]

test=pd.read_csv('../input/ncaaw-march-mania-2021/WDataFiles_Stage2/WSampleSubmissionStage2.csv')
test['Pred']=predictions

test.to_csv('submission.csv',index=False)