## Import libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

import gc
import pickle
import optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

## Prepare data for model training

In [2]:
with open("../input/mh-eda-data-prep/MH_Processed_Data.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

21

In [3]:
cat_cols = ['season','league','Team 1','Team2','year','quarter','month',
            'week','day_month','day_week','day_weekend','year_eq_season']

train_df[cat_cols] = train_df[cat_cols].astype(int)
test_df[cat_cols] = test_df[cat_cols].astype(int)
cat_cols_indices = [train_df.columns.get_loc(col) for col in cat_cols]
print(cat_cols_indices)

[0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15]


In [4]:
neg, pos = np.bincount(train_df['Outcome'])
class_weight = neg/pos
class_weight

0.4679423949496942

In [5]:
Xtrain = train_df.loc[:, train_df.columns != 'Outcome'].copy()
Ytrain = train_df['Outcome'].copy()
Xtest = test_df.copy()

print(f"Xtrain: {Xtrain.shape} \nYtrain: {Ytrain.shape} \nXtest: {Xtest.shape}")

del train_df
del test_df
gc.collect()

Xtrain: (7441, 152) 
Ytrain: (7441,) 
Xtest: (4008, 152)


0

## Hyperparameters Search

In [6]:
'''
def objective(trial):
    
    params = {}
    params["objective"] = 'binary'
    params['metric'] = 'binary_logloss'
    params['boosting'] = 'gbdt'
    params['scale_pos_weight'] = class_weight
    params["learning_rate"] = trial.suggest_loguniform("learning_rate", 1e-2, 1e-1)
    params["lambda_l2"] = trial.suggest_loguniform("lambda_l2", 1e-3, 1)
    params["num_leaves"] = trial.suggest_int("num_leaves", 40, 1500)
    params["max_depth"] = trial.suggest_int("max_depth", 4, 12)
    params["feature_fraction"] = trial.suggest_uniform("feature_fraction", 0.45, 1)
    params["bagging_fraction"] = trial.suggest_uniform("bagging_fraction", 0.45, 1)
    params["bagging_freq"] = trial.suggest_int("bagging_freq", 1, 30)
    params["bagging_seed"] = 10
    params["min_data_in_leaf"] = trial.suggest_int("min_data_in_leaf", 1, 30)
    params["saved_feature_importance_type"] = 1
    params["random_state"] = 0
    params["verbosity"] = -1
    num_rounds = 5000

    score = 0
    counter = 0
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2010)
    
    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]
    
        lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
        lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

        model = lgb.train(params, lgtrain, num_rounds, 
                          valid_sets=[lgtrain, lgvalidation], 
                          categorical_feature=cat_cols_indices,
                          early_stopping_rounds=200, verbose_eval=0)

        y_pred = model.predict(val_x, num_iteration=model.best_iteration)
        score += log_loss(val_y, y_pred)
    
    score /= float(counter)
    return score


study = optuna.create_study()
study.optimize(objective, n_trials=100)
'''

'\ndef objective(trial):\n    \n    params = {}\n    params["objective"] = \'binary\'\n    params[\'metric\'] = \'binary_logloss\'\n    params[\'boosting\'] = \'gbdt\'\n    params[\'scale_pos_weight\'] = class_weight\n    params["learning_rate"] = trial.suggest_loguniform("learning_rate", 1e-2, 1e-1)\n    params["lambda_l2"] = trial.suggest_loguniform("lambda_l2", 1e-3, 1)\n    params["num_leaves"] = trial.suggest_int("num_leaves", 40, 1500)\n    params["max_depth"] = trial.suggest_int("max_depth", 4, 12)\n    params["feature_fraction"] = trial.suggest_uniform("feature_fraction", 0.45, 1)\n    params["bagging_fraction"] = trial.suggest_uniform("bagging_fraction", 0.45, 1)\n    params["bagging_freq"] = trial.suggest_int("bagging_freq", 1, 30)\n    params["bagging_seed"] = 10\n    params["min_data_in_leaf"] = trial.suggest_int("min_data_in_leaf", 1, 30)\n    params["saved_feature_importance_type"] = 1\n    params["random_state"] = 0\n    params["verbosity"] = -1\n    num_rounds = 5000\n\

In [7]:
'''
print("Number of finished trials: {}".format(len(study.trials)))

trial = study.best_trial
print("Best trial value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print(" {}: {}".format(key, value))
'''

'\nprint("Number of finished trials: {}".format(len(study.trials)))\n\ntrial = study.best_trial\nprint("Best trial value: {}".format(trial.value))\n\nprint("Params: ")\nfor key, value in trial.params.items():\n    print(" {}: {}".format(key, value))\n'

## Build and validate the model

In [8]:
FOLD = 5
SEEDS = [2018, 2020]

fet_imp = 0
counter = 0
oof_score = 0
y_pred_final_lgb = 0
y_pred_meta_lgb = np.zeros((Ytrain.shape[0], 1))


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]
        
        if seed == 2020:
            params = {}
            params["objective"] = 'binary'
            params['metric'] = 'binary_logloss'
            params['boosting'] = 'gbdt'
            params['scale_pos_weight'] = class_weight
            params["learning_rate"] = 0.0555
            params["lambda_l2"] = 0.508
            params["num_leaves"] = 278
            params["max_depth"] = 4
            params["feature_fraction"] = 0.737
            params["bagging_fraction"] = 0.886
            params["bagging_freq"] = 27
            params["bagging_seed"] = 10
            params["min_data_in_leaf"] = 21
            params["saved_feature_importance_type"] = 1
            params["random_state"] = 0
            params["verbosity"] = -1
            num_rounds = 5000
            
        else:
            params = {}
            params["objective"] = 'binary'
            params['metric'] = 'binary_logloss'
            params['boosting'] = 'gbdt'
            params['scale_pos_weight'] = class_weight
            params["learning_rate"] = 0.0567
            params["lambda_l2"] = 0.49
            params["num_leaves"] = 384
            params["max_depth"] = 9
            params["feature_fraction"] = 0.6
            params["bagging_fraction"] = 0.94
            params["bagging_freq"] = 10
            params["bagging_seed"] = 10
            params["min_data_in_leaf"] = 22
            params["saved_feature_importance_type"] = 1
            params["random_state"] = 0
            params["verbosity"] = -1
            num_rounds = 5000

        
        lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
        lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

        model = lgb.train(params, lgtrain, num_rounds, 
                          valid_sets=[lgtrain, lgvalidation], 
                          categorical_feature=cat_cols_indices,
                          early_stopping_rounds=200, verbose_eval=100)

        y_pred = model.predict(val_x, num_iteration=model.best_iteration)
        y_pred_meta_lgb[val] += np.array([y_pred]).T
        y_pred_final_lgb += model.predict(Xtest, num_iteration=model.best_iteration)
        
        score = log_loss(val_y, y_pred)
        oof_score += score
        seed_score += score
        print("\nSeed-{} | Fold-{} | OOF Score: {}\n".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_lgb = y_pred_meta_lgb / float(len(SEEDS))
y_pred_final_lgb = y_pred_final_lgb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.00495007	valid_1's binary_logloss: 0.015844
[200]	training's binary_logloss: 0.000636065	valid_1's binary_logloss: 0.0155983
[300]	training's binary_logloss: 0.000242511	valid_1's binary_logloss: 0.0162556
Early stopping, best iteration is:
[143]	training's binary_logloss: 0.0016447	valid_1's binary_logloss: 0.0147933

Seed-2018 | Fold-0 | OOF Score: 0.014793312367292545

Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.00476271	valid_1's binary_logloss: 0.0197415
[200]	training's binary_logloss: 0.000557468	valid_1's binary_logloss: 0.0222019
[300]	training's binary_logloss: 0.000215488	valid_1's binary_logloss: 0.0244603
Early stopping, best iteration is:
[112]	training's binary_logloss: 0.00336636	valid_1's binary_logloss: 0.0191796

Seed-2018 | Fold-1 | OOF Score: 0.019179555583987266

Training until validation scores don't improve for 200

In [9]:
np.savez_compressed('./LGB_Meta_Features.npz',
                    y_pred_meta_lgb=y_pred_meta_lgb, 
                    oof_score=oof_score,
                    y_pred_final_lgb=y_pred_final_lgb)

## Create submission file

In [10]:
submit_df = pd.DataFrame(y_pred_final_lgb, columns=['Outcome'])
submit_df.to_csv("LGB_Submission.csv", index=False)
submit_df.head(10)

Unnamed: 0,Outcome
0,0.999583
1,0.999625
2,0.000941
3,0.998669
4,0.000539
5,0.999586
6,0.999238
7,0.999473
8,0.999125
9,0.999474
