## Import libraries

In [1]:
import gc
import pickle
import optuna
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

## Prepare data for model training

In [2]:
with open("../input/mh-eda-data-prep/MH_Processed_Data.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

20

In [3]:
Xtrain = train_df.loc[:, train_df.columns != 'Outcome'].copy()
Ytrain = train_df['Outcome'].copy()
Xtest = test_df.copy()

print(f"Xtrain: {Xtrain.shape} \nYtrain: {Ytrain.shape} \nXtest: {Xtest.shape}")

del train_df
del test_df
gc.collect()

Xtrain: (7441, 133) 
Ytrain: (7441,) 
Xtest: (4008, 133)


0

## Hyperparameters Search

In [4]:
'''
def objective(trial):
    
    score = 0
    counter = 0
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
    
    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]
    
        model = HistGradientBoostingClassifier(
            loss='binary_crossentropy', 
            learning_rate=trial.suggest_loguniform("learning_rate", 5e-3, 1e-1), 
            max_iter=3000, 
            max_leaf_nodes=trial.suggest_int("max_leaf_nodes", 40, 1500), 
            max_depth=trial.suggest_int("max_depth", 4, 14), 
            min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 35), 
            l2_regularization=trial.suggest_loguniform("l2_regularization", 1e-3, 1), 
            early_stopping=True, 
            validation_fraction=0.1, 
            n_iter_no_change=100,
            random_state=0
            )
    
        model.fit(train_x, train_y.ravel())

        y_pred = model.predict_proba(val_x)
        score += log_loss(val_y, y_pred)
    
    score /= float(counter)
    return score


study = optuna.create_study()
study.optimize(objective, n_trials=100)
'''

'\ndef objective(trial):\n    \n    score = 0\n    counter = 0\n    \n    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)\n    \n    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):\n        counter += 1\n\n        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]\n        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]\n    \n        model = HistGradientBoostingClassifier(\n            loss=\'binary_crossentropy\', \n            learning_rate=trial.suggest_loguniform("learning_rate", 5e-3, 1e-1), \n            max_iter=3000, \n            max_leaf_nodes=trial.suggest_int("max_leaf_nodes", 40, 1500), \n            max_depth=trial.suggest_int("max_depth", 4, 14), \n            min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 35), \n            l2_regularization=trial.suggest_loguniform("l2_regularization", 1e-3, 1), \n            early_stopping=True, \n            validation_fraction=0.1, \n            n_iter_no_change=100,\n      

In [5]:
'''
print("Number of finished trials: {}".format(len(study.trials)))

trial = study.best_trial
print("Best trial value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print(" {}: {}".format(key, value))
'''

'\nprint("Number of finished trials: {}".format(len(study.trials)))\n\ntrial = study.best_trial\nprint("Best trial value: {}".format(trial.value))\n\nprint("Params: ")\nfor key, value in trial.params.items():\n    print(" {}: {}".format(key, value))\n'

## Build and validate the model

In [6]:
FOLD = 5
SEEDS = [2018, 2020]

oof_score = 0
y_pred_meta_gbc = np.zeros((Ytrain.shape[0], 1))
y_pred_final_gbc = 0
counter = 0


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]

        if seed == 2020:
            model = HistGradientBoostingClassifier(
                loss='binary_crossentropy', 
                learning_rate=0.0058, 
                max_iter=3000, 
                max_leaf_nodes=981, 
                max_depth=11, 
                min_samples_leaf=34, 
                l2_regularization=0.0015, 
                early_stopping=True, 
                validation_fraction=0.1, 
                n_iter_no_change=100,
                random_state=0
            )
        
        else:
            model = HistGradientBoostingClassifier(
                loss='binary_crossentropy', 
                learning_rate=0.0052, 
                max_iter=3000, 
                max_leaf_nodes=499, 
                max_depth=10, 
                min_samples_leaf=18, 
                l2_regularization=0.3082, 
                early_stopping=True, 
                validation_fraction=0.1, 
                n_iter_no_change=100,
                random_state=0
            )
        
        model.fit(train_x, train_y)

        y_pred = model.predict_proba(val_x)
        y_pred_meta_gbc[val] += np.array([y_pred[:,1]]).T
        y_pred_final_gbc += model.predict_proba(Xtest)
        
        score = log_loss(val_y, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_gbc = y_pred_meta_gbc / float(len(SEEDS))
y_pred_final_gbc = y_pred_final_gbc / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Seed-2018 | Fold-0 | OOF Score: 0.013636819643628935
Seed-2018 | Fold-1 | OOF Score: 0.021573215563891496
Seed-2018 | Fold-2 | OOF Score: 0.008587470639905934
Seed-2018 | Fold-3 | OOF Score: 0.0072755895502901805
Seed-2018 | Fold-4 | OOF Score: 0.01219672520453355

Seed: 2018 | Aggregate OOF Score: 0.01265396412045002


Seed-2020 | Fold-0 | OOF Score: 0.010256779810374556
Seed-2020 | Fold-1 | OOF Score: 0.020008154170835093
Seed-2020 | Fold-2 | OOF Score: 0.007133559419802397
Seed-2020 | Fold-3 | OOF Score: 0.010181797031774017
Seed-2020 | Fold-4 | OOF Score: 0.019953284324890512

Seed: 2020 | Aggregate OOF Score: 0.013506714951535315


Aggregate OOF Score: 0.013080339535992669


In [7]:
np.savez_compressed('./GBC_Meta_Features.npz',
                    y_pred_meta_gbc=y_pred_meta_gbc, 
                    oof_score=oof_score,
                    y_pred_final_gbc=y_pred_final_gbc)

## Create submission file

In [8]:
submit_df = pd.DataFrame(y_pred_final_gbc[:,1], columns=['Outcome'])
submit_df.to_csv("GBC_Submission.csv", index=False)
submit_df.head(10)

Unnamed: 0,Outcome
0,0.999378
1,0.999381
2,0.001612
3,0.998861
4,0.001172
5,0.999354
6,0.999104
7,0.998934
8,0.999055
9,0.99894
