In [13]:
"""
[V3]
Blend Models:
* tabnet 10 folds 3 seeds using non scored
* 2 stage NN
* fork-of-2heads-looper-super-puper-markpeng-infer
* deepinsight-efficientnet-v7-b3-infer
* deepinsight-restnest-v2-infer

"""

kernel_mode = False

import os
import numpy as np
import pandas as pd
import time
import random
import math
import datetime
import glob

from numba import njit
from scipy.optimize import minimize, fsolve

import optuna

import warnings
warnings.filterwarnings('ignore')

import gc
gc.enable()

rand_seed = 1120

optuna_mode = False
study_name = "moa_blend_team_v3"
# n_trials = 500
n_trials = 3000

In [2]:
# !mkdir -p /root/.cache/torch/hub/checkpoints/
# !cp ../input/gen-efficientnet-pretrained/tf_efficientnet_*.pth /root/.cache/torch/hub/checkpoints/
# !cp ../input/deepinsight-resnest-v1-resnest50/*.pth /root/.cache/torch/hub/checkpoints/
# !cp ../input/deepinsight-resnest-v2-resnest50-output/*.pth /root/.cache/torch/hub/checkpoints/
# !ls -la /root/.cache/torch/hub/checkpoints/

In [3]:
# !cp ../input/kaggle-moa-team/scripts/* .
# !ls -la

In [4]:
dataset_folder = "../input/lish-moa" if kernel_mode else "/workspace/Kaggle/MoA/"

# Add your model inference script here
# Tuple Format: (script, oof_filename, output_filename, weight)
model_list = [
    ("moa-inference-data/script_tabnet_ns_oldcv.py",
     "moa-inference-data/oof_tabnet_ns_oldcv.npy",
     "submission_tabnet_ns_oldcv.csv", 0),
    
    ("moa-inference-data/2stageNN_with_ns_oldcv.py",
     "moa-inference-data/oof_2stageNN_ns_oldcv.npy",
     "submission_2stageNN_ns_oldcv.csv", 0),
    
    ("improving-mark-s-2-heads-model-infer.py",
     "completed/improving-mark-s-2-heads-model/oof_0.015660083675738144.npy",
     "submission_improving-mark-s-2-heads-model.csv", 0),
#     ("fork-of-2heads-looper-super-puper-markpeng-infer.py",
#      "completed/2heads-looper-super-puper-markpeng/oof_0.015886529391963274.npy",
#      "submission_2heads-looper-super-puper.csv", 0),
    
    #     ("moa-inference-data/script_nn_ns_newcv.py",
    #      "moa-inference-data/oof_nn_ns_newcv.npy",
    #      "submission_nn_ns_newcv.csv", 0),
    
#     ("deepinsight_efficientnet_lightning_v7_b3_infer.py",
#      "oof_deepinsight_efficientnet_v7_b3_0.014802440208660929.npy",
#      "submission_effnet_v7_b3.csv", 0),
    
#     ("deepinsight_resnest_lightning_v2_infer.py",
#      "oof_deepinsight_ResNeSt_v2_resnest50_0.01455961217985703.npy",
#      "submission_resnest_v2.csv", 0),
]

model_path = "." if kernel_mode else dataset_folder

In [5]:
train_features = pd.read_csv(f"{dataset_folder}/train_features.csv",
                             engine='c')
train_labels = pd.read_csv(f'{dataset_folder}/train_targets_scored.csv',
                           engine='c')
train_classes = [c for c in train_labels.columns if c != "sig_id"]

non_control_group_rows = train_features["cp_type"] == "trt_cp"
non_control_group_train_labels = train_labels.loc[
    non_control_group_rows, :].copy().reset_index(drop=True)

submission = pd.read_csv(f'{dataset_folder}/sample_submission.csv')
submission.iloc[:, 1:] = 0

In [6]:
def mean_logloss(y_pred, y_true):
    logloss = (1 - y_true) * np.log(1 - y_pred +
                                    1e-15) + y_true * np.log(y_pred + 1e-15)
    return np.nanmean(-logloss)

## Bayesian Optimization

In [None]:
total_start = time.time()
if not optuna_mode:
    for i, (script, oof_filename, output_filename, weight) in enumerate(model_list):
        print(f"Generating submission file from {script} ......")
        infer_start = time.time()
        !python {model_path}/{script}
        infer_elapsed = time.time() - infer_start
        print(f"Time spent on inference: {infer_elapsed/60:.2f} minutes.")

        model_submit = pd.read_csv(output_filename, engine='c')
        print(model_submit.head(5))
        print(model_submit.shape)
        submission.iloc[:, 1:] += weight * model_submit.iloc[:, 1:]
else:
    ## Search Best Blend Weights by Optuna ##
    model_oofs = []

    for i, (script, oof_filename, output_filename, weight) in enumerate(model_list):
        print(f"Loading OOF from {oof_filename} ......")
        oof = np.load(f"{dataset_folder}/{oof_filename}")
        
        if oof.shape[0] == 23814:
            oof = oof[non_control_group_rows, :]

        oof_loss = mean_logloss(oof, non_control_group_train_labels[train_classes].values)
        print(f"OOF Validation Loss of {script}: {oof_loss:.6f}\n")
        model_oofs.append(oof)

    def objective(trial):
        weights = []
        for i in range(len(model_list)):
            weights.append(trial.suggest_float(f"w{i}", 0, 1.0))

        blend = np.zeros(model_oofs[0].shape)
        for i in range(len(model_list)):
            blend += weights[i] * model_oofs[i]
        blend = np.clip(blend, 0, 1.0)

        loss = mean_logloss(blend, non_control_group_train_labels[train_classes].values)
        return loss

    pruner = optuna.pruners.MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=0,
        interval_steps=1,
    )
    sampler = optuna.samplers.TPESampler(seed=rand_seed)
    study = optuna.create_study(direction="minimize",
                                pruner=pruner,
                                sampler=sampler,
                                study_name=study_name,
                                storage=f'sqlite:///{study_name}.db',
                                load_if_exists=True)

    study.optimize(objective,
                   n_trials=n_trials,
                   timeout=None,
                   gc_after_trial=True,
                   n_jobs=-1)

    trial = study.best_trial

#     for i, (script, oof_filename, output_filename, _) in enumerate(model_list):
#         optimal_weight = trial.params[f"w{i}"]
#         print(f"Generating submission file from {script} ...... (Weight: {optimal_weight})")
#         infer_start = time.time()
#         !python {model_path}/{script}
#         infer_elapsed = time.time() - infer_start
#         print(f"Time spent on inference: {infer_elapsed/60:.2f} minutes.")

#         model_submit = pd.read_csv(output_filename, engine='c')
#         print(model_submit.head(5))
#         print(model_submit.shape)
#         submission.iloc[:, 1:] += optimal_weight * model_submit.iloc[:, 1:]

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
total_elapsed = time.time() - total_start
print(f"Total time spent: {total_elapsed/60:.2f} minutes.")

In [None]:
# Loading OOF from moa-inference-data/oof_tabnet_ns_oldcv.npy ......
# OOF Validation Loss of moa-inference-data/script_tabnet_ns_oldcv.py: 0.016234

# Loading OOF from moa-inference-data/oof_2stageNN_ns_oldcv.npy ......
# OOF Validation Loss of moa-inference-data/2stageNN_with_ns_oldcv.py: 0.015599

# Loading OOF from completed/improving-mark-s-2-heads-model/oof_0.015660083675738144.npy ......
# OOF Validation Loss of improving-mark-s-2-heads-model-infer.py: 0.015660

# Number of finished trials: 3000
# Best trial:
#   Value: 0.015443316097753039
#   Params: 
#     w0: 0.0003278730994472901
#     w1: 0.5138464744637067
#     w2: 0.48704428642636544

In [None]:
# [V3]
# improving-mark-s-2-heads-model-infer
# Number of finished trials: 3000
# Best trial:
#   Value: 0.01515466145873492
#   Params: 
#     w0: 0.0002980690037490555
#     w1: 0.29771381784976886
#     w2: 0.1569191862042946
#     w3: 0.18156875605872544
#     w4: 0.36371774630338105

In [None]:
# [V3]
# fork-of-2heads-looper-super-puper-markpeng-infer
# Number of finished trials: 3000
# Best trial:
#   Value: 0.015170138066049686
#   Params: 
#     w0: 0.00019903389488299251
#     w1: 0.3853752127955825
#     w2: 0.015968332256452233
#     w3: 0.22945916769823432
#     w4: 0.3711290150522236

In [None]:
# [V2]
# Number of finished trials: 3000
# Best trial:
#   Value: 0.015169987138880783
#   Params: 
#     w0: 0.00012453900174446623
#     w1: 0.38385943982932813
#     w2: 0.23167342990923373
#     w3: 0.384803293338815

## Optimise Blending Weights with Bonus
https://www.kaggle.com/gogo827jz/optimise-blending-weights-with-bonus-0/notebook

In [7]:
model_oofs = []
y_true = non_control_group_train_labels[train_classes].values

all_oof = np.zeros(
    (len(model_list), non_control_group_train_labels.shape[0], 206))
print(all_oof.shape)
for i, (script, oof_filename, output_filename,
        weight) in enumerate(model_list):
    print(f"Loading OOF from {oof_filename} ......")
    oof = np.load(f"{dataset_folder}/{oof_filename}")

    if oof.shape[0] == 23814:
        oof = oof[non_control_group_rows, :]

    all_oof[i, :, :] = oof

    oof_loss = mean_logloss(oof, y_true)
    print(f"OOF Validation Loss of {script}: {oof_loss:.6f}\n")
    model_oofs.append(oof)

(3, 21948, 206)
Loading OOF from moa-inference-data/oof_tabnet_ns_oldcv.npy ......
OOF Validation Loss of moa-inference-data/script_tabnet_ns_oldcv.py: 0.016234

Loading OOF from moa-inference-data/oof_2stageNN_ns_oldcv.npy ......
OOF Validation Loss of moa-inference-data/2stageNN_with_ns_oldcv.py: 0.015599

Loading OOF from completed/improving-mark-s-2-heads-model/oof_0.015660083675738144.npy ......
OOF Validation Loss of improving-mark-s-2-heads-model-infer.py: 0.015660



In [10]:
# Reference: https://www.kaggle.com/gogo827jz/optimise-blending-weights-with-bonus-0/notebook
# CPMP's logloss from https://www.kaggle.com/c/lish-moa/discussion/183010
def log_loss_numpy(y_pred, y_true):
    y_true_ravel = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    loss = np.where(y_true_ravel == 1, -np.log(y_pred), -np.log(1 - y_pred))
    return loss.mean()


def func_numpy_metric(weights):
    oof_blend = np.tensordot(weights, all_oof, axes=((0), (0)))
    return log_loss_numpy(oof_blend, y_true)

In [14]:
tol = 1e-10
init_guess = [1 / all_oof.shape[0]] * all_oof.shape[0]
bnds = [(0, 1) for _ in range(all_oof.shape[0])]
cons = {
    'type': 'eq',
    'fun': lambda x: np.sum(x) - 1,
    'jac': lambda x: [1] * len(x)
}

print(
    'Inital Blend OOF:',
    func_numpy_metric(init_guess))

Inital Blend OOF: 0.015555036537025642


In [15]:
@njit
def grad_func_jit(weights):
    oof_clip = np.minimum(1 - 1e-15, np.maximum(all_oof, 1e-15))
    gradients = np.zeros(all_oof.shape[0])
    for i in range(all_oof.shape[0]):
        a, b, c = y_true, oof_clip[i], np.zeros(
                (all_oof.shape[1], all_oof.shape[2]))
        for j in range(oof.shape[0]):
            if j != i:
                c += weights[j] * oof_clip[j]
        gradients[i] = -np.mean(
            (-a * b + (b**2) * weights[i] + b * c) /
            ((b**2) *
             (weights[i]**2) + 2 * b * c * weights[i] - b * weights[i] +
             (c**2) - c))
    return gradients


start_time = time.time()
res_scipy = minimize(
    fun=func_numpy_metric,
    x0=init_guess,
    method='SLSQP',
    # jac=grad_func_jit,  # grad_func 
    bounds=bnds,
    constraints=cons,
    tol=tol)
print(
    f'[{str(datetime.timedelta(seconds = time.time() - start_time))[2:7]}] Optimised Blend OOF:',
    res_scipy.fun)
print('Optimised Weights:', res_scipy.x)

[00:10] Optimised Blend OOF: 0.015443398897897863
Optimised Weights: [0.         0.51297486 0.48702514]


In [None]:
# print(submission.shape)
# submission

In [None]:
# submission.to_csv('submission.csv', index=False)

## EOF

In [None]:
# !rm ./*.py
# !ls -la