In [1]:
"""
[V4]
Blend Models:
* kibuna-nn-hs-1024-last-train (aka. 2stage-NN, LB: 0.01822)
* simple-nn-using-old-cv (LB: 0.01836)
* fork-of-2heads-looper-super-puper-markpeng-infer (LB: 0.1836)
* deepinsight-efficientnet-v7-b3-infer (LB: 0.01850)

"""

kernel_mode = False

import os
import numpy as np
import pandas as pd
import time
import random
import math
import datetime
import glob

from numba import njit
from scipy.optimize import minimize, fsolve

import optuna

import warnings
warnings.filterwarnings('ignore')

import gc
gc.enable()

rand_seed = 1120

search_mode = True
run_submit_script = False

method = "scipy"

# method = "optuna"
study_name = "moa_blend_team_v4"
# n_trials = 500
n_trials = 3000

In [2]:
# !mkdir -p /root/.cache/torch/hub/checkpoints/
# !cp ../input/gen-efficientnet-pretrained/tf_efficientnet_*.pth /root/.cache/torch/hub/checkpoints/
# !cp ../input/deepinsight-resnest-v1-resnest50/*.pth /root/.cache/torch/hub/checkpoints/
# !cp ../input/deepinsight-resnest-v2-resnest50-output/*.pth /root/.cache/torch/hub/checkpoints/
# !ls -la /root/.cache/torch/hub/checkpoints/

In [3]:
# !cp ../input/kaggle-moa-team/scripts/* .
# !ls -la

In [4]:
dataset_folder = "../input/lish-moa" if kernel_mode else "/workspace/Kaggle/MoA/"

# Add your model inference script here
# Tuple Format: (script, oof_filename, output_filename, weight)
model_list = [
    ("2stageNN_with_ns_oldcv.py" if kernel_mode else "../../Github/kaggle_moa_team/scripts/2stageNN_with_ns_oldcv.py",
     "../../Github/kaggle_moa_team/oof/oof_2stageNN_ns_oldcv.npy",
     "submission_2stageNN_with_ns_oldcv_0.01822.csv",
     0.4),
    
    ("script_simpleNN_oldcv.py" if kernel_mode else "../../Github/kaggle_moa_team/scripts/script_simpleNN_oldcv.py",
     "../../Github/kaggle_moa_team/oof/oof_script_simpleNN_oldcv.npy",
     "submission_script_simpleNN_oldcv_0.01836.csv",
     0.2),
    
    ("fork-of-2heads-looper-super-puper-markpeng-infer.py" if kernel_mode else "../../Github/kaggle_moa_team/scripts/fork-of-2heads-looper-super-puper-markpeng-infer.py",
     "../../Github/kaggle_moa_team/oof/oof_fork-of-2heads-looper-super-puper-markpeng.npy",
     "submission_2heads-looper-super-puper_0.01836.csv",
     0.2),
    
    ("deepinsight_efficientnet_lightning_v7_b3_infer.py" if kernel_mode else "../../Github/kaggle_moa_team/scripts/deepinsight_efficientnet_lightning_v7_b3_infer.py",
     "../../Github/kaggle_moa_team/oof/oof_deepinsight_efficientnet_lightning_v7_b3_0.01850.npy",
     "submission_effnet_v7_b3_0.01850.csv",
     0.2),
    
#     ("deepinsight_resnest_lightning_v2_infer.py",
#      "oof_deepinsight_ResNeSt_v2_resnest50_0.01455961217985703.npy",
#      "submission_resnest_v2.csv", 0),
]

model_path = "." if kernel_mode else dataset_folder

In [5]:
train_features = pd.read_csv(f"{dataset_folder}/train_features.csv",
                             engine='c')
train_labels = pd.read_csv(f'{dataset_folder}/train_targets_scored.csv',
                           engine='c')
train_classes = [c for c in train_labels.columns if c != "sig_id"]

non_control_group_rows = train_features["cp_type"] == "trt_cp"
non_control_group_train_labels = train_labels.loc[
    non_control_group_rows, :].copy().reset_index(drop=True)

submission = pd.read_csv(f'{dataset_folder}/sample_submission.csv')
submission.iloc[:, 1:] = 0

In [6]:
def mean_logloss(y_pred, y_true):
    logloss = (1 - y_true) * np.log(1 - y_pred +
                                    1e-15) + y_true * np.log(y_pred + 1e-15)
    return np.nanmean(-logloss)

## Bayesian Optimization and Sequential Least Squares Programming (SLSQP)

In [7]:
def run_inference_scripts(submission, weights=None):
    for i, (script, oof_filename, output_filename, weight) in enumerate(model_list):
        print(f"Generating submission file from {script} ......")
        infer_start = time.time()
        !python {model_path}/{script}
        infer_elapsed = time.time() - infer_start
        print(f"Time spent on inference: {infer_elapsed/60:.2f} minutes.")

        model_submit = pd.read_csv(output_filename, engine='c')
        print(model_submit.head(5))
        print(model_submit.shape)
        if weights is None:
            print(f"Blending {script} with weight: {weight} ......")
            submission.iloc[:, 1:] += weight * model_submit.iloc[:, 1:]
        else:
            print(f"Blending {script} with weight: {weights[i]} ......")
            submission.iloc[:, 1:] += weights[i] * model_submit.iloc[:, 1:]

    return submission

In [None]:
total_start = time.time()

if not search_mode:
    submission = run_inference_scripts(submission)
elif search_mode and method == "optuna":
    ## Search Best Blend Weights by Optuna ##
    model_oofs = []

    for i, (script, oof_filename, output_filename,
            weight) in enumerate(model_list):
        print(f"Loading OOF from {oof_filename} ......")
        oof = np.load(f"{dataset_folder}/{oof_filename}")

        if oof.shape[0] == 23814:
            oof = oof[non_control_group_rows, :]

        oof_loss = mean_logloss(
            oof, non_control_group_train_labels[train_classes].values)
        print(f"OOF Validation Loss of {script}: {oof_loss:.6f}\n")
        model_oofs.append(oof)

    def objective(trial):
        weights = []
        for i in range(len(model_list)):
            weights.append(trial.suggest_float(f"w{i}", 0, 1.0))

        blend = np.zeros(model_oofs[0].shape)
        for i in range(len(model_list)):
            blend += weights[i] * model_oofs[i]
        blend = np.clip(blend, 0, 1.0)

        loss = mean_logloss(
            blend, non_control_group_train_labels[train_classes].values)
        return loss

    pruner = optuna.pruners.MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=0,
        interval_steps=1,
    )
    sampler = optuna.samplers.TPESampler(seed=rand_seed)
    study = optuna.create_study(direction="minimize",
                                pruner=pruner,
                                sampler=sampler,
                                study_name=study_name,
                                storage=f'sqlite:///{study_name}.db',
                                load_if_exists=True)

    study.optimize(objective,
                   n_trials=n_trials,
                   timeout=None,
                   gc_after_trial=True,
                   n_jobs=-1)

    trial = study.best_trial

    if run_submit_script:
        optimal_weights = []
        for i, (script, oof_filename, output_filename, _) in enumerate(model_list):
            optimal_weights.append(trial.params[f"w{i}"])
        submission = run_inference_scripts(submission, weights=optimal_weights)

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

elif search_mode and method == "scipy":
    # Optimise Blending Weights with Bonus
    # https://www.kaggle.com/gogo827jz/optimise-blending-weights-with-bonus-0/notebook
    model_oofs = []
    y_true = non_control_group_train_labels[train_classes].values

    all_oof = np.zeros(
        (len(model_list), non_control_group_train_labels.shape[0], 206))
    print(all_oof.shape)
    for i, (script, oof_filename, output_filename,
            weight) in enumerate(model_list):
        print(f"Loading OOF from {oof_filename} ......")
        oof = np.load(f"{dataset_folder}/{oof_filename}")

        if oof.shape[0] == 23814:
            oof = oof[non_control_group_rows, :]

        all_oof[i, :, :] = oof

        oof_loss = mean_logloss(oof, y_true)
        print(f"OOF Validation Loss of {script}: {oof_loss:.6f}\n")
        model_oofs.append(oof)

    # Reference: https://www.kaggle.com/gogo827jz/optimise-blending-weights-with-bonus-0/notebook
    # CPMP's logloss from https://www.kaggle.com/c/lish-moa/discussion/183010
    def log_loss_numpy(y_pred, y_true):
        y_true_ravel = np.asarray(y_true).ravel()
        y_pred = np.asarray(y_pred).ravel()
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        loss = np.where(y_true_ravel == 1, -np.log(y_pred),
                        -np.log(1 - y_pred))
        return loss.mean()

    def func_numpy_metric(weights):
        oof_blend = np.tensordot(weights, all_oof, axes=((0), (0)))
        return log_loss_numpy(oof_blend, y_true)

    @njit
    def grad_func_jit(weights):
        oof_clip = np.minimum(1 - 1e-15, np.maximum(all_oof, 1e-15))
        gradients = np.zeros(all_oof.shape[0])
        for i in range(all_oof.shape[0]):
            a, b, c = y_true, oof_clip[i], np.zeros(
                (all_oof.shape[1], all_oof.shape[2]))
            for j in range(oof.shape[0]):
                if j != i:
                    c += weights[j] * oof_clip[j]
            gradients[i] = -np.mean(
                (-a * b + (b**2) * weights[i] + b * c) /
                ((b**2) *
                 (weights[i]**2) + 2 * b * c * weights[i] - b * weights[i] +
                 (c**2) - c))
        return gradients

    tol = 1e-10
    init_guess = [1 / all_oof.shape[0]] * all_oof.shape[0]
    bnds = [(0, 1) for _ in range(all_oof.shape[0])]
    cons = {
        'type': 'eq',
        'fun': lambda x: np.sum(x) - 1,
        'jac': lambda x: [1] * len(x)
    }

    print('Inital Blend OOF:', func_numpy_metric(init_guess))

    start_time = time.time()
    res_scipy = minimize(
        fun=func_numpy_metric,
        x0=init_guess,
        method='SLSQP',
        # jac=grad_func_jit,  # grad_func
        bounds=bnds,
        constraints=cons,
        tol=tol)
    print(
        f'[{str(datetime.timedelta(seconds = time.time() - start_time))[2:7]}] Optimised Blend OOF:',
        res_scipy.fun)
    print(f'Optimised Weights: {res_scipy.x}\n')

    if run_submit_script:
        submission = run_inference_scripts(submission, weights=res_scipy.x)

(4, 21948, 206)
Loading OOF from ../../Github/kaggle_moa_team/oof/oof_2stageNN_ns_oldcv.npy ......
OOF Validation Loss of ../../Github/kaggle_moa_team/scripts/2stageNN_with_ns_oldcv.py: 0.015606

Loading OOF from ../../Github/kaggle_moa_team/oof/oof_script_simpleNN_oldcv.npy ......
OOF Validation Loss of ../../Github/kaggle_moa_team/scripts/script_simpleNN_oldcv.py: 0.015846

Loading OOF from ../../Github/kaggle_moa_team/oof/oof_fork-of-2heads-looper-super-puper-markpeng.npy ......
OOF Validation Loss of ../../Github/kaggle_moa_team/scripts/fork-of-2heads-looper-super-puper-markpeng-infer.py: 0.015887

Loading OOF from ../../Github/kaggle_moa_team/oof/oof_deepinsight_efficientnet_lightning_v7_b3_0.01850.npy ......
OOF Validation Loss of ../../Github/kaggle_moa_team/scripts/deepinsight_efficientnet_lightning_v7_b3_infer.py: 0.016016

Inital Blend OOF: 0.015370944555914716
[00:23] Optimised Blend OOF: 0.015331777381591449
Optimised Weights: [0.44090106 0.14508641 0.05945655 0.35455598]



Inferencing on seed23 fold0 ......
2020-11-28 18:51:31.584126: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-11-28 18:51:32.391611: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-28 18:51:32.392934: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.545GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-11-28 18:51:32.392999: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-11-28 18:51:32.393910: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 

2020-11-28 18:51:33.490549: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
[[0.00062638 0.00139738 0.00256215 ... 0.00215134 0.00118474 0.00168681]
 [0.00035632 0.00282041 0.00062197 ... 0.00116035 0.00233351 0.00359165]
 [0.00111823 0.00116571 0.00234077 ... 0.00260893 0.00513932 0.00421351]
 [0.00030748 0.00038197 0.00049237 ... 0.00021168 0.0007948  0.00052682]
 [0.00435118 0.00289444 0.00288538 ... 0.00288126 0.0007065  0.00149353]]


Inferencing on seed23 fold1 ......
[[0.00043796 0.00087135 0.00177629 ... 0.0059837  0.00603774 0.00380702]
 [0.00099434 0.00191273 0.00115954 ... 0.00050997 0.01455157 0.00108468]
 [0.00072859 0.00028989 0.00163209 ... 0.00377872 0.00062831 0.00547929]
 [0.00150527 0.00081846 0.00254885 ... 0.00238689 0.00355374 0.00205896]
 [0.00068232 0.0011047  0.00187454 ... 0.00261347 0.00064067 0.00329006]]


Inferencing on seed23 fold2 ......
[[0.00078193 0.00042732 0.00251405 ... 0.00410442 

[[0.00083112 0.00102997 0.00112913 ... 0.00218203 0.01007643 0.00139813]
 [0.00130403 0.00040868 0.00049331 ... 0.00123464 0.00721349 0.00239361]
 [0.00082362 0.00055569 0.00603769 ... 0.00258942 0.00045117 0.00122529]
 [0.00164426 0.00084203 0.00287178 ... 0.00109287 0.00137246 0.0044546 ]
 [0.00104206 0.00219211 0.00291657 ... 0.00119398 0.00040803 0.00113686]]


Inferencing on seed1488 fold0 ......
[[0.00041637 0.00076366 0.00185131 ... 0.00074927 0.00080906 0.00062757]
 [0.00068896 0.00121188 0.00055628 ... 0.00127395 0.0001347  0.00816328]
 [0.0011636  0.00084701 0.00332347 ... 0.00686609 0.00115232 0.00409443]
 [0.00074673 0.00083443 0.00125868 ... 0.00120647 0.00032822 0.00361357]
 [0.00122167 0.00144062 0.00319215 ... 0.00267271 0.00139986 0.0015122 ]]


Inferencing on seed1488 fold1 ......
[[0.00053722 0.00158904 0.00302694 ... 0.00326372 0.00553973 0.00121735]
 [0.00055014 0.00126293 0.00050249 ... 0.00037031 0.00032316 0.00120134]
 [0.00172001 0.00094465 0.00868401 ... 0.005

[[0.00220144 0.0030131  0.00270208 ... 0.00153642 0.00093725 0.00144432]
 [0.00115876 0.00378029 0.0016098  ... 0.00088771 0.01036188 0.01074338]
 [0.00213321 0.00195554 0.00713593 ... 0.01207241 0.0053597  0.00399447]
 [0.00123775 0.00196412 0.00254078 ... 0.00322822 0.0017693  0.00369861]
 [0.00237524 0.00098375 0.00096843 ... 0.0028148  0.00053063 0.00137608]]


Inferencing on seed2208 fold0 ......
[[0.00204638 0.00113798 0.00063891 ... 0.00148389 0.00233765 0.00109445]
 [0.0019779  0.00084201 0.00062849 ... 0.00064207 0.00840471 0.00084796]
 [0.00120602 0.00119492 0.00185022 ... 0.00443862 0.00388473 0.00302889]
 [0.00219935 0.00347256 0.00602429 ... 0.00176903 0.00859255 0.00754905]
 [0.00093299 0.00074737 0.00156016 ... 0.00177619 0.00116859 0.00111089]]


Inferencing on seed2208 fold1 ......
[[0.0009003  0.00042809 0.00075695 ... 0.00067307 0.00324794 0.00145146]
 [0.00031122 0.00011111 0.00053146 ... 0.00031583 0.00143036 0.00114015]
 [0.00158012 0.00060916 0.00282818 ... 0.003

[[0.00088812 0.00102286 0.00188788 ... 0.00186495 0.00754405 0.00278977]
 [0.00124838 0.0007562  0.00090979 ... 0.00049957 0.02061236 0.00324751]
 [0.00113308 0.00048514 0.00096171 ... 0.00639555 0.0015716  0.0039128 ]
 [0.00153632 0.0018115  0.00634292 ... 0.00198243 0.00080375 0.00383651]
 [0.00056622 0.00050676 0.00233097 ... 0.00198139 0.00052732 0.00197423]]


Inferencing on seed404 fold0 ......
[[0.00557549 0.00195153 0.00248652 ... 0.00342367 0.00178767 0.0018387 ]
 [0.00081232 0.0019375  0.00098008 ... 0.00041475 0.00204593 0.00087728]
 [0.00061184 0.00046793 0.00204296 ... 0.00390025 0.00095449 0.00657   ]
 [0.0014319  0.00282594 0.00390048 ... 0.00134808 0.00097792 0.00351321]
 [0.00083724 0.00269259 0.00250249 ... 0.00103952 0.000378   0.00402767]]


Inferencing on seed404 fold1 ......
[[0.00118746 0.00261698 0.00133018 ... 0.00185213 0.00260876 0.00411207]
 [0.00020632 0.0003831  0.00087567 ... 0.00031728 0.0105675  0.00103415]
 [0.00052676 0.0003387  0.00089782 ... 0.00713

In [None]:
total_elapsed = time.time() - total_start
print(f"Total time spent: {total_elapsed/60:.2f} minutes.")

In [None]:
# [V4]
# [00:23] Optimised Blend OOF: 0.015331777381591449
# Optimised Weights: [0.44090106 0.14508641 0.05945655 0.35455598]

In [None]:
# [V3]
# improving-mark-s-2-heads-model-infer
# Number of finished trials: 3000
# Best trial:
#   Value: 0.01515466145873492
#   Params: 
#     w0: 0.0002980690037490555
#     w1: 0.29771381784976886
#     w2: 0.1569191862042946
#     w3: 0.18156875605872544
#     w4: 0.36371774630338105

In [None]:
# [V3]
# fork-of-2heads-looper-super-puper-markpeng-infer
# Number of finished trials: 3000
# Best trial:
#   Value: 0.015170138066049686
#   Params: 
#     w0: 0.00019903389488299251
#     w1: 0.3853752127955825
#     w2: 0.015968332256452233
#     w3: 0.22945916769823432
#     w4: 0.3711290150522236

In [None]:
print(submission.shape)
submission

In [None]:
submission.to_csv('submission.csv', index=False)

## EOF

In [None]:
if kernel_mode:
    !rm ./*.py
    !ls -la