In [4]:
import os
import sys
import itertools
import pickle
from glob import glob
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd
from scipy.stats import spearmanr

from matplotlib import pyplot as plt
from matplotlib_venn import venn2, venn3
import seaborn as sns

import torch
from torch import nn, optim
from transformers import BertConfig, BertTokenizer, BertModel, BertForMaskedLM#, BertLayer, BertEmbeddings
from transformers.modeling_bert import BertLayer, BertEmbeddings

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# re-load functions
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
%config InlineBackend.figure_formats = {'png', 'retina'}

In [3]:
DEVICE = 'cpu'

In [37]:
import sys
import pickle
from functools import partial
from glob import glob

import numpy as np
import pandas as pd
import scipy as sp
import torch
from scipy.stats import spearmanr
from tqdm import tqdm

class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """

    def __init__(self):
        self.coef_ = 0

    def _spearmanr_loss(self, coef, X, y, labels):
        """
        Get loss according to
        using current coefficients
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) +
                     [np.inf], labels=labels)

        # return -np.mean(spearmanr(y, X_p).correlation)
        return -spearmanr(y, X_p).correlation

    def fit(self, X, y, initial_coef):
        """
        Optimize rounding thresholds
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        labels = self.labels
        loss_partial = partial(self._spearmanr_loss, X=X, y=y, labels=labels)
        self.coef_ = sp.optimize.minimize(
            # loss_partial, initial_coef, method='Powell')
            loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        labels = self.labels
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) +
                      [np.inf], labels=labels)
        # [np.inf], labels=[0, 1, 2, 3])

    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

    def set_labels(self, labels):
        self.labels = labels

In [45]:
sys.path.append('../scripts/')
from get_optR3 import compute_spearmanr, get_opt_y_pred

In [39]:
class histogramBasedCoefInitializer:
    def __init__(self):
        self.bins = None

    def fit(self, labels):
        self.bins = pd.Series(labels).value_counts().sort_index().cumsum().values
        return self

    def predict(self, preds):
        preds = sorted(preds)
        res_threshs = []
        if self.bins is None:
            raise Exception('plz fit at first.')
        for _bin in self.bins[:-1]:
            res_threshs.append((preds[_bin - 1] + preds[_bin]) / 2)
        return res_threshs

## snapshot 済みの model を load

In [6]:
CKPT_DIR = '../mnt/checkpoints'

with open(f'{CKPT_DIR}/e059/snapshot_dicts.pkl', 'rb') as fin:
    bert_question_dict = pickle.load(fin)
with open(f'{CKPT_DIR}/e060/snapshot_dicts.pkl', 'rb') as fin:
    bert_answer_dict = pickle.load(fin)
    
with open(f'{CKPT_DIR}/e068/snapshot_dicts.pkl', 'rb') as fin:
    roberta_question_dict = pickle.load(fin)
with open(f'{CKPT_DIR}/e070/snapshot_dicts.pkl', 'rb') as fin:
    roberta_answer_dict = pickle.load(fin)
    
with open(f'{CKPT_DIR}/e072/snapshot_dicts.pkl', 'rb') as fin:
    gpt2_question_dict = pickle.load(fin)
with open(f'{CKPT_DIR}/e073/snapshot_dicts.pkl', 'rb') as fin:
    gpt2_answer_dict = pickle.load(fin)
    
with open(f'{CKPT_DIR}/e074/snapshot_dicts.pkl', 'rb') as fin:
    xlnet_question_dict = pickle.load(fin)
with open(f'{CKPT_DIR}/e075/snapshot_dicts.pkl', 'rb') as fin:
    xlnet_answer_dict = pickle.load(fin)

In [8]:
bert_question_dict.keys()

dict_keys([0, 1, 2, 3, 4])

In [10]:
bert_question_dict[0].keys()

dict_keys(['y_trues', 'y_preds'])

In [11]:
xlnet_answer_dict.keys()

dict_keys([0, 1, 2, 3, 4])

In [33]:
def _get_y_trues_and_y_preds_from_snapshot_dicts(snapshot_dicts, single):
    y_trues, y_preds = [], []
    for fold in range(5):
        if single:
            y_trues.append(snapshot_dicts[fold]['y_trues'][0])
            y_preds.append(snapshot_dicts[fold]['y_preds'][0])
        else:
            y_trues.append(np.average(snapshot_dicts[fold]['y_trues'], axis=0))
            y_preds.append(np.average(snapshot_dicts[fold]['y_preds'], axis=0))
    y_trues = np.concatenate(y_trues)
    y_preds = np.concatenate(y_preds)
    return y_trues, y_preds

def get_y_trues_and_y_preds_from_QA_snapshota_dicts(Q_snapshot_dicts, A_snapshot_dicts, single=False):
    q_y_trues, q_y_preds = _get_y_trues_and_y_preds_from_snapshot_dicts(Q_snapshot_dicts, single)
    a_y_trues, a_y_preds = _get_y_trues_and_y_preds_from_snapshot_dicts(A_snapshot_dicts, single)
    y_trues = np.concatenate([q_y_trues, a_y_trues], axis=1)
    y_preds = np.concatenate([q_y_preds, a_y_preds], axis=1)
    return y_trues, y_preds

In [34]:
# %debug
bert_y_trues, bert_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(bert_question_dict, bert_answer_dict)
roberta_y_trues, roberta_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(roberta_question_dict, roberta_answer_dict)
gpt2_y_trues, gpt2_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(gpt2_question_dict, gpt2_answer_dict)
xlnet_y_trues, xlnet_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(xlnet_question_dict, xlnet_answer_dict)

In [35]:
# %debug
single_bert_y_trues, single_bert_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(bert_question_dict, bert_answer_dict, single=True)
single_roberta_y_trues, single_roberta_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(roberta_question_dict, roberta_answer_dict, single=True)
single_gpt2_y_trues, single_gpt2_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(gpt2_question_dict, gpt2_answer_dict, single=True)
single_xlnet_y_trues, single_xlnet_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(xlnet_question_dict, xlnet_answer_dict, single=True)

In [31]:
# 良さそう
(bert_y_trues == roberta_y_trues).all(), (bert_y_trues == gpt2_y_trues).all(), (bert_y_trues == xlnet_y_trues).all()

(True, True, True)

In [32]:
(bert_y_preds == roberta_y_preds).all(), (bert_y_preds == gpt2_y_preds).all(), (bert_y_preds == xlnet_y_preds).all()

(False, False, False)

## 混ぜてみる
 - 4 つ全て混ぜてみる
 - 3/4 の　snapshot 混ぜてみる？

In [52]:
def blend_and_evaluate(y_trues, y_preds_list, eval_func):
    y_preds = np.average(y_preds_list, axis=0)
    eval_scores = eval_func(y_trues, y_preds)
    _, opt_y_preds = get_opt_y_pred(y_trues, y_preds, num_labels=30)
    opt_eval_scores = eval_func(y_trues, opt_y_preds)
    print(f'original_score: {np.mean(eval_scores)}')
    print(f'opt_score: {np.mean(opt_eval_scores)}')
    return eval_scores, opt_eval_scores

#### 試しに single, top-2 snapshots を見てみる

In [54]:
blend_and_evaluate(single_bert_y_trues, [single_bert_y_preds,  ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_roberta_y_preds, ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_gpt2_y_preds, ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_xlnet_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.39787353711376366
opt_score: 0.4261143948591801
original_score: 0.3917349489436032
opt_score: 0.4159272241610885
original_score: 0.38234367637291916
opt_score: 0.4048386132780595
original_score: 0.3906094115371919
opt_score: 0.4208016538440192
fini!


In [55]:
blend_and_evaluate(single_bert_y_trues, [bert_y_preds,  ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [roberta_y_preds, ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [gpt2_y_preds, ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [xlnet_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.40377496605021973
opt_score: 0.44007917576260047
original_score: 0.3993632474521848
opt_score: 0.4290080945752392


  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


original_score: 0.3854747911354585
opt_score: 0.4121866682795468
original_score: 0.3977770316464774
opt_score: 0.42765094537546605
fini!


#### all blends がどうなるかを見てみる

In [53]:
eval_scores, opt_eval_scores = blend_and_evaluate(single_bert_y_trues, [single_bert_y_preds,  single_roberta_y_preds, single_gpt2_y_preds, single_xlnet_y_preds], compute_spearmanr)

original_score: 0.4180430600394728
opt_score: 0.45011560307585413


#### 組み合わせがどうなるかを見てみる

In [56]:
blend_and_evaluate(single_bert_y_trues, [ single_roberta_y_preds, single_gpt2_y_preds, single_xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_bert_y_preds, single_gpt2_y_preds, single_xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_bert_y_preds,  single_roberta_y_preds, single_xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_bert_y_preds,  single_roberta_y_preds, single_gpt2_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.4123991396964391
opt_score: 0.44228018730704144
original_score: 0.41487735800374637
opt_score: 0.44704608097446064
original_score: 0.4165326987734412
opt_score: 0.4536773141844584
original_score: 0.4142418328259677
opt_score: 0.44462524506286166
fini!


In [58]:
blend_and_evaluate(bert_y_trues, [ roberta_y_preds, gpt2_y_preds, xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [bert_y_preds, gpt2_y_preds, xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [bert_y_preds,  roberta_y_preds, xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [bert_y_preds,  roberta_y_preds, gpt2_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.41494955942931444
opt_score: 0.4396218163436752
original_score: 0.41725360795561056
opt_score: 0.45043969072616535
original_score: 0.41943000297059607
opt_score: 0.455408513948017
original_score: 0.4165551766237184
opt_score: 0.44958809940315647
fini!


#### 一番良さそうな gpt2 抜きの場合にどれを single にするとよいか見てみる

In [59]:
blend_and_evaluate(bert_y_trues, [single_bert_y_preds,  roberta_y_preds, xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [bert_y_preds,  single_roberta_y_preds, xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [bert_y_preds,  roberta_y_preds, single_xlnet_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.4186640931608012
opt_score: 0.45243258027214034
original_score: 0.4186459442933615
opt_score: 0.4563528045631384
original_score: 0.41862052031267105
opt_score: 0.4571952857338109
fini!


## まとめ
(ちょっと気に入らんけど) bert * 2, roberta * 2, xlnet * 1 の average が良さそう (* 1, * 1, * 1, * 1 よりも)

## model を load して snapshot ensemble 試してみる

In [4]:
def get_snapshot_info_df(base_dir):
    res_dicts= []
    for fold in tqdm(list(range(5))):
        fold_ckpts = glob(f'{base_dir}/{fold}/*.pth')
        for ckpt in fold_ckpts:
            res_dict = {}
            splitted_ckpt = ckpt.split('/')[-1].split('_')
            res_dict['ckpt_filename'] = ckpt
            res_dict['fold'] = int(splitted_ckpt[1])
            res_dict['epoch'] = int(splitted_ckpt[3])
            res_dict['val_loss'] = float(splitted_ckpt[4])
            res_dict['val_metric'] = float(splitted_ckpt[5])
            res_dicts.append(res_dict)
    res_df = pd.DataFrame(res_dicts)
    return res_df

In [38]:
# snapshot_df = get_snapshot_info_df('../mnt/checkpoints/e059')
snapshot_df = get_snapshot_info_df('../mnt/checkpoints/e060')
snapshot_df.sort_values(['fold', 'epoch'])

100%|██████████| 5/5 [00:00<00:00, 148.46it/s]


Unnamed: 0,ckpt_filename,fold,epoch,val_loss,val_metric
1,../mnt/checkpoints/e060/0/fold_0_epoch_0_0.444...,0,0,0.44409,0.15826
0,../mnt/checkpoints/e060/0/fold_0_epoch_1_0.366...,0,1,0.3664,0.33436
5,../mnt/checkpoints/e060/0/fold_0_epoch_2_0.364...,0,2,0.36413,0.36288
4,../mnt/checkpoints/e060/0/fold_0_epoch_3_0.372...,0,3,0.3726,0.34227
3,../mnt/checkpoints/e060/0/fold_0_epoch_4_0.375...,0,4,0.37538,0.34899
2,../mnt/checkpoints/e060/0/fold_0_epoch_5_0.380...,0,5,0.38036,0.34078
11,../mnt/checkpoints/e060/1/fold_1_epoch_0_0.441...,1,0,0.44151,0.14596
10,../mnt/checkpoints/e060/1/fold_1_epoch_1_0.374...,1,1,0.37413,0.32893
9,../mnt/checkpoints/e060/1/fold_1_epoch_2_0.372...,1,2,0.37233,0.34316
6,../mnt/checkpoints/e060/1/fold_1_epoch_3_0.380...,1,3,0.38092,0.33905


#### 上位 k 個の snapshot を使った場合にどういう成果が得られるかを見てみる

In [42]:
snapshot_df['rank'] = snapshot_df.groupby(['fold']).val_metric.rank()
snapshot_df

Unnamed: 0,ckpt_filename,fold,epoch,val_loss,val_metric,rank
0,../mnt/checkpoints/e060/0/fold_0_epoch_1_0.366...,0,1,0.3664,0.33436,2.0
1,../mnt/checkpoints/e060/0/fold_0_epoch_0_0.444...,0,0,0.44409,0.15826,1.0
2,../mnt/checkpoints/e060/0/fold_0_epoch_5_0.380...,0,5,0.38036,0.34078,3.0
3,../mnt/checkpoints/e060/0/fold_0_epoch_4_0.375...,0,4,0.37538,0.34899,5.0
4,../mnt/checkpoints/e060/0/fold_0_epoch_3_0.372...,0,3,0.3726,0.34227,4.0
5,../mnt/checkpoints/e060/0/fold_0_epoch_2_0.364...,0,2,0.36413,0.36288,6.0
6,../mnt/checkpoints/e060/1/fold_1_epoch_3_0.380...,1,3,0.38092,0.33905,4.0
7,../mnt/checkpoints/e060/1/fold_1_epoch_4_0.384...,1,4,0.38446,0.35154,6.0
8,../mnt/checkpoints/e060/1/fold_1_epoch_5_0.388...,1,5,0.38884,0.33372,3.0
9,../mnt/checkpoints/e060/1/fold_1_epoch_2_0.372...,1,2,0.37233,0.34316,5.0


In [67]:
snapshot_dicts = {}

for fold in tqdm(list(range(5))):
    snapshot_dict = {}
    for i, row in snapshot_df.query(f'fold == {fold}').sort_values('rank', ascending=False).reset_index(drop=True).iterrows():
        ckpt = torch.load(row['ckpt_filename'])
        if i == 0:
            qa_ids = ckpt['val_qa_ids']
            qa_ids_argsort = np.argsort(qa_ids)
            snapshot_dict['y_trues'] = [ckpt['val_y_trues'][qa_ids_argsort]]
            snapshot_dict['y_preds'] = [ckpt['val_y_preds'][qa_ids_argsort]]
            snapshot_dict['model_state_dict'] = [ckpt['model_state_dict']]
        else:
            qa_ids = ckpt['val_qa_ids']
            qa_ids_argsort = np.argsort(qa_ids)
            snapshot_dict['y_trues'].append(ckpt['val_y_trues'][qa_ids_argsort])
            snapshot_dict['y_preds'].append(ckpt['val_y_preds'][qa_ids_argsort])
            snapshot_dict['model_state_dict'].append(ckpt['model_state_dict'])
    snapshot_dicts[fold] = snapshot_dict
    break

  0%|          | 0/5 [00:03<?, ?it/s]


In [68]:
snapshot_dict = snapshot_dicts[0]

In [47]:
def get_opt_y_pred(y_true, y_pred, num_labels=21):
    opt_y_preds = []

    # for i in tqdm(list(range(21))):
    for i in range(num_labels):
        optR = OptimizedRounder()
        labels = np.sort(np.unique(y_true[:, i]))
        optR.set_labels(labels)
        initer = histogramBasedCoefInitializer().fit(y_true[:, i])
        opt_thresh = initer.predict(y_pred[:, i])
        optR.fit(y_pred[:, i], y_true[:, i], opt_thresh)
        # opt_threshs.append(optR.coefficients())
        # opt_threshs[i] = optR.coefficients()
        opt_y_preds.append((optR.predict(y_pred[:, i], optR.coefficients())))

    opt_y_preds = np.asarray(opt_y_preds).T
    return opt_y_preds

In [69]:
y_true = snapshot_dict['y_trues'][0]

for k in range(6):
    print(f' --------- top {k} --------- ')
    # y_true = snapshot_dict['y_trues'][k]
    y_pred = np.mean(snapshot_dict['y_preds'][:k+1], axis=0)
    # y_pred = snapshot_dict['y_preds'][k]
    original_score = np.mean(compute_spearmanr(y_true, y_pred))
    print(f'original_score : {original_score}')
    opt_y_pred = get_opt_y_pred(y_true, y_pred, num_labels=9)
    opt_score = np.mean(compute_spearmanr(y_true, opt_y_pred))
    print(f'opt_score : {opt_score}')

 --------- top 0 --------- 
original_score : 0.3628847311147994
opt_score : 0.370105983592996
 --------- top 1 --------- 
original_score : 0.36359187565200396
opt_score : 0.37941430977945306
 --------- top 2 --------- 
original_score : 0.36105658303060834
opt_score : 0.36472392967197254
 --------- top 3 --------- 
original_score : 0.35850096303446816
opt_score : 0.36183891720388417
 --------- top 4 --------- 
original_score : 0.36059427483389317
opt_score : 0.37403301456613636
 --------- top 5 --------- 
original_score : 0.3549419490561672
opt_score : 0.3734286936041146


In [88]:
y_true = snapshot_dict['y_trues'][0]

for k in range(6):
    print(f' --------- top {k} --------- ')
    # y_true = snapshot_dict['y_trues'][k]
    y_pred = np.average(snapshot_dict['y_preds'][:k+1], weights=[i+1 for i in range(k+1)][::-1], axis=0)
    # y_pred = snapshot_dict['y_preds'][k]
    original_score = np.mean(compute_spearmanr(y_true, y_pred))
    print(f'original_score : {original_score}')
    opt_y_pred = get_opt_y_pred(y_true, y_pred, num_labels=9)
    opt_score = np.mean(compute_spearmanr(y_true, opt_y_pred))
    print(f'opt_score : {opt_score}')

 --------- top 0 --------- 
original_score : 0.3628847311147994
opt_score : 0.370105983592996
 --------- top 1 --------- 
original_score : 0.3656735783973128
opt_score : 0.37947199914484453
 --------- top 2 --------- 
original_score : 0.36423148282519535
opt_score : 0.37760377725035343
 --------- top 3 --------- 
original_score : 0.3625230419023142
opt_score : 0.37541032775707417
 --------- top 4 --------- 
original_score : 0.362327112629013
opt_score : 0.36597550735344964
 --------- top 5 --------- 
original_score : 0.3617967440746636
opt_score : 0.3804809949672511


#### first epoch の混ぜ具合を見てみる

In [104]:
y_true = snapshot_dict['y_trues'][0]

for weight in np.arange(0, 0.11, 0.01):
    print(f' --------- weight {weight} --------- ')
    y_pred = np.average(snapshot_dict['y_preds'][0:2] + snapshot_dict['y_preds'][-1:], weights=[1, 1, weight], axis=0)
    # y_pred = snapshot_dict['y_preds'][k]
    original_score = np.mean(compute_spearmanr(y_true, y_pred))
    print(f'original_score : {original_score}')
    opt_y_pred = get_opt_y_pred(y_true, y_pred, num_labels=9)
    opt_score = np.mean(compute_spearmanr(y_true, opt_y_pred))
    print(f'opt_score : {opt_score}')

 --------- weight 0.0 --------- 
original_score : 0.3635917640627021
opt_score : 0.37941430977945306
 --------- weight 0.01 --------- 
original_score : 0.36355606071084945
opt_score : 0.36167292572859655
 --------- weight 0.02 --------- 
original_score : 0.3634983037411785
opt_score : 0.3746594127499283
 --------- weight 0.03 --------- 
original_score : 0.3634193524223484
opt_score : 0.3808735825562981
 --------- weight 0.04 --------- 
original_score : 0.3633990360391487
opt_score : 0.3778591130797972
 --------- weight 0.05 --------- 
original_score : 0.363302796084233
opt_score : 0.376773620640199
 --------- weight 0.06 --------- 
original_score : 0.3632906893921314
opt_score : 0.37499524925388805
 --------- weight 0.07 --------- 
original_score : 0.3631440929133196
opt_score : 0.37813004290115915
 --------- weight 0.08 --------- 
original_score : 0.36307601963689096
opt_score : 0.3757001518329829
 --------- weight 0.09 --------- 
original_score : 0.3629490544712056
opt_score : 0.3780

#### わかったこと
 - 1 epoch 目 (warmup) をちょっと weight かけて混ぜると良い
     - weight は 0.03 が最適...？でもこれは上振れひいてるだけにも見えるので混ぜなくて良さそう
 - top2 snapshot average は結構良さそう (weight 1 : 1 でも)

In [105]:
snapshot_df = get_snapshot_info_df('../mnt/checkpoints/e059')
# snapshot_df = get_snapshot_info_df('../mnt/checkpoints/e060')
snapshot_df.sort_values(['fold', 'epoch'])

100%|██████████| 5/5 [00:00<00:00, 2810.82it/s]


Unnamed: 0,ckpt_filename,fold,epoch,val_loss,val_metric
1,../mnt/checkpoints/e059/0/fold_0_epoch_0_0.468...,0,0,0.46894,0.09985
4,../mnt/checkpoints/e059/0/fold_0_epoch_1_0.375...,0,1,0.37514,0.40437
2,../mnt/checkpoints/e059/0/fold_0_epoch_2_0.370...,0,2,0.37055,0.41944
3,../mnt/checkpoints/e059/0/fold_0_epoch_3_0.369...,0,3,0.36953,0.42207
5,../mnt/checkpoints/e059/0/fold_0_epoch_4_0.372...,0,4,0.37282,0.42366
0,../mnt/checkpoints/e059/0/fold_0_epoch_5_0.374...,0,5,0.37481,0.42
6,../mnt/checkpoints/e059/1/fold_1_epoch_0_0.469...,1,0,0.46931,0.10207
11,../mnt/checkpoints/e059/1/fold_1_epoch_1_0.377...,1,1,0.37764,0.38441
9,../mnt/checkpoints/e059/1/fold_1_epoch_2_0.367...,1,2,0.36728,0.41026
10,../mnt/checkpoints/e059/1/fold_1_epoch_3_0.369...,1,3,0.36979,0.40877


In [107]:
snapshot_df['rank'] = snapshot_df.groupby(['fold']).val_metric.rank()
snapshot_df

Unnamed: 0,ckpt_filename,fold,epoch,val_loss,val_metric,rank
0,../mnt/checkpoints/e059/0/fold_0_epoch_5_0.374...,0,5,0.37481,0.42,4.0
1,../mnt/checkpoints/e059/0/fold_0_epoch_0_0.468...,0,0,0.46894,0.09985,1.0
2,../mnt/checkpoints/e059/0/fold_0_epoch_2_0.370...,0,2,0.37055,0.41944,3.0
3,../mnt/checkpoints/e059/0/fold_0_epoch_3_0.369...,0,3,0.36953,0.42207,5.0
4,../mnt/checkpoints/e059/0/fold_0_epoch_1_0.375...,0,1,0.37514,0.40437,2.0
5,../mnt/checkpoints/e059/0/fold_0_epoch_4_0.372...,0,4,0.37282,0.42366,6.0
6,../mnt/checkpoints/e059/1/fold_1_epoch_0_0.469...,1,0,0.46931,0.10207,1.0
7,../mnt/checkpoints/e059/1/fold_1_epoch_5_0.372...,1,5,0.37296,0.40772,3.0
8,../mnt/checkpoints/e059/1/fold_1_epoch_4_0.367...,1,4,0.36793,0.41268,6.0
9,../mnt/checkpoints/e059/1/fold_1_epoch_2_0.367...,1,2,0.36728,0.41026,5.0


In [108]:
snapshot_dicts = {}

for fold in tqdm(list(range(5))):
    snapshot_dict = {}
    for i, row in snapshot_df.query(f'fold == {fold}').sort_values('rank', ascending=False).reset_index(drop=True).iterrows():
        ckpt = torch.load(row['ckpt_filename'])
        if i == 0:
            qa_ids = ckpt['val_qa_ids']
            qa_ids_argsort = np.argsort(qa_ids)
            snapshot_dict['y_trues'] = [ckpt['val_y_trues'][qa_ids_argsort]]
            snapshot_dict['y_preds'] = [ckpt['val_y_preds'][qa_ids_argsort]]
            snapshot_dict['model_state_dict'] = [ckpt['model_state_dict']]
        else:
            qa_ids = ckpt['val_qa_ids']
            qa_ids_argsort = np.argsort(qa_ids)
            snapshot_dict['y_trues'].append(ckpt['val_y_trues'][qa_ids_argsort])
            snapshot_dict['y_preds'].append(ckpt['val_y_preds'][qa_ids_argsort])
            snapshot_dict['model_state_dict'].append(ckpt['model_state_dict'])
    snapshot_dicts[fold] = snapshot_dict
    break

  0%|          | 0/5 [00:00<?, ?it/s]

In [109]:
snapshot_dict = snapshot_dicts[0]

In [110]:
y_true = snapshot_dict['y_trues'][0]

for k in range(6):
    print(f' --------- top {k} --------- ')
    # y_true = snapshot_dict['y_trues'][k]
    y_pred = np.mean(snapshot_dict['y_preds'][:k+1], axis=0)
    # y_pred = snapshot_dict['y_preds'][k]
    original_score = np.mean(compute_spearmanr(y_true, y_pred))
    print(f'original_score : {original_score}')
    opt_y_pred = get_opt_y_pred(y_true, y_pred, num_labels=9)
    opt_score = np.mean(compute_spearmanr(y_true, opt_y_pred))
    print(f'opt_score : {opt_score}')

 --------- top 0 --------- 
original_score : 0.42365526733188663
opt_score : 0.44326111287788406
 --------- top 1 --------- 
original_score : 0.42613238607464965
opt_score : 0.4552375961182593
 --------- top 2 --------- 
original_score : 0.42590249083662846
opt_score : 0.4531235660531298
 --------- top 3 --------- 
original_score : 0.42741705871314717
opt_score : 0.45201576355798473
 --------- top 4 --------- 
original_score : 0.4279312302779748
opt_score : 0.4549547351653802
 --------- top 5 --------- 
original_score : 0.4214820777962884
opt_score : 0.4545861528795436


In [112]:
y_true = snapshot_dict['y_trues'][0]

for weight in np.arange(0, 1.1, 0.1):
    print(f' --------- weight {weight} --------- ')
    y_pred = np.average(snapshot_dict['y_preds'][0:2] + snapshot_dict['y_preds'][-1:], weights=[1, 1, weight], axis=0)
    # y_pred = snapshot_dict['y_preds'][k]
    original_score = np.mean(compute_spearmanr(y_true, y_pred))
    print(f'original_score : {original_score}')
    opt_y_pred = get_opt_y_pred(y_true, y_pred, num_labels=9)
    opt_score = np.mean(compute_spearmanr(y_true, opt_y_pred))
    print(f'opt_score : {opt_score}')

 --------- weight 0.0 --------- 
original_score : 0.4261325552996813
opt_score : 0.4552375961182593
 --------- weight 0.1 --------- 
original_score : 0.4244973894673522
opt_score : 0.4562121086353901
 --------- weight 0.2 --------- 
original_score : 0.42273399753463825
opt_score : 0.45395505221547316
 --------- weight 0.30000000000000004 --------- 
original_score : 0.4210080009295397
opt_score : 0.45418838584223575
 --------- weight 0.4 --------- 
original_score : 0.4194577079653098
opt_score : 0.4516499488728317
 --------- weight 0.5 --------- 
original_score : 0.4180597538591983
opt_score : 0.4521816601321864
 --------- weight 0.6000000000000001 --------- 
original_score : 0.41671158813772785
opt_score : 0.451251374536033
 --------- weight 0.7000000000000001 --------- 
original_score : 0.4154077644118532
opt_score : 0.45254678969892453
 --------- weight 0.8 --------- 
original_score : 0.4142088151564722
opt_score : 0.453056072286838
 --------- weight 0.9 --------- 
original_score : 0

In [111]:
y_true = snapshot_dict['y_trues'][0]

for weight in np.arange(0, 0.11, 0.01):
    print(f' --------- weight {weight} --------- ')
    y_pred = np.average(snapshot_dict['y_preds'][0:2] + snapshot_dict['y_preds'][-1:], weights=[1, 1, weight], axis=0)
    # y_pred = snapshot_dict['y_preds'][k]
    original_score = np.mean(compute_spearmanr(y_true, y_pred))
    print(f'original_score : {original_score}')
    opt_y_pred = get_opt_y_pred(y_true, y_pred, num_labels=9)
    opt_score = np.mean(compute_spearmanr(y_true, opt_y_pred))
    print(f'opt_score : {opt_score}')

 --------- weight 0.0 --------- 
original_score : 0.4261325552996813
opt_score : 0.4552375961182593
 --------- weight 0.01 --------- 
original_score : 0.42592114663360175
opt_score : 0.45382356990600886
 --------- weight 0.02 --------- 
original_score : 0.42576380965601135
opt_score : 0.4544389098496915
 --------- weight 0.03 --------- 
original_score : 0.42561421200654753
opt_score : 0.45427744444812784
 --------- weight 0.04 --------- 
original_score : 0.4253993607589279
opt_score : 0.45425383926025564
 --------- weight 0.05 --------- 
original_score : 0.4252744412605826
opt_score : 0.45465569715377363
 --------- weight 0.06 --------- 
original_score : 0.4250820441050089
opt_score : 0.45355107173162484
 --------- weight 0.07 --------- 
original_score : 0.4249225326086575
opt_score : 0.45403332728627593
 --------- weight 0.08 --------- 
original_score : 0.42479401482426304
opt_score : 0.4552623797454474
 --------- weight 0.09 --------- 
original_score : 0.4246539763016442
opt_score : 

#### question 側でもほぼ同じ傾向っぽい