In [4]:
import os
import sys
import itertools
import pickle
from glob import glob
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd
from scipy.stats import spearmanr

from matplotlib import pyplot as plt
from matplotlib_venn import venn2, venn3
import seaborn as sns

import torch
from torch import nn, optim
from transformers import BertConfig, BertTokenizer, BertModel, BertForMaskedLM#, BertLayer, BertEmbeddings
from transformers.modeling_bert import BertLayer, BertEmbeddings

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# re-load functions
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
%config InlineBackend.figure_formats = {'png', 'retina'}

In [3]:
DEVICE = 'cpu'

In [37]:
import sys
import pickle
from functools import partial
from glob import glob

import numpy as np
import pandas as pd
import scipy as sp
import torch
from scipy.stats import spearmanr
from tqdm import tqdm

class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """

    def __init__(self):
        self.coef_ = 0

    def _spearmanr_loss(self, coef, X, y, labels):
        """
        Get loss according to
        using current coefficients
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) +
                     [np.inf], labels=labels)

        # return -np.mean(spearmanr(y, X_p).correlation)
        return -spearmanr(y, X_p).correlation

    def fit(self, X, y, initial_coef):
        """
        Optimize rounding thresholds
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        labels = self.labels
        loss_partial = partial(self._spearmanr_loss, X=X, y=y, labels=labels)
        self.coef_ = sp.optimize.minimize(
            # loss_partial, initial_coef, method='Powell')
            loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        labels = self.labels
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) +
                      [np.inf], labels=labels)
        # [np.inf], labels=[0, 1, 2, 3])

    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

    def set_labels(self, labels):
        self.labels = labels

In [45]:
sys.path.append('../scripts/')
from get_optR3 import compute_spearmanr, get_opt_y_pred

In [39]:
class histogramBasedCoefInitializer:
    def __init__(self):
        self.bins = None

    def fit(self, labels):
        self.bins = pd.Series(labels).value_counts().sort_index().cumsum().values
        return self

    def predict(self, preds):
        preds = sorted(preds)
        res_threshs = []
        if self.bins is None:
            raise Exception('plz fit at first.')
        for _bin in self.bins[:-1]:
            res_threshs.append((preds[_bin - 1] + preds[_bin]) / 2)
        return res_threshs

## snapshot 済みの model を load

In [6]:
CKPT_DIR = '../mnt/checkpoints'

with open(f'{CKPT_DIR}/e059/snapshot_dicts.pkl', 'rb') as fin:
    bert_question_dict = pickle.load(fin)
with open(f'{CKPT_DIR}/e060/snapshot_dicts.pkl', 'rb') as fin:
    bert_answer_dict = pickle.load(fin)
    
with open(f'{CKPT_DIR}/e068/snapshot_dicts.pkl', 'rb') as fin:
    roberta_question_dict = pickle.load(fin)
with open(f'{CKPT_DIR}/e070/snapshot_dicts.pkl', 'rb') as fin:
    roberta_answer_dict = pickle.load(fin)
    
with open(f'{CKPT_DIR}/e072/snapshot_dicts.pkl', 'rb') as fin:
    gpt2_question_dict = pickle.load(fin)
with open(f'{CKPT_DIR}/e073/snapshot_dicts.pkl', 'rb') as fin:
    gpt2_answer_dict = pickle.load(fin)
    
with open(f'{CKPT_DIR}/e074/snapshot_dicts.pkl', 'rb') as fin:
    xlnet_question_dict = pickle.load(fin)
with open(f'{CKPT_DIR}/e075/snapshot_dicts.pkl', 'rb') as fin:
    xlnet_answer_dict = pickle.load(fin)

In [8]:
bert_question_dict.keys()

dict_keys([0, 1, 2, 3, 4])

In [10]:
bert_question_dict[0].keys()

dict_keys(['y_trues', 'y_preds'])

In [11]:
xlnet_answer_dict.keys()

dict_keys([0, 1, 2, 3, 4])

In [33]:
def _get_y_trues_and_y_preds_from_snapshot_dicts(snapshot_dicts, single):
    y_trues, y_preds = [], []
    for fold in range(5):
        if single:
            y_trues.append(snapshot_dicts[fold]['y_trues'][0])
            y_preds.append(snapshot_dicts[fold]['y_preds'][0])
        else:
            y_trues.append(np.average(snapshot_dicts[fold]['y_trues'], axis=0))
            y_preds.append(np.average(snapshot_dicts[fold]['y_preds'], axis=0))
    y_trues = np.concatenate(y_trues)
    y_preds = np.concatenate(y_preds)
    return y_trues, y_preds

def get_y_trues_and_y_preds_from_QA_snapshota_dicts(Q_snapshot_dicts, A_snapshot_dicts, single=False):
    q_y_trues, q_y_preds = _get_y_trues_and_y_preds_from_snapshot_dicts(Q_snapshot_dicts, single)
    a_y_trues, a_y_preds = _get_y_trues_and_y_preds_from_snapshot_dicts(A_snapshot_dicts, single)
    y_trues = np.concatenate([q_y_trues, a_y_trues], axis=1)
    y_preds = np.concatenate([q_y_preds, a_y_preds], axis=1)
    return y_trues, y_preds

In [34]:
# %debug
bert_y_trues, bert_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(bert_question_dict, bert_answer_dict)
roberta_y_trues, roberta_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(roberta_question_dict, roberta_answer_dict)
gpt2_y_trues, gpt2_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(gpt2_question_dict, gpt2_answer_dict)
xlnet_y_trues, xlnet_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(xlnet_question_dict, xlnet_answer_dict)

In [35]:
# %debug
single_bert_y_trues, single_bert_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(bert_question_dict, bert_answer_dict, single=True)
single_roberta_y_trues, single_roberta_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(roberta_question_dict, roberta_answer_dict, single=True)
single_gpt2_y_trues, single_gpt2_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(gpt2_question_dict, gpt2_answer_dict, single=True)
single_xlnet_y_trues, single_xlnet_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(xlnet_question_dict, xlnet_answer_dict, single=True)

In [31]:
# 良さそう
(bert_y_trues == roberta_y_trues).all(), (bert_y_trues == gpt2_y_trues).all(), (bert_y_trues == xlnet_y_trues).all()

(True, True, True)

In [32]:
(bert_y_preds == roberta_y_preds).all(), (bert_y_preds == gpt2_y_preds).all(), (bert_y_preds == xlnet_y_preds).all()

(False, False, False)

## 混ぜてみる
 - 4 つ全て混ぜてみる
 - 3/4 の　snapshot 混ぜてみる？

In [65]:
def blend_and_evaluate(y_trues, y_preds_list, eval_func, weights=None):
    if weights:
        y_preds = np.average(y_preds_list, axis=0, weights=weights)
    else:
        y_preds = np.average(y_preds_list, axis=0)
    eval_scores = eval_func(y_trues, y_preds)
    optRs, opt_y_preds = get_opt_y_pred(y_trues, y_preds, num_labels=30)
    opt_eval_scores = eval_func(y_trues, opt_y_preds)
    print(f'original_score: {np.mean(eval_scores)}')
    print(f'opt_score: {np.mean(opt_eval_scores)}')
    return eval_scores, opt_eval_scores, optRs

#### 試しに single, top-2 snapshots を見てみる

In [54]:
blend_and_evaluate(single_bert_y_trues, [single_bert_y_preds,  ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_roberta_y_preds, ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_gpt2_y_preds, ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_xlnet_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.39787353711376366
opt_score: 0.4261143948591801
original_score: 0.3917349489436032
opt_score: 0.4159272241610885
original_score: 0.38234367637291916
opt_score: 0.4048386132780595
original_score: 0.3906094115371919
opt_score: 0.4208016538440192
fini!


In [55]:
blend_and_evaluate(single_bert_y_trues, [bert_y_preds,  ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [roberta_y_preds, ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [gpt2_y_preds, ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [xlnet_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.40377496605021973
opt_score: 0.44007917576260047
original_score: 0.3993632474521848
opt_score: 0.4290080945752392


  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


original_score: 0.3854747911354585
opt_score: 0.4121866682795468
original_score: 0.3977770316464774
opt_score: 0.42765094537546605
fini!


#### all blends がどうなるかを見てみる

In [53]:
eval_scores, opt_eval_scores = blend_and_evaluate(single_bert_y_trues, [single_bert_y_preds,  single_roberta_y_preds, single_gpt2_y_preds, single_xlnet_y_preds], compute_spearmanr)

original_score: 0.4180430600394728
opt_score: 0.45011560307585413


#### 組み合わせがどうなるかを見てみる

In [56]:
blend_and_evaluate(single_bert_y_trues, [ single_roberta_y_preds, single_gpt2_y_preds, single_xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_bert_y_preds, single_gpt2_y_preds, single_xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_bert_y_preds,  single_roberta_y_preds, single_xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_bert_y_preds,  single_roberta_y_preds, single_gpt2_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.4123991396964391
opt_score: 0.44228018730704144
original_score: 0.41487735800374637
opt_score: 0.44704608097446064
original_score: 0.4165326987734412
opt_score: 0.4536773141844584
original_score: 0.4142418328259677
opt_score: 0.44462524506286166
fini!


In [58]:
blend_and_evaluate(bert_y_trues, [ roberta_y_preds, gpt2_y_preds, xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [bert_y_preds, gpt2_y_preds, xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [bert_y_preds,  roberta_y_preds, xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [bert_y_preds,  roberta_y_preds, gpt2_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.41494955942931444
opt_score: 0.4396218163436752
original_score: 0.41725360795561056
opt_score: 0.45043969072616535
original_score: 0.41943000297059607
opt_score: 0.455408513948017
original_score: 0.4165551766237184
opt_score: 0.44958809940315647
fini!


#### 一番良さそうな gpt2 抜きの場合にどれを single にするとよいか見てみる

In [59]:
blend_and_evaluate(bert_y_trues, [single_bert_y_preds,  roberta_y_preds, xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [bert_y_preds,  single_roberta_y_preds, xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [bert_y_preds,  roberta_y_preds, single_xlnet_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.4186640931608012
opt_score: 0.45243258027214034
original_score: 0.4186459442933615
opt_score: 0.4563528045631384
original_score: 0.41862052031267105
opt_score: 0.4571952857338109
fini!


## 2, 2, 1 じゃ disk 足りなかった...

In [64]:
blend_and_evaluate(bert_y_trues, [single_bert_y_preds,  single_roberta_y_preds, xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [bert_y_preds,  single_roberta_y_preds, single_xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [single_bert_y_preds,  roberta_y_preds, single_xlnet_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.41766799758567713
opt_score: 0.4527705536288453
original_score: 0.41742998990308045
opt_score: 0.4525666155163446
original_score: 0.41790617961638554
opt_score: 0.45475244816683874
fini!


#### いったん 1, 2, 0, 1 で行くことにするので optR を取得

In [66]:
eval_scores, opt_eval_scores, optRs = blend_and_evaluate(bert_y_trues, [single_bert_y_preds,  roberta_y_preds, single_xlnet_y_preds], compute_spearmanr)

original_score: 0.41790617961638554
opt_score: 0.45475244816683874


In [68]:
with open('../mnt/datasets/sub_bert1_roberta2_gpt0_xlnet1/optRs.pkl', 'wb') as fout:
    pickle.dump(optRs, fout)

#### weights どうすればよいか見てみる

In [59]:
blend_and_evaluate(bert_y_trues, [single_bert_y_preds,  roberta_y_preds, xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [bert_y_preds,  single_roberta_y_preds, xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [bert_y_preds,  roberta_y_preds, single_xlnet_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.4186640931608012
opt_score: 0.45243258027214034
original_score: 0.4186459442933615
opt_score: 0.4563528045631384
original_score: 0.41862052031267105
opt_score: 0.4571952857338109
fini!


## まとめ
 - (ちょっと気に入らんけど) bert * 2, roberta * 2, xlnet * 1 の average が良さそう (* 1, * 1, * 1, * 1 よりも)
 - 多分もう一つ別の良いモデルを入れられれば伸びる