In [1]:
import os
import sys
import itertools
import pickle
from glob import glob
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd
from scipy.stats import spearmanr

from matplotlib import pyplot as plt
from matplotlib_venn import venn2, venn3
import seaborn as sns

import torch
from torch import nn, optim
from transformers import BertConfig, BertTokenizer, BertModel, BertForMaskedLM#, BertLayer, BertEmbeddings
from transformers.modeling_bert import BertLayer, BertEmbeddings

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# re-load functions
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
%config InlineBackend.figure_formats = {'png', 'retina'}

In [3]:
DEVICE = 'cpu'

In [4]:
import sys
import pickle
from functools import partial
from glob import glob

import numpy as np
import pandas as pd
import scipy as sp
import torch
from scipy.stats import spearmanr
from tqdm import tqdm

class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """

    def __init__(self):
        self.coef_ = 0

    def _spearmanr_loss(self, coef, X, y, labels):
        """
        Get loss according to
        using current coefficients
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) +
                     [np.inf], labels=labels)

        # return -np.mean(spearmanr(y, X_p).correlation)
        return -spearmanr(y, X_p).correlation

    def fit(self, X, y, initial_coef):
        """
        Optimize rounding thresholds
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        labels = self.labels
        loss_partial = partial(self._spearmanr_loss, X=X, y=y, labels=labels)
        self.coef_ = sp.optimize.minimize(
            # loss_partial, initial_coef, method='Powell')
            loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        labels = self.labels
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) +
                      [np.inf], labels=labels)
        # [np.inf], labels=[0, 1, 2, 3])

    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

    def set_labels(self, labels):
        self.labels = labels

In [5]:
# sys.path.append('../scripts/')
# from get_optR3 import compute_spearmanr, get_opt_y_pred

import os
import pickle
import sys
from functools import partial
from glob import glob

import numpy as np
import pandas as pd
import scipy as sp
import torch
from scipy.stats import spearmanr
from tqdm import tqdm


class histogramBasedCoefInitializer:
    def __init__(self):
        self.bins = None

    def fit(self, labels):
        self.bins = pd.Series(
            labels).value_counts().sort_index().cumsum().values
        return self

    def predict(self, preds):
        preds = sorted(preds)
        res_threshs = []
        if self.bins is None:
            raise Exception('plz fit at first.')
        for _bin in self.bins[:-1]:
            res_threshs.append((preds[_bin - 1] + preds[_bin]) / 2)
        return res_threshs


class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """

    def __init__(self):
        self.coef_ = 0

    def _spearmanr_loss(self, coef, X, y, labels):
        """
        Get loss according to
        using current coefficients
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) +
                     [np.inf], labels=labels)

        # return -np.mean(spearmanr(y, X_p).correlation)
        return -spearmanr(y, X_p).correlation

    def fit(self, X, y, initial_coef):
        """
        Optimize rounding thresholds
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        labels = self.labels
        loss_partial = partial(self._spearmanr_loss, X=X, y=y, labels=labels)
        self.coef_ = sp.optimize.minimize(
            loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        labels = self.labels
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) +
                      [np.inf], labels=labels)
        # [np.inf], labels=[0, 1, 2, 3])

    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

    def set_labels(self, labels):
        self.labels = labels


def compute_spearmanr(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        if len(np.unique(col_pred)) == 1:
            if col_pred[0] == np.max(col_trues):
                col_pred[np.argmin(
                    col_pred)] = np.min(col_trues)
            else:
                col_pred[np.argmax(
                    col_pred)] = np.max(col_trues)
        rhos.append(
            spearmanr(
                col_trues,
                col_pred
                #                  + np.random.normal(
                #                     0,
                #                     1e-7,
                #                     col_pred.shape[0])
            ).correlation)
    return rhos


def get_best_ckpt(ckpts):
    ckpt_dicts = []
    for ckpt in ckpts:
        ckpt_dict = {}
        ckpt_dict['ckpt'] = ckpt
        splitted_ckpt = ckpt.split('/')[-1].split('_')
        ckpt_dict['val_metric'] = float(splitted_ckpt[5])
        ckpt_dicts.append(ckpt_dict)
    ckpt_df = pd.DataFrame(ckpt_dicts)
    return ckpt_df.sort_values('val_metric', ascending=False).ckpt.iloc[0]


def get_snapshot_info_df(base_dir):
    res_dicts = []
    for fold in tqdm(list(range(5))):
        fold_ckpts = glob(f'{base_dir}/{fold}/*.pth')
        for ckpt in fold_ckpts:
            res_dict = {}
            splitted_ckpt = ckpt.split('/')[-1].split('_')
            res_dict['ckpt_filename'] = ckpt
            res_dict['fold'] = int(splitted_ckpt[1])
            res_dict['epoch'] = int(splitted_ckpt[3])
            res_dict['val_loss'] = float(splitted_ckpt[4])
            res_dict['val_metric'] = float(splitted_ckpt[5])
            res_dicts.append(res_dict)
    res_df = pd.DataFrame(res_dicts)
    res_df['rank'] = res_df.groupby(['fold']).val_metric.rank()
    return res_df


def get_opt_y_pred(y_true, y_pred, num_labels):
    optRs = []
    opt_y_preds = []

    # for i in tqdm(list(range(21))):
    for i in range(num_labels):
        optR = OptimizedRounder()
        labels = np.sort(np.unique(y_true[:, i]))
        optR.set_labels(labels)
        initer = histogramBasedCoefInitializer().fit(y_true[:, i])
        opt_thresh = initer.predict(y_pred[:, i])
        optR.fit(y_pred[:, i], y_true[:, i], opt_thresh)
        # opt_threshs.append(optR.coefficients())
        # opt_threshs[i] = optR.coefficients()
        optRs.append(optR)
        opt_y_preds.append((optR.predict(y_pred[:, i], optR.coefficients())))

    opt_y_preds = np.asarray(opt_y_preds).T
    return optRs, opt_y_preds


def opt(BASE_PATH, num_labels=30, snapshot_num=2):
    snapshot_df = get_snapshot_info_df(BASE_PATH)

    snapshot_dicts = {}
    state_dict_dicts = {}
    for fold in tqdm(list(range(5))):
        snapshot_dict = {}
        state_dict_dict = {}
        for i, row in snapshot_df.query(f'fold == {fold}').sort_values(
                'rank', ascending=False).reset_index(drop=True).iterrows():
            if i >= snapshot_num:
                continue
            ckpt = torch.load(row['ckpt_filename'])
            state_dict_dict[i] = ckpt['model_state_dict']
            if i == 0:
                qa_ids = ckpt['val_qa_ids']
                qa_ids_argsort = np.argsort(qa_ids)
                snapshot_dict['y_trues'] = [
                    ckpt['val_y_trues'][qa_ids_argsort]]
                snapshot_dict['y_preds'] = [
                    ckpt['val_y_preds'][qa_ids_argsort]]
            else:
                qa_ids = ckpt['val_qa_ids']
                qa_ids_argsort = np.argsort(qa_ids)
                snapshot_dict['y_trues'].append(
                    ckpt['val_y_trues'][qa_ids_argsort])
                snapshot_dict['y_preds'].append(
                    ckpt['val_y_preds'][qa_ids_argsort])
        snapshot_dicts[fold] = snapshot_dict
        state_dict_dicts[fold] = state_dict_dict

    y_preds = np.concatenate(
        [np.average(snapshot_dicts[fold]['y_preds'][:snapshot_num], axis=0)
         for fold in range(5)])
    y_trues = np.concatenate(
        [snapshot_dicts[fold]['y_trues'][0]
         for fold in range(5)])

    reses = []
    optRs = []

    for i in tqdm(list(range(num_labels))):
        y_pred = y_preds[:, i]
        y_true = y_trues[:, i]

        y_pred_argmax = np.argmax(y_pred)
        y_pred_argmin = np.argmin(y_pred)

        optR = OptimizedRounder()
        labels = np.sort(np.unique(y_true))
        optR.set_labels(labels)
        initer = histogramBasedCoefInitializer().fit(y_true)
        initial_coef = initer.predict(y_pred)
        optR.fit(y_pred, y_true, initial_coef=initial_coef)
        optRs.append(optR)
        res = optR.predict(y_pred, optR.coefficients())

        if len(np.unique(res)) == 1:
            if np.unique(res) == res[y_pred_argmax]:
                res[y_pred_argmin] = np.min(y_true)
            else:
                res[y_pred_argmax] = np.max(y_true)

        reses.append(res)
    reses = np.asarray(reses).T

    with open(f'{BASE_PATH}/optRs.pkl', 'wb') as fout:
        pickle.dump(optRs, fout)
    with open(f'{BASE_PATH}/snapshot_dicts.pkl', 'wb') as fout:
        pickle.dump(snapshot_dicts, fout)
    if not os.path.exists(f'{BASE_PATH}/state_dicts'):
        os.mkdir(f'{BASE_PATH}/state_dicts')
    for fold in range(5):
        for rank in range(snapshot_num):
            with open(f'{BASE_PATH}/state_dicts/fold_{fold}_rank_{rank}_state_dict.pkl', 'wb') as fout:
                pickle.dump(state_dict_dicts[fold][rank], fout)
    # with open(f'{BASE_PATH}/state_dict_dicts.pkl', 'wb') as fout:
    #     pickle.dump(state_dict_dicts, fout)

    original_score = compute_spearmanr(y_trues, y_preds)
    print(f'original_score: {original_score}')
    print(f'original_score: {np.mean(original_score)}')

    res_score = compute_spearmanr(y_trues, reses)
    print(f'res_score: {res_score}')
    print(f'res_score_mean: {np.mean(res_score)}')

    return res_score

In [6]:
class histogramBasedCoefInitializer:
    def __init__(self):
        self.bins = None

    def fit(self, labels):
        self.bins = pd.Series(labels).value_counts().sort_index().cumsum().values
        return self

    def predict(self, preds):
        preds = sorted(preds)
        res_threshs = []
        if self.bins is None:
            raise Exception('plz fit at first.')
        for _bin in self.bins[:-1]:
            res_threshs.append((preds[_bin - 1] + preds[_bin]) / 2)
        return res_threshs

In [16]:
def blend_and_evaluate(y_trues, y_preds_list, eval_func, weights=None):
    if weights:
        y_preds = np.average(y_preds_list, axis=0, weights=weights)
    else:
        y_preds = np.average(y_preds_list, axis=0)
    eval_scores = eval_func(y_trues, y_preds)
    optRs, opt_y_preds = get_opt_y_pred(y_trues, y_preds, num_labels=30)
    opt_eval_scores = eval_func(y_trues, opt_y_preds)
    print(f'original_score: {np.mean(eval_scores)}')
    print(f'opt_score: {np.mean(opt_eval_scores)}')
    return eval_scores, opt_eval_scores, optRs

## まずは top2 optRs を作る

## snapshot 済みの model を load

In [9]:
CKPT_DIR = '../mnt/checkpoints'

with open(f'{CKPT_DIR}/e078/snapshot_dicts.pkl', 'rb') as fin:
    bert_question_dict = pickle.load(fin)
with open(f'{CKPT_DIR}/e079/snapshot_dicts.pkl', 'rb') as fin:
    bert_answer_dict = pickle.load(fin)
    
with open(f'{CKPT_DIR}/e080/snapshot_dicts.pkl', 'rb') as fin:
    roberta_question_dict = pickle.load(fin)
with open(f'{CKPT_DIR}/e081/snapshot_dicts.pkl', 'rb') as fin:
    roberta_answer_dict = pickle.load(fin)
    
# with open(f'{CKPT_DIR}/e072/snapshot_dicts.pkl', 'rb') as fin:
#     gpt2_question_dict = pickle.load(fin)
# with open(f'{CKPT_DIR}/e073/snapshot_dicts.pkl', 'rb') as fin:
#     gpt2_answer_dict = pickle.load(fin)
    
with open(f'{CKPT_DIR}/e082/snapshot_dicts.pkl', 'rb') as fin:
    xlnet_question_dict = pickle.load(fin)
with open(f'{CKPT_DIR}/e083/snapshot_dicts.pkl', 'rb') as fin:
    xlnet_answer_dict = pickle.load(fin)

In [10]:
def _get_y_trues_and_y_preds_from_snapshot_dicts(snapshot_dicts, single, avg):
    y_trues, y_preds = [], []
    for fold in range(5):
        if single:
            y_trues.append(snapshot_dicts[fold]['y_trues'][0])
            y_preds.append(snapshot_dicts[fold]['y_preds'][0])
        else:
            if avg:
                y_trues.append(np.average(snapshot_dicts[fold]['y_trues'], axis=0))
                y_preds.append(np.average(snapshot_dicts[fold]['y_preds'], axis=0))
            else:
                y_trues.append(np.concatenate(snapshot_dicts[fold]['y_trues'], axis=1))
                y_preds.append(np.concatenate(snapshot_dicts[fold]['y_preds'], axis=1))
    y_trues = np.concatenate(y_trues)
    y_preds = np.concatenate(y_preds)
    return y_trues, y_preds

def get_y_trues_and_y_preds_from_QA_snapshota_dicts(Q_snapshot_dicts, A_snapshot_dicts, single=False, avg=True, model_num=2):
    q_y_trues, q_y_preds = _get_y_trues_and_y_preds_from_snapshot_dicts(Q_snapshot_dicts, single, avg)
    a_y_trues, a_y_preds = _get_y_trues_and_y_preds_from_snapshot_dicts(A_snapshot_dicts, single, avg)
    if avg:
        y_trues = np.concatenate([q_y_trues, a_y_trues], axis=1)
        y_preds = np.concatenate([q_y_preds, a_y_preds], axis=1)
    else:
        y_trues = np.concatenate([np.concatenate([q_y_trues[:, i*21:(i+1)*21], a_y_trues[:, i*9:(i+1)*9]], axis=1) for i in range(model_num)], axis=1)
        y_preds = np.concatenate([np.concatenate([q_y_preds[:, i*21:(i+1)*21], a_y_preds[:, i*9:(i+1)*9]], axis=1) for i in range(model_num)], axis=1)
        # y_preds = np.concatenate([q_y_preds[:, i*21:(i+1)*21] for i in range(model_num)] + [a_y_preds[:, i*9:(i+1)*9] for i in range(model_num)], axis=1)
    return y_trues, y_preds

In [11]:
# %debug
bert_y_trues, bert_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(bert_question_dict, bert_answer_dict)
roberta_y_trues, roberta_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(roberta_question_dict, roberta_answer_dict)
# gpt2_y_trues, gpt2_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(gpt2_question_dict, gpt2_answer_dict)
xlnet_y_trues, xlnet_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(xlnet_question_dict, xlnet_answer_dict)

In [12]:
# %debug
single_bert_y_trues, single_bert_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(bert_question_dict, bert_answer_dict, single=True)
single_roberta_y_trues, single_roberta_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(roberta_question_dict, roberta_answer_dict, single=True)
# single_gpt2_y_trues, single_gpt2_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(gpt2_question_dict, gpt2_answer_dict, single=True)
single_xlnet_y_trues, single_xlnet_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(xlnet_question_dict, xlnet_answer_dict, single=True)

In [13]:
# 良さそう
(bert_y_trues == roberta_y_trues).all(), (bert_y_trues == xlnet_y_trues).all()

(True, True)

In [17]:
blend_and_evaluate(single_bert_y_trues, [single_bert_y_preds,  ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_roberta_y_preds, ], compute_spearmanr)
# blend_and_evaluate(single_bert_y_trues, [single_gpt2_y_preds, ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_xlnet_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.39658756843296844
opt_score: 0.4240074936181189


  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


original_score: 0.3952086168898242
opt_score: 0.417814664868746
original_score: 0.39510187276130454
opt_score: 0.42243226674838147
fini!


In [18]:
blend_and_evaluate(single_bert_y_trues, [bert_y_preds,  ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [roberta_y_preds, ], compute_spearmanr)
# blend_and_evaluate(single_bert_y_trues, [gpt2_y_preds, ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [xlnet_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.40437836760234475
opt_score: 0.4374389165251154
original_score: 0.40003269950864506
opt_score: 0.4242294402147104
original_score: 0.4032963844416211
opt_score: 0.427509974207624
fini!


In [19]:
eval_scores, opt_eval_scores, optRs = blend_and_evaluate(single_bert_y_trues, [bert_y_preds, roberta_y_preds, xlnet_y_preds], compute_spearmanr)

original_score: 0.42022642838382596
opt_score: 0.4540915450569295


In [22]:
with open('../mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/optRs.pkl', 'wb') as fout:
    pickle.dump(optRs, fout)

## 関数設計 (path を設定すると model を load して prediction を返してくる)

In [28]:
sys.path.append('../scripts/')
from refactor.datasets import QUESTDataset
from refactor.models import BertModelForBinaryMultiLabelClassifier, RobertaModelForBinaryMultiLabelClassifier, XLNetModelForBinaryMultiLabelClassifier
from refactor.utils import test
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SequentialSampler

In [None]:
def predict_from_ckpt(df, ckpt, loader, TOKENIZER_TYPE, DO_LOWER_CASE, T_MAX_LEN, Q_MAX_LEN, A_MAX_LEN, TQA_MODE):
    if TOKENIZER_TYPE == 'bert':
        state_dict = BertModel.from_pretrained('bert-base-uncased').state_dict()
        model = BertModelForBinaryMultiLabelClassifier(num_labels=, )
        ckpt = 
    dataset = QUESTDataset(
                                df=df,
                                mode='test',
                                tokens=[],
                                augment=[],
                                tokenizer_type=TOKENIZER_TYPE,
                                pretrained_model_name_or_path='../mnt/checkpoints/e078/datasets/',
                                do_lower_case=DO_LOWER_CASE,
                                LABEL_COL=LABEL_COL,
                                t_max_len=T_MAX_LEN,
                                q_max_len=Q_MAX_LEN,
                                a_max_len=A_MAX_LEN,
                                tqa_mode=TQA_MODE,
                                TBSEP='[TBSEP]',
                                pos_id_type='arange',
                                MAX_SEQUENCE_LENGTH=512,
                            )
    sampler = RandomSampler(data_source=dataset)
    loader = DataLoader(dataset,
                            batch_size=8,
                            sampler=sampler,
                            num_workers=os.cpu_count(),
                            worker_init_fn=lambda x: np.random.seed(),
                            drop_last=False,
                            pin_memory=True)    
    return prediction

#### まず Q 系を潰す

In [26]:
labels = [
    'question_asker_intent_understanding',
    'question_body_critical',
    'question_conversational',
    'question_expect_short_answer',
    'question_fact_seeking',
    'question_has_commonly_accepted_answer',
    'question_interestingness_others',
    'question_interestingness_self',
    'question_multi_intent',
    'question_not_really_a_question',
    'question_opinion_seeking',
    'question_type_choice',
    'question_type_compare',
    'question_type_consequence',
    'question_type_definition',
    'question_type_entity',
    'question_type_instructions',
    'question_type_procedure',
    'question_type_reason_explanation',
    'question_type_spelling',
    'question_well_written',
#     'answer_helpful',
#     'answer_level_of_information',
#     'answer_plausible',
#     'answer_relevance',
#     'answer_satisfaction',
#     'answer_type_instructions',
#     'answer_type_procedure',
#     'answer_type_reason_explanation',
#     'answer_well_written'
]

## feature engineering

In [293]:
trn_df = pd.read_csv('../mnt/inputs/origin/train.csv')

In [294]:
from sklearn.model_selection import GroupKFold as GKF

gkf = GKF(5)
fold = list(gkf.split(trn_df.qa_id, groups=trn_df.question_body))

In [295]:
val_idxes = [i[1] for i in fold]
val_idx = np.concatenate(val_idxes)

In [296]:
trn_df['fold'] = None
for i, _val_idx in enumerate(val_idxes):
    trn_df.loc[_val_idx, 'fold'] = i

In [297]:
trn_df['fold'].fillna(-1).value_counts()

3    1216
2    1216
1    1216
0    1216
4    1215
Name: fold, dtype: int64

In [298]:
(trn_df.iloc[val_idx].iloc[:, 11:-1].values == bert_y_trues[:, :30]).all()

True

In [299]:
trn_df = trn_df.iloc[val_idx].reset_index(drop=True)

In [301]:
# 置換完了
(trn_df.iloc[:, 11:-1].values == bert_y_trues[:, :30]).all()

True

In [442]:
from urllib.parse import urlparse
import re
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import MultiTaskElasticNet, Lasso, LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

import tqdm
tqdm.tqdm.pandas()
from tqdm import tqdm_notebook as tqdm

from transformers import BertTokenizer, RobertaTokenizer, XLNetTokenizer

def mk_features(df, bert_preds, roberta_preds, xlnet_preds,
                              bert_tokenizer_path, ohe=None, scaler=None, labels=[]):
    base_cols = ['qa_id', 'category', 'fold'] + labels
    features_df = df[base_cols]

    # categorize host
    HOSTs = ['stackexchange', 'askubuntu', 'mathoverflow', 'serverfault', 'stackoverflow', 'superuser']
    for HOST in HOSTs:
        features_df.loc[df.host.str.contains(HOST).values, 'host'] = f'HOST_{HOST}'.casefold()

    # url features
    MAP_DICT = {'cs': 'programmers', 'softwarerecs': 'programmers', 'robotics': 'electronics'}
    find = re.compile(r"^[^.]*")
    features_df['netloc'] = df['url'].apply(lambda x: re.findall(find, urlparse(x).netloc)[0]).apply(lambda x: MAP_DICT[x] if x in MAP_DICT else x)
    
    # len types features
    features_df['title_len'] = df.question_title.apply(lambda x: len(x.casefold().split()))
    features_df['body_len'] = df.question_body.apply(lambda x: len(x.casefold().split()))
    features_df['answer_len'] = df.answer.apply(lambda x: len(x.casefold().split()))
    
    bert_tokenizer = BertTokenizer.from_pretrained(bert_tokenizer_path)
    features_df['bert_title_len'] = df.question_title.progress_apply(lambda x: len(bert_tokenizer.tokenize(x)))
    features_df['bert_body_len'] = df.question_body.progress_apply(lambda x: len(bert_tokenizer.tokenize(x)))
    features_df['bert_answer_len'] = df.answer.progress_apply(lambda x: len(bert_tokenizer.tokenize(x)))
    
    # res_ohes = []
    # for i, feature in enumerate(['category', 'host', 'netloc']):
    #     if not ohes:
    #         ohe = OneHotEncoder()
    #         ohe.fit(features_df[feature])
    #         res_ohes.append(ohe)
    #     else:
    #         ohe = ohes[i]
    #         res_ohes.append(ohe)
    #     features_df[feature] = ohe.transform(features_df[feature])
    cat_cols = ['category', 'host', 'netloc']
    if not ohe:
        ohe = OneHotEncoder()
        ohe.fit(features_df[cat_cols])
    features_df = pd.concat([features_df, pd.DataFrame(ohe.transform(features_df[cat_cols]).toarray())], axis=1)
     
    # res_scalers = ['title_len', 'body_len', 'answer_len', 'bert_title_len', 'bert_body_len', 'bert_answer_len']
    # for i, feature in enumerate([]):
    #     if not scalers:
    #         scaler = StandardScaler()
    #         scaler.fit(features_df[feature])
    #         res_scalers.append(scaler)
    #     else:
    #         scaler = scalers[i]
    #         res_scalers.append(scaler)
    #     features_df[feature] = scaler.transform(features_df[feature])

    num_cols = ['title_len', 'body_len', 'answer_len', 'bert_title_len', 'bert_body_len', 'bert_answer_len']
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(features_df[num_cols])
    features_df.loc[:, num_cols] = scaler.transform(features_df[num_cols])
    
    bert_df = pd.DataFrame(bert_preds).add_prefix('bert_')
    roberta_df = pd.DataFrame(roberta_preds).add_prefix('roberta_')
    xlnet_df = pd.DataFrame(xlnet_preds).add_prefix('xlnet_')
    
    features_df = pd.concat([features_df, bert_df, roberta_df, xlnet_df], axis=1)
    
    return features_df, ohe, scaler

In [306]:
labels = [
    'question_asker_intent_understanding',
    'question_body_critical',
    'question_conversational',
    'question_expect_short_answer',
    'question_fact_seeking',
    'question_has_commonly_accepted_answer',
    'question_interestingness_others',
    'question_interestingness_self',
    'question_multi_intent',
    'question_not_really_a_question',
    'question_opinion_seeking',
    'question_type_choice',
    'question_type_compare',
    'question_type_consequence',
    'question_type_definition',
    'question_type_entity',
    'question_type_instructions',
    'question_type_procedure',
    'question_type_reason_explanation',
    'question_type_spelling',
    'question_well_written',
    'answer_helpful',
    'answer_level_of_information',
    'answer_plausible',
    'answer_relevance',
    'answer_satisfaction',
    'answer_type_instructions',
    'answer_type_procedure',
    'answer_type_reason_explanation',
    'answer_well_written'
]

In [310]:
# %debug
features_df, ohes, scalers = mk_features(trn_df, bert_y_preds, roberta_y_preds, xlnet_y_preds, '../mnt/checkpoints/e078/datasets/', ohe=None, scaler=None, labels=labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc

 65%|██████▍   | 3934/6079 [00:25<00:14, 147.70it/s][A
 65%|██████▌   | 3954/6079 [00:25<00:13, 159.59it/s][A
 65%|██████▌   | 3971/6079 [00:25<00:14, 140.75it/s][A
 66%|██████▌   | 3992/6079 [00:25<00:13, 154.27it/s][A
 66%|██████▌   | 4016/6079 [00:26<00:12, 169.83it/s][A
 66%|██████▋   | 4035/6079 [00:26<00:12, 161.05it/s][A
 67%|██████▋   | 4053/6079 [00:26<00:14, 144.34it/s][A
 67%|██████▋   | 4071/6079 [00:26<00:13, 152.71it/s][A
 67%|██████▋   | 4088/6079 [00:26<00:13, 146.74it/s][A
 68%|██████▊   | 4109/6079 [00:26<00:12, 157.13it/s][A
 68%|██████▊   | 4126/6079 [00:26<00:12, 159.74it/s][A
 68%|██████▊   | 4144/6079 [00:26<00:11, 165.17it/s][A
 69%|██████▊   | 4168/6079 [00:27<00:10, 181.51it/s][A
 69%|██████▉   | 4187/6079 [00:27<00:11, 166.29it/s][A
 69%|██████▉   | 4205/6079 [00:27<00:11, 168.86it/s][A
 69%|██████▉   | 4223/6079 [00:27<00:11, 165.94it/s][A
 70%|██████▉   | 4240/6079 [00:27<00:11, 159.17it/s][A
 70%|███████   | 4257/6079 [00:27<00:12, 150.67i

 51%|█████     | 3106/6079 [00:20<00:19, 153.46it/s][A
 51%|█████▏    | 3126/6079 [00:20<00:18, 163.57it/s][A
 52%|█████▏    | 3152/6079 [00:20<00:15, 182.99it/s][A
 52%|█████▏    | 3172/6079 [00:20<00:18, 160.35it/s][A
 52%|█████▏    | 3190/6079 [00:20<00:18, 156.23it/s][A
 53%|█████▎    | 3207/6079 [00:20<00:21, 135.97it/s][A
 53%|█████▎    | 3223/6079 [00:20<00:20, 141.46it/s][A
 53%|█████▎    | 3239/6079 [00:20<00:19, 145.92it/s][A
 54%|█████▎    | 3258/6079 [00:21<00:18, 155.07it/s][A
 54%|█████▍    | 3280/6079 [00:21<00:16, 168.53it/s][A
 54%|█████▍    | 3298/6079 [00:21<00:18, 148.19it/s][A
 55%|█████▍    | 3314/6079 [00:21<00:19, 139.27it/s][A
 55%|█████▍    | 3332/6079 [00:21<00:18, 146.43it/s][A
 55%|█████▌    | 3348/6079 [00:21<00:21, 125.93it/s][A
 55%|█████▌    | 3369/6079 [00:21<00:19, 139.20it/s][A
 56%|█████▌    | 3393/6079 [00:21<00:17, 156.82it/s][A
 56%|█████▌    | 3412/6079 [00:22<00:16, 162.65it/s][A
 56%|█████▋    | 3430/6079 [00:22<00:16, 164.62i

In [311]:
features_df

Unnamed: 0,qa_id,category,fold,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,question_opinion_seeking,question_type_choice,question_type_compare,question_type_consequence,question_type_definition,question_type_entity,question_type_instructions,question_type_procedure,question_type_reason_explanation,question_type_spelling,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written,host,netloc,title_len,body_len,answer_len,bert_title_len,bert_body_len,bert_answer_len,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,bert_6,bert_7,bert_8,bert_9,bert_10,bert_11,bert_12,bert_13,bert_14,bert_15,bert_16,bert_17,bert_18,bert_19,bert_20,bert_21,bert_22,bert_23,bert_24,bert_25,bert_26,bert_27,bert_28,bert_29,bert_30,bert_31,bert_32,bert_33,bert_34,bert_35,bert_36,bert_37,bert_38,bert_39,bert_40,bert_41,bert_42,bert_43,bert_44,bert_45,bert_46,bert_47,bert_48,bert_49,bert_50,bert_51,bert_52,bert_53,bert_54,bert_55,bert_56,bert_57,bert_58,bert_59,roberta_0,roberta_1,roberta_2,roberta_3,roberta_4,roberta_5,roberta_6,roberta_7,roberta_8,roberta_9,roberta_10,roberta_11,roberta_12,roberta_13,roberta_14,roberta_15,roberta_16,roberta_17,roberta_18,roberta_19,roberta_20,roberta_21,roberta_22,roberta_23,roberta_24,roberta_25,roberta_26,roberta_27,roberta_28,roberta_29,roberta_30,roberta_31,roberta_32,roberta_33,roberta_34,roberta_35,roberta_36,roberta_37,roberta_38,roberta_39,roberta_40,roberta_41,roberta_42,roberta_43,roberta_44,roberta_45,roberta_46,roberta_47,roberta_48,roberta_49,roberta_50,roberta_51,roberta_52,roberta_53,roberta_54,roberta_55,roberta_56,roberta_57,roberta_58,roberta_59,xlnet_0,xlnet_1,xlnet_2,xlnet_3,xlnet_4,xlnet_5,xlnet_6,xlnet_7,xlnet_8,xlnet_9,xlnet_10,xlnet_11,xlnet_12,xlnet_13,xlnet_14,xlnet_15,xlnet_16,xlnet_17,xlnet_18,xlnet_19,xlnet_20,xlnet_21,xlnet_22,xlnet_23,xlnet_24,xlnet_25,xlnet_26,xlnet_27,xlnet_28,xlnet_29,xlnet_30,xlnet_31,xlnet_32,xlnet_33,xlnet_34,xlnet_35,xlnet_36,xlnet_37,xlnet_38,xlnet_39,xlnet_40,xlnet_41,xlnet_42,xlnet_43,xlnet_44,xlnet_45,xlnet_46,xlnet_47,xlnet_48,xlnet_49,xlnet_50,xlnet_51,xlnet_52,xlnet_53,xlnet_54,xlnet_55,xlnet_56,xlnet_57,xlnet_58,xlnet_59
0,0,LIFE_ARTS,0,1.000000,0.333333,0.000000,0.000000,0.000000,0.0,1.000000,1.000000,0.000000,0.0,1.000000,0.000000,0.0,0.0,0.0,0.000000,1.000000,0.000000,0.000000,0.0,1.000000,1.000000,0.666667,1.000000,1.000000,0.800000,1.000000,0.000000,0.000000,1.000000,host_stackexchange,photo,1.050273,0.135656,0.109141,0.317333,-0.120117,-0.067228,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.937834,0.633688,0.054328,0.559676,0.682502,0.403668,0.635739,0.591393,0.394091,0.004519,0.719388,0.044919,0.129137,0.026241,0.005142,0.123739,0.393864,0.079044,0.507497,0.002152,0.921155,0.978332,0.690518,0.990210,0.988864,0.925071,0.084333,0.060422,0.917059,0.938186,0.948040,0.693285,0.052300,0.539415,0.674708,0.225477,0.681708,0.597191,0.194366,0.005146,0.792736,0.041187,0.077567,0.026021,0.004593,0.065753,0.361647,0.067497,0.510446,0.001913,0.948859,0.970516,0.714862,0.986803,0.985297,0.923511,0.126159,0.059128,0.868586,0.936540,0.939356,0.690926,0.140137,0.456486,0.695388,0.427599,0.675041,0.654015,0.326620,0.005562,0.802528,0.071001,0.165792,0.025142,0.006687,0.034890,0.074643,0.022397,0.893541,0.002120,0.907167,0.936800,0.635479,0.976267,0.974464,0.847127,0.081082,0.087312,0.935919,0.917832,0.943043,0.677397,0.170254,0.436276,0.697599,0.334588,0.728414,0.615993,0.308374,0.004337,0.807897,0.028765,0.137566,0.018854,0.004842,0.045164,0.077641,0.030107,0.837298,0.001631,0.946298,0.931054,0.639547,0.972470,0.978804,0.889602,0.060267,0.043313,0.904865,0.929291,0.973829,0.905940,0.082676,0.531691,0.627554,0.172224,0.787267,0.630674,0.034848,0.003968,0.720875,0.017180,0.176688,0.018778,0.001952,0.013320,0.062216,0.035129,0.679391,0.001523,0.966897,0.952973,0.673146,0.978761,0.979357,0.892544,0.036530,0.026685,0.749635,0.924106,0.955310,0.769556,0.098012,0.399802,0.675737,0.294906,0.731433,0.631878,0.105315,0.004851,0.725987,0.036107,0.143083,0.012034,0.006219,0.025864,0.170231,0.064661,0.651337,0.001928,0.945690,0.949768,0.679223,0.970614,0.968133,0.882259,0.083363,0.071621,0.681534,0.909260
1,2,SCIENCE,0,0.888889,0.666667,0.000000,1.000000,1.000000,1.0,0.666667,0.444444,0.333333,0.0,0.333333,0.000000,0.0,0.0,0.0,0.000000,1.000000,0.333333,0.333333,0.0,0.777778,0.777778,0.555556,1.000000,1.000000,0.666667,0.000000,0.333333,1.000000,0.888889,host_stackexchange,electronics,-0.538993,-0.062617,0.321281,-0.229507,-0.215080,0.040466,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.862143,0.427385,0.005301,0.763975,0.906572,0.878436,0.602152,0.495679,0.406346,0.004009,0.284857,0.174727,0.019744,0.008480,0.003340,0.357163,0.587298,0.156908,0.266454,0.001586,0.831538,0.978350,0.714989,0.989560,0.987037,0.934199,0.617221,0.205096,0.587093,0.956661,0.876384,0.407436,0.004039,0.797632,0.902393,0.854854,0.610949,0.478781,0.370515,0.003568,0.361926,0.170642,0.012575,0.006540,0.003083,0.331551,0.571259,0.155808,0.221593,0.001189,0.827448,0.971369,0.688888,0.984001,0.983884,0.921705,0.385440,0.130536,0.608682,0.950625,0.877547,0.437875,0.008237,0.730027,0.826313,0.828643,0.580050,0.486645,0.314219,0.003656,0.481191,0.397303,0.012055,0.005897,0.003483,0.154600,0.648417,0.171914,0.139646,0.001242,0.753731,0.978550,0.705715,0.987457,0.989491,0.912375,0.539887,0.313191,0.794380,0.960950,0.896145,0.477015,0.009453,0.799165,0.862492,0.858080,0.599161,0.509604,0.294127,0.002733,0.442037,0.408057,0.012404,0.006706,0.003225,0.267228,0.403380,0.166591,0.115688,0.001482,0.791144,0.943950,0.667137,0.968254,0.978777,0.900755,0.232618,0.122150,0.778061,0.943902,0.891846,0.454563,0.004302,0.797033,0.904814,0.949496,0.532095,0.455181,0.327094,0.002800,0.352610,0.567057,0.008251,0.007538,0.002314,0.078818,0.603917,0.123678,0.306348,0.001484,0.830631,0.973851,0.717925,0.985332,0.985919,0.924478,0.192462,0.130445,0.865155,0.960773,0.885473,0.469644,0.010380,0.656595,0.854381,0.883974,0.569868,0.513995,0.311041,0.005261,0.376742,0.387319,0.017164,0.009074,0.006241,0.179836,0.698789,0.194993,0.164836,0.002510,0.820787,0.969116,0.694305,0.979944,0.980628,0.911874,0.270408,0.185028,0.665707,0.943955
2,5,LIFE_ARTS,0,1.000000,0.666667,0.000000,1.000000,1.000000,1.0,0.666667,0.666667,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,1.000000,0.000000,1.000000,0.0,1.000000,1.000000,0.666667,1.000000,1.000000,0.800000,1.000000,0.000000,1.000000,1.000000,host_stackexchange,graphicdesign,-0.009237,-0.381578,-0.681561,-0.229507,-0.378299,-0.511900,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.940273,0.809567,0.003115,0.742909,0.934072,0.958034,0.601563,0.519815,0.329866,0.001682,0.156734,0.404634,0.002104,0.003643,0.001797,0.009099,0.898314,0.204224,0.080918,0.001160,0.924907,0.938400,0.625094,0.973249,0.981240,0.894752,0.952295,0.071493,0.030456,0.915926,0.945112,0.842681,0.003052,0.723989,0.939539,0.953104,0.620866,0.515534,0.418432,0.001308,0.169254,0.439473,0.001934,0.003295,0.002002,0.006600,0.922226,0.223547,0.082051,0.001258,0.924779,0.936383,0.602553,0.957955,0.975175,0.875395,0.923964,0.091445,0.051407,0.901083,0.949596,0.745342,0.002595,0.723586,0.861962,0.947284,0.594686,0.523254,0.156604,0.001269,0.275957,0.574182,0.002246,0.000982,0.000871,0.002799,0.909478,0.170792,0.068408,0.000569,0.883512,0.933770,0.636178,0.966974,0.975628,0.859103,0.948309,0.119945,0.013637,0.894299,0.948401,0.810106,0.003370,0.734654,0.881596,0.942731,0.635351,0.542594,0.173564,0.000756,0.304521,0.533880,0.001673,0.000758,0.000816,0.002403,0.832006,0.195176,0.066425,0.000487,0.925106,0.889476,0.585075,0.938949,0.953297,0.813369,0.820268,0.106973,0.034179,0.889998,0.929744,0.795795,0.004567,0.755164,0.874132,0.868303,0.594423,0.536195,0.309727,0.001649,0.389914,0.476216,0.002519,0.001183,0.001317,0.004455,0.891083,0.228527,0.114345,0.000983,0.906356,0.941633,0.654082,0.967575,0.981958,0.849379,0.883231,0.083815,0.022141,0.932206,0.935037,0.806288,0.004375,0.724833,0.846311,0.927419,0.578390,0.594729,0.175808,0.002127,0.366523,0.307861,0.002644,0.000987,0.001513,0.004042,0.918660,0.252719,0.044948,0.000847,0.910946,0.931579,0.640613,0.957352,0.969075,0.846439,0.819772,0.149272,0.097381,0.902589
3,7,TECHNOLOGY,0,0.888889,0.333333,0.000000,0.000000,1.000000,1.0,0.444444,0.333333,1.000000,0.0,0.666667,0.666667,0.0,0.0,0.0,0.000000,1.000000,0.000000,0.333333,0.0,0.666667,1.000000,0.777778,1.000000,0.888889,0.866667,1.000000,0.000000,1.000000,0.888889,host_askubuntu,askubuntu,-1.068748,1.420120,-0.180140,-0.958627,0.782040,-0.247876,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.853799,0.418506,0.001864,0.752759,0.893811,0.962455,0.532325,0.400307,0.508213,0.002638,0.269419,0.525941,0.002798,0.003367,0.001798,0.004662,0.870421,0.226206,0.391221,0.000839,0.713590,0.964849,0.659576,0.983061,0.985660,0.891935,0.536711,0.184633,0.777613,0.913628,0.879365,0.448697,0.001749,0.835119,0.834409,0.969971,0.536086,0.421664,0.470135,0.002651,0.403169,0.567991,0.002399,0.002710,0.001726,0.003595,0.893184,0.178926,0.324915,0.000768,0.659846,0.960969,0.678831,0.979070,0.984961,0.892323,0.493353,0.128955,0.833424,0.926204,0.816392,0.406392,0.003423,0.605323,0.837362,0.921224,0.500163,0.393723,0.817738,0.002962,0.362719,0.509518,0.004134,0.003307,0.004257,0.008426,0.842403,0.249213,0.472432,0.001107,0.683468,0.961939,0.655792,0.980082,0.989784,0.893065,0.788369,0.251071,0.623797,0.913681,0.827540,0.371253,0.003102,0.642540,0.823013,0.934991,0.529688,0.396349,0.806316,0.002757,0.474001,0.434663,0.003607,0.002997,0.003031,0.006445,0.808501,0.254233,0.500927,0.001005,0.689030,0.921966,0.623066,0.955658,0.976179,0.851583,0.681994,0.190042,0.579048,0.902361,0.847718,0.500827,0.002943,0.739842,0.850472,0.933476,0.532534,0.430573,0.636995,0.003336,0.435143,0.474098,0.004293,0.003891,0.002101,0.005904,0.886643,0.271721,0.316003,0.001964,0.781127,0.957247,0.699371,0.981809,0.987001,0.914018,0.499482,0.124132,0.824901,0.884441,0.845501,0.485216,0.005474,0.662840,0.816937,0.921991,0.529743,0.460330,0.631977,0.004764,0.396782,0.428459,0.007880,0.005384,0.005163,0.013922,0.875551,0.313106,0.266314,0.002812,0.770087,0.961276,0.666798,0.977307,0.984410,0.905740,0.621227,0.181948,0.651063,0.907770
4,22,TECHNOLOGY,0,1.000000,0.777778,0.000000,1.000000,1.000000,1.0,0.555556,0.333333,0.000000,0.0,0.000000,0.333333,0.0,0.0,0.0,0.333333,0.666667,0.000000,0.000000,0.0,0.777778,0.555556,0.777778,0.777778,1.000000,0.933333,1.000000,0.000000,0.000000,1.000000,host_stackexchange,tex,-0.803870,-0.407440,-0.231568,-0.047227,-0.280368,-0.258298,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.860635,0.509862,0.005171,0.708300,0.877918,0.891856,0.498767,0.424163,0.107929,0.002586,0.317413,0.280515,0.005285,0.002985,0.001886,0.102649,0.830920,0.262429,0.020658,0.001494,0.763261,0.958601,0.680482,0.980309,0.985464,0.902712,0.845117,0.214321,0.236299,0.937493,0.870161,0.519534,0.003885,0.715089,0.858416,0.898430,0.494073,0.407960,0.178501,0.001987,0.384186,0.530557,0.004464,0.002707,0.001717,0.075149,0.851929,0.290458,0.017960,0.001301,0.737851,0.960159,0.676242,0.978570,0.985384,0.908109,0.829640,0.162296,0.363657,0.939053,0.871993,0.546429,0.003589,0.692251,0.873311,0.873898,0.500748,0.451323,0.184250,0.002667,0.321352,0.395656,0.008200,0.001695,0.002152,0.106612,0.851600,0.258965,0.042438,0.000795,0.727710,0.965490,0.665667,0.981747,0.988242,0.883523,0.876985,0.288451,0.169927,0.929777,0.868578,0.536922,0.003924,0.750173,0.901626,0.881564,0.511320,0.455085,0.177141,0.002621,0.291164,0.361100,0.008343,0.001595,0.002240,0.144747,0.765796,0.306835,0.037416,0.000941,0.718317,0.920787,0.612864,0.955546,0.969192,0.838491,0.701821,0.228713,0.242511,0.909499,0.818954,0.434231,0.010632,0.782810,0.879400,0.790347,0.522881,0.436751,0.310704,0.003490,0.350880,0.549025,0.021179,0.002189,0.004197,0.242415,0.758011,0.307041,0.078162,0.002282,0.762996,0.959454,0.677590,0.976629,0.986558,0.893032,0.804009,0.192058,0.128704,0.944631,0.841656,0.556763,0.009558,0.752193,0.859023,0.869228,0.547533,0.467144,0.191397,0.004927,0.278111,0.436358,0.028016,0.002705,0.007647,0.347120,0.761757,0.222398,0.044187,0.002657,0.748336,0.960435,0.679584,0.973362,0.981628,0.893759,0.723598,0.237019,0.314737,0.920138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6074,9608,TECHNOLOGY,4,0.777778,0.777778,0.000000,0.000000,0.333333,0.0,0.333333,0.333333,0.000000,0.0,0.666667,0.000000,0.0,0.0,0.0,0.000000,0.333333,0.666667,0.000000,0.0,0.777778,0.666667,0.666667,0.666667,0.777778,0.600000,0.333333,0.666667,0.000000,0.777778,host_stackexchange,webapps,0.255640,-0.735021,-0.790845,-0.229507,-0.523713,-0.702970,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.945020,0.890636,0.007168,0.879313,0.828783,0.941287,0.596020,0.433300,0.033273,0.005688,0.287404,0.023560,0.003612,0.002565,0.003162,0.007115,0.950091,0.111666,0.033186,0.003266,0.898956,0.902152,0.596175,0.945968,0.953084,0.829602,0.867170,0.120599,0.062336,0.853323,0.961746,0.894130,0.005808,0.882440,0.859244,0.947367,0.596898,0.466307,0.020476,0.004206,0.228542,0.008926,0.002025,0.001842,0.002621,0.003809,0.907377,0.129564,0.048756,0.002190,0.934412,0.801718,0.490590,0.912695,0.924825,0.781649,0.626061,0.027069,0.010575,0.832931,0.960679,0.848956,0.001726,0.836610,0.826023,0.960006,0.601383,0.496998,0.019130,0.002159,0.368559,0.006081,0.001247,0.000541,0.000760,0.002223,0.962812,0.125696,0.024473,0.000401,0.878837,0.881803,0.553945,0.941968,0.956882,0.759049,0.899407,0.067809,0.015718,0.889274,0.969562,0.855042,0.001625,0.836879,0.866055,0.958470,0.586410,0.512822,0.020393,0.001448,0.300338,0.005012,0.000841,0.000486,0.000876,0.002045,0.940238,0.143405,0.018471,0.000262,0.908841,0.867241,0.576438,0.937920,0.913687,0.767405,0.646847,0.186227,0.119737,0.882905,0.983471,0.929312,0.002497,0.829348,0.892678,0.936112,0.688765,0.595861,0.012791,0.001951,0.295112,0.002621,0.000878,0.000523,0.001305,0.003029,0.958513,0.184414,0.028823,0.000327,0.952064,0.937327,0.614721,0.964238,0.980368,0.896706,0.916240,0.079634,0.025455,0.907762,0.981051,0.930698,0.003213,0.906900,0.787809,0.952751,0.650892,0.534128,0.013359,0.002164,0.412436,0.003033,0.000859,0.000542,0.000995,0.002706,0.947300,0.159520,0.018847,0.000254,0.948970,0.880992,0.543861,0.928073,0.931232,0.762942,0.772663,0.156815,0.114902,0.850272
6075,9615,TECHNOLOGY,4,0.888889,0.555556,0.333333,1.000000,1.000000,0.0,0.666667,0.555556,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,1.000000,0.000000,0.000000,0.000000,0.0,0.888889,0.833333,0.666667,0.833333,1.000000,0.900000,0.000000,0.000000,0.000000,0.888889,host_stackexchange,gamedev,-1.068748,-0.640195,-0.617276,-1.140907,-0.491069,-0.595276,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.938369,0.744309,0.198311,0.925503,0.417933,0.388678,0.652661,0.498120,0.034421,0.009387,0.757291,0.227974,0.022438,0.009711,0.004984,0.619206,0.138890,0.052023,0.064394,0.003515,0.830620,0.886825,0.649844,0.956831,0.946714,0.857600,0.487293,0.067495,0.097196,0.884678,0.921786,0.630113,0.132557,0.945482,0.454810,0.438256,0.561050,0.419876,0.032255,0.008172,0.750938,0.174508,0.025347,0.005623,0.003760,0.580283,0.182875,0.064270,0.049468,0.002456,0.764197,0.927998,0.670508,0.979163,0.974889,0.881593,0.448498,0.039891,0.049127,0.882520,0.882383,0.498312,0.054546,0.901974,0.764099,0.755773,0.538805,0.522316,0.074913,0.011000,0.383075,0.075086,0.025589,0.004214,0.006901,0.816039,0.406000,0.106176,0.052062,0.003638,0.693786,0.796977,0.532875,0.889630,0.894424,0.689759,0.585953,0.071361,0.027584,0.845370,0.902207,0.488740,0.069467,0.900057,0.741072,0.693909,0.553558,0.540691,0.085852,0.008033,0.428672,0.086941,0.021811,0.003880,0.004870,0.783782,0.287722,0.108318,0.059840,0.003144,0.701541,0.885151,0.586975,0.942997,0.922319,0.780511,0.318240,0.070611,0.135648,0.880940,0.903849,0.714196,0.099916,0.918454,0.647855,0.603163,0.592879,0.545030,0.077179,0.008571,0.543347,0.159827,0.022085,0.003626,0.004959,0.739792,0.376900,0.121703,0.059179,0.002099,0.808251,0.916946,0.596607,0.957839,0.963604,0.878757,0.888262,0.118258,0.048626,0.854181,0.868449,0.640771,0.097377,0.948998,0.695785,0.487563,0.512601,0.523008,0.121765,0.010444,0.538255,0.083222,0.024096,0.003584,0.003525,0.848797,0.352080,0.140754,0.030719,0.001344,0.798420,0.893715,0.593973,0.935181,0.929859,0.805541,0.737991,0.162532,0.147062,0.832221
6076,9620,TECHNOLOGY,4,0.777778,0.333333,0.000000,0.333333,0.666667,1.0,0.444444,0.333333,0.000000,0.0,1.000000,0.000000,0.0,0.0,0.0,0.000000,1.000000,0.000000,0.000000,0.0,0.555556,0.888889,0.555556,1.000000,1.000000,0.733333,1.000000,0.000000,0.333333,0.888889,host_askubuntu,askubuntu,-1.068748,5.126962,-0.739417,-0.958627,8.447401,-0.664756,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789185,0.478820,0.002557,0.714757,0.754894,0.962344,0.524349,0.336639,0.145052,0.007236,0.400635,0.265209,0.002495,0.004413,0.001885,0.003219,0.817926,0.159544,0.430413,0.001747,0.600207,0.936448,0.689002,0.966172,0.982467,0.896367,0.836083,0.100895,0.323357,0.917629,0.822574,0.442314,0.002815,0.656890,0.776628,0.951822,0.523937,0.355077,0.054227,0.009475,0.426047,0.042828,0.001554,0.002415,0.001901,0.002716,0.772997,0.197272,0.514012,0.001609,0.659491,0.927434,0.630957,0.971979,0.978133,0.848875,0.741099,0.082199,0.613503,0.927468,0.859877,0.535695,0.001612,0.742894,0.760527,0.947612,0.522766,0.430478,0.093761,0.002976,0.444345,0.157988,0.001073,0.002088,0.000779,0.001962,0.895214,0.164565,0.295604,0.000334,0.717660,0.947658,0.703412,0.962765,0.978938,0.886947,0.811829,0.153621,0.463874,0.914900,0.835757,0.527603,0.001837,0.699537,0.807397,0.946325,0.539940,0.457635,0.124042,0.003236,0.286164,0.291983,0.001304,0.002562,0.001170,0.003034,0.801764,0.221386,0.274121,0.000395,0.729963,0.925703,0.664402,0.954914,0.961917,0.853537,0.621297,0.201935,0.712606,0.886061,0.899352,0.662624,0.002176,0.735490,0.821115,0.936916,0.618762,0.490514,0.041043,0.006311,0.424292,0.032199,0.001693,0.002317,0.001529,0.003247,0.923778,0.166201,0.390509,0.000983,0.780819,0.942603,0.656558,0.965070,0.980537,0.887025,0.832672,0.130486,0.636155,0.898163,0.883528,0.629372,0.001553,0.797396,0.749492,0.959496,0.564451,0.429575,0.038811,0.007312,0.434988,0.051323,0.001504,0.002373,0.001100,0.002592,0.890999,0.160384,0.449894,0.000915,0.729617,0.938816,0.662567,0.960706,0.968107,0.891288,0.812692,0.178095,0.379385,0.891256
6077,9627,TECHNOLOGY,4,1.000000,0.666667,0.000000,1.000000,1.000000,0.0,0.666667,0.666667,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,1.000000,0.000000,0.000000,0.000000,0.0,1.000000,0.833333,0.666667,1.000000,1.000000,1.000000,0.000000,0.000000,0.000000,1.000000,host_stackexchange,apple,-1.068748,-0.122961,-0.604419,-0.958627,-0.274433,-0.522322,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.927152,0.605351,0.054458,0.835438,0.776664,0.517339,0.607835,0.548088,0.044885,0.005517,0.496278,0.036246,0.032107,0.003881,0.005339,0.650952,0.375845,0.118151,0.054270,0.004177,0.836568,0.911883,0.654665,0.968649,0.967049,0.835066,0.594453,0.106484,0.204066,0.930919,0.936851,0.558253,0.029042,0.873865,0.817033,0.585486,0.580275,0.509221,0.026473,0.003789,0.454168,0.022612,0.027160,0.002894,0.003447,0.459932,0.412284,0.159937,0.052136,0.002546,0.864783,0.958875,0.682133,0.993101,0.989048,0.915223,0.105392,0.024786,0.221488,0.962837,0.907170,0.526672,0.081358,0.887495,0.630876,0.575749,0.569774,0.519094,0.021385,0.012793,0.588379,0.050328,0.057773,0.003701,0.005895,0.613546,0.346707,0.085192,0.058595,0.002746,0.774637,0.916121,0.673391,0.970963,0.967815,0.836476,0.275788,0.036840,0.108298,0.893958,0.919939,0.549421,0.051631,0.874103,0.729902,0.663199,0.584625,0.510344,0.028677,0.007297,0.505329,0.049622,0.058621,0.002788,0.004544,0.497107,0.353651,0.111044,0.064529,0.001795,0.800867,0.932307,0.627007,0.978468,0.969290,0.841661,0.128861,0.038704,0.419330,0.917167,0.927671,0.622088,0.041677,0.845907,0.758318,0.733036,0.586232,0.549805,0.053390,0.004888,0.535032,0.162639,0.021822,0.002416,0.004064,0.282074,0.510073,0.140698,0.092283,0.001375,0.829874,0.928586,0.677141,0.968687,0.971740,0.857305,0.436751,0.112967,0.305608,0.926620,0.926537,0.521300,0.053805,0.839922,0.710443,0.680117,0.561948,0.566100,0.081585,0.005643,0.679490,0.147413,0.035171,0.003135,0.002597,0.229172,0.426230,0.152076,0.087098,0.000998,0.835918,0.887308,0.612612,0.930565,0.925959,0.803360,0.497693,0.142454,0.355588,0.871720


In [318]:
drop_cols = ['qa_id', 'category', 'fold', 'host', 'netloc']

reses = []
for fold in range(5):
    trn_x = features_df.query(f'fold != {fold}').drop(drop_cols+labels, axis=1).values
    trn_y = features_df.query(f'fold != {fold}')[labels].values
    val_x = features_df.query(f'fold == {fold}').drop(drop_cols+labels, axis=1).values
    val_y = features_df.query(f'fold == {fold}')[labels].values
    
    model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
    model.fit(trn_x, trn_y)
    y_pred = model.predict(val_x)
    res = compute_spearmanr(val_y, y_pred)
    print(np.mean(res))
    reses.append(res)
print(np.mean(reses))
# model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
# model.fit(features_df.drop(drop_cols+labels, axis=1).values)
# model.predict

0.41041107804619154
0.40336732085623583
0.4204717163820624
0.407737655696662
0.39578742543838225


In [319]:
np.mean([0.41041107804619154,
0.40336732085623583,
0.4204717163820624,
0.407737655696662,
0.39578742543838225])

0.4075550392839068

In [333]:
drop_cols = ['qa_id', 'category', 'fold', 'host', 'netloc']

reses = []
for fold in range(5):
    trn_x = features_df.query(f'fold != {fold}').drop(drop_cols+labels, axis=1).values
    trn_y = features_df.query(f'fold != {fold}')[labels].values
    val_x = features_df.query(f'fold == {fold}').drop(drop_cols+labels, axis=1).values
    val_y = features_df.query(f'fold == {fold}')[labels].values
    
    model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
    model.fit(trn_x, trn_y)
    y_pred = model.predict(val_x)
    res = compute_spearmanr(val_y, y_pred)
    print(np.mean(res))
    reses.append(res)
print(np.mean(reses))
# model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
# model.fit(features_df.drop(drop_cols+labels, axis=1).values)
# model.predict

0.39832241941874347
0.39039939293439097
0.41126443142167607
0.39548347638047676
0.3887355061777134
0.39684104526660013


In [322]:
model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
model.fit(features_df.drop(drop_cols+labels, axis=1).values, features_df[labels].values)
y_pred = model.predict(features_df.drop(drop_cols+labels, axis=1).values)
res = compute_spearmanr(features_df[labels].values, y_pred)

In [323]:
np.mean(res)

0.44343593128277276

In [329]:
drop_cols = ['qa_id', 'category', 'fold', 'host', 'netloc']

reses = []
for fold in range(5):
    trn_x = features_df.query(f'fold != {fold}').drop(drop_cols+labels, axis=1).values
    trn_y = features_df.query(f'fold != {fold}')[labels].values
    val_x = features_df.query(f'fold == {fold}').drop(drop_cols+labels, axis=1).values
    val_y = features_df.query(f'fold == {fold}')[labels].values
    
    model = Lasso(alpha=0.0001, random_state=42)
    model.fit(trn_x, trn_y)
    y_pred = model.predict(val_x)
    res = compute_spearmanr(val_y, y_pred)
    print(np.mean(res))
    reses.append(res)
print(np.mean(reses))
# model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
# model.fit(features_df.drop(drop_cols+labels, axis=1).values)
# model.predict

0.41087039287624344
0.4058923725171438
0.4209248201796327
0.40854678227492225
0.3995903393449829
0.40916494143858506


In [443]:
drop_cols = ['qa_id', 'category', 'fold', 'host', 'netloc']

reses = []
for fold in range(5):
    trn_x = features_df.query(f'fold != {fold}').drop(drop_cols+labels, axis=1).values
    trn_y = features_df.query(f'fold != {fold}')[labels].values
    val_x = features_df.query(f'fold == {fold}').drop(drop_cols+labels, axis=1).values
    val_y = features_df.query(f'fold == {fold}')[labels].values
    
    model = OneVsRestClassifier(LogisticRegression(C=0.01, random_state=42, multi_class='ovr'))
    model.fit(trn_x, trn_y)
    y_pred = model.predict(val_x)
    res = compute_spearmanr(val_y, y_pred)
    print(np.mean(res))
    reses.append(res)
print(np.mean(reses))
# model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
# model.fit(features_df.drop(drop_cols+labels, axis=1).values)
# model.predict

ValueError: Multioutput target data is not supported with label binarization

In [336]:
model = Lasso(alpha=0.0001, random_state=42)
model.fit(features_df.drop(drop_cols+labels, axis=1).values, features_df[labels].values)
y_pred = model.predict(features_df.drop(drop_cols+labels, axis=1).values)
res = compute_spearmanr(features_df[labels].values, y_pred)

In [337]:
np.mean(res)

0.4432701170724508

## 試しに whole data train の stacking やってみる
 - ※ 2 : 1 : 1
 - 2 は avg

In [402]:
# %debug
bert_y_trues, bert_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(bert_question_dict, bert_answer_dict, avg=True)
roberta_y_trues, roberta_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(roberta_question_dict, roberta_answer_dict, avg=True)
xlnet_y_trues, xlnet_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(xlnet_question_dict, xlnet_answer_dict, avg=True)

In [404]:
# %debug
single_bert_y_trues, single_bert_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(bert_question_dict, bert_answer_dict, single=True, avg=False)
single_roberta_y_trues, single_roberta_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(roberta_question_dict, roberta_answer_dict, single=True, avg=False)
single_xlnet_y_trues, single_xlnet_y_preds = get_y_trues_and_y_preds_from_QA_snapshota_dicts(xlnet_question_dict, xlnet_answer_dict, single=True, avg=False)

In [405]:
trn_df = pd.read_csv('../mnt/inputs/origin/train.csv')
tst_df = pd.read_csv('../mnt/inputs/origin/test.csv')

In [406]:
from sklearn.model_selection import GroupKFold as GKF

gkf = GKF(5)
fold = list(gkf.split(trn_df.qa_id, groups=trn_df.question_body))

In [407]:
val_idxes = [i[1] for i in fold]
val_idx = np.concatenate(val_idxes)

In [408]:
trn_df['fold'] = None
for i, _val_idx in enumerate(val_idxes):
    trn_df.loc[_val_idx, 'fold'] = i

In [409]:
trn_df['fold'].fillna(-1).value_counts()

3    1216
2    1216
1    1216
0    1216
4    1215
Name: fold, dtype: int64

In [410]:
(trn_df.iloc[val_idx].iloc[:, 11:-1].values == bert_y_trues[:, :30]).all()

True

In [411]:
trn_df = trn_df.iloc[val_idx].reset_index(drop=True)

In [412]:
# 置換完了
(trn_df.iloc[:, 11:-1].values == bert_y_trues[:, :30]).all()

True

In [413]:
# %debug
features_df, ohes, scalers = mk_features(trn_df, bert_y_preds, single_roberta_y_preds, single_xlnet_y_preds, '../mnt/checkpoints/e078/datasets/', ohe=None, scaler=None, labels=labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc

 68%|██████▊   | 4150/6079 [00:25<00:11, 174.83it/s][A
 69%|██████▊   | 4175/6079 [00:25<00:09, 191.06it/s][A
 69%|██████▉   | 4195/6079 [00:25<00:10, 174.76it/s][A
 69%|██████▉   | 4214/6079 [00:26<00:10, 173.99it/s][A
 70%|██████▉   | 4237/6079 [00:26<00:11, 166.09it/s][A
 70%|██████▉   | 4255/6079 [00:26<00:12, 150.83it/s][A
 70%|███████   | 4272/6079 [00:26<00:11, 151.34it/s][A
 71%|███████   | 4290/6079 [00:26<00:11, 155.57it/s][A
 71%|███████   | 4306/6079 [00:26<00:11, 149.65it/s][A
 71%|███████   | 4322/6079 [00:26<00:12, 142.15it/s][A
 71%|███████▏  | 4346/6079 [00:26<00:10, 161.22it/s][A
 72%|███████▏  | 4364/6079 [00:27<00:11, 153.97it/s][A
 72%|███████▏  | 4387/6079 [00:27<00:09, 169.29it/s][A
 73%|███████▎  | 4408/6079 [00:27<00:09, 177.10it/s][A
 73%|███████▎  | 4427/6079 [00:27<00:09, 169.41it/s][A
 73%|███████▎  | 4445/6079 [00:27<00:09, 164.42it/s][A
 74%|███████▎  | 4469/6079 [00:27<00:08, 180.45it/s][A
 74%|███████▍  | 4490/6079 [00:27<00:08, 186.04i

 57%|█████▋    | 3480/6079 [00:21<00:15, 170.22it/s][A
 58%|█████▊    | 3498/6079 [00:22<00:15, 167.23it/s][A
 58%|█████▊    | 3516/6079 [00:22<00:16, 151.41it/s][A
 58%|█████▊    | 3536/6079 [00:22<00:15, 158.96it/s][A
 58%|█████▊    | 3554/6079 [00:22<00:15, 162.96it/s][A
 59%|█████▉    | 3572/6079 [00:22<00:15, 166.25it/s][A
 59%|█████▉    | 3594/6079 [00:22<00:14, 177.25it/s][A
 59%|█████▉    | 3613/6079 [00:22<00:14, 166.23it/s][A
 60%|█████▉    | 3635/6079 [00:22<00:13, 179.31it/s][A
 60%|██████    | 3654/6079 [00:22<00:13, 180.63it/s][A
 60%|██████    | 3673/6079 [00:23<00:14, 161.62it/s][A
 61%|██████    | 3691/6079 [00:23<00:14, 165.36it/s][A
 61%|██████    | 3716/6079 [00:23<00:13, 174.08it/s][A
 61%|██████▏   | 3734/6079 [00:23<00:13, 169.94it/s][A
 62%|██████▏   | 3752/6079 [00:23<00:13, 170.65it/s][A
 62%|██████▏   | 3774/6079 [00:23<00:12, 179.47it/s][A
 62%|██████▏   | 3793/6079 [00:23<00:13, 170.86it/s][A
 63%|██████▎   | 3811/6079 [00:23<00:13, 167.74i

In [423]:
bert_y_preds.shape

(6079, 30)

In [414]:
tst_df['fold'] = -1
tst_features, _, _ = mk_features(tst_df, bert_y_preds, single_roberta_y_preds, single_xlnet_y_preds, '../mnt/checkpoints/e078/datasets/', ohe=ohes, scaler=scalers, labels=[])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc

In [424]:
model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
model.fit(features_df.drop(drop_cols+labels, axis=1).values, features_df[labels].values)
y_pred = model.predict(features_df.drop(drop_cols+labels, axis=1).values)
res = compute_spearmanr(features_df[labels].values, y_pred)
np.mean(res)

0.438265342954586

In [425]:
eval_scores, opt_eval_scores, optRs = blend_and_evaluate(features_df[labels].values, [y_pred, ], compute_spearmanr)

original_score: 0.438265342954586
opt_score: 0.4881796345764086


In [426]:
eval_scores

[0.41753112199777104,
 0.6764953319096272,
 0.42041124578475975,
 0.3445646646410174,
 0.38704381965286866,
 0.45130652167676083,
 0.3961194908046353,
 0.529847909439801,
 0.6119612323931802,
 0.12516425938870868,
 0.510398186761741,
 0.7605434726794571,
 0.37125442681495224,
 0.19413188857848201,
 0.35777555752374846,
 0.46217239384555947,
 0.7923505527480342,
 0.3938071010494091,
 0.6946415350604473,
 0.07257527239457269,
 0.5285877260802457,
 0.2948007237476026,
 0.4851563812807026,
 0.21496838861715717,
 0.22525026989531646,
 0.38443902552982523,
 0.7721339186049994,
 0.332985963069032,
 0.7071401193763913,
 0.2324017872907762]

In [427]:
opt_eval_scores

[0.41343022683327774,
 0.6720030412486051,
 0.4938149372930572,
 0.3391910490029068,
 0.39187799667690215,
 0.5072403885501177,
 0.39808415086767984,
 0.5371936935309224,
 0.6192335166138851,
 0.18109855772262892,
 0.4967482171718512,
 0.772318523480993,
 0.5911052329202999,
 0.33529408603749244,
 0.659875548303857,
 0.6177865992485571,
 0.79790056640927,
 0.3745143276058302,
 0.6894973122086349,
 0.6151022328164457,
 0.5296435154397106,
 0.29602617927826935,
 0.4834727079718044,
 0.21961170063223812,
 0.22664964858647108,
 0.3847117607697327,
 0.7680312871946515,
 0.3033595906063066,
 0.6962289343060919,
 0.23434350796376646]

In [428]:
nes_dict = {
    'model': model, 
    'ohe': ohes,
    'scaler': scalers,
}

In [429]:
with open('../mnt/datasets/stackings/e10X_stacking/e10X_stacking_dict.pkl', 'wb') as fout:
    pickle.dump(nes_dict, fout)

In [430]:
with open('../mnt/datasets/stackings/e10X_stacking/optRs.pkl', 'wb') as fout:
    pickle.dump(optRs, fout)

#### ちなみに...？

In [421]:
drop_cols = ['qa_id', 'category', 'fold', 'host', 'netloc']

reses = []
temp_y_preds = []
temp_y_trues = []
for fold in range(5):
    trn_x = features_df.query(f'fold != {fold}').drop(drop_cols+labels, axis=1).values
    trn_y = features_df.query(f'fold != {fold}')[labels].values
    val_x = features_df.query(f'fold == {fold}').drop(drop_cols+labels, axis=1).values
    val_y = features_df.query(f'fold == {fold}')[labels].values
    
    model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
    model.fit(trn_x, trn_y)
    y_pred = model.predict(val_x)
    res = compute_spearmanr(val_y, y_pred)
    temp_y_preds.append(y_pred)
    temp_y_trues.append(val_y)
    print(np.mean(res))
    reses.append(res)
print(np.mean(reses))
# model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
# model.fit(features_df.drop(drop_cols+labels, axis=1).values)
# model.predict

0.40981280341747495
0.40372851673832894
0.4206026783853577
0.40815010880027536
0.39626454923180715
0.4077117313146489


In [422]:
_, _, _ = blend_and_evaluate(np.concatenate(temp_y_trues, axis=0), [np.concatenate(temp_y_preds, axis=0), ], compute_spearmanr)

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


original_score: 0.4053620624917785
opt_score: 0.4309404384262748


In [432]:
tst_features.shape, features_df.shape

((6079, 168), (6079, 198))

In [434]:
features_df.drop(drop_cols+labels, axis=1).columns.tolist()

['title_len',
 'body_len',
 'answer_len',
 'bert_title_len',
 'bert_body_len',
 'bert_answer_len',
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 'bert_0',
 'bert_1',
 'bert_2',
 'bert_3',
 'bert_4',
 'bert_5',
 'bert_6',
 'bert_7',
 'bert_8',
 'bert_9',
 'bert_10',
 'bert_11',
 'bert_12',
 'bert_13',
 'bert_14',
 'bert_15',
 'bert_16',
 'bert_17',
 'bert_18',
 'bert_19',
 'bert_20',
 'bert_21',
 'bert_22',
 'bert_23',
 'bert_24',
 'bert_25',
 'bert_26',
 'bert_27',
 'bert_28',
 'bert_29',
 'roberta_0',
 'roberta_1',
 'roberta_2',
 'roberta_3',
 'roberta_4',
 'roberta_5',
 'roberta_6',
 'roberta_7',
 'roberta_8',
 'roberta_9',
 'roberta_10',
 'roberta_11',
 'roberta_12',
 'roberta_13',
 'roberta_14',
 'roberta_1

## 混ぜてみる
 - 4 つ全て混ぜてみる
 - 3/4 の　snapshot 混ぜてみる？

#### 試しに single, top-2 snapshots を見てみる

In [340]:
blend_and_evaluate(single_bert_y_trues, [single_bert_y_preds,  ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_roberta_y_preds, ], compute_spearmanr)
# blend_and_evaluate(single_bert_y_trues, [single_gpt2_y_preds, ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_xlnet_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.39658756843296844
opt_score: 0.4240074936181189


  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


original_score: 0.3952086168898242
opt_score: 0.417814664868746
original_score: 0.39510187276130454
opt_score: 0.42243226674838147
fini!


In [341]:
blend_and_evaluate(single_bert_y_trues, [bert_y_preds,  ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [roberta_y_preds, ], compute_spearmanr)
# blend_and_evaluate(single_bert_y_trues, [gpt2_y_preds, ], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [xlnet_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.39658756843296844
opt_score: 0.4240074936181189
original_score: 0.3952086168898242
opt_score: 0.417814664868746
original_score: 0.39510187276130454
opt_score: 0.42243226674838147
fini!


#### all blends がどうなるかを見てみる

In [343]:
# eval_scores, opt_eval_scores = blend_and_evaluate(single_bert_y_trues, [single_bert_y_preds,  single_roberta_y_preds, single_gpt2_y_preds, single_xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(single_bert_y_trues, [single_bert_y_preds,  single_roberta_y_preds, single_xlnet_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.416481507046034
opt_score: 0.4491590833618151
fini!


## 2, 2, 1 じゃ disk 足りなかった...

In [20]:
blend_and_evaluate(bert_y_trues, [single_bert_y_preds,  single_roberta_y_preds, xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [bert_y_preds,  single_roberta_y_preds, single_xlnet_y_preds], compute_spearmanr)
blend_and_evaluate(bert_y_trues, [single_bert_y_preds,  roberta_y_preds, single_xlnet_y_preds], compute_spearmanr)
print('fini!')

original_score: 0.41745646679452425
opt_score: 0.4494514724953275
original_score: 0.41786576946688975
opt_score: 0.4513792627190148
original_score: 0.4181574464924081
opt_score: 0.44796109262403244
fini!


#### 2, 1, 1 で行くことにするので optR を取得

In [21]:
eval_scores, opt_eval_scores, optRs = blend_and_evaluate(bert_y_trues, [bert_y_preds,  single_roberta_y_preds, single_xlnet_y_preds], compute_spearmanr)

original_score: 0.41786576946688975
opt_score: 0.4513792627190148


In [23]:
!mkdir ../mnt/datasets/sub_e078_e083_bert2_roberta1_xlnet1

In [24]:
with open('../mnt/datasets/sub_e078_e083_bert2_roberta1_xlnet1/optRs.pkl', 'wb') as fout:
    pickle.dump(optRs, fout)

In [27]:
','.join(([str(i) for i in eval_scores]))

'0.3888662513004357,0.6264747430024324,0.417198052103926,0.31356995786700836,0.36621009889416495,0.43161328159188245,0.35960846883953307,0.5044099886093536,0.604524708512999,0.09185984413637717,0.4871602561126112,0.7588073461879603,0.3687724113728429,0.1878256807431471,0.3623045162839472,0.46221497917924714,0.7845698434998972,0.37561304685622554,0.685135875113699,0.06569265158283247,0.509277164422869,0.26846500559030767,0.44030220154571237,0.17059028253021058,0.18652642117523097,0.35943031009217935,0.7627075286902857,0.28993040895144306,0.6928979913635044,0.21341376785442578'

In [28]:
','.join(([str(i) for i in opt_eval_scores]))

'0.38647035222851883,0.6251840213259875,0.503341694761477,0.3140683242310781,0.37238351698553224,0.48891838338012195,0.3688643054450728,0.5174029320574922,0.6075334223395167,0.11234184006252432,0.4734302164363926,0.7724553926978851,0.5679733516339525,0.3086234975886461,0.6273452824025416,0.6104216226094872,0.7913164255678906,0.35227648410197876,0.6743979217445293,0.21251606433106468,0.5092991566008891,0.25989741044634646,0.4419292131037434,0.172812412462297,0.1891229575649494,0.360249882795264,0.7579315945321048,0.2690066667578972,0.6818714797543032,0.21199205562095982'

In [84]:
','.join([str(i) for i in eval_scores])

'0.38753764855932404,0.624591236065633,0.41612521851465645,0.31631905154404777,0.36958084656834655,0.4333397143525736,0.35647225646179814,0.5053740180720495,0.6024717179943407,0.09343879770080012,0.48788970013367006,0.7554768784733894,0.37043040207462113,0.19175643232185877,0.3632398123928754,0.4644149810472098,0.7882689049397114,0.37508934129137833,0.6875082798335962,0.06859634383678749,0.5058311174919892,0.26776035706473816,0.42468236885314736,0.18264774434316508,0.1960057923938459,0.3571316272381341,0.7629198053679217,0.29516358912502183,0.6885185616941287,0.1986028427408061'

In [85]:
','.join([str(i) for i in opt_eval_scores])

'0.38610697100222874,0.6220378810356756,0.4803108598080848,0.3087254707253665,0.3767010478726585,0.48153256901696284,0.3623044669561681,0.5109740248841105,0.6079582135843575,0.10998964284517494,0.4870548021086867,0.7644745305398798,0.5468575104750577,0.2850693099375624,0.6439619945737882,0.6181433940398664,0.7950887499733366,0.3480114972488913,0.6853066014210485,0.36239255378317503,0.503890831161848,0.26595541365451547,0.4260248516788567,0.18501720906281013,0.20022371905506964,0.3569060052051175,0.7579715572197582,0.27970705751496816,0.6821275873309829,0.20174712128915573'

#### weights どうすればよいか見てみる

In [86]:
np.arange(0, 1.1, 0.1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [None]:
for i in np.arange(0, 1.1, 0.1):
    for j in np.arange(0, 1.01- i, 0.1):
        weights = [i, j, 1-i-j]
        print(f' ---------- {weights} -----------')
        eval_scores, opt_eval_scores, optRs = blend_and_evaluate(bert_y_trues,
                                                                                                                 [single_bert_y_preds,  roberta_y_preds, single_xlnet_y_preds],
                                                                                                                 compute_spearmanr,
                                                                                                                 weights=weights)

 ---------- [0.0, 0.0, 1.0] -----------
original_score: 0.3906094115371919
opt_score: 0.4208016538440192
 ---------- [0.0, 0.1, 0.9] -----------
original_score: 0.39690428620399504
opt_score: 0.42660201555875216
 ---------- [0.0, 0.2, 0.8] -----------
original_score: 0.40194251860786706
opt_score: 0.4322132022423592
 ---------- [0.0, 0.30000000000000004, 0.7] -----------
original_score: 0.4058057699680523
opt_score: 0.4386623102560342
 ---------- [0.0, 0.4, 0.6] -----------
original_score: 0.40850785092762126
opt_score: 0.44218874682162584
 ---------- [0.0, 0.5, 0.5] -----------
original_score: 0.41002182412980964
opt_score: 0.4462993778180665
 ---------- [0.0, 0.6000000000000001, 0.3999999999999999] -----------
original_score: 0.41026148164290954
opt_score: 0.4472571871083271
 ---------- [0.0, 0.7000000000000001, 0.29999999999999993] -----------
original_score: 0.4092935254165069
opt_score: 0.4447945948297066
 ---------- [0.0, 0.8, 0.19999999999999996] -----------
original_score: 0.40

## まとめ
 - (ちょっと気に入らんけど) bert * 2, roberta * 2, xlnet * 1 の average が良さそう (* 1, * 1, * 1, * 1 よりも)
 - 多分もう一つ別の良いモデルを入れられれば伸びる