In [1]:
# ------ LIBRARY -------#
import numpy as np
import os
import pickle
import sys
import pandas as pd
import re
import cv2
import json
from functools import partial
from typing import List

# torch
import torch
import torch.cuda.amp as amp
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torch.utils.data.sampler import *


import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau, MultiStepLR, OneCycleLR
#

import math
from torch.optim.optimizer import Optimizer, required
import torch_optimizer as optim
from collections import defaultdict
import itertools as it

import tqdm
import random
#import time
import matplotlib.pyplot as plt
from timeit import default_timer as timer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# transformer
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
# class args
class args:
    # ---- factor ---- #
    debug=False
    amp = True
    gpu = '0'
    
    epochs=30
    batch_size=32
    weight_decay=1e-5
    n_fold=5
    fold=5 # [0, 1, 2, 3, 4] # 원래는 3
    patience = 7
    
    exp_name = 'experiment_name_folder'
    dir_ = f'./saved_models/'
    pt = 'xlm-roberta-large'
    max_len = 193
    
    start_lr = 2e-5#1e-3,5e-5
    min_lr=1e-6
    # ---- Dataset ---- #

    # ---- Else ---- #
    num_workers=8
    seed=222
    scheduler = None#'get_linear_schedule_with_warmup'


data_dir = './'
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
print(device)

##----------------
def set_seeds(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False # for faster training, but not deterministic

set_seeds(seed=args.seed)    

cuda


In [4]:
train = pd.read_csv("data/new_train_data.csv")
test = pd.read_csv("data/test_data.csv")
submission = pd.read_csv("data/sample_submission.csv")

In [5]:
print(pd.unique(train["label"]))

label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}

['contradiction' 'entailment' 'neutral']


In [6]:
for i, text in enumerate(train.label):
    train.label[i] = label_dict[text]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
train

Unnamed: 0,index,premise,hypothesis,label
0,0,"씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이...",씨름의 여자들의 놀이이다.,1
1,1,"삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 하였으나,...",자작극을 벌인 이는 3명이다.,1
2,2,이를 위해 예측적 범죄예방 시스템을 구축하고 고도화한다.,예측적 범죄예방 시스템 구축하고 고도화하는 것은 목적이 있기 때문이다.,0
3,3,광주광역시가 재개발 정비사업 원주민들에 대한 종합대책을 마련하는 등 원주민 보호에 ...,원주민들은 종합대책에 만족했다.,2
4,4,"진정 소비자와 직원들에게 사랑 받는 기업으로 오래 지속되고 싶으면, 이런 상황에서는...",이런 상황에서 책임 있는 모습을 보여주는 기업은 아주 드물다.,2
...,...,...,...,...
27993,2995,흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재개되었다가...,지방도 제1112호선을 넓히는 공사는 중단없이 마무리 되었다.,1
27994,2996,흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재개되었다가...,지방도 제1112호선을 넓히는 공사가 중단된 건 세 번째이다.,2
27995,2997,흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재개되었다가...,지방도 제1112호선은 흔히 비자림로라고 불린다.,0
27996,2998,흡연자분들은 발코니가 있는 방이면 발코니에서 흡연이 가능합니다.,비흡연자는 발코니 있는 방이 필요없습니다.,2


In [8]:
def collate_to_max_length(batch: List[List[torch.Tensor]], max_len: int = None, fill_values: List[float] = None) -> \
    List[torch.Tensor]:
    """
    pad to maximum length of this batch
    Args:
        batch: a batch of samples, each contains a list of field data(Tensor), which shape is [seq_length]
        max_len: specify max length
        fill_values: specify filled values of each field
    Returns:
        output: list of field batched data, which shape is [batch, max_length]
    """
    # [batch, num_fields]
    lengths = np.array([[len(field_data) for field_data in sample] for sample in batch])
    batch_size, num_fields = lengths.shape
    fill_values = fill_values or [0.0] * num_fields
    # [num_fields]
    max_lengths = lengths.max(axis=0)
    if max_len:
        assert max_lengths.max() <= max_len
        max_lengths = np.ones_like(max_lengths) * max_len

    output = [torch.full([batch_size, max_lengths[field_idx]],
                         fill_value=fill_values[field_idx],
                         dtype=batch[0][field_idx].dtype)
              for field_idx in range(num_fields)]
    for sample_idx in range(batch_size):
        for field_idx in range(num_fields):
            # seq_length
            data = batch[sample_idx][field_idx]
            output[field_idx][sample_idx][: data.shape[0]] = data
    # generate span_index and span_mask
    max_sentence_length = max_lengths[0]
    start_indexs = []
    end_indexs = []
    for i in range(1, max_sentence_length - 1):
        for j in range(i, max_sentence_length - 1):
            # # span大小为10
            # if j - i > 10:
            #     continue
            start_indexs.append(i)
            end_indexs.append(j)
    # generate span mask
    span_masks = []
    for input_ids, label, length in batch:
        span_mask = []
        middle_index = input_ids.tolist().index(2)
        for start_index, end_index in zip(start_indexs, end_indexs):
            if 1 <= start_index <= length.item() - 2 and 1 <= end_index <= length.item() - 2 and (
                start_index > middle_index or end_index < middle_index):
                span_mask.append(0)
            else:
                span_mask.append(1e6)
        span_masks.append(span_mask)
    # add to output
    output.append(torch.LongTensor(start_indexs))
    output.append(torch.LongTensor(end_indexs))
    output.append(torch.LongTensor(span_masks))
    return output  # (input_ids, labels, length, start_indexs, end_indexs, span_masks)

In [9]:
def collate_to_max_length_test(batch: List[List[torch.Tensor]], max_len: int = None, fill_values: List[float] = None) -> \
    List[torch.Tensor]:
    """
    pad to maximum length of this batch
    Args:
        batch: a batch of samples, each contains a list of field data(Tensor), which shape is [seq_length]
        max_len: specify max length
        fill_values: specify filled values of each field
    Returns:
        output: list of field batched data, which shape is [batch, max_length]
    """
    # [batch, num_fields]
    lengths = np.array([[len(field_data) for field_data in sample] for sample in batch])
    batch_size, num_fields = lengths.shape
    fill_values = fill_values or [0.0] * num_fields
    # [num_fields]
    max_lengths = lengths.max(axis=0)
    if max_len:
        assert max_lengths.max() <= max_len
        max_lengths = np.ones_like(max_lengths) * max_len

    output = [torch.full([batch_size, max_lengths[field_idx]],
                         fill_value=fill_values[field_idx],
                         dtype=batch[0][field_idx].dtype)
              for field_idx in range(num_fields)]
    for sample_idx in range(batch_size):
        for field_idx in range(num_fields):
            # seq_length
            data = batch[sample_idx][field_idx]
            output[field_idx][sample_idx][: data.shape[0]] = data
    # generate span_index and span_mask
    max_sentence_length = max_lengths[0]
    start_indexs = []
    end_indexs = []
    for i in range(1, max_sentence_length - 1):
        for j in range(i, max_sentence_length - 1):
            # # span大小为10
            # if j - i > 10:
            #     continue
            start_indexs.append(i)
            end_indexs.append(j)
    # generate span mask
    span_masks = []
    for input_ids, length in batch:
        span_mask = []
        middle_index = input_ids.tolist().index(2)
        for start_index, end_index in zip(start_indexs, end_indexs):
            if 1 <= start_index <= length.item() - 2 and 1 <= end_index <= length.item() - 2 and (
                start_index > middle_index or end_index < middle_index):
                span_mask.append(0)
            else:
                span_mask.append(1e6)
        span_masks.append(span_mask)
    # add to output
    output.append(torch.LongTensor(start_indexs))
    output.append(torch.LongTensor(end_indexs))
    output.append(torch.LongTensor(span_masks))
    return output  # (input_ids, labels, length, start_indexs, end_indexs, span_masks)

In [10]:
AutoTokenizer.from_pretrained(args.pt)

PreTrainedTokenizerFast(name_or_path='xlm-roberta-large', vocab_size=250002, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [11]:
class SNLIDataset(Dataset):

    def __init__(self, data, bert_path, max_length):
        super().__init__()
        self.max_length = max_length
        self.result = []
        for train_premise, train_hypothesis, train_label in tqdm.tqdm(zip(data['premise'], data['hypothesis'], data['label'])):
                    self.result.append((train_premise, train_hypothesis, train_label))
        self.tokenizer = AutoTokenizer.from_pretrained(bert_path)

    def __len__(self):
        return len(self.result)

    def __getitem__(self, idx):
        sentence_1, sentence_2, label = self.result[idx]
        # remove .
        if sentence_1.endswith("."):
            sentence_1 = sentence_1[:-1]
        if sentence_2.endswith("."):
            sentence_2 = sentence_2[:-1]
        sentence_1_input_ids = self.tokenizer.encode(sentence_1, add_special_tokens=False)
        sentence_2_input_ids = self.tokenizer.encode(sentence_2, add_special_tokens=False)
        input_ids = sentence_1_input_ids + [2] + sentence_2_input_ids
        if len(input_ids) > self.max_length - 2:
            input_ids = input_ids[:self.max_length - 2]
        # convert list to tensor
        length = torch.LongTensor([len(input_ids) + 2])
        input_ids = torch.LongTensor([0] + input_ids + [2])
        label = torch.LongTensor([label])
        
        return input_ids, label, length

In [12]:
class SNLIDataset_test(Dataset):

    def __init__(self, data, bert_path, max_length: int = 512):
        super().__init__()
        self.max_length = max_length
        self.result = []
        for test_premise, test_hypothesis in tqdm.tqdm(zip(data['premise'], data['hypothesis'])):
                    self.result.append((test_premise, test_hypothesis))
        self.tokenizer = AutoTokenizer.from_pretrained(bert_path)

    def __len__(self):
        return len(self.result)

    def __getitem__(self, idx):
        sentence_1, sentence_2 = self.result[idx]
        # remove .
        if sentence_1.endswith("."):
            sentence_1 = sentence_1[:-1]
        if sentence_2.endswith("."):
            sentence_2 = sentence_2[:-1]
        sentence_1_input_ids = self.tokenizer.encode(sentence_1, add_special_tokens=False)
        sentence_2_input_ids = self.tokenizer.encode(sentence_2, add_special_tokens=False)
        input_ids = sentence_1_input_ids + [2] + sentence_2_input_ids
        if len(input_ids) > self.max_length - 2:
            input_ids = input_ids[:self.max_length - 2]
        # convert list to tensor
        length = torch.LongTensor([len(input_ids) + 2])
        input_ids = torch.LongTensor([0] + input_ids + [2])
        
        return input_ids, length

In [13]:
def unit_test():
    
    dataset = SNLIDataset(data=train[:1], bert_path=args.pt, max_length=args.max_len)

    dataloader = DataLoader(
        dataset=dataset,
        batch_size=10,
        num_workers=0,
        shuffle=False,
        collate_fn=partial(collate_to_max_length, fill_values=[1, 0, 0])
    )
    for input_ids, label, length, start_index, end_index, span_mask in dataloader:
        print(input_ids.shape, input_ids)
        print(start_index.shape, start_index)
        print(end_index.shape, end_index)
        print(span_mask.shape, span_mask)
        print(label.view(-1).shape, label)
        print()
        
    for t, data in enumerate(tqdm.tqdm(dataloader)):
        print(data[0])

In [14]:
unit_test()

1it [00:00, 7724.32it/s]


torch.Size([1, 59]) tensor([[     0, 105051,  28913,    697,  11031,   1077, 128161,  84802,   3626,
           1963,  25436, 105646, 119686,  64757,  17862, 223713,      6, 145726,
          67520,      4,   6705,   2680,  16632,  11619,   2905,   7593,      6,
         154848,   1077,  51851,  27815,    993,  36372, 102102,  16632,      6,
         202577,   1180, 209750,  77442,  66127,   1291,  64730,  32685,      6,
          23854,  14413,    769,  15710,      2, 105051,  28913,    367,  52340,
          17862,      6, 145726,   5769,      2]])
torch.Size([1653]) tensor([ 1,  1,  1,  ..., 56, 56, 57])
torch.Size([1653]) tensor([ 1,  2,  3,  ..., 56, 57, 57])
torch.Size([1, 1653]) tensor([[0, 0, 0,  ..., 0, 0, 0]])
torch.Size([1]) tensor([[1]])



100%|██████████| 1/1 [00:00<00:00, 214.50it/s]

tensor([[     0, 105051,  28913,    697,  11031,   1077, 128161,  84802,   3626,
           1963,  25436, 105646, 119686,  64757,  17862, 223713,      6, 145726,
          67520,      4,   6705,   2680,  16632,  11619,   2905,   7593,      6,
         154848,   1077,  51851,  27815,    993,  36372, 102102,  16632,      6,
         202577,   1180, 209750,  77442,  66127,   1291,  64730,  32685,      6,
          23854,  14413,    769,  15710,      2, 105051,  28913,    367,  52340,
          17862,      6, 145726,   5769,      2]])





In [15]:
class ExplainableModel(nn.Module):
    def __init__(self, bert_dir):
        super().__init__()
        self.bert_config = AutoConfig.from_pretrained(bert_dir, output_hidden_states=False)
        self.intermediate = AutoModel.from_pretrained(bert_dir, return_dict=False)
        self.span_info_collect = SICModel(self.bert_config.hidden_size)
        self.interpretation = InterpretationModel(self.bert_config.hidden_size)
        self.output = nn.Linear(self.bert_config.hidden_size, 3)

    def forward(self, input_ids, start_indexs, end_indexs, span_masks):
        # generate mask
        attention_mask = (input_ids != 1).long()
        # intermediate layer
        hidden_states, first_token = self.intermediate(input_ids, attention_mask=attention_mask)  # output.shape = (bs, length, hidden_size)
        # span info collecting layer(SIC)
        h_ij = self.span_info_collect(hidden_states, start_indexs, end_indexs)
        # interpretation layer
        H, a_ij = self.interpretation(h_ij, span_masks)
        # output layer
        out = self.output(H)
        return out, a_ij


class SICModel(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.W_1 = nn.Linear(hidden_size, hidden_size)
        self.W_2 = nn.Linear(hidden_size, hidden_size)
        self.W_3 = nn.Linear(hidden_size, hidden_size)
        self.W_4 = nn.Linear(hidden_size, hidden_size)

    def forward(self, hidden_states, start_indexs, end_indexs):
        W1_h = self.W_1(hidden_states)  # (bs, length, hidden_size)
        W2_h = self.W_2(hidden_states)
        W3_h = self.W_3(hidden_states)
        W4_h = self.W_4(hidden_states)

        W1_hi_emb = torch.index_select(W1_h, 1, start_indexs)  # (bs, span_num, hidden_size)
        W2_hj_emb = torch.index_select(W2_h, 1, end_indexs)
        W3_hi_start_emb = torch.index_select(W3_h, 1, start_indexs)
        W3_hi_end_emb = torch.index_select(W3_h, 1, end_indexs)
        W4_hj_start_emb = torch.index_select(W4_h, 1, start_indexs)
        W4_hj_end_emb = torch.index_select(W4_h, 1, end_indexs)

        # [w1*hi, w2*hj, w3(hi-hj), w4(hi⊗hj)]
        span = W1_hi_emb + W2_hj_emb + (W3_hi_start_emb - W3_hi_end_emb) + torch.mul(W4_hj_start_emb, W4_hj_end_emb)
        h_ij = torch.tanh(span)
        return h_ij


class InterpretationModel(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.h_t = nn.Linear(hidden_size, 1)

    def forward(self, h_ij, span_masks):
        o_ij = self.h_t(h_ij).squeeze(-1)  # (ba, span_num)
        # mask illegal span
        o_ij = o_ij - span_masks
        # normalize all a_ij, a_ij sum = 1
        a_ij = nn.functional.softmax(o_ij, dim=1)
        # weight average span representation to get H
        H = (a_ij.unsqueeze(-1) * h_ij).sum(dim=1)  # (bs, hidden_size)
        return H, a_ij

In [16]:
# - util - #
def get_learning_rate(optimizer):
    lr=[]
    for param_group in optimizer.param_groups:
        lr +=[ param_group['lr'] ]

    assert(len(lr)==1) #we support only one param_group
    lr = lr[0]

    return lr

def load_data():
    train=pd.read_csv('data/new_train_data.csv')
    test=pd.read_csv('data/test_data.csv')
    
    #
    train=train[['premise', 'hypothesis', 'label']]
    test=test[['premise', 'hypothesis']]
    
    #
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    train['fold'] = -1
    for n_fold, (_,v_idx) in enumerate(skf.split(train, train['label'])):
        train.loc[v_idx, 'fold']  = n_fold
    train['id'] = [x for x in range(len(train))]
    
    for i, text in enumerate(train.label):
        train.label[i] = label_dict[text]
    
    return train, test

In [17]:
# ------------------------
#  scheduler
# ------------------------

def do_valid(net, valid_loader):

    val_loss = 0
    target_lst = []
    pred_lst = []
    logit = []
    loss_fn = nn.CrossEntropyLoss()

    net.eval()
    start_timer = timer()
    for t, data in enumerate(tqdm.tqdm(valid_loader)):

        # (input_ids, labels, length, start_indexs, end_indexs, span_masks)
        input_ids  = data[0].to(device)
        start_index  = data[3].to(device)
        end_index = data[4].to(device)
        span_mask = data[5].to(device)
        target = data[1].to(device).view(-1)

        with torch.no_grad():
            if args.amp:
                with amp.autocast():
                    # output
                    output, a_ij = net(input_ids, start_index, end_index, span_mask)
#                     output = output[0]

                    # loss
                    loss = loss_fn(output, target)

            else:
                output = net(ids, mask)#.squeeze(0)
                loss = loss_fn(output, target)
            
            val_loss += loss
            target_lst.extend(target.detach().cpu().numpy())
            pred_lst.extend(output.argmax(dim=1).tolist())
            logit.extend(output.tolist())
            
        val_mean_loss = val_loss / len(valid_loader)
        validation_score = f1_score(y_true=target_lst, y_pred=pred_lst, average='macro')
        validation_acc = accuracy_score(y_true=target_lst, y_pred=pred_lst)
        

    return val_mean_loss, validation_score, validation_acc, logit

def do_predict(net, valid_loader):
    
    val_loss = 0
    pred_lst = []
    logit=[]
    net.eval()
    for t, data in enumerate(tqdm.tqdm(valid_loader)):
        
        # (input_ids, length, start_indexs, end_indexs, span_masks)
        input_ids  = data[0].to(device)
        start_index  = data[2].to(device)
        end_index = data[3].to(device)
        span_mask = data[4].to(device)

        with torch.no_grad():
            if args.amp:
                with amp.autocast():
                    # output
                    output = net(input_ids, start_index, end_index, span_mask)[0]

            else:
                output = net(input_ids, start_index, end_index, span_mask)
             
            pred_lst.extend(output.argmax(dim=1).tolist())
            logit.extend(output.tolist())
            
    return pred_lst,logit

def run_train(folds=3):
    out_dir = args.dir_+ f'/fold{args.fold}/{args.exp_name}/'
    os.makedirs(out_dir, exist_ok=True)
    
    # load dataset
    train, test = load_data()    
 
    
    # split fold
    for n_fold in range(5):
        
        print(n_fold)
        if n_fold != folds:
            print(f'{n_fold} fold pass'+'\n')
            continue
            
        if args.debug:
            train = train.sample(1000).copy()
            
        print(n_fold)
        
        trn_idx = train[train['fold']!=n_fold]['id'].values
        val_idx = train[train['fold']==n_fold]['id'].values

        ## dataset ------------------------------------
        train_dataset = SNLIDataset(data = train.iloc[trn_idx], bert_path=args.pt, max_length=args.max_len)
        valid_dataset = SNLIDataset(data = train.iloc[val_idx], bert_path=args.pt, max_length=args.max_len)
        trainloader = DataLoader(dataset=train_dataset, batch_size=args.batch_size,
                                 num_workers=8, shuffle=True, pin_memory=True, collate_fn=partial(collate_to_max_length, fill_values=[1, 0, 0]))
        validloader = DataLoader(dataset=valid_dataset, batch_size=args.batch_size, 
                                 num_workers=8, shuffle=False, pin_memory=True, collate_fn=partial(collate_to_max_length, fill_values=[1, 0, 0]))

        ## net ----------------------------------------
        scaler = amp.GradScaler()
        net = ExplainableModel(args.pt)

        net.to(device)
        if len(args.gpu)>1:
            net = nn.DataParallel(net)

        # ------------------------
        # loss
        # ------------------------
        loss_fn = nn.CrossEntropyLoss()

        # ------------------------
        #  Optimizer
        # ------------------------
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in net.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": args.weight_decay,
            },
            {
                "params": [p for n, p in net.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          betas=(0.9, 0.98),  # according to RoBERTa paper
                          lr=args.start_lr,
                          eps=1e-9)

        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = len(trainloader)*args.epochs)
        
        
        # ----
        start_timer = timer()
        best_score = 0
        early_stopping = 0
        
        for epoch in range(1, args.epochs+1):
            train_loss = 0
            valid_loss = 0

            target_lst = []
            pred_lst = []
#             lr = get_learning_rate(optimizer)
            print(f'-------------------')
            print(f'{epoch}epoch start')
            print(f'-------------------'+'\n')
#             print(f'learning rate : {lr : .6f}')
            for t, data in enumerate(tqdm.tqdm(trainloader)):

                # one iteration update  -------------
                input_ids  = data[0].to(device)
                start_index  = data[3].to(device)
                end_index = data[4].to(device)
                span_mask = data[5].to(device)
                target = data[1].to(device).view(-1)

                # ------------
#                 net.train()
                optimizer.zero_grad()


                if args.amp:
                    with amp.autocast():
                        # output
                        output, a_ij = net(input_ids, start_index, end_index, span_mask)
#                         output = output[0]

                        # loss
                        loss = loss_fn(output, target)
                        train_loss += loss


                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()

                else:
                    # output
                    output = net(ids, mask)

                    # loss
                    loss = loss_fn(output, target)
                    train_loss += loss

                    # update
                    loss.backward()
                    optimizer.step()


                # for calculate f1 score
                target_lst.extend(target.detach().cpu().numpy())
                pred_lst.extend(output.argmax(dim=1).tolist())


                if scheduler is not None:
                    scheduler.step() 
            train_loss = train_loss / len(trainloader)
            train_score = f1_score(y_true=target_lst, y_pred=pred_lst, average='macro')
            train_acc = accuracy_score(y_true=target_lst, y_pred=pred_lst)

            # validation
            valid_loss, valid_score, valid_acc, _ = do_valid(net, validloader)


            if valid_acc > best_score:
                best_score = valid_acc
                best_epoch = epoch
                best_loss = valid_loss

                torch.save(net.state_dict(), out_dir + f'/{folds}f_explain.pth')
                print('best model saved'+'\n')
            else:
                early_stopping += 1

            # Early Stopping
            if early_stopping == args.patience:
                break


            print(f'train loss : {train_loss:.4f}, train f1 score : {train_score : .4f}, train acc : {train_acc : .4f}'+'\n')
            print(f'valid loss : {valid_loss:.4f}, valid f1 score : {valid_score : .4f}, valid acc : {valid_acc : .4f}'+'\n')


        print(f'best valid loss : {best_loss : .4f}'+'\n')
        print(f'best epoch : {best_epoch }'+'\n')
        print(f'best accuracy : {best_score : .4f}'+'\n')
        
def run_predict(model_path):
    ## dataset ------------------------------------
    # load
        
    train, test = load_data()
    print('test load')

    test_dataset = SNLIDataset_test(data = test, bert_path=args.pt, max_length=args.max_len)
    testloader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, 
                             num_workers=8, shuffle=False, pin_memory=True, collate_fn=partial(collate_to_max_length_test, fill_values=[1, 0, 0]))
    print('set testloader')
    ## net ----------------------------------------
    scaler = amp.GradScaler()
    net = ExplainableModel(args.pt)
        
    net.to(device)
    
    if len(args.gpu)>1:
        net = nn.DataParallel(net)

    f = torch.load(model_path)
    net.load_state_dict(f, strict=True)  # True
    print('load saved models')
    # ------------------------
    # validation
    preds, logit = do_predict(net, testloader) #outputs
           
    print('complete predict')
    
    return preds, np.array(logit)
     

In [18]:
"""5fold 전용"""
if __name__ == '__main__':

    for pt, max_len in zip(['xlm-roberta-large', 'klue/roberta-large'],[193, 193]):
        
        args.max_len = max_len
        args.pt = pt
        args.exp_name = str(args.pt) + '_' + str(args.max_len)
    
        for i in [0,1,2,3,4]: # 5fold
            run_train(folds=i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
0


22398it [00:00, 839400.46it/s]
5600it [00:00, 808067.65it/s]
Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


-------------------
1epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.95it/s]


best model saved

train loss : 0.4921, train f1 score :  0.8065, train acc :  0.8075

valid loss : 0.3948, valid f1 score :  0.8557, valid acc :  0.8564

-------------------
2epoch start
-------------------



100%|██████████| 700/700 [02:48<00:00,  4.16it/s]
100%|██████████| 175/175 [00:13<00:00, 12.82it/s]


best model saved

train loss : 0.2419, train f1 score :  0.9184, train acc :  0.9187

valid loss : 0.3756, valid f1 score :  0.8639, valid acc :  0.8636

-------------------
3epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.80it/s]


best model saved

train loss : 0.1288, train f1 score :  0.9580, train acc :  0.9581

valid loss : 0.3935, valid f1 score :  0.8743, valid acc :  0.8754

-------------------
4epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.19it/s]
100%|██████████| 175/175 [00:13<00:00, 12.69it/s]


train loss : 0.0772, train f1 score :  0.9760, train acc :  0.9761

valid loss : 0.5117, valid f1 score :  0.8624, valid acc :  0.8630

-------------------
5epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.17it/s]
100%|██████████| 175/175 [00:13<00:00, 12.90it/s]


train loss : 0.0537, train f1 score :  0.9838, train acc :  0.9839

valid loss : 0.5314, valid f1 score :  0.8710, valid acc :  0.8718

-------------------
6epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.84it/s]


train loss : 0.0400, train f1 score :  0.9879, train acc :  0.9879

valid loss : 0.5946, valid f1 score :  0.8677, valid acc :  0.8682

-------------------
7epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.17it/s]
100%|██████████| 175/175 [00:13<00:00, 12.76it/s]


train loss : 0.0354, train f1 score :  0.9897, train acc :  0.9897

valid loss : 0.5624, valid f1 score :  0.8698, valid acc :  0.8709

-------------------
8epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.19it/s]
100%|██████████| 175/175 [00:13<00:00, 13.01it/s]


train loss : 0.0256, train f1 score :  0.9928, train acc :  0.9928

valid loss : 0.5750, valid f1 score :  0.8698, valid acc :  0.8716

-------------------
9epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.90it/s]


train loss : 0.0216, train f1 score :  0.9940, train acc :  0.9940

valid loss : 0.6234, valid f1 score :  0.8732, valid acc :  0.8738

-------------------
10epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 13.07it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


best valid loss :  0.3935

best epoch : 3

best accuracy :  0.8754

1
1 fold pass

2
2 fold pass

3
3 fold pass

4
4 fold pass

0
0 fold pass

1
1


22398it [00:00, 849879.87it/s]
5600it [00:00, 828767.59it/s]
Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


-------------------
1epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.72it/s]


best model saved

train loss : 0.4870, train f1 score :  0.8070, train acc :  0.8078

valid loss : 0.3667, valid f1 score :  0.8667, valid acc :  0.8670

-------------------
2epoch start
-------------------



100%|██████████| 700/700 [02:48<00:00,  4.16it/s]
100%|██████████| 175/175 [00:13<00:00, 12.74it/s]


best model saved

train loss : 0.2252, train f1 score :  0.9228, train acc :  0.9232

valid loss : 0.3915, valid f1 score :  0.8738, valid acc :  0.8741

-------------------
3epoch start
-------------------



100%|██████████| 700/700 [02:48<00:00,  4.15it/s]
100%|██████████| 175/175 [00:13<00:00, 13.13it/s]


best model saved

train loss : 0.1160, train f1 score :  0.9623, train acc :  0.9624

valid loss : 0.3725, valid f1 score :  0.8756, valid acc :  0.8761

-------------------
4epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.17it/s]
100%|██████████| 175/175 [00:13<00:00, 12.80it/s]


train loss : 0.0711, train f1 score :  0.9773, train acc :  0.9774

valid loss : 0.5065, valid f1 score :  0.8600, valid acc :  0.8609

-------------------
5epoch start
-------------------



100%|██████████| 700/700 [02:48<00:00,  4.16it/s]
100%|██████████| 175/175 [00:13<00:00, 12.79it/s]


best model saved

train loss : 0.0532, train f1 score :  0.9832, train acc :  0.9832

valid loss : 0.4937, valid f1 score :  0.8778, valid acc :  0.8780

-------------------
6epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.17it/s]
100%|██████████| 175/175 [00:13<00:00, 12.95it/s]


train loss : 0.0391, train f1 score :  0.9885, train acc :  0.9885

valid loss : 0.5306, valid f1 score :  0.8755, valid acc :  0.8761

-------------------
7epoch start
-------------------



100%|██████████| 700/700 [02:48<00:00,  4.16it/s]
100%|██████████| 175/175 [00:13<00:00, 12.81it/s]


train loss : 0.0294, train f1 score :  0.9905, train acc :  0.9905

valid loss : 0.5278, valid f1 score :  0.8738, valid acc :  0.8743

-------------------
8epoch start
-------------------



100%|██████████| 700/700 [02:48<00:00,  4.16it/s]
100%|██████████| 175/175 [00:13<00:00, 13.05it/s]


train loss : 0.0280, train f1 score :  0.9917, train acc :  0.9917

valid loss : 0.5727, valid f1 score :  0.8773, valid acc :  0.8773

-------------------
9epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.17it/s]
100%|██████████| 175/175 [00:13<00:00, 12.70it/s]


train loss : 0.0220, train f1 score :  0.9934, train acc :  0.9934

valid loss : 0.5883, valid f1 score :  0.8753, valid acc :  0.8759

-------------------
10epoch start
-------------------



100%|██████████| 700/700 [02:48<00:00,  4.16it/s]
100%|██████████| 175/175 [00:13<00:00, 12.92it/s]


best model saved

train loss : 0.0208, train f1 score :  0.9941, train acc :  0.9942

valid loss : 0.5151, valid f1 score :  0.8801, valid acc :  0.8807

-------------------
11epoch start
-------------------



100%|██████████| 700/700 [02:48<00:00,  4.14it/s]
100%|██████████| 175/175 [00:13<00:00, 13.07it/s]


train loss : 0.0169, train f1 score :  0.9954, train acc :  0.9954

valid loss : 0.6148, valid f1 score :  0.8764, valid acc :  0.8773

-------------------
12epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.17it/s]
100%|██████████| 175/175 [00:13<00:00, 13.08it/s]


best valid loss :  0.5151

best epoch : 10

best accuracy :  0.8807

2
2 fold pass

3
3 fold pass

4
4 fold pass



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
0 fold pass

1
1 fold pass

2
2


22398it [00:00, 826782.79it/s]
5600it [00:00, 819743.22it/s]
Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


-------------------
1epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.19it/s]
100%|██████████| 175/175 [00:13<00:00, 12.69it/s]


best model saved

train loss : 0.4883, train f1 score :  0.8055, train acc :  0.8065

valid loss : 0.3705, valid f1 score :  0.8660, valid acc :  0.8668

-------------------
2epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.17it/s]
100%|██████████| 175/175 [00:13<00:00, 12.91it/s]


best model saved

train loss : 0.2328, train f1 score :  0.9216, train acc :  0.9219

valid loss : 0.3704, valid f1 score :  0.8669, valid acc :  0.8671

-------------------
3epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.19it/s]
100%|██████████| 175/175 [00:13<00:00, 12.65it/s]


best model saved

train loss : 0.1170, train f1 score :  0.9639, train acc :  0.9641

valid loss : 0.3659, valid f1 score :  0.8834, valid acc :  0.8838

-------------------
4epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.69it/s]


train loss : 0.0692, train f1 score :  0.9785, train acc :  0.9785

valid loss : 0.4068, valid f1 score :  0.8770, valid acc :  0.8777

-------------------
5epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.60it/s]


train loss : 0.0505, train f1 score :  0.9850, train acc :  0.9851

valid loss : 0.5320, valid f1 score :  0.8719, valid acc :  0.8723

-------------------
6epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.63it/s]


train loss : 0.0390, train f1 score :  0.9881, train acc :  0.9882

valid loss : 0.5952, valid f1 score :  0.8739, valid acc :  0.8741

-------------------
7epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.77it/s]


train loss : 0.0331, train f1 score :  0.9905, train acc :  0.9905

valid loss : 0.6447, valid f1 score :  0.8684, valid acc :  0.8688

-------------------
8epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.19it/s]
100%|██████████| 175/175 [00:13<00:00, 12.66it/s]


train loss : 0.0289, train f1 score :  0.9921, train acc :  0.9921

valid loss : 0.5374, valid f1 score :  0.8793, valid acc :  0.8796

-------------------
9epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.58it/s]


train loss : 0.0258, train f1 score :  0.9926, train acc :  0.9926

valid loss : 0.5821, valid f1 score :  0.8817, valid acc :  0.8821

-------------------
10epoch start
-------------------



100%|██████████| 700/700 [02:48<00:00,  4.16it/s]
100%|██████████| 175/175 [00:13<00:00, 12.77it/s]


best valid loss :  0.3659

best epoch : 3

best accuracy :  0.8838

3
3 fold pass

4
4 fold pass



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
0 fold pass

1
1 fold pass

2
2 fold pass

3
3


22399it [00:00, 71351.81it/s]
5599it [00:00, 807589.95it/s]
Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


-------------------
1epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.19it/s]
100%|██████████| 175/175 [00:13<00:00, 12.74it/s]


best model saved

train loss : 0.5009, train f1 score :  0.7960, train acc :  0.7970

valid loss : 0.4167, valid f1 score :  0.8395, valid acc :  0.8398

-------------------
2epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.17it/s]
100%|██████████| 175/175 [00:13<00:00, 12.90it/s]


best model saved

train loss : 0.2457, train f1 score :  0.9166, train acc :  0.9169

valid loss : 0.3390, valid f1 score :  0.8756, valid acc :  0.8759

-------------------
3epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.17it/s]
100%|██████████| 175/175 [00:13<00:00, 12.85it/s]


train loss : 0.1127, train f1 score :  0.9646, train acc :  0.9647

valid loss : 0.4051, valid f1 score :  0.8749, valid acc :  0.8755

-------------------
4epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.63it/s]


train loss : 0.0722, train f1 score :  0.9773, train acc :  0.9774

valid loss : 0.4811, valid f1 score :  0.8755, valid acc :  0.8757

-------------------
5epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.75it/s]


train loss : 0.0483, train f1 score :  0.9864, train acc :  0.9865

valid loss : 0.5039, valid f1 score :  0.8664, valid acc :  0.8664

-------------------
6epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.66it/s]


train loss : 0.0424, train f1 score :  0.9865, train acc :  0.9865

valid loss : 0.5252, valid f1 score :  0.8717, valid acc :  0.8721

-------------------
7epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.62it/s]


train loss : 0.0321, train f1 score :  0.9906, train acc :  0.9907

valid loss : 0.6209, valid f1 score :  0.8660, valid acc :  0.8664

-------------------
8epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.57it/s]


train loss : 0.0243, train f1 score :  0.9925, train acc :  0.9925

valid loss : 0.7663, valid f1 score :  0.8657, valid acc :  0.8666

-------------------
9epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.17it/s]
100%|██████████| 175/175 [00:14<00:00, 12.50it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


best valid loss :  0.3390

best epoch : 2

best accuracy :  0.8759

4
4 fold pass

0
0 fold pass

1
1 fold pass

2
2 fold pass

3
3 fold pass

4
4


22399it [00:00, 817851.31it/s]
5599it [00:00, 780040.79it/s]
Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


-------------------
1epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.56it/s]


best model saved

train loss : 0.5324, train f1 score :  0.7780, train acc :  0.7787

valid loss : 0.4274, valid f1 score :  0.8366, valid acc :  0.8378

-------------------
2epoch start
-------------------



100%|██████████| 700/700 [02:48<00:00,  4.14it/s]
100%|██████████| 175/175 [00:13<00:00, 12.61it/s]


best model saved

train loss : 0.2506, train f1 score :  0.9116, train acc :  0.9120

valid loss : 0.3666, valid f1 score :  0.8747, valid acc :  0.8753

-------------------
3epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.62it/s]


best model saved

train loss : 0.1234, train f1 score :  0.9584, train acc :  0.9586

valid loss : 0.3756, valid f1 score :  0.8795, valid acc :  0.8798

-------------------
4epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:14<00:00, 12.36it/s]


train loss : 0.0749, train f1 score :  0.9772, train acc :  0.9773

valid loss : 0.4289, valid f1 score :  0.8726, valid acc :  0.8730

-------------------
5epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.58it/s]


train loss : 0.0586, train f1 score :  0.9820, train acc :  0.9820

valid loss : 0.4856, valid f1 score :  0.8614, valid acc :  0.8621

-------------------
6epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.19it/s]
100%|██████████| 175/175 [00:13<00:00, 12.73it/s]


train loss : 0.0418, train f1 score :  0.9869, train acc :  0.9869

valid loss : 0.5610, valid f1 score :  0.8680, valid acc :  0.8687

-------------------
7epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:14<00:00, 12.35it/s]


train loss : 0.0356, train f1 score :  0.9894, train acc :  0.9895

valid loss : 0.5624, valid f1 score :  0.8742, valid acc :  0.8748

-------------------
8epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:13<00:00, 12.60it/s]


train loss : 0.0279, train f1 score :  0.9914, train acc :  0.9915

valid loss : 0.4990, valid f1 score :  0.8715, valid acc :  0.8723

-------------------
9epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.19it/s]
100%|██████████| 175/175 [00:13<00:00, 12.59it/s]


train loss : 0.0262, train f1 score :  0.9923, train acc :  0.9923

valid loss : 0.5636, valid f1 score :  0.8716, valid acc :  0.8725

-------------------
10epoch start
-------------------



100%|██████████| 700/700 [02:47<00:00,  4.18it/s]
100%|██████████| 175/175 [00:14<00:00, 12.42it/s]


best valid loss :  0.3756

best epoch : 3

best accuracy :  0.8798



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
0


22398it [00:00, 780499.33it/s]
5600it [00:00, 769395.39it/s]
Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably 

-------------------
1epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.07it/s]
100%|██████████| 175/175 [00:12<00:00, 13.57it/s]


best model saved

train loss : 0.4262, train f1 score :  0.8328, train acc :  0.8337

valid loss : 0.2974, valid f1 score :  0.8919, valid acc :  0.8923

-------------------
2epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.07it/s]
100%|██████████| 175/175 [00:12<00:00, 13.72it/s]


train loss : 0.1497, train f1 score :  0.9498, train acc :  0.9500

valid loss : 0.3429, valid f1 score :  0.8846, valid acc :  0.8868

-------------------
3epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.06it/s]
100%|██████████| 175/175 [00:12<00:00, 13.75it/s]


train loss : 0.0685, train f1 score :  0.9787, train acc :  0.9788

valid loss : 0.3434, valid f1 score :  0.8872, valid acc :  0.8880

-------------------
4epoch start
-------------------



100%|██████████| 700/700 [02:19<00:00,  5.01it/s]
100%|██████████| 175/175 [00:12<00:00, 13.63it/s]


train loss : 0.0422, train f1 score :  0.9866, train acc :  0.9866

valid loss : 0.4402, valid f1 score :  0.8863, valid acc :  0.8866

-------------------
5epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.07it/s]
100%|██████████| 175/175 [00:12<00:00, 13.77it/s]


best model saved

train loss : 0.0316, train f1 score :  0.9904, train acc :  0.9904

valid loss : 0.4461, valid f1 score :  0.8949, valid acc :  0.8955

-------------------
6epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.05it/s]
100%|██████████| 175/175 [00:12<00:00, 13.87it/s]


train loss : 0.0257, train f1 score :  0.9927, train acc :  0.9928

valid loss : 0.4325, valid f1 score :  0.8920, valid acc :  0.8930

-------------------
7epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.05it/s]
100%|██████████| 175/175 [00:12<00:00, 13.70it/s]


best model saved

train loss : 0.0230, train f1 score :  0.9935, train acc :  0.9935

valid loss : 0.5034, valid f1 score :  0.8972, valid acc :  0.8979

-------------------
8epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.04it/s]
100%|██████████| 175/175 [00:12<00:00, 13.63it/s]


train loss : 0.0164, train f1 score :  0.9957, train acc :  0.9957

valid loss : 0.5028, valid f1 score :  0.8968, valid acc :  0.8977

-------------------
9epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.05it/s]
100%|██████████| 175/175 [00:12<00:00, 13.61it/s]


train loss : 0.0149, train f1 score :  0.9960, train acc :  0.9961

valid loss : 0.5052, valid f1 score :  0.8912, valid acc :  0.8920

-------------------
10epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.06it/s]
100%|██████████| 175/175 [00:12<00:00, 13.51it/s]


best valid loss :  0.5034

best epoch : 7

best accuracy :  0.8979

1
1 fold pass

2
2 fold pass

3
3 fold pass

4
4 fold pass



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
0 fold pass

1
1


22398it [00:00, 847824.31it/s]
5600it [00:00, 787847.66it/s]
Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably 

-------------------
1epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.06it/s]
100%|██████████| 175/175 [00:12<00:00, 13.55it/s]


best model saved

train loss : 0.4190, train f1 score :  0.8409, train acc :  0.8416

valid loss : 0.2897, valid f1 score :  0.8979, valid acc :  0.8980

-------------------
2epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.04it/s]
100%|██████████| 175/175 [00:12<00:00, 13.69it/s]


train loss : 0.1484, train f1 score :  0.9507, train acc :  0.9509

valid loss : 0.3485, valid f1 score :  0.8910, valid acc :  0.8921

-------------------
3epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.05it/s]
100%|██████████| 175/175 [00:12<00:00, 13.72it/s]


best model saved

train loss : 0.0624, train f1 score :  0.9810, train acc :  0.9811

valid loss : 0.3382, valid f1 score :  0.9013, valid acc :  0.9016

-------------------
4epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.04it/s]
100%|██████████| 175/175 [00:12<00:00, 13.82it/s]


train loss : 0.0431, train f1 score :  0.9868, train acc :  0.9869

valid loss : 0.3989, valid f1 score :  0.8967, valid acc :  0.8970

-------------------
5epoch start
-------------------



100%|██████████| 700/700 [02:19<00:00,  5.03it/s]
100%|██████████| 175/175 [00:13<00:00, 13.39it/s]


train loss : 0.0313, train f1 score :  0.9904, train acc :  0.9904

valid loss : 0.4242, valid f1 score :  0.8968, valid acc :  0.8975

-------------------
6epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.04it/s]
100%|██████████| 175/175 [00:12<00:00, 13.53it/s]


train loss : 0.0240, train f1 score :  0.9926, train acc :  0.9926

valid loss : 0.4525, valid f1 score :  0.8984, valid acc :  0.8989

-------------------
7epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.05it/s]
100%|██████████| 175/175 [00:12<00:00, 13.77it/s]


train loss : 0.0218, train f1 score :  0.9938, train acc :  0.9938

valid loss : 0.5779, valid f1 score :  0.8964, valid acc :  0.8968

-------------------
8epoch start
-------------------



100%|██████████| 700/700 [02:17<00:00,  5.08it/s]
100%|██████████| 175/175 [00:12<00:00, 13.64it/s]


train loss : 0.0169, train f1 score :  0.9951, train acc :  0.9951

valid loss : 0.5213, valid f1 score :  0.8975, valid acc :  0.8982

-------------------
9epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.07it/s]
100%|██████████| 175/175 [00:12<00:00, 13.56it/s]


best valid loss :  0.3382

best epoch : 3

best accuracy :  0.9016

2
2 fold pass

3
3 fold pass

4
4 fold pass



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
0 fold pass

1
1 fold pass

2
2


22398it [00:00, 824996.67it/s]
5600it [00:00, 772330.08it/s]
Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably 

-------------------
1epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.05it/s]
100%|██████████| 175/175 [00:12<00:00, 13.64it/s]


best model saved

train loss : 0.4135, train f1 score :  0.8367, train acc :  0.8375

valid loss : 0.2828, valid f1 score :  0.8977, valid acc :  0.8979

-------------------
2epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.07it/s]
100%|██████████| 175/175 [00:12<00:00, 13.61it/s]


train loss : 0.1408, train f1 score :  0.9519, train acc :  0.9522

valid loss : 0.2929, valid f1 score :  0.8949, valid acc :  0.8952

-------------------
3epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.06it/s]
100%|██████████| 175/175 [00:12<00:00, 13.70it/s]


train loss : 0.0660, train f1 score :  0.9794, train acc :  0.9795

valid loss : 0.3238, valid f1 score :  0.8884, valid acc :  0.8891

-------------------
4epoch start
-------------------



100%|██████████| 700/700 [02:17<00:00,  5.07it/s]
100%|██████████| 175/175 [00:12<00:00, 13.65it/s]


train loss : 0.0393, train f1 score :  0.9887, train acc :  0.9887

valid loss : 0.4326, valid f1 score :  0.8968, valid acc :  0.8973

-------------------
5epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.06it/s]
100%|██████████| 175/175 [00:12<00:00, 13.72it/s]


train loss : 0.0343, train f1 score :  0.9910, train acc :  0.9910

valid loss : 0.4531, valid f1 score :  0.8964, valid acc :  0.8968

-------------------
6epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.07it/s]
100%|██████████| 175/175 [00:12<00:00, 13.74it/s]


train loss : 0.0276, train f1 score :  0.9921, train acc :  0.9921

valid loss : 0.4932, valid f1 score :  0.8919, valid acc :  0.8929

-------------------
7epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.06it/s]
100%|██████████| 175/175 [00:13<00:00, 13.43it/s]


train loss : 0.0223, train f1 score :  0.9940, train acc :  0.9940

valid loss : 0.5443, valid f1 score :  0.8859, valid acc :  0.8870

-------------------
8epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.07it/s]
100%|██████████| 175/175 [00:12<00:00, 13.73it/s]


best valid loss :  0.2828

best epoch : 1

best accuracy :  0.8979

3
3 fold pass

4
4 fold pass



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
0 fold pass

1
1 fold pass

2
2 fold pass

3
3


22399it [00:00, 820529.93it/s]
5599it [00:00, 794099.62it/s]
Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably 

-------------------
1epoch start
-------------------



100%|██████████| 700/700 [02:17<00:00,  5.09it/s]
100%|██████████| 175/175 [00:13<00:00, 13.35it/s]


best model saved

train loss : 0.4300, train f1 score :  0.8353, train acc :  0.8360

valid loss : 0.3215, valid f1 score :  0.8870, valid acc :  0.8868

-------------------
2epoch start
-------------------



100%|██████████| 700/700 [02:17<00:00,  5.09it/s]
100%|██████████| 175/175 [00:12<00:00, 13.65it/s]


train loss : 0.1557, train f1 score :  0.9482, train acc :  0.9484

valid loss : 0.3456, valid f1 score :  0.8814, valid acc :  0.8812

-------------------
3epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.06it/s]
100%|██████████| 175/175 [00:12<00:00, 13.47it/s]


best model saved

train loss : 0.0713, train f1 score :  0.9790, train acc :  0.9791

valid loss : 0.3607, valid f1 score :  0.8915, valid acc :  0.8923

-------------------
4epoch start
-------------------



100%|██████████| 700/700 [02:17<00:00,  5.08it/s]
100%|██████████| 175/175 [00:12<00:00, 13.64it/s]


best model saved

train loss : 0.0366, train f1 score :  0.9889, train acc :  0.9889

valid loss : 0.4245, valid f1 score :  0.8921, valid acc :  0.8927

-------------------
5epoch start
-------------------



100%|██████████| 700/700 [02:17<00:00,  5.08it/s]
100%|██████████| 175/175 [00:12<00:00, 13.78it/s]


best model saved

train loss : 0.0328, train f1 score :  0.9905, train acc :  0.9906

valid loss : 0.4025, valid f1 score :  0.8947, valid acc :  0.8952

-------------------
6epoch start
-------------------



100%|██████████| 700/700 [02:17<00:00,  5.08it/s]
100%|██████████| 175/175 [00:13<00:00, 13.43it/s]


train loss : 0.0247, train f1 score :  0.9923, train acc :  0.9923

valid loss : 0.4970, valid f1 score :  0.8922, valid acc :  0.8927

-------------------
7epoch start
-------------------



100%|██████████| 700/700 [02:17<00:00,  5.08it/s]
100%|██████████| 175/175 [00:12<00:00, 13.53it/s]


train loss : 0.0210, train f1 score :  0.9942, train acc :  0.9942

valid loss : 0.5531, valid f1 score :  0.8947, valid acc :  0.8950

-------------------
8epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.07it/s]
100%|██████████| 175/175 [00:12<00:00, 13.62it/s]


train loss : 0.0163, train f1 score :  0.9953, train acc :  0.9953

valid loss : 0.5573, valid f1 score :  0.8910, valid acc :  0.8923

-------------------
9epoch start
-------------------



100%|██████████| 700/700 [02:17<00:00,  5.08it/s]
100%|██████████| 175/175 [00:13<00:00, 13.44it/s]


train loss : 0.0142, train f1 score :  0.9963, train acc :  0.9963

valid loss : 0.5885, valid f1 score :  0.8865, valid acc :  0.8869

-------------------
10epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.07it/s]
100%|██████████| 175/175 [00:12<00:00, 13.62it/s]


train loss : 0.0139, train f1 score :  0.9968, train acc :  0.9968

valid loss : 0.5319, valid f1 score :  0.8919, valid acc :  0.8925

-------------------
11epoch start
-------------------



100%|██████████| 700/700 [02:17<00:00,  5.07it/s]
100%|██████████| 175/175 [00:13<00:00, 13.42it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


best valid loss :  0.4025

best epoch : 5

best accuracy :  0.8952

4
4 fold pass

0
0 fold pass

1
1 fold pass

2
2 fold pass

3
3 fold pass

4
4


22399it [00:00, 802715.49it/s]
5599it [00:00, 769737.06it/s]
Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably 

-------------------
1epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.07it/s]
100%|██████████| 175/175 [00:12<00:00, 13.70it/s]


best model saved

train loss : 0.4091, train f1 score :  0.8427, train acc :  0.8434

valid loss : 0.2793, valid f1 score :  0.8983, valid acc :  0.8993

-------------------
2epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.06it/s]
100%|██████████| 175/175 [00:12<00:00, 13.56it/s]


train loss : 0.1481, train f1 score :  0.9506, train acc :  0.9508

valid loss : 0.3065, valid f1 score :  0.8925, valid acc :  0.8934

-------------------
3epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.06it/s]
100%|██████████| 175/175 [00:12<00:00, 13.51it/s]


train loss : 0.0652, train f1 score :  0.9804, train acc :  0.9804

valid loss : 0.3794, valid f1 score :  0.8900, valid acc :  0.8911

-------------------
4epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.05it/s]
100%|██████████| 175/175 [00:12<00:00, 13.60it/s]


best model saved

train loss : 0.0433, train f1 score :  0.9870, train acc :  0.9871

valid loss : 0.3407, valid f1 score :  0.8998, valid acc :  0.9000

-------------------
5epoch start
-------------------



100%|██████████| 700/700 [02:17<00:00,  5.07it/s]
100%|██████████| 175/175 [00:12<00:00, 13.52it/s]


train loss : 0.0299, train f1 score :  0.9908, train acc :  0.9908

valid loss : 0.4859, valid f1 score :  0.8989, valid acc :  0.8996

-------------------
6epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.06it/s]
100%|██████████| 175/175 [00:13<00:00, 13.45it/s]


train loss : 0.0267, train f1 score :  0.9922, train acc :  0.9923

valid loss : 0.4226, valid f1 score :  0.8905, valid acc :  0.8918

-------------------
7epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.06it/s]
100%|██████████| 175/175 [00:12<00:00, 13.75it/s]


train loss : 0.0179, train f1 score :  0.9949, train acc :  0.9949

valid loss : 0.4351, valid f1 score :  0.8951, valid acc :  0.8952

-------------------
8epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.05it/s]
100%|██████████| 175/175 [00:12<00:00, 13.46it/s]


train loss : 0.0154, train f1 score :  0.9960, train acc :  0.9960

valid loss : 0.4541, valid f1 score :  0.8926, valid acc :  0.8928

-------------------
9epoch start
-------------------



100%|██████████| 700/700 [02:18<00:00,  5.07it/s]
100%|██████████| 175/175 [00:12<00:00, 13.62it/s]


best valid loss :  0.3407

best epoch : 4

best accuracy :  0.9000



In [26]:
def ensemble():
    final_logit=0
    
    args.pt = 'klue/roberta-large'
    _, logit1 = run_predict("./saved_models/fold5/klue/roberta-large_193/0f_explain.pth")
    _, logit2 = run_predict("./saved_models/fold5/klue/roberta-large_193/1f_explain.pth")
    _, logit3 = run_predict("./saved_models/fold5/klue/roberta-large_193/2f_explain.pth")
    _, logit4 = run_predict("./saved_models/fold5/klue/roberta-large_193/3f_explain.pth")
    _, logit5 = run_predict("./saved_models/fold5/klue/roberta-large_193/4f_explain.pth")
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    
    args.pt = 'xlm-roberta-large'
    _, logit1 = run_predict("./saved_models/fold5/xlm-roberta-large_193/0f_explain.pth")
    _, logit2 = run_predict("./saved_models/fold5/xlm-roberta-large_193/1f_explain.pth")
    _, logit3 = run_predict("./saved_models/fold5/xlm-roberta-large_193/2f_explain.pth")
    _, logit4 = run_predict("./saved_models/fold5/xlm-roberta-large_193/3f_explain.pth")
    _, logit5 = run_predict("./saved_models/fold5/xlm-roberta-large_193/4f_explain.pth")
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    
    
    return final_logit


In [27]:
final_logit = ensemble()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


test load


1666it [00:00, 804943.03it/s]


set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

load saved models


100%|██████████| 53/53 [00:04<00:00, 10.62it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


complete predict
test load


1666it [00:00, 821793.54it/s]


set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

load saved models


100%|██████████| 53/53 [00:04<00:00, 10.81it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


complete predict
test load


1666it [00:00, 779356.51it/s]


set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

load saved models


100%|██████████| 53/53 [00:05<00:00, 10.28it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


complete predict
test load


1666it [00:00, 823633.95it/s]


set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

load saved models


100%|██████████| 53/53 [00:05<00:00, 10.36it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


complete predict
test load


1666it [00:00, 756409.45it/s]


set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

load saved models


100%|██████████| 53/53 [00:05<00:00, 10.57it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


complete predict
test load


1666it [00:00, 822373.83it/s]


set testloader


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


load saved models


100%|██████████| 53/53 [00:05<00:00,  9.64it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


complete predict
test load


1666it [00:00, 829303.40it/s]


set testloader


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


load saved models


100%|██████████| 53/53 [00:05<00:00,  9.71it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


complete predict
test load


1666it [00:00, 756819.07it/s]


set testloader


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


load saved models


100%|██████████| 53/53 [00:05<00:00,  9.59it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


complete predict
test load


1666it [00:00, 673124.98it/s]


set testloader


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


load saved models


100%|██████████| 53/53 [00:05<00:00,  9.95it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


complete predict
test load


1666it [00:00, 701154.97it/s]


set testloader


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


load saved models


100%|██████████| 53/53 [00:05<00:00,  9.29it/s]

complete predict





In [28]:
final_logit

array([[-6.13554687, 10.23984375, -3.91015625],
       [ 1.14471436, -5.20625   ,  4.35      ],
       [ 4.13775635, -3.88855591, -0.13886719],
       ...,
       [-3.69506836, -4.94394531,  9.48632812],
       [-3.22539063, -5.56796875,  9.58984375],
       [-1.17011719, -0.66969128,  1.91318359]])

In [29]:
np.save('./explain_npy', final_logit)

In [33]:
robert_logit = np.load('./robert_npy.npy')

In [34]:
final = (final_logit + robert_logit) / 2

In [35]:
sub = pd.read_csv("./data/sample_submission.csv")
out = [list(label_dict.keys())[_] for _ in final.argmax(1)]

sub['label'] = out
print(sub)
# preds
sub.to_csv(f'./submission/final_submission_xlm_klu_roberta-large_194_explain_193_esemble_all.csv', index=False)


      index          label
0         0  contradiction
1         1        neutral
2         2     entailment
3         3  contradiction
4         4  contradiction
...     ...            ...
1661   1661        neutral
1662   1662     entailment
1663   1663        neutral
1664   1664        neutral
1665   1665        neutral

[1666 rows x 2 columns]
