In [1]:
# ------ LIBRARY -------#
import numpy as np
import os
import pickle
import sys
import pandas as pd
import re
import cv2
import json
from functools import partial
from typing import List

# torch
import torch
import torch.cuda.amp as amp
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torch.utils.data.sampler import *


import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau, MultiStepLR, OneCycleLR
#

import math
from torch.optim.optimizer import Optimizer, required
import torch_optimizer as optim
from collections import defaultdict
import itertools as it

import tqdm
import random
#import time
import matplotlib.pyplot as plt
from timeit import default_timer as timer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# transformer
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
# class args
class args:
    # ---- factor ---- #
    debug=False
    amp = True
    gpu = '0'
    
    epochs=20
    batch_size=10
    weight_decay=0.0
    n_fold=5
    fold=5 # [0, 1, 2, 3, 4] # 원래는 3
    patience = 7
    
    exp_name = 'experiment_name_folder'
    dir_ = f'./saved_models/'
    pt = 'xlm-roberta-large'
    max_len = 200
    
    start_lr = 2e-5#1e-3,5e-5
    min_lr=1e-6
    lamb = 1.0
    # ---- Dataset ---- #

    # ---- Else ---- #
    num_workers=8
    seed=222
    scheduler = None#'get_linear_schedule_with_warmup'


data_dir = './'
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
print(device)

##----------------
def set_seeds(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False # for faster training, but not deterministic

set_seeds(seed=args.seed)    

cuda


In [4]:
train = pd.read_csv("data/new_train_data.csv")
test = pd.read_csv("data/test_data.csv")
submission = pd.read_csv("data/sample_submission.csv")

In [5]:
print(pd.unique(train["label"]))

label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}

['contradiction' 'entailment' 'neutral']


In [6]:
for i, text in enumerate(train.label):
    train.label[i] = label_dict[text]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
train

Unnamed: 0,index,premise,hypothesis,label
0,0,"씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이...",씨름의 여자들의 놀이이다.,1
1,1,"삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 하였으나,...",자작극을 벌인 이는 3명이다.,1
2,2,이를 위해 예측적 범죄예방 시스템을 구축하고 고도화한다.,예측적 범죄예방 시스템 구축하고 고도화하는 것은 목적이 있기 때문이다.,0
3,3,광주광역시가 재개발 정비사업 원주민들에 대한 종합대책을 마련하는 등 원주민 보호에 ...,원주민들은 종합대책에 만족했다.,2
4,4,"진정 소비자와 직원들에게 사랑 받는 기업으로 오래 지속되고 싶으면, 이런 상황에서는...",이런 상황에서 책임 있는 모습을 보여주는 기업은 아주 드물다.,2
...,...,...,...,...
27993,2995,흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재개되었다가...,지방도 제1112호선을 넓히는 공사는 중단없이 마무리 되었다.,1
27994,2996,흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재개되었다가...,지방도 제1112호선을 넓히는 공사가 중단된 건 세 번째이다.,2
27995,2997,흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재개되었다가...,지방도 제1112호선은 흔히 비자림로라고 불린다.,0
27996,2998,흡연자분들은 발코니가 있는 방이면 발코니에서 흡연이 가능합니다.,비흡연자는 발코니 있는 방이 필요없습니다.,2


In [8]:
def collate_to_max_length(batch: List[List[torch.Tensor]], max_len: int = None, fill_values: List[float] = None) -> \
    List[torch.Tensor]:
    """
    pad to maximum length of this batch
    Args:
        batch: a batch of samples, each contains a list of field data(Tensor), which shape is [seq_length]
        max_len: specify max length
        fill_values: specify filled values of each field
    Returns:
        output: list of field batched data, which shape is [batch, max_length]
    """
    # [batch, num_fields]
    lengths = np.array([[len(field_data) for field_data in sample] for sample in batch])
    batch_size, num_fields = lengths.shape
    fill_values = fill_values or [0.0] * num_fields
    # [num_fields]
    max_lengths = lengths.max(axis=0)
    if max_len:
        assert max_lengths.max() <= max_len
        max_lengths = np.ones_like(max_lengths) * max_len

    output = [torch.full([batch_size, max_lengths[field_idx]],
                         fill_value=fill_values[field_idx],
                         dtype=batch[0][field_idx].dtype)
              for field_idx in range(num_fields)]
    for sample_idx in range(batch_size):
        for field_idx in range(num_fields):
            # seq_length
            data = batch[sample_idx][field_idx]
            output[field_idx][sample_idx][: data.shape[0]] = data
    # generate span_index and span_mask
    max_sentence_length = max_lengths[0]
    start_indexs = []
    end_indexs = []
    for i in range(1, max_sentence_length - 1):
        for j in range(i, max_sentence_length - 1):
            # # span大小为10
            # if j - i > 10:
            #     continue
            start_indexs.append(i)
            end_indexs.append(j)
    # generate span mask
    span_masks = []
    for input_ids, label, length in batch:
        span_mask = []
        middle_index = input_ids.tolist().index(2)
        for start_index, end_index in zip(start_indexs, end_indexs):
            if 1 <= start_index <= length.item() - 2 and 1 <= end_index <= length.item() - 2 and (
                start_index > middle_index or end_index < middle_index):
                span_mask.append(0)
            else:
                span_mask.append(1e6)
        span_masks.append(span_mask)
    # add to output
    output.append(torch.LongTensor(start_indexs))
    output.append(torch.LongTensor(end_indexs))
    output.append(torch.LongTensor(span_masks))
    return output  # (input_ids, labels, length, start_indexs, end_indexs, span_masks)

In [9]:
def collate_to_max_length_test(batch: List[List[torch.Tensor]], max_len: int = None, fill_values: List[float] = None) -> \
    List[torch.Tensor]:
    """
    pad to maximum length of this batch
    Args:
        batch: a batch of samples, each contains a list of field data(Tensor), which shape is [seq_length]
        max_len: specify max length
        fill_values: specify filled values of each field
    Returns:
        output: list of field batched data, which shape is [batch, max_length]
    """
    # [batch, num_fields]
    lengths = np.array([[len(field_data) for field_data in sample] for sample in batch])
    batch_size, num_fields = lengths.shape
    fill_values = fill_values or [0.0] * num_fields
    # [num_fields]
    max_lengths = lengths.max(axis=0)
    if max_len:
        assert max_lengths.max() <= max_len
        max_lengths = np.ones_like(max_lengths) * max_len

    output = [torch.full([batch_size, max_lengths[field_idx]],
                         fill_value=fill_values[field_idx],
                         dtype=batch[0][field_idx].dtype)
              for field_idx in range(num_fields)]
    for sample_idx in range(batch_size):
        for field_idx in range(num_fields):
            # seq_length
            data = batch[sample_idx][field_idx]
            output[field_idx][sample_idx][: data.shape[0]] = data
    # generate span_index and span_mask
    max_sentence_length = max_lengths[0]
    start_indexs = []
    end_indexs = []
    for i in range(1, max_sentence_length - 1):
        for j in range(i, max_sentence_length - 1):
            # # span大小为10
            # if j - i > 10:
            #     continue
            start_indexs.append(i)
            end_indexs.append(j)
    # generate span mask
    span_masks = []
    for input_ids, length in batch:
        span_mask = []
        middle_index = input_ids.tolist().index(2)
        for start_index, end_index in zip(start_indexs, end_indexs):
            if 1 <= start_index <= length.item() - 2 and 1 <= end_index <= length.item() - 2 and (
                start_index > middle_index or end_index < middle_index):
                span_mask.append(0)
            else:
                span_mask.append(1e6)
        span_masks.append(span_mask)
    # add to output
    output.append(torch.LongTensor(start_indexs))
    output.append(torch.LongTensor(end_indexs))
    output.append(torch.LongTensor(span_masks))
    return output  # (input_ids, labels, length, start_indexs, end_indexs, span_masks)

In [10]:
AutoTokenizer.from_pretrained(args.pt)

PreTrainedTokenizerFast(name_or_path='xlm-roberta-large', vocab_size=250002, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [11]:
class NLIDataset(Dataset):

    def __init__(self, data, bert_path, max_length):
        super().__init__()
        self.max_length = max_length
        self.result = []

        train=data[['premise', 'hypothesis', 'label']]

#         train['premise'] = train['premise'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 0-9]', '')
#         train['hypothesis'] = train['hypothesis'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 0-9]', '')
        
        for train_premise, train_hypothesis, train_label in tqdm.tqdm(zip(train['premise'], train['hypothesis'], train['label'])):
                    self.result.append((train_premise, train_hypothesis, train_label))
        self.tokenizer = AutoTokenizer.from_pretrained(bert_path)

    def __len__(self):
        return len(self.result)

    def __getitem__(self, idx):
        sentence_1, sentence_2, label = self.result[idx]
        # remove .
        if sentence_1.endswith("."):
            sentence_1 = sentence_1[:-1]
        if sentence_2.endswith("."):
            sentence_2 = sentence_2[:-1]
        sentence_1_input_ids = self.tokenizer.encode(sentence_1, add_special_tokens=False)
        sentence_2_input_ids = self.tokenizer.encode(sentence_2, add_special_tokens=False)
        input_ids = sentence_1_input_ids + [2] + sentence_2_input_ids
        if len(input_ids) > self.max_length - 2:
            input_ids = input_ids[:self.max_length - 2]
        # convert list to tensor
        length = torch.LongTensor([len(input_ids) + 2])
        input_ids = torch.LongTensor([0] + input_ids + [2])
        label = torch.LongTensor([label])
        
        return input_ids, label, length

In [12]:
class NLIDataset_test(Dataset):

    def __init__(self, data, bert_path, max_length: int = 512):
        super().__init__()
        self.max_length = max_length
        self.result = []
        
        test=data[['premise', 'hypothesis']]
    
#         test['premise'] = test['premise'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 0-9]', '')
#         test['hypothesis'] = test['hypothesis'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 0-9]', '')
    
        for test_premise, test_hypothesis in tqdm.tqdm(zip(test['premise'], test['hypothesis'])):
                    self.result.append((test_premise, test_hypothesis))
        self.tokenizer = AutoTokenizer.from_pretrained(bert_path)

    def __len__(self):
        return len(self.result)

    def __getitem__(self, idx):
        sentence_1, sentence_2 = self.result[idx]
        # remove .
        if sentence_1.endswith("."):
            sentence_1 = sentence_1[:-1]
        if sentence_2.endswith("."):
            sentence_2 = sentence_2[:-1]
        sentence_1_input_ids = self.tokenizer.encode(sentence_1, add_special_tokens=False)
        sentence_2_input_ids = self.tokenizer.encode(sentence_2, add_special_tokens=False)
        input_ids = sentence_1_input_ids + [2] + sentence_2_input_ids
        if len(input_ids) > self.max_length - 2:
            input_ids = input_ids[:self.max_length - 2]
        # convert list to tensor
        length = torch.LongTensor([len(input_ids) + 2])
        input_ids = torch.LongTensor([0] + input_ids + [2])
        
        return input_ids, length

In [13]:
def unit_test():
    
    dataset = SNLIDataset(data=train[:1], bert_path=args.pt, max_length=args.max_len)

    dataloader = DataLoader(
        dataset=dataset,
        batch_size=10,
        num_workers=0,
        shuffle=False,
        collate_fn=partial(collate_to_max_length, fill_values=[1, 0, 0])
    )
    for input_ids, label, length, start_index, end_index, span_mask in dataloader:
        print(input_ids.shape, input_ids)
        print(start_index.shape, start_index)
        print(end_index.shape, end_index)
        print(span_mask.shape, span_mask)
        print(label.view(-1).shape, label)
        print()
        
    for t, data in enumerate(tqdm.tqdm(dataloader)):
        print(data[0])

In [14]:
unit_test()

1it [00:00, 8144.28it/s]


torch.Size([1, 59]) tensor([[     0, 105051,  28913,    697,  11031,   1077, 128161,  84802,   3626,
           1963,  25436, 105646, 119686,  64757,  17862, 223713,      6, 145726,
          67520,      4,   6705,   2680,  16632,  11619,   2905,   7593,      6,
         154848,   1077,  51851,  27815,    993,  36372, 102102,  16632,      6,
         202577,   1180, 209750,  77442,  66127,   1291,  64730,  32685,      6,
          23854,  14413,    769,  15710,      2, 105051,  28913,    367,  52340,
          17862,      6, 145726,   5769,      2]])
torch.Size([1653]) tensor([ 1,  1,  1,  ..., 56, 56, 57])
torch.Size([1653]) tensor([ 1,  2,  3,  ..., 56, 57, 57])
torch.Size([1, 1653]) tensor([[0, 0, 0,  ..., 0, 0, 0]])
torch.Size([1]) tensor([[1]])



100%|██████████| 1/1 [00:00<00:00, 198.12it/s]

tensor([[     0, 105051,  28913,    697,  11031,   1077, 128161,  84802,   3626,
           1963,  25436, 105646, 119686,  64757,  17862, 223713,      6, 145726,
          67520,      4,   6705,   2680,  16632,  11619,   2905,   7593,      6,
         154848,   1077,  51851,  27815,    993,  36372, 102102,  16632,      6,
         202577,   1180, 209750,  77442,  66127,   1291,  64730,  32685,      6,
          23854,  14413,    769,  15710,      2, 105051,  28913,    367,  52340,
          17862,      6, 145726,   5769,      2]])





In [15]:
class ExplainableModel(nn.Module):
    def __init__(self, bert_dir):
        super().__init__()
        self.bert_config = AutoConfig.from_pretrained(bert_dir, output_hidden_states=False)
        self.intermediate = AutoModel.from_pretrained(bert_dir, return_dict=False)
        self.span_info_collect = SICModel(self.bert_config.hidden_size)
        self.interpretation = InterpretationModel(self.bert_config.hidden_size)
        self.output = nn.Linear(self.bert_config.hidden_size, 3)

    def forward(self, input_ids, start_indexs, end_indexs, span_masks):
        # generate mask
        attention_mask = (input_ids != 1).long()
        # intermediate layer
        hidden_states, first_token = self.intermediate(input_ids, attention_mask=attention_mask)  # output.shape = (bs, length, hidden_size)
        # span info collecting layer(SIC)
        h_ij = self.span_info_collect(hidden_states, start_indexs, end_indexs)
        # interpretation layer
        H, a_ij = self.interpretation(h_ij, span_masks)
        # output layer
        out = self.output(H)
        return out, a_ij


class SICModel(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.W_1 = nn.Linear(hidden_size, hidden_size)
        self.W_2 = nn.Linear(hidden_size, hidden_size)
        self.W_3 = nn.Linear(hidden_size, hidden_size)
        self.W_4 = nn.Linear(hidden_size, hidden_size)

    def forward(self, hidden_states, start_indexs, end_indexs):
        W1_h = self.W_1(hidden_states)  # (bs, length, hidden_size)
        W2_h = self.W_2(hidden_states)
        W3_h = self.W_3(hidden_states)
        W4_h = self.W_4(hidden_states)

        W1_hi_emb = torch.index_select(W1_h, 1, start_indexs)  # (bs, span_num, hidden_size)
        W2_hj_emb = torch.index_select(W2_h, 1, end_indexs)
        W3_hi_start_emb = torch.index_select(W3_h, 1, start_indexs)
        W3_hi_end_emb = torch.index_select(W3_h, 1, end_indexs)
        W4_hj_start_emb = torch.index_select(W4_h, 1, start_indexs)
        W4_hj_end_emb = torch.index_select(W4_h, 1, end_indexs)

        # [w1*hi, w2*hj, w3(hi-hj), w4(hi⊗hj)]
        span = W1_hi_emb + W2_hj_emb + (W3_hi_start_emb - W3_hi_end_emb) + torch.mul(W4_hj_start_emb, W4_hj_end_emb)
        h_ij = torch.tanh(span)
        return h_ij


class InterpretationModel(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.h_t = nn.Linear(hidden_size, 1)

    def forward(self, h_ij, span_masks):
        o_ij = self.h_t(h_ij).squeeze(-1)  # (ba, span_num)
        # mask illegal span
        o_ij = o_ij - span_masks
        # normalize all a_ij, a_ij sum = 1
        a_ij = nn.functional.softmax(o_ij, dim=1)
        # weight average span representation to get H
        H = (a_ij.unsqueeze(-1) * h_ij).sum(dim=1)  # (bs, hidden_size)
        return H, a_ij

In [16]:
# - util - #
def get_learning_rate(optimizer):
    lr=[]
    for param_group in optimizer.param_groups:
        lr +=[ param_group['lr'] ]

    assert(len(lr)==1) #we support only one param_group
    lr = lr[0]

    return lr

def load_data():
    train=pd.read_csv('data/new_train_data.csv')
    test=pd.read_csv('data/test_data.csv')
    
    #
    train=train[['premise', 'hypothesis', 'label']]
    test=test[['premise', 'hypothesis']]
    
    #
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    train['fold'] = -1
    for n_fold, (_,v_idx) in enumerate(skf.split(train, train['label'])):
        train.loc[v_idx, 'fold']  = n_fold
    train['id'] = [x for x in range(len(train))]
    
    for i, text in enumerate(train.label):
        train.label[i] = label_dict[text]
    
    return train, test

In [17]:
# ------------------------
#  scheduler
# ------------------------

def do_valid(net, valid_loader):

    val_loss = 0
    target_lst = []
    pred_lst = []
    logit = []
    loss_fn = nn.CrossEntropyLoss()

    net.eval()
    start_timer = timer()
    for t, data in enumerate(tqdm.tqdm(valid_loader)):

        # (input_ids, labels, length, start_indexs, end_indexs, span_masks)
        input_ids  = data[0].to(device)
        start_index  = data[3].to(device)
        end_index = data[4].to(device)
        span_mask = data[5].to(device)
        target = data[1].to(device).view(-1)

        with torch.no_grad():
            if args.amp:
                with amp.autocast():
                    # output
                    output, a_ij = net(input_ids, start_index, end_index, span_mask)
#                     output = output[0]

                    # loss
                    ce_loss  = loss_fn(output, target)
                    reg_loss = args.lamb * a_ij.pow(2).sum(dim=1).mean()
                    loss = ce_loss + reg_loss

            else:
                output = net(ids, mask)#.squeeze(0)
                loss = loss_fn(output, target)
            
            val_loss += loss
            target_lst.extend(target.detach().cpu().numpy())
            pred_lst.extend(output.argmax(dim=1).tolist())
            logit.extend(output.tolist())
            
        val_mean_loss = val_loss / len(valid_loader)
        validation_score = f1_score(y_true=target_lst, y_pred=pred_lst, average='macro')
        validation_acc = accuracy_score(y_true=target_lst, y_pred=pred_lst)
        

    return val_mean_loss, validation_score, validation_acc, logit

def do_predict(net, valid_loader):
    
    val_loss = 0
    pred_lst = []
    logit=[]
    net.eval()
    for t, data in enumerate(tqdm.tqdm(valid_loader)):
        
        # (input_ids, length, start_indexs, end_indexs, span_masks)
        input_ids  = data[0].to(device)
        start_index  = data[2].to(device)
        end_index = data[3].to(device)
        span_mask = data[4].to(device)

        with torch.no_grad():
            if args.amp:
                with amp.autocast():
                    # output
                    output = net(input_ids, start_index, end_index, span_mask)[0]

            else:
                output = net(input_ids, start_index, end_index, span_mask)
             
            pred_lst.extend(output.argmax(dim=1).tolist())
            logit.extend(output.tolist())
            
    return pred_lst,logit

def run_train(folds=3):
    out_dir = args.dir_+ f'/fold{args.fold}/{args.exp_name}/'
    os.makedirs(out_dir, exist_ok=True)
    
    # load dataset
    train, test = load_data()    
 
    
    # split fold
    for n_fold in range(5):
        
        print(n_fold)
        if n_fold != folds:
            print(f'{n_fold} fold pass'+'\n')
            continue
            
        if args.debug:
            train = train.sample(1000).copy()
            
        print(n_fold)
        
        trn_idx = train[train['fold']!=n_fold]['id'].values
        val_idx = train[train['fold']==n_fold]['id'].values

        ## dataset ------------------------------------
        train_dataset = NLIDataset(data = train.iloc[trn_idx], bert_path=args.pt, max_length=args.max_len)
        valid_dataset = NLIDataset(data = train.iloc[val_idx], bert_path=args.pt, max_length=args.max_len)
        trainloader = DataLoader(dataset=train_dataset, batch_size=args.batch_size,
                                 num_workers=8, shuffle=True, pin_memory=True, 
                                 collate_fn=partial(collate_to_max_length, fill_values=[1, 0, 0]),
                                drop_last=False)
        validloader = DataLoader(dataset=valid_dataset, batch_size=args.batch_size, 
                                 num_workers=8, shuffle=False, pin_memory=True, 
                                 collate_fn=partial(collate_to_max_length, fill_values=[1, 0, 0]),
                                drop_last=False)

        ## net ----------------------------------------
        scaler = amp.GradScaler()
        net = ExplainableModel(args.pt)

        net.to(device)
        if len(args.gpu)>1:
            net = nn.DataParallel(net)

        # ------------------------
        # loss
        # ------------------------
        loss_fn = nn.CrossEntropyLoss()

        # ------------------------
        #  Optimizer
        # ------------------------
#         no_decay = ["bias", "LayerNorm.weight"]
#         optimizer_grouped_parameters = [
#             {
#                 "params": [p for n, p in net.named_parameters() if not any(nd in n for nd in no_decay)],
#                 "weight_decay": args.weight_decay,
#             },
#             {
#                 "params": [p for n, p in net.named_parameters() if any(nd in n for nd in no_decay)],
#                 "weight_decay": 0.0,
#             },
#         ]
#         optimizer = AdamW(optimizer_grouped_parameters,
#                           betas=(0.9, 0.98),  # according to RoBERTa paper
#                           lr=args.start_lr,
#                           eps=1e-9)

        optimizer = optim.Lookahead(optim.RAdam(filter(lambda p: p.requires_grad,net.parameters()), lr=args.start_lr), alpha=0.5, k=5)
    
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = len(trainloader)*args.epochs)
        
        
        # ----
        start_timer = timer()
        best_score = 0
        early_stopping = 0
        
        for epoch in range(1, args.epochs+1):
            train_loss = 0
            valid_loss = 0

            target_lst = []
            pred_lst = []
#             lr = get_learning_rate(optimizer)
            print(f'-------------------')
            print(f'{epoch}epoch start')
            print(f'-------------------'+'\n')
#             print(f'learning rate : {lr : .6f}')
            for t, data in enumerate(tqdm.tqdm(trainloader)):

                # one iteration update  -------------
                input_ids  = data[0].to(device)
                start_index  = data[3].to(device)
                end_index = data[4].to(device)
                span_mask = data[5].to(device)
                target = data[1].to(device).view(-1)

                # ------------
#                 net.train()
                optimizer.zero_grad()


                if args.amp:
                    with amp.autocast():
                        # output
                        output, a_ij = net(input_ids, start_index, end_index, span_mask)
#                         output = output[0]

                        # loss
                        ce_loss  = loss_fn(output, target)
                        reg_loss = args.lamb * a_ij.pow(2).sum(dim=1).mean()
                        loss = ce_loss + reg_loss
                        train_loss += loss


                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()

                else:
                    # output
                    output = net(ids, mask)

                    # loss
                    loss = loss_fn(output, target)
                    train_loss += loss

                    # update
                    loss.backward()
                    optimizer.step()


                # for calculate f1 score
                target_lst.extend(target.detach().cpu().numpy())
                pred_lst.extend(output.argmax(dim=1).tolist())


                if scheduler is not None:
                    scheduler.step() 
            train_loss = train_loss / len(trainloader)
            train_score = f1_score(y_true=target_lst, y_pred=pred_lst, average='macro')
            train_acc = accuracy_score(y_true=target_lst, y_pred=pred_lst)

            # validation
            valid_loss, valid_score, valid_acc, _ = do_valid(net, validloader)


            if valid_acc > best_score:
                best_score = valid_acc
                best_epoch = epoch
                best_loss = valid_loss

                torch.save(net.state_dict(), out_dir + f'/{folds}f_explain.pth')
                print('best model saved'+'\n')
            else:
                early_stopping += 1

            # Early Stopping
            if early_stopping == args.patience:
                break


            print(f'train loss : {train_loss:.4f}, train f1 score : {train_score : .4f}, train acc : {train_acc : .4f}'+'\n')
            print(f'valid loss : {valid_loss:.4f}, valid f1 score : {valid_score : .4f}, valid acc : {valid_acc : .4f}'+'\n')


        print(f'best valid loss : {best_loss : .4f}'+'\n')
        print(f'best epoch : {best_epoch }'+'\n')
        print(f'best accuracy : {best_score : .4f}'+'\n')
        
def run_predict(model_path):
    ## dataset ------------------------------------
    # load
        
    train, test = load_data()
    print('test load')

    test_dataset = NLIDataset_test(data = test, bert_path=args.pt, max_length=args.max_len)
    testloader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, 
                             num_workers=8, shuffle=False, pin_memory=True, collate_fn=partial(collate_to_max_length_test, fill_values=[1, 0, 0]))
    print('set testloader')
    ## net ----------------------------------------
    scaler = amp.GradScaler()
    net = ExplainableModel(args.pt)
        
    net.to(device)
    
    if len(args.gpu)>1:
        net = nn.DataParallel(net)

    f = torch.load(model_path)
    net.load_state_dict(f, strict=True)  # True
    print('load saved models')
    # ------------------------
    # validation
    preds, logit = do_predict(net, testloader) #outputs
           
    print('complete predict')
    
    return preds, np.array(logit)
     

In [19]:
"""5fold 전용"""
if __name__ == '__main__':

    for pt, max_len in zip(['klue/roberta-large'],[200]):
        
        args.max_len = max_len
        args.pt = pt
        args.exp_name = str(args.pt) + '_' + str(args.max_len)
    
        for i in [0,1,2,3,4]: # 5fold
            run_train(folds=i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
0


22398it [00:00, 836254.74it/s]
5600it [00:00, 829294.30it/s]
Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably 

-------------------
1epoch start
-------------------



100%|██████████| 2240/2240 [05:26<00:00,  6.85it/s]
100%|██████████| 560/560 [00:27<00:00, 20.15it/s]


best model saved

train loss : 0.4369, train f1 score :  0.8280, train acc :  0.8288

valid loss : 0.3058, valid f1 score :  0.8933, valid acc :  0.8938

-------------------
2epoch start
-------------------



100%|██████████| 2240/2240 [05:28<00:00,  6.82it/s]
100%|██████████| 560/560 [00:27<00:00, 20.27it/s]


best model saved

train loss : 0.1782, train f1 score :  0.9413, train acc :  0.9416

valid loss : 0.2938, valid f1 score :  0.8999, valid acc :  0.9002

-------------------
3epoch start
-------------------



100%|██████████| 2240/2240 [05:34<00:00,  6.70it/s]
100%|██████████| 560/560 [00:26<00:00, 20.75it/s]


train loss : 0.0793, train f1 score :  0.9771, train acc :  0.9772

valid loss : 0.4495, valid f1 score :  0.8787, valid acc :  0.8812

-------------------
4epoch start
-------------------



100%|██████████| 2240/2240 [05:33<00:00,  6.72it/s]
100%|██████████| 560/560 [00:27<00:00, 20.71it/s]


train loss : 0.0487, train f1 score :  0.9867, train acc :  0.9867

valid loss : 0.4674, valid f1 score :  0.8866, valid acc :  0.8871

-------------------
5epoch start
-------------------



100%|██████████| 2240/2240 [05:31<00:00,  6.76it/s]
100%|██████████| 560/560 [00:26<00:00, 20.88it/s]


train loss : 0.0363, train f1 score :  0.9914, train acc :  0.9915

valid loss : 0.5153, valid f1 score :  0.8905, valid acc :  0.8907

-------------------
6epoch start
-------------------



100%|██████████| 2240/2240 [05:30<00:00,  6.79it/s]
100%|██████████| 560/560 [00:27<00:00, 20.72it/s]


train loss : 0.0283, train f1 score :  0.9930, train acc :  0.9930

valid loss : 0.4803, valid f1 score :  0.8925, valid acc :  0.8927

-------------------
7epoch start
-------------------



100%|██████████| 2240/2240 [05:29<00:00,  6.80it/s]
100%|██████████| 560/560 [00:27<00:00, 20.29it/s]


train loss : 0.0213, train f1 score :  0.9953, train acc :  0.9954

valid loss : 0.6208, valid f1 score :  0.8958, valid acc :  0.8962

-------------------
8epoch start
-------------------



100%|██████████| 2240/2240 [05:28<00:00,  6.82it/s]
100%|██████████| 560/560 [00:26<00:00, 20.97it/s]


train loss : 0.0203, train f1 score :  0.9960, train acc :  0.9960

valid loss : 0.5165, valid f1 score :  0.8873, valid acc :  0.8875

-------------------
9epoch start
-------------------



100%|██████████| 2240/2240 [05:33<00:00,  6.71it/s]
100%|██████████| 560/560 [00:26<00:00, 21.12it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


best valid loss :  0.2938

best epoch : 2

best accuracy :  0.9002

1
1 fold pass

2
2 fold pass

3
3 fold pass

4
4 fold pass

0
0 fold pass

1
1


22398it [00:00, 851922.24it/s]
5600it [00:00, 740856.12it/s]
Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably 

-------------------
1epoch start
-------------------



100%|██████████| 2240/2240 [05:28<00:00,  6.82it/s]
100%|██████████| 560/560 [00:26<00:00, 20.86it/s]


best model saved

train loss : 0.4548, train f1 score :  0.8187, train acc :  0.8195

valid loss : 0.2953, valid f1 score :  0.8950, valid acc :  0.8954

-------------------
2epoch start
-------------------



100%|██████████| 2240/2240 [05:31<00:00,  6.77it/s]
100%|██████████| 560/560 [00:26<00:00, 21.43it/s]


best model saved

train loss : 0.1788, train f1 score :  0.9424, train acc :  0.9427

valid loss : 0.3100, valid f1 score :  0.8988, valid acc :  0.8993

-------------------
3epoch start
-------------------



100%|██████████| 2240/2240 [05:30<00:00,  6.78it/s]
100%|██████████| 560/560 [00:26<00:00, 20.79it/s]


train loss : 0.0792, train f1 score :  0.9758, train acc :  0.9759

valid loss : 0.4042, valid f1 score :  0.8914, valid acc :  0.8911

-------------------
4epoch start
-------------------



100%|██████████| 2240/2240 [05:30<00:00,  6.78it/s]
100%|██████████| 560/560 [00:26<00:00, 21.11it/s]


train loss : 0.0492, train f1 score :  0.9860, train acc :  0.9861

valid loss : 0.4683, valid f1 score :  0.8838, valid acc :  0.8836

-------------------
5epoch start
-------------------



100%|██████████| 2240/2240 [05:30<00:00,  6.79it/s]
100%|██████████| 560/560 [00:27<00:00, 20.55it/s]


train loss : 0.0364, train f1 score :  0.9901, train acc :  0.9901

valid loss : 0.4287, valid f1 score :  0.8953, valid acc :  0.8957

-------------------
6epoch start
-------------------



100%|██████████| 2240/2240 [05:29<00:00,  6.80it/s]
100%|██████████| 560/560 [00:26<00:00, 21.37it/s]


train loss : 0.0297, train f1 score :  0.9926, train acc :  0.9926

valid loss : 0.4964, valid f1 score :  0.8913, valid acc :  0.8912

-------------------
7epoch start
-------------------



100%|██████████| 2240/2240 [05:31<00:00,  6.77it/s]
100%|██████████| 560/560 [00:27<00:00, 20.54it/s]


train loss : 0.0283, train f1 score :  0.9930, train acc :  0.9931

valid loss : 0.4455, valid f1 score :  0.8981, valid acc :  0.8982

-------------------
8epoch start
-------------------



100%|██████████| 2240/2240 [05:35<00:00,  6.68it/s]
100%|██████████| 560/560 [00:27<00:00, 20.42it/s]


best model saved

train loss : 0.0176, train f1 score :  0.9957, train acc :  0.9957

valid loss : 0.4659, valid f1 score :  0.9026, valid acc :  0.9029

-------------------
9epoch start
-------------------



100%|██████████| 2240/2240 [05:38<00:00,  6.61it/s]
100%|██████████| 560/560 [00:28<00:00, 19.94it/s]


best model saved

train loss : 0.0189, train f1 score :  0.9962, train acc :  0.9962

valid loss : 0.4715, valid f1 score :  0.9043, valid acc :  0.9046

-------------------
10epoch start
-------------------



100%|██████████| 2240/2240 [05:40<00:00,  6.58it/s]
100%|██████████| 560/560 [00:26<00:00, 20.88it/s]


train loss : 0.0126, train f1 score :  0.9976, train acc :  0.9976

valid loss : 0.5198, valid f1 score :  0.9013, valid acc :  0.9016

-------------------
11epoch start
-------------------



100%|██████████| 2240/2240 [05:40<00:00,  6.57it/s]
100%|██████████| 560/560 [00:27<00:00, 20.58it/s]


best valid loss :  0.4715

best epoch : 9

best accuracy :  0.9046

2
2 fold pass

3
3 fold pass

4
4 fold pass



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
0 fold pass

1
1 fold pass

2
2


22398it [00:00, 838351.76it/s]
5600it [00:00, 763170.63it/s]
Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably 

-------------------
1epoch start
-------------------



100%|██████████| 2240/2240 [05:40<00:00,  6.57it/s]
100%|██████████| 560/560 [00:27<00:00, 20.51it/s]


best model saved

train loss : 0.4561, train f1 score :  0.8174, train acc :  0.8181

valid loss : 0.3000, valid f1 score :  0.8947, valid acc :  0.8955

-------------------
2epoch start
-------------------



100%|██████████| 2240/2240 [05:38<00:00,  6.61it/s]
100%|██████████| 560/560 [00:27<00:00, 20.51it/s]


best model saved

train loss : 0.1754, train f1 score :  0.9437, train acc :  0.9440

valid loss : 0.3031, valid f1 score :  0.8998, valid acc :  0.9004

-------------------
3epoch start
-------------------



100%|██████████| 2240/2240 [05:42<00:00,  6.54it/s]
100%|██████████| 560/560 [00:27<00:00, 20.02it/s]


best model saved

train loss : 0.0792, train f1 score :  0.9773, train acc :  0.9774

valid loss : 0.3365, valid f1 score :  0.9014, valid acc :  0.9020

-------------------
4epoch start
-------------------



100%|██████████| 2240/2240 [05:39<00:00,  6.60it/s]
100%|██████████| 560/560 [00:27<00:00, 20.44it/s]


train loss : 0.0479, train f1 score :  0.9869, train acc :  0.9869

valid loss : 0.4634, valid f1 score :  0.8959, valid acc :  0.8966

-------------------
5epoch start
-------------------



100%|██████████| 2240/2240 [05:39<00:00,  6.61it/s]
100%|██████████| 560/560 [00:27<00:00, 20.22it/s]


train loss : 0.0364, train f1 score :  0.9899, train acc :  0.9900

valid loss : 0.4771, valid f1 score :  0.8904, valid acc :  0.8902

-------------------
6epoch start
-------------------



100%|██████████| 2240/2240 [05:39<00:00,  6.60it/s]
100%|██████████| 560/560 [00:27<00:00, 20.54it/s]


train loss : 0.0291, train f1 score :  0.9922, train acc :  0.9922

valid loss : 0.4568, valid f1 score :  0.9011, valid acc :  0.9018

-------------------
7epoch start
-------------------



100%|██████████| 2240/2240 [05:38<00:00,  6.62it/s]
100%|██████████| 560/560 [00:27<00:00, 20.25it/s]


train loss : 0.0223, train f1 score :  0.9950, train acc :  0.9950

valid loss : 0.5047, valid f1 score :  0.9009, valid acc :  0.9014

-------------------
8epoch start
-------------------



100%|██████████| 2240/2240 [05:43<00:00,  6.52it/s]
100%|██████████| 560/560 [00:29<00:00, 19.24it/s]


train loss : 0.0202, train f1 score :  0.9952, train acc :  0.9952

valid loss : 0.4362, valid f1 score :  0.8992, valid acc :  0.8996

-------------------
9epoch start
-------------------



100%|██████████| 2240/2240 [05:42<00:00,  6.54it/s]
100%|██████████| 560/560 [00:27<00:00, 20.32it/s]


train loss : 0.0133, train f1 score :  0.9978, train acc :  0.9978

valid loss : 0.5608, valid f1 score :  0.8970, valid acc :  0.8977

-------------------
10epoch start
-------------------



100%|██████████| 2240/2240 [05:43<00:00,  6.53it/s]
100%|██████████| 560/560 [00:27<00:00, 20.22it/s]


best valid loss :  0.3365

best epoch : 3

best accuracy :  0.9020

3
3 fold pass

4
4 fold pass



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
0 fold pass

1
1 fold pass

2
2 fold pass

3
3


22399it [00:00, 806443.22it/s]
5599it [00:00, 764126.77it/s]
Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably 

-------------------
1epoch start
-------------------



100%|██████████| 2240/2240 [05:35<00:00,  6.68it/s]
100%|██████████| 560/560 [00:28<00:00, 19.82it/s]


best model saved

train loss : 0.4502, train f1 score :  0.8160, train acc :  0.8166

valid loss : 0.3303, valid f1 score :  0.8797, valid acc :  0.8818

-------------------
2epoch start
-------------------



100%|██████████| 2240/2240 [05:35<00:00,  6.67it/s]
100%|██████████| 560/560 [00:27<00:00, 20.23it/s]


best model saved

train loss : 0.1751, train f1 score :  0.9439, train acc :  0.9442

valid loss : 0.3088, valid f1 score :  0.8961, valid acc :  0.8966

-------------------
3epoch start
-------------------



100%|██████████| 2240/2240 [05:36<00:00,  6.66it/s]
100%|██████████| 560/560 [00:27<00:00, 20.30it/s]


train loss : 0.0822, train f1 score :  0.9756, train acc :  0.9757

valid loss : 0.3809, valid f1 score :  0.8863, valid acc :  0.8875

-------------------
4epoch start
-------------------



100%|██████████| 2240/2240 [05:41<00:00,  6.57it/s]
100%|██████████| 560/560 [00:29<00:00, 19.23it/s]


train loss : 0.0532, train f1 score :  0.9858, train acc :  0.9858

valid loss : 0.4381, valid f1 score :  0.8880, valid acc :  0.8882

-------------------
5epoch start
-------------------



100%|██████████| 2240/2240 [05:46<00:00,  6.47it/s]
100%|██████████| 560/560 [00:27<00:00, 20.61it/s]


train loss : 0.0352, train f1 score :  0.9914, train acc :  0.9914

valid loss : 0.4631, valid f1 score :  0.8911, valid acc :  0.8921

-------------------
6epoch start
-------------------



100%|██████████| 2240/2240 [05:45<00:00,  6.49it/s]
100%|██████████| 560/560 [00:28<00:00, 19.88it/s]


train loss : 0.0317, train f1 score :  0.9918, train acc :  0.9918

valid loss : 0.5780, valid f1 score :  0.8873, valid acc :  0.8889

-------------------
7epoch start
-------------------



100%|██████████| 2240/2240 [05:41<00:00,  6.56it/s]
100%|██████████| 560/560 [00:27<00:00, 20.56it/s]


train loss : 0.0226, train f1 score :  0.9942, train acc :  0.9942

valid loss : 0.4843, valid f1 score :  0.8955, valid acc :  0.8957

-------------------
8epoch start
-------------------



100%|██████████| 2240/2240 [05:32<00:00,  6.73it/s]
100%|██████████| 560/560 [00:27<00:00, 20.20it/s]


best model saved

train loss : 0.0194, train f1 score :  0.9958, train acc :  0.9958

valid loss : 0.4540, valid f1 score :  0.9007, valid acc :  0.9011

-------------------
9epoch start
-------------------



100%|██████████| 2240/2240 [05:39<00:00,  6.60it/s]
100%|██████████| 560/560 [00:27<00:00, 20.52it/s]


train loss : 0.0164, train f1 score :  0.9967, train acc :  0.9967

valid loss : 0.5202, valid f1 score :  0.8920, valid acc :  0.8930

-------------------
10epoch start
-------------------



100%|██████████| 2240/2240 [05:34<00:00,  6.69it/s]
100%|██████████| 560/560 [00:27<00:00, 20.66it/s]


best valid loss :  0.4540

best epoch : 8

best accuracy :  0.9011

4
4 fold pass



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
0 fold pass

1
1 fold pass

2
2 fold pass

3
3 fold pass

4
4


22399it [00:00, 804743.88it/s]
5599it [00:00, 719945.68it/s]
Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably 

-------------------
1epoch start
-------------------



100%|██████████| 2240/2240 [05:30<00:00,  6.78it/s]
100%|██████████| 560/560 [00:26<00:00, 20.96it/s]


best model saved

train loss : 0.4522, train f1 score :  0.8164, train acc :  0.8170

valid loss : 0.3056, valid f1 score :  0.8926, valid acc :  0.8930

-------------------
2epoch start
-------------------



100%|██████████| 2240/2240 [05:30<00:00,  6.79it/s]
100%|██████████| 560/560 [00:27<00:00, 20.48it/s]


best model saved

train loss : 0.1742, train f1 score :  0.9432, train acc :  0.9434

valid loss : 0.3266, valid f1 score :  0.8933, valid acc :  0.8948

-------------------
3epoch start
-------------------



100%|██████████| 2240/2240 [05:38<00:00,  6.61it/s]
100%|██████████| 560/560 [00:27<00:00, 20.72it/s]


best model saved

train loss : 0.0811, train f1 score :  0.9750, train acc :  0.9751

valid loss : 0.3449, valid f1 score :  0.8941, valid acc :  0.8952

-------------------
4epoch start
-------------------



100%|██████████| 2240/2240 [05:32<00:00,  6.75it/s]
100%|██████████| 560/560 [00:26<00:00, 20.85it/s]


best model saved

train loss : 0.0532, train f1 score :  0.9850, train acc :  0.9850

valid loss : 0.3455, valid f1 score :  0.8990, valid acc :  0.8994

-------------------
5epoch start
-------------------



100%|██████████| 2240/2240 [05:27<00:00,  6.84it/s]
100%|██████████| 560/560 [00:26<00:00, 20.87it/s]


train loss : 0.0328, train f1 score :  0.9912, train acc :  0.9912

valid loss : 0.4214, valid f1 score :  0.8933, valid acc :  0.8946

-------------------
6epoch start
-------------------



100%|██████████| 2240/2240 [05:27<00:00,  6.85it/s]
100%|██████████| 560/560 [00:27<00:00, 20.32it/s]


train loss : 0.0307, train f1 score :  0.9927, train acc :  0.9927

valid loss : 0.4631, valid f1 score :  0.8927, valid acc :  0.8930

-------------------
7epoch start
-------------------



100%|██████████| 2240/2240 [05:29<00:00,  6.80it/s]
100%|██████████| 560/560 [00:26<00:00, 21.22it/s]


best model saved

train loss : 0.0231, train f1 score :  0.9951, train acc :  0.9951

valid loss : 0.4868, valid f1 score :  0.9035, valid acc :  0.9041

-------------------
8epoch start
-------------------



100%|██████████| 2240/2240 [05:35<00:00,  6.67it/s]
100%|██████████| 560/560 [00:26<00:00, 20.74it/s]


best model saved

train loss : 0.0201, train f1 score :  0.9954, train acc :  0.9954

valid loss : 0.4134, valid f1 score :  0.9067, valid acc :  0.9073

-------------------
9epoch start
-------------------



100%|██████████| 2240/2240 [05:41<00:00,  6.55it/s]
100%|██████████| 560/560 [00:29<00:00, 19.26it/s]


train loss : 0.0154, train f1 score :  0.9968, train acc :  0.9968

valid loss : 0.5205, valid f1 score :  0.9009, valid acc :  0.9016

-------------------
10epoch start
-------------------



100%|██████████| 2240/2240 [05:51<00:00,  6.38it/s]
100%|██████████| 560/560 [00:30<00:00, 18.16it/s]


train loss : 0.0148, train f1 score :  0.9970, train acc :  0.9971

valid loss : 0.4564, valid f1 score :  0.9041, valid acc :  0.9050

-------------------
11epoch start
-------------------



100%|██████████| 2240/2240 [05:55<00:00,  6.30it/s]
100%|██████████| 560/560 [00:28<00:00, 19.60it/s]


train loss : 0.0131, train f1 score :  0.9983, train acc :  0.9983

valid loss : 0.5106, valid f1 score :  0.8996, valid acc :  0.9009

-------------------
12epoch start
-------------------



100%|██████████| 2240/2240 [06:03<00:00,  6.17it/s]
100%|██████████| 560/560 [00:29<00:00, 18.88it/s]


train loss : 0.0114, train f1 score :  0.9982, train acc :  0.9983

valid loss : 0.5572, valid f1 score :  0.9027, valid acc :  0.9032

-------------------
13epoch start
-------------------



100%|██████████| 2240/2240 [05:52<00:00,  6.35it/s]
100%|██████████| 560/560 [00:28<00:00, 19.39it/s]


best model saved

train loss : 0.0071, train f1 score :  0.9994, train acc :  0.9994

valid loss : 0.5951, valid f1 score :  0.9067, valid acc :  0.9075

-------------------
14epoch start
-------------------



100%|██████████| 2240/2240 [05:44<00:00,  6.50it/s]
100%|██████████| 560/560 [00:29<00:00, 19.20it/s]


best valid loss :  0.5951

best epoch : 13

best accuracy :  0.9075



In [20]:
def ensemble():
    final_logit=0
    
    args.pt = 'klue/roberta-large'
    _, logit1 = run_predict("./saved_models/fold5/klue/roberta-large_200/0f_explain.pth")
    _, logit2 = run_predict("./saved_models/fold5/klue/roberta-large_200/1f_explain.pth")
    _, logit3 = run_predict("./saved_models/fold5/klue/roberta-large_200/2f_explain.pth")
    _, logit4 = run_predict("./saved_models/fold5/klue/roberta-large_200/3f_explain.pth")
    _, logit5 = run_predict("./saved_models/fold5/klue/roberta-large_200/4f_explain.pth")
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    
#     args.pt = 'xlm-roberta-large'
#     _, logit1 = run_predict("./saved_models/fold5/xlm-roberta-large_200/0f_explain.pth")
#     _, logit2 = run_predict("./saved_models/fold5/xlm-roberta-large_200/1f_explain.pth")
#     _, logit3 = run_predict("./saved_models/fold5/xlm-roberta-large_200/2f_explain.pth")
#     _, logit4 = run_predict("./saved_models/fold5/xlm-roberta-large_200/3f_explain.pth")
#     _, logit5 = run_predict("./saved_models/fold5/xlm-roberta-large_200/4f_explain.pth")
#     final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    
    
    return final_logit


In [21]:
final_logit = ensemble()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


test load


1666it [00:00, 765776.49it/s]


set testloader


Downloading:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

load saved models


100%|██████████| 167/167 [00:08<00:00, 19.16it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


complete predict
test load


1666it [00:00, 767627.21it/s]


set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

load saved models


100%|██████████| 167/167 [00:08<00:00, 20.01it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


complete predict
test load


1666it [00:00, 813565.08it/s]


set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

load saved models


100%|██████████| 167/167 [00:08<00:00, 18.77it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


complete predict
test load


1666it [00:00, 767880.27it/s]


set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

load saved models


100%|██████████| 167/167 [00:09<00:00, 18.00it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


complete predict
test load


1666it [00:00, 744640.93it/s]


set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

load saved models


100%|██████████| 167/167 [00:08<00:00, 19.23it/s]

complete predict





In [22]:
final_logit

array([[-3.42148437,  6.121875  , -2.7796875 ],
       [-1.14521484, -3.11445312,  4.28828125],
       [ 4.20117188, -3.57070313, -0.59492188],
       ...,
       [-1.78164063, -3.45117188,  5.375     ],
       [-1.95390625, -3.7       ,  5.8640625 ],
       [-0.75438232, -0.95178223,  1.52290039]])

In [None]:
# np.save('./explain_npy', final_logit)

In [None]:
# robert_logit = np.load('./robert_npy.npy')

In [None]:
# final = final_logit + robert_logit

In [None]:
sub = pd.read_csv("./data/sample_submission.csv")
out = [list(label_dict.keys())[_] for _ in final_logit.argmax(1)]

sub['label'] = out
print(sub)
# preds
sub.to_csv(f'./submission/final_submission_klu_roberta-large_explain_200.csv', index=False)
