In [None]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
import pandas
from transformers import AutoTokenizer
from transformers import RobertaModel, RobertaConfig
from transformers import AdamW
import time
import argparse

transformers.logging.set_verbosity(40) # Turn off warning
save_dir = './result_qa'

## Load Data

In [None]:
class TensorDataset(Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.tokenized_dataset.items()}
        label = self.labels[idx]
        return item, label

    def __len__(self):
        return len(self.labels)

In [None]:
def load_data(dataset_dir):
  dataset = pd.read_csv(dataset_dir, delimiter='\t', names=['ID', 'sentence', 'question', '1', '2','answer'], header=0)
  dataset["label"] = dataset["answer"].astype(int) - 1

  new_sentence1_1 = []
  new_sentence1_2 = []
  new_sentence2_1 = []
  new_sentence2_2 = []
  
  for i in range(len(dataset)):
    s = dataset.iloc[i]['sentence']
    q = dataset.iloc[i]['question']
    s1 = dataset.iloc[i]['1']
    s2 = dataset.iloc[i]['2']
    lb = dataset.iloc[i]['label']

    if q == "결과":
      new_sentence1_1.append(s)
      new_sentence1_2.append(s1)  # 후보1
      new_sentence2_1.append(s)
      new_sentence2_2.append(s2)  # 후보2

    else:
      new_sentence1_1.append(s1) # 후보1 먼저
      new_sentence1_2.append(s)
      new_sentence2_1.append(s2) # 후보2 먼저 
      new_sentence2_2.append(s) 

  dataset["new_sentence1_1"] = new_sentence1_1
  dataset["new_sentence1_2"] = new_sentence1_2
  dataset["new_sentence2_1"] = new_sentence2_1
  dataset["new_sentence2_2"] = new_sentence2_2

  return dataset

In [None]:
def tokenized_dataset(dataset, tokenizer) :
  sentence1_1 = dataset['new_sentence1_1'].tolist()
  sentence1_2 = dataset['new_sentence1_2'].tolist()
  sentence2_1 = dataset["new_sentence2_1"].tolist()
  sentence2_2 = dataset["new_sentence2_2"].tolist()

  # 결과인 경우
  tokenized_sentences = tokenizer(
      sentence1_1,
      sentence1_2,
      return_tensors  = 'pt',
      padding = True,
      truncation = True, 
      max_length = 100, 
      add_special_tokens = True,
      return_token_type_ids = True
  )

  # 원인인 경우
  tokenized_sentences2 = tokenizer(
      sentence2_1,
      sentence2_2,
      return_tensors  = 'pt',
      padding = True,
      truncation = True, 
      max_length = 100,  
      add_special_tokens = True,
      return_token_type_ids = True
  )
  
  # Sentence 하나로 합쳐 return 하기
  for key, value in tokenized_sentences2.items() :
    tokenized_sentences[key+'2'] = value

  return tokenized_sentences

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model_type = "Roberta"
size = 'large'
model_name = f"klue/roberta-{size}"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [None]:
import pandas as pd

if size == 'base':
    batch_size = 32
else:
    batch_size = 16

base_path = '/content/drive/MyDrive/NLP_Final/'
train_dataset = load_data(os.path.join(base_path, 'SKT_COPA_Train.tsv'))
val_dataset = load_data(os.path.join(base_path, 'SKT_COPA_Dev.tsv'))

train_tokenized = tokenized_dataset(train_dataset, tokenizer)
val_tokenized = tokenized_dataset(val_dataset, tokenizer)

train_dataset = TensorDataset(train_tokenized, train_dataset['label'])
val_dataset = TensorDataset(val_tokenized, val_dataset['label'])
    
train_loader = DataLoader(train_dataset, batch_size=batch_size,drop_last=True, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, drop_last=False, shuffle=False)

In [None]:
class Roberta(RobertaModel):
    def __init__(self, config, model_name):
        super(Roberta, self).__init__(config)
        self.roberta = RobertaModel.from_pretrained(model_name, config=config)
        self.hdim = config.hidden_size
        self.nclass = config.nclass
        self.classifier = nn.Linear(self.hdim, self.nclass-1)

    def forward(self, input_ids, input_ids2, attention_mask, attention_mask2, **kwargs):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        outputs2 = self.roberta(input_ids2, attention_mask=attention_mask2)
        h = outputs[0][:, 0, :]
        h2 = outputs2[0][:, 0, :]
        logits1 = self.classifier(h)
        logits2 = self.classifier(h2)
        logits = torch.cat([logits1, logits2], dim=1)
        return logits

config = RobertaConfig.from_pretrained(model_name)
config.nclass = 2

Downloading:   0%|          | 0.00/547 [00:00<?, ?B/s]

In [None]:
def train_epoch(epoch, model, train_loader, optimizer, scheduler):
    model.train()
    total_loss = 0
    cor = 0
    n_sample = 0
    s = time.time()
    criterion = nn.CrossEntropyLoss()

    for data, target in train_loader:
        item = {key: val.to(device) for key, val in data.items()}
        target = target.to(device)

        logits = model(**item)
        loss = criterion(logits, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        with torch.no_grad():
            preds = torch.argmax(logits, dim=-1)

        total_loss += loss.item()
        cor += (preds == target).sum().item()
        n_sample += len(target)

        print(f"{cor}/{n_sample}", end='\r')

    loss_avg = total_loss / n_sample
    acc = cor / n_sample
    print(
        f"[Epoch {epoch}] Train loss: {loss_avg:.3f}, acc: {acc*100:.2f}, time: {time.time()-s:.1f}s"
    )
    return acc


def validate(epoch, model, val_loader, verbose=True):
    model.eval()
    total_loss = 0
    cor = 0
    n_sample = 0
    criterion = nn.CrossEntropyLoss()
    pred_all = []
    
    with torch.no_grad():
        for data, target in val_loader:
            item = {key: val.to(device) for key, val in data.items()}
            target = target.to(device)

            logits = model(**item)
            loss = criterion(logits, target)
            preds = torch.argmax(logits, dim=-1)
            pred_all.append(preds)

            total_loss += loss.item()
            cor += (preds == target).sum().item()
            n_sample += len(target)

    loss_avg = total_loss / n_sample
    acc = cor / n_sample
    pred_all = torch.cat(pred_all)
    
    if verbose:
        print(f"[Epoch {epoch}] Valid loss: {loss_avg:.3f}, acc: {acc*100:.2f}")
    return acc, pred_all


def train(idx, num_epochs, lr, train_loader, val_loader):
    print(f"Start trining {idx}th model")
    model = Roberta(config, model_name).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = transformers.get_scheduler("linear",
                                           optimizer=optimizer,
                                           num_warmup_steps=num_epochs * len(train_loader) // 10,
                                           num_training_steps=num_epochs * len(train_loader))
    best_acc = 0
    for epoch in range(num_epochs):
        train_acc = train_epoch(epoch, model, train_loader, optimizer, scheduler)
        val_acc, _ = validate(epoch, model, val_loader)
        if val_acc > best_acc:
            best_acc = val_acc

            model_to_save = model.module if hasattr(model, "module") else model
            model_to_save.save_pretrained(os.path.join(save_dir, f'{idx}'))
            
    print(f"Training finish! Best validation accuracy: {best_acc*100:.2f}\n")

In [None]:
lr = 8e-6
num_epochs = 10

In [None]:
for i in range(10):
    train(i, num_epochs, lr, train_loader, val_loader)

Start trining 0th model


Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

[Epoch 0] Train loss: 0.038, acc: 63.44, time: 83.8s
[Epoch 0] Valid loss: 0.019, acc: 88.40
[Epoch 1] Train loss: 0.017, acc: 88.25, time: 83.4s
[Epoch 1] Valid loss: 0.017, acc: 91.80
[Epoch 2] Train loss: 0.006, acc: 96.26, time: 83.4s
[Epoch 2] Valid loss: 0.019, acc: 91.00
[Epoch 3] Train loss: 0.003, acc: 98.60, time: 83.3s
[Epoch 3] Valid loss: 0.024, acc: 91.60
[Epoch 4] Train loss: 0.002, acc: 99.35, time: 83.3s
[Epoch 4] Valid loss: 0.022, acc: 91.80
[Epoch 5] Train loss: 0.001, acc: 99.67, time: 83.3s
[Epoch 5] Valid loss: 0.024, acc: 92.20
[Epoch 6] Train loss: 0.001, acc: 99.84, time: 83.4s
[Epoch 6] Valid loss: 0.025, acc: 92.00
[Epoch 7] Train loss: 0.001, acc: 99.74, time: 83.3s
[Epoch 7] Valid loss: 0.024, acc: 91.60
[Epoch 8] Train loss: 0.000, acc: 99.84, time: 83.3s
[Epoch 8] Valid loss: 0.025, acc: 91.40
[Epoch 9] Train loss: 0.000, acc: 99.90, time: 83.3s
[Epoch 9] Valid loss: 0.025, acc: 91.60
Training finish! Best validation accuracy: 92.20

Start trining 1th mo

In [None]:
def validate_ensemble(val_loader, answer, idx_max=10):
    pred_ensemble = []
    for idx in range(idx_max):
        model = Roberta.from_pretrained(os.path.join(save_dir, f'{idx}'), model_name)
        model.to(device)
        acc, pred_all = validate('best', model, val_loader, verbose=False)
        print(f"Load {idx}th model (acc: {acc*100:.2f})")
        if acc >= 0.85:
            pred_ensemble.append(pred_all)
        
    pred_ensemble = torch.stack(pred_ensemble, dim=-1).float()
    pred_ensemble = (pred_ensemble.mean(-1) >= 0.5).long().to(answer.device)
    acc_ensemble = (pred_ensemble == answer).sum() / len(answer)
    print(f"\nEnsemble accuracy: {acc_ensemble*100:.2f}")

In [None]:
answer = torch.tensor(val_dataset.labels)
validate_ensemble(val_loader, answer, idx_max=10)

Load 0th model (acc: 92.20)
Load 1th model (acc: 91.40)
Load 2th model (acc: 92.20)
Load 3th model (acc: 93.20)
Load 4th model (acc: 91.60)
Load 5th model (acc: 91.40)
Load 6th model (acc: 91.80)
Load 7th model (acc: 92.20)
Load 8th model (acc: 92.20)
Load 9th model (acc: 92.80)

Ensemble accuracy: 92.80
