## [문법성 판단] 양서연 (2021-33205)

In [1]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
import pandas as pd
from transformers import AutoTokenizer
from transformers import ElectraConfig, ElectraModel
from transformers import AdamW
import time
import argparse

transformers.logging.set_verbosity(40) # Turn off warning

In [2]:
save_dir = './result_gram'

### Load Grammer data
- Train, Dev 데이터가 base_path에 들어있어야 합니다. (default: './data')

In [3]:
def load_data(path, tokenizer):
    dataset = pd.read_csv(path, delimiter='\t', names=['source', 'acceptability_label', 'source_annotation', 'sentence'], header=0)
    dataset["label"] = dataset["acceptability_label"].astype(int)
    
    sentence = dataset['sentence'].tolist()
    
    tokenized = tokenizer(sentence,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=150,
    add_special_tokens=True,
    return_token_type_ids = True)
    
    return dataset, tokenized


class TensorDataset(Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.tokenized_dataset.items()}
        label = self.labels[idx]
        return item, label

    def __len__(self):
        return len(self.labels)

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model_type = "Electra"
#size = 'large'
#model_name = f"klue/roberta-{size}"
model_name = f"monologg/koelectra-base-v3-discriminator"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
base_path = '/home/stella/NIKL-KLUE/Cola/cola_data_results/data'
train_dataset, train_tokenized = load_data(os.path.join(base_path, 'NIKL_CoLA_train.tsv'), tokenizer)
val_dataset, val_tokenized = load_data(os.path.join(base_path, 'NIKL_CoLA_dev.tsv'), tokenizer)

train_dataset = TensorDataset(train_tokenized, train_dataset['label'])
val_dataset = TensorDataset(val_tokenized, val_dataset['label'])

batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [6]:
# 데이터 예시
tokenizer.decode(train_tokenized['input_ids'][0])

'[CLS] 높은 달이 떴다. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

### Load pretrained model
- 모델은 monologg/koelectra-base-v3-discriminator를 사용하였습니다. (https://github.com/monologg/KoELECTRA) 
- cls token에 해당하는 hidden feature에 linear classifier를 추가했습니다.

In [7]:
class Electra(ElectraModel):
    # Add classification layer to Roberta model
    def __init__(self, config, model_name):
        super(Electra, self).__init__(config)
        self.electra = ElectraModel.from_pretrained(model_name, config=config)
        self.hdim = config.hidden_size
        self.nclass = config.nclass
        self.classifier = nn.Linear(self.hdim, self.nclass)

    def forward(self, input_ids, attention_mask, **kwargs):
        outputs = self.electra(input_ids, attention_mask=attention_mask)
        h = outputs[0][:, 0, :]
        logits = self.classifier(h)
        return logits


config = ElectraConfig.from_pretrained(model_name)
config.nclass = 2

### Finetune model
- Size: **Large** (\~85%) / Base (\~79%)
- Epoch: 10
- warm up: 10% training step (없으면 불안정)
- Learning rate: 1e-5, **8e-6**, 5e-6 (큰 차이는 없으나, 커지면 불안정)
- Batch size: **5**, 20, 60
- Finetuning: **All**, Only classifier (\~57%)
- classifier: 전체를 업데이트하는 경우 multi-layer 효과 작음

In [8]:
def train_epoch(epoch, model, train_loader, optimizer, scheduler):
    model.train()
    total_loss = 0
    cor = 0
    n_sample = 0
    s = time.time()
    criterion = nn.CrossEntropyLoss()

    for data, target in train_loader:
        item = {key: val.to(device) for key, val in data.items()}
        target = target.to(device)

        logits = model(**item)
        loss = criterion(logits, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        with torch.no_grad():
            preds = torch.argmax(logits, dim=-1)

        total_loss += loss.item()
        cor += (preds == target).sum().item()
        n_sample += len(target)

        print(f"{cor}/{n_sample}", end='\r')

    loss_avg = total_loss / n_sample
    acc = cor / n_sample
    print(
        f"[Epoch {epoch}] Train loss: {loss_avg:.3f}, acc: {acc*100:.2f}, time: {time.time()-s:.1f}s"
    )
    return acc


def validate(epoch, model, val_loader, verbose=True):
    model.eval()
    total_loss = 0
    cor = 0
    n_sample = 0
    criterion = nn.CrossEntropyLoss()
    pred_all = []
    
    with torch.no_grad():
        for data, target in val_loader:
            item = {key: val.to(device) for key, val in data.items()}
            target = target.to(device)

            logits = model(**item)
            loss = criterion(logits, target)
            preds = torch.argmax(logits, dim=-1)
            pred_all.append(preds)

            total_loss += loss.item()
            cor += (preds == target).sum().item()
            n_sample += len(target)

    loss_avg = total_loss / n_sample
    acc = cor / n_sample
    pred_all = torch.cat(pred_all)
    
    if verbose:
        print(f"[Epoch {epoch}] Valid loss: {loss_avg:.3f}, acc: {acc*100:.2f}")
    return acc, pred_all


def train(idx, num_epochs, lr, train_loader, val_loader):
    print(f"Start trining {idx}th model")
    model = Electra(config, model_name).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = transformers.get_scheduler("linear",
                                           optimizer=optimizer,
                                           num_warmup_steps=num_epochs * len(train_loader) // 10,
                                           num_training_steps=num_epochs * len(train_loader))
    best_acc = 0
    for epoch in range(num_epochs):
        train_acc = train_epoch(epoch, model, train_loader, optimizer, scheduler)
        val_acc, _ = validate(epoch, model, val_loader)
        if val_acc > best_acc:
            best_acc = val_acc

            model_to_save = model.module if hasattr(model, "module") else model
            model_to_save.save_pretrained(os.path.join(save_dir, f'{idx}'))
            
    print(f"Training finish! Best validation accuracy: {best_acc*100:.2f}\n")

In [9]:
lr = 1e-6 #8e-6
num_epochs = 10

####  최적 세팅에서 10번 반복한 결과 74% \~ 75% 의 결과를 얻음
- 각 실험은 1시간 정도 소요 (6min/epoch) 
- (참고) jupyter를 서버에서 돌려서 컴퓨터 연결이 끊겼을때 print가 잘 되지 않은 경우가 있으나, 모델 학습 및 저장은 이상 없음.

In [10]:
for i in range(10):
    train(i, num_epochs, lr, train_loader, val_loader)

Start trining 0th model
[Epoch 0] Train loss: 0.022, acc: 52.40, time: 90.8s
[Epoch 0] Valid loss: 0.021, acc: 64.03
[Epoch 1] Train loss: 0.020, acc: 67.38, time: 90.5s
[Epoch 1] Valid loss: 0.018, acc: 71.26
[Epoch 2] Train loss: 0.017, acc: 72.66, time: 90.2s
[Epoch 2] Valid loss: 0.017, acc: 72.79
[Epoch 3] Train loss: 0.016, acc: 74.96, time: 90.3s
[Epoch 3] Valid loss: 0.017, acc: 73.57
[Epoch 4] Train loss: 0.016, acc: 76.03, time: 90.1s
[Epoch 4] Valid loss: 0.017, acc: 74.02
[Epoch 5] Train loss: 0.015, acc: 77.48, time: 90.3s
[Epoch 5] Valid loss: 0.016, acc: 74.21
[Epoch 6] Train loss: 0.015, acc: 77.55, time: 90.2s
[Epoch 6] Valid loss: 0.017, acc: 74.16
[Epoch 7] Train loss: 0.015, acc: 77.75, time: 85.0s
[Epoch 7] Valid loss: 0.017, acc: 74.31
[Epoch 8] Train loss: 0.015, acc: 78.28, time: 72.2s
[Epoch 8] Valid loss: 0.017, acc: 74.21
[Epoch 9] Train loss: 0.015, acc: 78.27, time: 80.4s
[Epoch 9] Valid loss: 0.017, acc: 74.36
Training finish! Best validation accuracy: 74.

[Epoch 1] Valid loss: 0.017, acc: 71.56
[Epoch 2] Train loss: 0.017, acc: 73.53, time: 90.1s
[Epoch 2] Valid loss: 0.017, acc: 73.23
[Epoch 3] Train loss: 0.016, acc: 76.05, time: 90.0s
[Epoch 3] Valid loss: 0.016, acc: 74.41
[Epoch 4] Train loss: 0.015, acc: 77.02, time: 83.3s
[Epoch 4] Valid loss: 0.016, acc: 74.21
[Epoch 5] Train loss: 0.015, acc: 77.78, time: 72.2s
[Epoch 5] Valid loss: 0.016, acc: 74.90
[Epoch 6] Train loss: 0.015, acc: 77.95, time: 82.0s
[Epoch 6] Valid loss: 0.016, acc: 74.85
[Epoch 7] Train loss: 0.015, acc: 78.62, time: 90.0s
[Epoch 7] Valid loss: 0.016, acc: 75.05
[Epoch 8] Train loss: 0.014, acc: 78.71, time: 90.2s
[Epoch 8] Valid loss: 0.016, acc: 75.10
[Epoch 9] Train loss: 0.014, acc: 79.32, time: 90.1s
[Epoch 9] Valid loss: 0.016, acc: 75.10
Training finish! Best validation accuracy: 75.10

Start trining 9th model
[Epoch 0] Train loss: 0.022, acc: 50.61, time: 89.9s
[Epoch 0] Valid loss: 0.021, acc: 65.75
[Epoch 1] Train loss: 0.019, acc: 67.39, time: 90

### Test models (validation: 74.85%)
- 최종 모델은 위 개별 모델을 ensemble해서 얻음 
- Ensemble에서 학습 성능 하위 74% 모델들은 배제

In [11]:
def validate_ensemble(val_loader, answer, idx_max=10):
    pred_ensemble = []
    for idx in range(idx_max):
        model = Electra.from_pretrained(os.path.join(save_dir, f'{idx}'), model_name)
        model.to(device)
        acc, pred_all = validate('best', model, val_loader, verbose=False)
        print(f"Load {idx}th model (acc: {acc*100:.2f})")
        if acc >= 0.74:
            pred_ensemble.append(pred_all)
        
    pred_ensemble = torch.stack(pred_ensemble, dim=-1).float()
    pred_ensemble = (pred_ensemble.mean(-1) >= 0.5).long().to(answer.device)
    acc_ensemble = (pred_ensemble == answer).sum() / len(answer)
    print(f"\nEnsemble accuracy: {acc_ensemble*100:.2f}")

In [12]:
answer = torch.tensor(val_dataset.labels)

validate_ensemble(val_loader, answer, idx_max=10)

Load 0th model (acc: 74.36)
Load 1th model (acc: 74.51)
Load 2th model (acc: 75.05)
Load 3th model (acc: 74.41)
Load 4th model (acc: 75.15)
Load 5th model (acc: 74.61)
Load 6th model (acc: 75.00)
Load 7th model (acc: 75.25)
Load 8th model (acc: 75.10)
Load 9th model (acc: 75.15)

Ensemble accuracy: 74.85


### Binary loss + 3 layer
Binay cross entropy loss 를 사용하고
fine tunning 하는 structure 를 layer 를 1->3 로 올렸습니다.

In [41]:
class Electra2(ElectraModel):
    # Add classification layer to Roberta model
    def __init__(self, config, model_name):
        super(Electra2, self).__init__(config)
        self.electra = ElectraModel.from_pretrained(model_name, config=config)
        self.hdim = config.hidden_size
        self.nclass = config.nclass
        self.layer1 = nn.Linear(self.hdim, 256)
        self.layer2 = nn.Linear(256, 128)
        self.layer3 = nn.Linear(128, self.nclass)

    def forward(self, input_ids, attention_mask, **kwargs):
        outputs = self.electra(input_ids, attention_mask=attention_mask)
        h = outputs[0][:, 0, :]
        h = self.layer1(h)
        h = self.layer2(h)
        logits = self.layer3(h)
        return logits


config = ElectraConfig.from_pretrained(model_name)
config.nclass = 2

In [42]:
def train_epoch2(epoch, model, train_loader, optimizer, scheduler):
    model.train()
    total_loss = 0
    cor = 0
    n_sample = 0
    s = time.time()
    criterion = nn.BCELoss()

    for data, target in train_loader:
        item = {key: val.to(device) for key, val in data.items()}
        target = target.to(device)

        logits = model(**item)
        loss = criterion(logits, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        with torch.no_grad():
            preds = torch.argmax(logits, dim=-1)

        total_loss += loss.item()
        cor += (preds == target).sum().item()
        n_sample += len(target)

        print(f"{cor}/{n_sample}", end='\r')

    loss_avg = total_loss / n_sample
    acc = cor / n_sample
    print(
        f"[Epoch {epoch}] Train loss: {loss_avg:.3f}, acc: {acc*100:.2f}, time: {time.time()-s:.1f}s"
    )
    return acc


def validate2(epoch, model, val_loader, verbose=True):
    model.eval()
    total_loss = 0
    cor = 0
    n_sample = 0
    criterion = nn.BCELoss()
    pred_all = []
    
    with torch.no_grad():
        for data, target in val_loader:
            item = {key: val.to(device) for key, val in data.items()}
            target = target.to(device)

            logits = model(**item)
            loss = criterion(logits, target)
            preds = torch.argmax(logits, dim=-1)
            pred_all.append(preds)

            total_loss += loss.item()
            cor += (preds == target).sum().item()
            n_sample += len(target)

    loss_avg = total_loss / n_sample
    acc = cor / n_sample
    pred_all = torch.cat(pred_all)
    
    if verbose:
        print(f"[Epoch {epoch}] Valid loss: {loss_avg:.3f}, acc: {acc*100:.2f}")
    return acc, pred_all


def train2(idx, num_epochs, lr, train_loader, val_loader):
    print(f"Start trining {idx}th model")
    model = Electra2(config, model_name).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = transformers.get_scheduler("linear",
                                           optimizer=optimizer,
                                           num_warmup_steps=num_epochs * len(train_loader) // 10,
                                           num_training_steps=num_epochs * len(train_loader))
    best_acc = 0
    for epoch in range(num_epochs):
        train_acc = train_epoch(epoch, model, train_loader, optimizer, scheduler)
        val_acc, _ = validate(epoch, model, val_loader)
        if val_acc > best_acc:
            best_acc = val_acc

            model_to_save = model.module if hasattr(model, "module") else model
            model_to_save.save_pretrained(os.path.join(save_dir, f'{idx}_2'))
            
    print(f"Training finish! Best validation accuracy: {best_acc*100:.2f}\n")

In [43]:
lr = 1e-6 #8e-6
num_epochs = 10

In [48]:
!ls result_gram

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0  0_2	1  1_2	2  2_2	3  3_2	4  4_2	5  5_2	6  6_2	7  7_2	8  8_2	9  9_2


In [44]:
for i in range(10):
    train2(i, num_epochs, lr, train_loader, val_loader)

Start trining 0th model
[Epoch 0] Train loss: 0.022, acc: 53.44, time: 43.6s
[Epoch 0] Valid loss: 0.021, acc: 66.83
[Epoch 1] Train loss: 0.020, acc: 68.96, time: 44.3s
[Epoch 1] Valid loss: 0.018, acc: 72.19
[Epoch 2] Train loss: 0.017, acc: 73.82, time: 43.0s
[Epoch 2] Valid loss: 0.017, acc: 73.08
[Epoch 3] Train loss: 0.016, acc: 75.93, time: 43.8s
[Epoch 3] Valid loss: 0.016, acc: 73.23
[Epoch 4] Train loss: 0.015, acc: 77.19, time: 44.1s
[Epoch 4] Valid loss: 0.016, acc: 74.51
[Epoch 5] Train loss: 0.015, acc: 77.87, time: 44.1s
[Epoch 5] Valid loss: 0.017, acc: 74.61
[Epoch 6] Train loss: 0.015, acc: 78.48, time: 44.4s
[Epoch 6] Valid loss: 0.016, acc: 74.85
[Epoch 7] Train loss: 0.015, acc: 79.00, time: 44.8s
[Epoch 7] Valid loss: 0.017, acc: 74.85
[Epoch 8] Train loss: 0.014, acc: 79.33, time: 43.9s
[Epoch 8] Valid loss: 0.017, acc: 75.00
[Epoch 9] Train loss: 0.014, acc: 79.04, time: 44.0s
[Epoch 9] Valid loss: 0.017, acc: 75.05
Training finish! Best validation accuracy: 75.

[Epoch 1] Valid loss: 0.018, acc: 71.85
[Epoch 2] Train loss: 0.017, acc: 74.14, time: 43.4s
[Epoch 2] Valid loss: 0.017, acc: 72.44
[Epoch 3] Train loss: 0.016, acc: 76.13, time: 43.7s
[Epoch 3] Valid loss: 0.017, acc: 73.38
[Epoch 4] Train loss: 0.015, acc: 77.54, time: 44.1s
[Epoch 4] Valid loss: 0.017, acc: 74.41
[Epoch 5] Train loss: 0.015, acc: 78.02, time: 44.1s
[Epoch 5] Valid loss: 0.017, acc: 74.46
[Epoch 6] Train loss: 0.015, acc: 78.55, time: 44.1s
[Epoch 6] Valid loss: 0.017, acc: 74.46
[Epoch 7] Train loss: 0.014, acc: 78.91, time: 43.8s
[Epoch 7] Valid loss: 0.017, acc: 74.75
[Epoch 8] Train loss: 0.014, acc: 79.31, time: 43.2s
[Epoch 8] Valid loss: 0.017, acc: 74.75
[Epoch 9] Train loss: 0.014, acc: 79.54, time: 43.3s
[Epoch 9] Valid loss: 0.017, acc: 74.75
Training finish! Best validation accuracy: 74.75

Start trining 9th model
[Epoch 0] Train loss: 0.022, acc: 52.78, time: 44.1s
[Epoch 0] Valid loss: 0.021, acc: 67.13
[Epoch 1] Train loss: 0.020, acc: 69.53, time: 43

### Test models (validation: 75%)
- 최종 모델은 위 개별 모델을 ensemble해서 얻음 
- Ensemble에서 학습 성능 하위 74% 모델들은 배제

In [1]:
def validate_ensemble2(val_loader, answer, idx_max=10):
    pred_ensemble = []
    for idx in range(idx_max):
        model = Electra2.from_pretrained(os.path.join(save_dir, f'{idx}_2'), model_name)
        model.to(device)
        acc, pred_all = validate('best', model, val_loader, verbose=False)
        print(f"Load {idx}th model (acc: {acc*100:.2f})")
        if acc >= 0.74:
            pred_ensemble.append(pred_all)
        
    pred_ensemble = torch.stack(pred_ensemble, dim=-1).float()
    pred_ensemble = (pred_ensemble.mean(-1) >= 0.5).long().to(answer.device)
    acc_ensemble = (pred_ensemble == answer).sum() / len(answer)
    print(f"\nEnsemble accuracy: {acc_ensemble*100:.2f}")

In [50]:
answer = torch.tensor(val_dataset.labels)

validate_ensemble2(val_loader, answer, idx_max=10)

Load 0th model (acc: 75.05)
Load 1th model (acc: 74.70)
Load 2th model (acc: 74.75)
Load 3th model (acc: 75.10)
Load 4th model (acc: 74.80)
Load 5th model (acc: 74.56)
Load 6th model (acc: 75.15)
Load 7th model (acc: 74.75)
Load 8th model (acc: 74.75)
Load 9th model (acc: 75.00)

Ensemble accuracy: 75.05
