In [None]:
%pip install transformers sentencepiece datasets asian-bart wandb datasets

In [None]:
import os
if "drive" not in os.listdir("/content") :
    from google.colab import drive
    drive.mount('/content/drive')
os.chdir("/content/drive/MyDrive/NLP_Project_3")

In [None]:
!wandb login

fine_tuned_model_name = "tagged_back_translation_eng2kor"

import wandb
wandb.init(project = "Goorm_3rd_project", entity = "2nd_group", name = fine_tuned_model_name)

In [None]:
import pandas as pd
import numpy as np
import torch
import json
import datasets
import random

from transformers import MBartForConditionalGeneration, MBartTokenizer, DataCollatorForSeq2Seq, AutoTokenizer, get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split
from collections import defaultdict, Counter, deque
from tqdm import tqdm

from asian_bart import AsianBartTokenizer, AsianBartForConditionalGeneration
from transformers.models.bart.modeling_bart import shift_tokens_right

SEED = 20220819
BACKBONE = "hyunwoongko/asian-bart-ecjk"

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if "cuda" in DEVICE.type :
    torch.cuda.set_device(DEVICE)
print(DEVICE)

model = AsianBartForConditionalGeneration.from_pretrained("../Model/large_batch_eng2kor")
model.train()
model = model.to(DEVICE)

tokenizer = AutoTokenizer.from_pretrained(BACKBONE, src_lang="en_XX", tgt_lang="ko_KR")

model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print("# of params in model :", params)


# Tokenizer에 새로운 토큰을 추가하고 모델의 임베딩 크기를 조정합니다.
# Tagged back translation이라는 방법을 사용했습니다. 적용하기도 아주 쉽고 효과도 좋은 듯 해 사용해볼까 합니다. (참고 : https://kh-kim.github.io/blog/2020/09/30/Back-Translation-Review.html)
# 장점은 <tag> 토큰으로 번역할 문장의 유형을 제어할 수 있다는 점입니다.
# Decoding 단계에서 원문의 맨 앞에 <tag>를 붙인 상태로 입력을 주면 신조어스러운(?) 결과를 반환합니다.
# 아래 translate_data를 가져오는 셀에서 모든 문장에 <tag>를 붙임으로써 back translated된 문장에 이 토큰을 삽입했습니다. (병렬 데이터에는 삽입 X)

tokenizer.add_tokens("<tag>")
model.resize_token_embeddings(len(tokenizer))

cuda:0


Downloading config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading sentencepiece.bpe.model:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

# of params in model : 413730816


Embedding(57548, 1024)

In [None]:
original_data = utils.load_parallel("./RawData")
translation = pd.read_csv("../RawData/generated_eng_data_from_kor.csv")
translation = translation.rename({"sentence" : "ko",
                                  "inferenced" : "en"},
                                 axis = "columns")
translation.loc[:, "type"] = "new_word"
translate_data = translation.drop(["nw", "tag"], axis = "columns")
translate_data.loc[:, "en"] = "<tag>" + translate_data.en

sampled_train = original_train.sample(n = len(translate_data), random_state = SEED, replace = False).reset_index(drop = True)
total_train = pd.concat([sampled_train, translate_data]).reset_index(drop = True)

In [None]:
batch_size = 16
collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = model, return_tensors = "pt")

train_pd, valid_pd = train_test_split(total_train, random_state = SEED, test_size = .3)

train_data = utils.get_dataset(train_pd, tokenizer, collator, batch_size, True, "en", "ko")
valid_data = utils.get_dataset(valid_pd, tokenizer, collator, batch_size * 2, True, "en", "ko")

  0%|          | 0/174 [00:00<?, ?ba/s]

  0%|          | 0/75 [00:00<?, ?ba/s]

In [None]:
learning_rate = 1e-4
epochs = 3

optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate, eps = 1e-6, weight_decay = 0.02)

In [None]:
wandb_config = {
    "learning_rate" : learning_rate,
    "batch_size" : batch_size,
    "backbone" : BACKBONE,
    "epochs" : epochs
}

wandb.config.update(wandb_config)

In [None]:
scaler = torch.cuda.amp.GradScaler()
wandb.watch(model, log = "all", log_freq = 500)

valid_check_period = 5000
early_stopping = utils.EarlyStopping(path = "../Model/tagged_bt_eng2kor_checkpoint", patience = 1, verbose = True)
halt = False

step = 0
for epoch in range(epochs) :
    cum_loss = deque(maxlen = 20)
    curr_loss = []

    with tqdm(train_data, unit = " batch") as tepoch :
        curr_loss.clear()
        model.train()

        for i, batch in enumerate(tepoch) :
            step += 1
            optimizer.zero_grad()
            tepoch.set_description(f"Train Epoch {epoch}")

            batch = {k : v.to(DEVICE) for k, v in batch.items()}

            with torch.cuda.amp.autocast() :
                outputs = model(**batch)
                loss = outputs["loss"]

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            # lr_scheduler.step()

            cum_loss.append(loss.item())
            curr_loss.append(loss.item())

            del batch, outputs, loss

            tepoch.set_postfix(loss = sum(cum_loss) / len(cum_loss))

            wandb.log({"train_loss" : sum(cum_loss) / len(cum_loss),
                       "lr" : optimizer.state_dict()["param_groups"][0]['lr'],
                       "train_step" : step})


            if not step % valid_check_period :
                model.eval()
                val_losses = []
                with torch.no_grad() :
                    for j, val_batch in enumerate(valid_data) :
                        val_batch = {k : v.to(DEVICE) for k, v in val_batch.items()}
                        with torch.cuda.amp.autocast() :
                            val_outputs = model(**val_batch)
                            val_loss = val_outputs["loss"]
                        val_losses.append(val_loss.item())

                        del val_batch, val_outputs, val_loss
                
                wandb.log({"valid_loss" : sum(val_losses) / len(val_losses),
                           "valid_step" : step // valid_check_period})

                early_stopping(sum(val_losses) / len(val_losses), model)

                if early_stopping.early_stop:
                    print("Early stopping")
                    halt = True
                    break
                else :
                    model.train()

        print("Train loss : ", sum(curr_loss) / len(curr_loss))

    if halt :
        break
    curr_loss.clear()
    cum_loss.clear()

In [None]:
model.save_pretrained("../Model/tagged_bt_eng2kor")