In [None]:
%pip install transformers sentencepiece datasets asian-bart

In [None]:
import os
if "drive" not in os.listdir("/content") :
    from google.colab import drive
    drive.mount('/content/drive')
os.chdir("/content/drive/MyDrive/NLP_Project_3")

In [None]:
import pandas as pd
import numpy as np
import torch
import json
import datasets
import random

from transformers import DataCollatorForSeq2Seq, AutoTokenizer
from collections import defaultdict, Counter, deque
from tqdm import tqdm
from asian_bart import AsianBartTokenizer, AsianBartForConditionalGeneration

SEED = 20220819
BACKBONE = "hyunwoongko/asian-bart-ecjk"

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if "cuda" in DEVICE.type :
    torch.cuda.set_device(DEVICE)
print(DEVICE)

model = AsianBartForConditionalGeneration.from_pretrained("../Model/large_batch_kor2eng/")
model.train()
model = model.to(DEVICE)

tokenizer = AutoTokenizer.from_pretrained(BACKBONE, src_lang="ko_KR", tgt_lang="en_XX")

In [None]:
back_translation_data = pd.read_csv("../RawData/monolingual.csv")
back_translation_data = back_translation_data.loc[back_translation_data.sentence.str.len() < 100, :].rename({"sentence" : "ko"}, axis = "columns")

In [None]:
batch_size = 32
collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = model, return_tensors = "pt")

inference_data = utils.get_dataset(back_translation_data, tokenizer, collator, batch_size, False, "ko", None)

In [None]:
model.eval()
generated = []
with torch.no_grad() :
    with tqdm(inference_data, unit = " batch") as tepoch :
        for i, inference_batch in enumerate(tepoch) :
            inference_batch = {k : v.to(DEVICE) for k, v in inference_batch.items()}
            with torch.cuda.amp.autocast() :
                inference_output = model.generate(inference_batch["input_ids"], max_length = 100, num_beams = 7, no_repeat_ngram_size = 2, decoder_start_token_id = tokenizer.lang_code_to_id["en_XX"])
            generated += tokenizer.batch_decode(inference_output, skip_special_tokens = True, clean_up_tokenization_spaces = True)

In [None]:
back_translation_data.loc[:, "inferenced"] = generated
back_translation_data.loc[:, "inferenced"] = back_translation_data.inferenced.str.replace(" ", '').str.replace("▁", ' ').str.strip()

In [None]:
back_translation_data.to_csv("../RawData/generated_eng_data_from_kor.csv", index = False)