## install and load

In [18]:
!pip install transformers
!pip install sentencepiece
!pip install wandb
!pip install torch



In [2]:
from typing import Dict, List
import argparse
import csv
import os
import random
from tqdm.notebook import tqdm
import wandb
from easydict import EasyDict as edict
import numpy as np

from IPython.display import display
import ipywidgets as widgets

from transformers import (
    EncoderDecoderModel,
    #GPT2Tokenizer as BaseGPT2Tokenizer,
    PreTrainedTokenizer,
    PreTrainedTokenizerFast,
    BertTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Trainer,

)

import torch

from transformers.models.encoder_decoder.modeling_encoder_decoder import EncoderDecoderModel


## Setting

In [3]:
#%cd /content/drive/MyDrive/GoormProject/GoormProject3

In [4]:
args = edict({'do_wandb' : False,
              'w_project': 'NMT_enko',
              'w_entity': 'goorm-project-nlp-team-1', # WandB ID
              'learning_rate': 2e-4,
              'batch_size': 16,
              'accumulate': 16,
              'epochs': 10,
              'seed': 42,
              'src_pt' : 'bert-base-cased',
              'trg_pt': 'skt/kogpt2-base-v2', 
              'max_length': 512,
              'earlystopping' : True,
              'warmup_proportion' : 0.1,
              'patience' : 0.5,
              })
args['NAME'] = ''f'{args.w_project}_{random.randrange(0, 1024)}'
print(args.NAME)

NMT_enko_999


In [5]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(args.seed)

## Dataset

In [6]:
class PairedDataset:
    def __init__(self, 
        src_tokenizer: PreTrainedTokenizer, tgt_tokenizer: PreTrainedTokenizer,
        file_path: str
    ):
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = tgt_tokenizer
        with open(file_path, 'r') as fd:
            self.data = [row[1:] for row in csv.reader(fd)][1:]

    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
        src, trg = self.data[index]
        embeddings = self.src_tokenizer(src, return_attention_mask=False, return_token_type_ids=False)
        embeddings['labels'] = self.trg_tokenizer(trg, return_attention_mask=False)['input_ids']

        return embeddings

    def __len__(self):
        return len(self.data)


## Tokenizer, dataset, model

In [7]:
class KoGPT2Tokenizer(PreTrainedTokenizerFast):
    def build_inputs_with_special_tokens(self, token_ids: List[int], _) -> List[int]:
        return token_ids + [self.eos_token_id]

In [8]:
src_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
trg_tokenizer = KoGPT2Tokenizer.from_pretrained('skt/kogpt2-base-v2')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'KoGPT2Tokenizer'.


In [9]:
dataset = PairedDataset(src_tokenizer, trg_tokenizer, 'data/기술과학_train_en-ko.csv')
eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, 'data/기술과학_valid_en-ko.csv')

In [10]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    'bert-base-cased',
    'skt/kogpt2-base-v2',
    pad_token_id=trg_tokenizer.bos_token_id
)
model.config.decoder_start_token_id = trg_tokenizer.bos_token_id
model.config.early_stopping = True
#model.config.max_length = 512

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['transformer.h.11.crossatten

In [11]:
collator = DataCollatorForSeq2Seq(src_tokenizer, model)


In [12]:
arguments = Seq2SeqTrainingArguments(
    output_dir='dump',
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=30,
    # per_device_train_batch_size=64,
    # per_device_eval_batch_size=64,
    auto_find_batch_size = True,
    warmup_ratio=0.1,
    gradient_accumulation_steps=8,
    save_total_limit=5,
    dataloader_num_workers=1,
    fp16=False, # True only CUDA
    load_best_model_at_end=True,
)

trainer = Trainer(
    model,
    arguments,
    data_collator=collator,
    train_dataset=dataset,
    eval_dataset=eval_dataset
)

## Wandb

In [13]:
if args.do_wandb :
  wandb.login()
  run = wandb.init(project = args.w_project, entity = args.w_entity)
  wandb.run.name = args.NAME
  wandb.config.learning_rate = args.learning_rate
  wandb.config.epochs = args.epochs
  wandb.config.batch_size = args.batch_size

## cuda setting and train

In [14]:
# cuda memory error 피하기
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import gc
gc.collect()
torch.cuda.empty_cache()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [15]:
trainer.train()
model.save_pretrained(f"model/{args.NAME}_best_model")
run.finish()

ImportError: 
find_executable_batch_size requires the accelerate library but it was not found in your environment. You can install it with pip:
`pip install accelerate`


## test

In [None]:
model = EncoderDecoderModel.from_pretrained(f"model/{args.NAME}_best_model")
model.eval()
model.cuda()
model.config.decoder_start_token_id = trg_tokenizer.bos_token_id
model.config.early_stopping = True
model.config.max_length = 50

In [17]:
testset = pd.read_csv('data/test_investing.csv')

testset_mt = []
for i in range(len(testset)) :
    text = testset['내용'][i]
    embeddings = src_tokenizer(text, return_attention_mask=False, return_token_type_ids=False, return_tensors='pt')
    embeddings = {k: v.cuda() for k, v in embeddings.items()}
    output = model.generate(**embeddings)[0, 1:-1]
    text_mt = trg_tokenizer.decode(output.cpu())
    testset_mt.append(text_mt)

NameError: name 'pd' is not defined

## demo

In [None]:
src_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
trg_tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2')

model = EncoderDecoderModel.from_pretrained(f"model/{args.NAME}_best_model")
model.eval()
model.cuda()
model.config.decoder_start_token_id = trg_tokenizer.bos_token_id
model.config.early_stopping = True
model.config.max_length = 512

In [None]:
from IPython.display import display
import ipywidgets as widgets

eng = widgets.Textarea(
    placeholder='번역할 영어',
    description="입력",
    disabled=False
)

button = widgets.Button(
    description='번역!',
    disabled=False,
    tooltip='해당 기사를 번역합니다.'
)

kor = widgets.Textarea(
    description="출력",
    disabled=True
)

def translate(_):
    eng.value = ""
    text = kor.get_interact_value()
    embeddings = src_tokenizer(text, return_attention_mask=False, return_token_type_ids=False, return_tensors='pt')
    embeddings = {k: v.cuda() for k, v in embeddings.items()}
    output = model.generate(**embeddings)[0, 1:-1]
    eng.value = trg_tokenizer.decode(output.cpu())

button.on_click(translate)
display(eng, button, kor)