In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## install and load

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 11.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 45.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    U

In [None]:
from typing import Dict, List
import csv
import os
import random
from tqdm.notebook import tqdm
from easydict import EasyDict as edict
import numpy as np

from IPython.display import display
import ipywidgets as widgets

from transformers import (
    EncoderDecoderModel,
    PreTrainedTokenizer,
    PreTrainedTokenizerFast,
    BertTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Trainer,

)

import torch

from transformers.models.encoder_decoder.modeling_encoder_decoder import EncoderDecoderModel


## Setting

In [None]:
%cd /content/drive/MyDrive/GoormProject/GoormProject3

/content/drive/.shortcut-targets-by-id/1ovgSHdL_LDsDV-KWBQ2NNEs2v8Mpi0fm/GoormProject/GoormProject3


In [None]:
args = edict({'do_wandb' : False,
              'w_project': 'NMT_enko',
              'w_entity': 'goorm-project-nlp-team-1', # WandB ID
              'learning_rate': 2e-4,
              'batch_size': 8,
              'accumulate': 8,
              'epochs': 3,
              'seed': 42,
              'src_pt' : 'bert-base-cased',
              'trg_pt': 'skt/kogpt2-base-v2', 
              'max_length': 50,
              'earlystopping' : True,
              'warmup_proportion' : 0.1,
              'patience' : 0.5,
              })
args['NAME'] = ''f'{args.w_project}_{random.randrange(0, 1024)}'
print(args.NAME)

NMT_enko_228


In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(args.seed)

## Dataset

In [None]:
class PairedDataset:
    def __init__(self, 
        src_tokenizer: PreTrainedTokenizer, tgt_tokenizer: PreTrainedTokenizer,
        file_path: str
    ):
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = tgt_tokenizer
        with open(file_path, 'r') as fd:
            self.data = [row[1:] for row in csv.reader(fd)][1:]

    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
        src, trg = self.data[index]
        embeddings = self.src_tokenizer(src, return_attention_mask=False, return_token_type_ids=False)
        embeddings['labels'] = self.trg_tokenizer(trg, return_attention_mask=False)['input_ids']

        return embeddings

    def __len__(self):
        return len(self.data)


## Tokenizer, dataset, model

In [None]:
class KoGPT2Tokenizer(PreTrainedTokenizerFast):
    def build_inputs_with_special_tokens(self, token_ids: List[int], _) -> List[int]:
        return token_ids + [self.eos_token_id]

In [None]:
src_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
trg_tokenizer = KoGPT2Tokenizer.from_pretrained('skt/kogpt2-base-v2',
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')

loading file https://huggingface.co/bert-base-cased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
loading file https://huggingface.co/bert-base-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217

In [None]:
dataset = PairedDataset(src_tokenizer, trg_tokenizer, 'data/기술과학_train_en-ko.csv')
eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, 'data/기술과학_valid_en-ko.csv')

In [None]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    'bert-base-cased',
    'skt/kogpt2-base-v2',
    pad_token_id=trg_tokenizer.bos_token_id
)
model.config.decoder_start_token_id = trg_tokenizer.bos_token_id
model.config.early_stopping = True

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file https://huggingface.co/bert-base-cased/resolve/

In [None]:
collator = DataCollatorForSeq2Seq(src_tokenizer, model)


In [None]:
arguments = Seq2SeqTrainingArguments(
    output_dir='/content/drive/MyDrive/KDT_goorm/prj3_NMT/test-1/baseline_rev/dump2',
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=args.epochs,
    per_device_train_batch_size = args.batch_size,
    gradient_accumulation_steps=args.accumulate,
    warmup_ratio=0.1,
    save_total_limit=5,
    dataloader_num_workers=0,
    fp16=True, # True only CUDA
    load_best_model_at_end=True,
    # predict_with_generate = True,
    # generation_max_length = args.max_length,
    
)

trainer = Trainer(
    model,
    arguments,
    data_collator=collator,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using cuda_amp half precision backend


## cuda setting and train

In [None]:
# cuda memory error 피하기
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import gc
gc.collect()
torch.cuda.empty_cache()
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [None]:
trainer.train()
model.save_pretrained(f"{args.NAME}_best_model")

***** Running training *****
  Num examples = 240439
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 11268


Epoch,Training Loss,Validation Loss


In [None]:
# run.finish()

## test

In [None]:
model = EncoderDecoderModel.from_pretrained(f"model/{args.NAME}_best_model")
model.eval()
model.cuda()
model.config.decoder_start_token_id = trg_tokenizer.bos_token_id
model.config.early_stopping = True
model.config.max_length = 50

In [None]:
testset = pd.read_csv('data/test_investing.csv')

testset_mt = []
for i in range(len(testset)) :
    text = testset['내용'][i]
    embeddings = src_tokenizer(text, return_attention_mask=False, return_token_type_ids=False, return_tensors='pt')
    embeddings = {k: v.cuda() for k, v in embeddings.items()}
    output = model.generate(**embeddings)[0, 1:-1]
    text_mt = trg_tokenizer.decode(output.cpu())
    testset_mt.append(text_mt)

NameError: name 'pd' is not defined

## demo

In [None]:
src_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
trg_tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2')

model = EncoderDecoderModel.from_pretrained(f"model/{args.NAME}_best_model")
model.eval()
model.cuda()
model.config.decoder_start_token_id = trg_tokenizer.bos_token_id
model.config.early_stopping = True
model.config.max_length = 512

In [None]:
from IPython.display import display
import ipywidgets as widgets

eng = widgets.Textarea(
    placeholder='번역할 영어',
    description="입력",
    disabled=False
)

button = widgets.Button(
    description='번역!',
    disabled=False,
    tooltip='해당 기사를 번역합니다.'
)

kor = widgets.Textarea(
    description="출력",
    disabled=True
)

def translate(_):
    eng.value = ""
    text = kor.get_interact_value()
    embeddings = src_tokenizer(text, return_attention_mask=False, return_token_type_ids=False, return_tensors='pt')
    embeddings = {k: v.cuda() for k, v in embeddings.items()}
    output = model.generate(**embeddings)[0, 1:-1]
    eng.value = trg_tokenizer.decode(output.cpu())

button.on_click(translate)
display(eng, button, kor)

In [None]:
%cd NLP_Project_3
!git config --global user.email "hs.hyein@gmail.com" # 깃헙 가입할때 적었던 이메일
!git config --global user.name "hyeshin3" # 우측 상단 프로필 아이콘 누르면 굵은 글씨로 나오는 부분
!git add .
!git commit -m "model config"
!git checkout -b model/en-ko_test develop