##Drive mount

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Install and Load

In [135]:
!pip install transformers
!pip install sentencepiece
!pip install torch
!pip install wandb
!pip install kss
!pip install nltk
!pip install bert_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [137]:
from typing import Dict, List
import csv
import os
import random
import wandb
from tqdm.notebook import tqdm
from easydict import EasyDict as edict
import numpy as np
import pandas as pd
from IPython.display import display
import ipywidgets as widgets
from pprint import pprint
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize


from transformers import (
    EncoderDecoderModel,
    PreTrainedTokenizerFast,
    DistilBertTokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Trainer,
)

import torch

from transformers.models.encoder_decoder.modeling_encoder_decoder import EncoderDecoderModel


## setting

In [6]:
# %cd /content/drive/MyDrive/GoormProject/GoormProject3

In [7]:
args = edict({'do_wandb' : False,
              'w_project': 'NMT_enko',
              'w_entity': 'goorm-project-nlp-team-1', # WandB ID
              'batch_size': 8,
              'accumulate': 8,
              'epochs': 5,
              'seed': 42,
              })
args['NAME'] = ''f'{args.w_project}_{random.randrange(0, 1024)}'
print(args.NAME)

NMT_enko_213


In [8]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(args.seed)

## Dataset, Tokenizer, Model

In [9]:
class PairedDataset:
    def __init__(self, data) :
        self.data = data
    
    @classmethod
    def loads(cls, file_path) :
        with open(file_path, 'r') as fd:
            data = [row[1:] for row in csv.reader(fd)][1:]

        return cls(data)

    @classmethod
    def split(cls, datasets, ratio=0.2) :
        valid_length = int(len(datasets) * ratio)
        valid = datasets[:valid_length]
        train = datasets[valid_length:]

        return cls(train), cls(valid)


    def __getitem__(self, index: int) -> List[str]:
        return self.data[index]
        
    def __len__(self):
        return len(self.data)


In [128]:
src_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
trg_tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [129]:
# special tokens 설정
special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
num_added_toks = trg_tokenizer.add_special_tokens(special_tokens_dict)

In [138]:
class TokenizeDataset:
    def __init__(self, dataset, src_tokenizer, trg_tokenizer):
        self.dataset = dataset
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = trg_tokenizer
    
    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
        src, trg = self.dataset[index]
        embeddings = self.src_tokenizer(src, return_attention_mask=False, return_token_type_ids=False,
                                        truncation=True, max_length=512)
        embeddings['labels'] = [trg_tokenizer.bos_token_id] + self.trg_tokenizer(trg,
                                                                                 return_attention_mask=False)['input_ids']+[trg_tokenizer.eos_token_id]

        return embeddings
    
    def __len__(self):
        return len(self.dataset)


# Train

In [133]:
# 경로 설정
%cd <설정 경로>

/content/drive/MyDrive/KDT_goorm/prj3_NMT/test-1/baseline_rev


In [140]:
dataset = PairedDataset.loads('data/기술과학혼합_train_en-ko.csv')
train_dataset_, valid_dataset_ = PairedDataset.split(dataset, ratio=0.2)
test_dataset = PairedDataset.loads('data/기술과학_valid_en-ko.csv')

print(len(train_dataset_))
print(len(valid_dataset_))
print(len(test_dataset))

352352
88087
29705


In [143]:
train_dataset = TokenizeDataset(train_dataset_, src_tokenizer, trg_tokenizer)
valid_dataset = TokenizeDataset(valid_dataset_, src_tokenizer, trg_tokenizer)

In [144]:
trg_tokenizer.decode(train_dataset[0]['labels'])

'<BOS> 1957 년 6월 21일에는 제75차 군사정전위원회에서 유엔군은 북한 측이 작전 물자 반입 관련 협정 준수를 행동으로 보일 때까지 제13항 ᄅ목을 잠정 폐기할 것을 선언했다.<EOS>'

In [146]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    'distilbert-base-cased',
    'skt/kogpt2-base-v2',
    pad_token_id=trg_tokenizer.bos_token_id
)
model.decoder.resize_token_embeddings(len(trg_tokenizer))
model.config.decoder_start_token_id = trg_tokenizer.bos_token_id
model.config.eos_token_id = src_tokenizer.sep_token_id
model.config.pad_token_id = src_tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size
model.config.early_stopping = True

Downloading pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading pytorch_model.bin:   0%|          | 0.00/490M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['transformer.h.2.crossattention.q_attn.weight', 'transformer.h.9.crossattention.masked_bias', 'transformer.h.3.ln_cross_attn.weight', 'transformer.h.7.crossattention.c_proj.weight', 'transformer.h.6.crossattention.bias', 'transformer.h.2.crossattention.c_proj.weight', 'transformer.h.7.crossattention.masked_bias', 'transformer.h.5.crossattention.c_proj.weight', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.8.crossattention.q_attn.weight', 'transformer.h.9.crossattention.c_proj.weight', 'transformer.h.4.crossattention.bias', 'transformer.h.5.ln_cross_attn.weight', 'transformer.h.3.crossattention.bias', 'transformer.h.10.crossattention.masked_bias', 'transformer.h.8.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.weight', 'transforme

In [147]:
collator = DataCollatorForSeq2Seq(src_tokenizer, model)

In [None]:
arguments = Seq2SeqTrainingArguments(
    output_dir='/content/drive/MyDrive/KDT_goorm/prj3_NMT/test-1/baseline_rev/dump3',
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size = 16,
    gradient_accumulation_steps= 4,
    warmup_ratio=0.1,
    save_total_limit=5,
    fp16=True, # True only CUDA
    load_best_model_at_end=True,    
)

trainer = Trainer(
    model,
    arguments,
    data_collator=collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

Using cuda_amp half precision backend


## cuda setting and train

In [161]:
# cuda memory error 피하기
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import gc
gc.collect()
torch.cuda.empty_cache()
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [None]:
trainer.train()
model.save_pretrained(f"/content/drive/MyDrive/KDT_goorm/prj3_NMT/test-1/{args.NAME}_best_model")

***** Running training *****
  Num examples = 352352
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 27525
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss
0,2.0672,1.890211
1,1.5964,1.535208
2,1.3583,1.431638
3,1.1822,1.386823
4,1.059,1.37145


***** Running Evaluation *****
  Num examples = 88087
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/KDT_goorm/prj3_NMT/test-1/baseline_rev/dump3/checkpoint-5505
Configuration saved in /content/drive/MyDrive/KDT_goorm/prj3_NMT/test-1/baseline_rev/dump3/checkpoint-5505/config.json
Model weights saved in /content/drive/MyDrive/KDT_goorm/prj3_NMT/test-1/baseline_rev/dump3/checkpoint-5505/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 88087
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/KDT_goorm/prj3_NMT/test-1/baseline_rev/dump3/checkpoint-11010
Configuration saved in /content/drive/MyDrive/KDT_goorm/prj3_NMT/test-1/baseline_rev/dump3/checkpoint-11010/config.json
Model weights saved in /content/drive/MyDrive/KDT_goorm/prj3_NMT/test-1/baseline_rev/dump3/checkpoint-11010/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 88087
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/KDT_goorm/prj3_NMT/test-

# Test

In [148]:
# test dataset 
test_dataset = pd.read_csv('data/기술과학_valid_en-ko.csv')
test_dataset = test_dataset.sample(2000) # 테스트용 sampling
test_dataset.reset_index(drop=True, inplace=True)
test_dataset

Unnamed: 0.1,Unnamed: 0,en,ko
0,26628,The analysis period of this study was from 200...,"본 연구의 분석 대상 기간은 2000~2015년으로, 주요 분석 대상은 BBB 3대..."
1,10686,Technological innovation combines the meaning ...,"기술 혁신은 이런 혁신의 의미를 기술과 결합하여 기존의 기술보다 개선되거나, 대체 ..."
2,9782,Although a causal relationship is not establis...,소의 산지도매 가격 간에는 인과관계가 성립하지 않지만 장기 균형 관계를 포함하는 오...
3,14413,Support from Japan was important to Korea base...,한국에는 일본과의 관계를 바탕으로 일본으로부터 얻을 수 있는 지원이 중요했다.
4,3737,This study analyzed the effectiveness of feedb...,본 연구는 AAA가 부정적 피드백을 건설적 비판과 파괴적 비판으로 구분한 이론적 개...
...,...,...,...
1995,21163,"In line with the design, regional dummy, biddi...","모형에 따라 지역 더미, 입찰 제도와 공종별 구분 등을 설명 변수로 추가하여 담합의..."
1996,25577,This is because the econometric analysis is ev...,이는 계량 경제학적 분석이 다른 방법에 비해 더 정확하고 객관적이라는 평가를 받고 ...
1997,16115,This paper theoretically examines the behavior...,본 논문은 비용 측면에 불확실성이 존재하는 과점 시장에서 기업들의 합병에 대한 행태...
1998,10383,The best local income tax faculties could be o...,이상적인 지방세 특성에 대해 중앙 정부와 지방 정부가 서로 달리 인식할 수 있다.


In [169]:
# best model load
model = EncoderDecoderModel.from_pretrained('/content/drive/MyDrive/KDT_goorm/prj3_NMT/test-1/NMT_enko_353_best_model')
model.eval()
model.cuda()
model.config.decoder_start_token_id = trg_tokenizer.bos_token_id

In [149]:
test_dataset['en'][0]

'The analysis period of this study was from 2000 to 2015, as well as the main analysis target ended up being the coordination procedure 3 (M3) created within the environment and resources centered on the AAA president, the three major BBB managers.'

In [61]:
results = pd.DataFrame()
texts_en = []
texts_komt = []
texts_ko = []

for i in range(len(test_dataset)):
    text = test_dataset['en'][i]

    texts = sent_tokenize(text)
    for text in texts :
        embeddings = src_tokenizer(text, return_attention_mask=False, return_token_type_ids=False, return_tensors='pt')
        embeddings = {k: v.cuda() for k, v in embeddings.items()}
        output = model.generate(**embeddings, 
                            max_length=50,
                            num_return_sequences=1,
                            no_repeat_ngram_size=2,
                            num_beams=5,
                            length_penalty = 3
                            )[0,1:-1]
        ko_text = trg_tokenizer.decode(output.cpu(), skip_special_tokens=True)
        tokenized = kss.split_sentences(ko_text)[0]
   
        texts_en.append(text)
        texts_komt.append(tokenized) 
        texts_ko.append(test_dataset['ko'][i])
        
    
results['en'] = texts_en
results['ko_mt'] = texts_komt
results['ko'] = texts_ko


In [201]:
results

Unnamed: 0,en,ko_mt,ko
0,Although the types or standards of actions are...,"그 행위의 유형이나 기준이 시행령을 통해 구체적으로 정해지는 것은 아니지만, ""중요...","비록 시행령을 통하여 행위의 유형 또는 기준을 구체적으로 정하고 있지만, ""상당성""..."
1,"If this happens, companies will reduce domesti...",이렇게 되면 기업은 국내 생산을 줄이고 해외 투자를 더 늘리게 된다.,"이렇게 되면, 기업들이 국내 생산을 줄이고 해외 투자를 더욱 확대할 것이다."
2,Hypothesis 5-3 that strategic distance include...,전략적 거리가 기업의 질적 성과에 부( 의 영향을 미친다는 가설 5-3은 또한 지지...,전략적 거리가 기업의 질적 성과에 (-) 영향을 미친다는 가설 5-3도 지지되었다.
3,"Introverts are less friendly, more independent...","인플루언서는 친화성이 낮고, 그룹 중심보다 독립적이며, 느린 것보다 안정적이다.","내향적인 사람들은 덜 친화적인 쪽이며, 집단 지향적이기보다는 독립적이고, 느릿느릿하..."
4,"In particular, interest rates and average mont...",특히 대출 그 자체에 대한 위험 요인인 금리와 월평균 상환액은 차주의 상환 부담과 ...,특히 대출 자체의 리스크 요인인 금리와 월평균 상환액은 차주의 상환부담과 정의 관계...
...,...,...,...
2024,The larger the number of farms for a specific ...,해당 지역의 특정 품목에 대한 농가들의 수가 많을수록 특화도가 높고 규모의 경제도 ...,지역 내 특정 품목의 농가 수가 많을수록 해당 품목에 특화되어 있고 따라서 규모의 ...
2025,It must be considered that non-reported credit...,신고하지 않은 채권자는 회생 계획안을 선택할 수 있도록 단순히 관계자 회사로만 허용...,미신고 채권자의 추완 신고는 회생 계획안 결의를 위한 관계인 집회까지만 허용된다고 ...
2026,"In all four periods, the frequency belonging t...",4개 기간 모두에서 1사분면에 속하는 빈도가 일본이 한국보다 높은 것으로 나타났다.,4개의 기간 전체에 걸쳐 1분면에 속한 빈도는 일본이 한국에 비해 많은 편이다.
2027,The Contingent Valuation Method was first prop...,"조건부 가치 평가법은 토양 오염 방지를 연구한 AAA1에 의해 처음 제안되었고, A...",조건부 가치 평가법(Contingent Valuation Method)은 최초 토양...


In [202]:
results.ko_mt[0]

'그 행위의 유형이나 기준이 시행령을 통해 구체적으로 정해지는 것은 아니지만, "중요성" 또는 "사업 기회"의 개념에 대한 정의는 구체성을 충족시키지 못하고 다른 법령과의 정합성 문제도 존재한다.'

### **Bert Score**


In [68]:
# bert score 넣기
from bert_score import score

candidates = results['ko_mt'].values.tolist() #번역 문장 중 첫번째
references = results['ko'].values.tolist() #원본 한국어 문장

Precision, Recall, F1 = score(candidates, references, lang="ko", verbose=False)
P, R, F = [x.tolist() for x in [Precision, Recall, F1]]

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [69]:
print(f'Precision : {np.mean(P)}')
print(f'Recall :{np.mean(R)}')
print(f'F1 : {np.mean(F)}')

Precision : 0.8812663388875037
Recall :0.8815086745111269
F1 : 0.8811229916262826


# Demo

In [74]:
src_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
trg_tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2')

model = EncoderDecoderModel.from_pretrained('/content/drive/MyDrive/KDT_goorm/prj3_NMT/test-1/NMT_enko_353_best_model')
model.eval()
model.cuda()
model.config.decoder_start_token_id = trg_tokenizer.bos_token_id

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [184]:
def translate_title(eng_title:str) :
    text = eng_title
    embeddings = src_tokenizer(text, return_attention_mask=False, return_token_type_ids=False, return_tensors='pt')
    embeddings = {k: v.cuda() for k, v in embeddings.items()}
    output = model.generate(**embeddings, 
                        max_length=30,
                        num_return_sequences=1,
                        no_repeat_ngram_size=2,
                        num_beams=3,
                        length_penalty=10
                        )[0,1:-1]
    ko_title = trg_tokenizer.decode(output.cpu(), skip_special_tokens=True)
    ko_title = kss.split_sentences(ko_title)[0]

    return ko_title

In [197]:
def translate_text(eng_text:str) :
    ko_texts = ''
    texts = sent_tokenize(eng_text)
    for text in texts :
        embeddings = src_tokenizer(text, return_attention_mask=False, return_token_type_ids=False, return_tensors='pt')
        embeddings = {k: v.cuda() for k, v in embeddings.items()}
        output = model.generate(**embeddings, 
                            max_length=50,
                            num_return_sequences=1,
                            no_repeat_ngram_size=3,
                            num_beams=7,
                            length_penalty=5
                            )[0,1:-1]
        ko_text = trg_tokenizer.decode(output.cpu(), skip_special_tokens=True)
        ko_text = kss.split_sentences(ko_text)[0]
        ko_texts += f' {ko_text}'
    return ko_texts

In [198]:
eng_text = '''Ukraine has gained the necessary approval from holders of its GDP warrants to its request for proposed changes to the securities, preliminary results of a vote released in a filing on Tuesday showed. The Ukrainian government launched a proposal in July to change conditions on its $2.6 billion of outstanding GDP warrants, a derivative security that triggers payments linked to economic growth. "Approximately 93% of Holders of the Notional Amount of Securities outstanding were represented for quorum purposes and approximately 91% of such Holders had voted in favour of the Extraordinary Resolution," Tuesday's statement said. Kyiv is also asking creditors to defer payments on the war-torn country's international bonds for 24 months as Ukraine seeks to avoid a potential $20 billion debt default. Ukraine said that final results will be announced after a meeting on Wednesday.'''
eng_title = '''Problems remain with Kaliningrad transit despite EU deal - Russia'''

In [199]:
translate_title(eng_title)

'EU의 거래에도 불구하고 칼리닌그라드의 통과 문제는 여전히 남아있다.'

In [200]:
translate_text(eng_text)

' 우크라이나는 유가 증권의 제안 변경 요구에 대해 자국의 GDP 보증금 보유국으로부터 필요한 승인을 얻었으며 지난주 제소에 부쳐진 투표 결과는 예상을 깨고 나왔다.. 우크라이나 정부는 지난 7월 경제 성장과 연계된 대금 결제를 촉발하는 파생금융상품인 160억 달러 상당의 채무 불이행 관련 조건 변경안을 내놓았다. 주간의 성명은 "증권거래세 예탁금 보유자의 약 93%가 정족수로 대표되고, 이와 같은 보유자 중 약 91%가 보충성 결의안에 찬성했다"고 밝혔다. 키프로스는 또 우크라이나가 20억 달러 규모의 채무불이행 사태를 피하기 위해 24개월 동안 전쟁 중인 자국의 국제채권에 대한 지급을 연기해줄 것을 채권단에 요구하고 있다. 우크라이나는 이날 오전 회의를 거쳐 최종 결과를 발표할 예정이라고 밝혔다.'