In [13]:
#%%
import csv
from tqdm import tqdm

from typing import List

from easydict import EasyDict as edict

from nltk.translate.bleu_score import sentence_bleu

from lib.tokenization_kobert import KoBertTokenizer

from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    EncoderDecoderModel,
    PreTrainedTokenizerFast as BaseKoGPT2Tokenizer,
    GPT2Tokenizer as BaseGPT2Tokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Trainer,
    DistilBertTokenizer,
    
)

from googletrans import Translator

In [14]:
file_name = 'data/kor2en_test.csv'
try:
    with open(file_name, 'r', encoding='cp949') as fd:
        datas = [row[1:] for row in csv.reader(fd)]
except:
    with open(file_name, 'r', encoding='utf-8') as fd:
        datas = [row[1:] for row in csv.reader(fd)]

test_datasets = []
references = []
for data in datas:
    test_datasets.append(data[1])
    references.append(data[0])
references = [[i.split()] for i in references]

# bert-kogpt2

In [15]:
class PreTrainedTokenizerFast(BaseKoGPT2Tokenizer):
    def build_inputs_with_special_tokens(self, token_ids: List[int], _) -> List[int]:
        return token_ids + [self.eos_token_id]

enc_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
dec_tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.69M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [16]:
model = EncoderDecoderModel.from_pretrained(f'./checkpoints/bert_kogpt2_ep15_lr0.0001_692_fine2')

In [None]:
predictions = []
for input_prompt in tqdm(test_datasets):
    input_ids = enc_tokenizer.encode(input_prompt, return_tensors='pt')

    outputs = model.generate(input_ids,
                            # num_beams=5,
                            # num_return_sequences=1,
                            max_length=50,
                            no_repeat_ngram_size = 2)
    
    predictions.append(dec_tokenizer.decode(outputs[0], skip_special_tokens=True).split())

In [None]:
score = 0
cnt = 0
for i in range(len(predictions)):
    cnt += 1
    temp = sentence_bleu(references[i], predictions[i], weights=(0.25, 0.25, 0.25, 0.25))
    print(temp)
    score += temp
score /= cnt
print('-'*100)
print(f'Bleu Score: {score}')
print('-'*100)

0.0909090909090909
0.002804892496400388
0.029391225946319512
----------------------------------------------------------------------------------------------------
Bleu Score: 0.04103506978393693
----------------------------------------------------------------------------------------------------


Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [None]:
translator = Translator()

google_predictions = []
for input_prompt in tqdm(test_datasets):
    result = translator.translate(input_prompt, src='en', dest="ko")
    google_predictions.append(result.text.split())

hello.


In [None]:
score = 0
cnt = 0
for i in range(len(google_predictions)):
    cnt += 1
    temp = sentence_bleu(references[i], google_predictions[i],weights=(0.25, 0.25, 0.25, 0.25))
    print(temp)
    score += temp
score /= cnt
print('-'*100)
print(f'Google Bleu Score: {score}')
print('-'*100)

0
0
0
----------------------------------------------------------------------------------------------------
Google Bleu Score: 0.0
----------------------------------------------------------------------------------------------------


# kobert-gpt

In [None]:
class GPT2Tokenizer(BaseGPT2Tokenizer):
    def build_inputs_with_special_tokens(self, token_ids: List[int], _) -> List[int]:
        return token_ids + [self.eos_token_id]

enc_tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
dec_tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

In [None]:
model = EncoderDecoderModel.from_pretrained(f'./checkpoints/kobert_gpt2_ep15_lr0.0001_793_fine2')

In [None]:
predictions = []
for input_prompt in tqdm(test_datasets):
    input_ids = enc_tokenizer.encode(input_prompt, return_tensors='pt')

    outputs = model.generate(input_ids,
                            # num_beams=5,
                            # num_return_sequences=1,
                            max_length=50,
                            no_repeat_ngram_size = 2)
    
    predictions.append(dec_tokenizer.decode(outputs[0], skip_special_tokens=True).split())

In [None]:
score = 0
cnt = 0
for i in range(len(predictions)):
    cnt += 1
    temp = sentence_bleu(references[i], predictions[i], weights=(0.25, 0.25, 0.25, 0.25))
    print(temp)
    score += temp
score /= cnt
print('-'*100)
print(f'Bleu Score: {score}')
print('-'*100)

In [None]:
translator = Translator()

google_predictions = []
for input_prompt in tqdm(test_datasets):
    result = translator.translate(input_prompt, src='ko', dest="en")
    google_predictions.append(result.text.split())

In [None]:
score = 0
cnt = 0
for i in range(len(google_predictions)):
    cnt += 1
    temp = sentence_bleu(references[i], google_predictions[i],weights=(0.25, 0.25, 0.25, 0.25))
    print(temp)
    score += temp
score /= cnt
print('-'*100)
print(f'Google Bleu Score: {score}')
print('-'*100)

# m2m100 en2kor

In [None]:
model = M2M100ForConditionalGeneration.from_pretrained('checkpoints/m2m100_en2kor_ep15_lr0.0001_465_fine2')
tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M')
tokenizer.src_lang = "en"
tokenizer.tgt_lang = "ko"

Downloading:   0%|          | 0.00/3.54M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/272 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/908 [00:00<?, ?B/s]

In [None]:
predictions = []
for input_prompt in tqdm(test_datasets):
    input_ids = tokenizer.encode(input_prompt, return_tensors='pt')

    outputs = model.generate(input_ids,
                        # num_beams=5,
                        # num_return_sequences=5,
                        max_length=512,
                        no_repeat_ngram_size = 2,
                        forced_bos_token_id=tokenizer.get_lang_id("ko"))
    
    predictions.append(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].split())

In [None]:
score = 0
cnt = 0
for i in range(len(predictions)):
    cnt += 1
    temp = sentence_bleu(references[i], predictions[i], weights=(0.25, 0.25, 0.25, 0.25))
    print(temp)
    score += temp
score /= cnt
print('-'*100)
print(f'Bleu Score: {score}')
print('-'*100)

In [6]:
translator = Translator()

google_predictions = []
for input_prompt in tqdm(test_datasets):
    result = translator.translate(input_prompt, scr='en', dest="ko")
    google_predictions.append(result.text.split())

In [7]:
score = 0
cnt = 0
for i in range(len(google_predictions)):
    cnt += 1
    temp = sentence_bleu(references[i], google_predictions[i],weights=(0.25, 0.25, 0.25, 0.25))
    print(temp)
    score += temp
score /= cnt
print('-'*100)
print(f'Google Bleu Score: {score}')
print('-'*100)

0.537284965911771
0.03257862486785316
0.0552879052829212
0.1713410206906127
0.477999608304783
0.022318050731328917
0.16620830006469267
0.18653674746226495
----------------------------------------------------------------------------------------------------
Google Bleu Score: 0.20619440291452848
----------------------------------------------------------------------------------------------------


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


# m2m100 kor2en

In [None]:
model = M2M100ForConditionalGeneration.from_pretrained('checkpoints/m2m100_kor2en_ep15_lr0.0001_478_fine2')
tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M')
tokenizer.src_lang = "ko"
tokenizer.tgt_lang = "en"

In [None]:
predictions = []
for input_prompt in tqdm(test_datasets):
    input_ids = tokenizer.encode(input_prompt, return_tensors='pt')

    outputs = model.generate(input_ids,
                        # num_beams=5,
                        # num_return_sequences=5,
                        max_length=512,
                        no_repeat_ngram_size = 2,
                        forced_bos_token_id=tokenizer.get_lang_id("en"))
    
    predictions.append(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].split())

In [None]:
score = 0
cnt = 0
for i in range(len(predictions)):
    cnt += 1
    temp = sentence_bleu(references[i], predictions[i], weights=(0.25, 0.25, 0.25, 0.25))
    print(temp)
    score += temp
score /= cnt
print('-'*100)
print(f'Bleu Score: {score}')
print('-'*100)

In [None]:
translator = Translator()

google_predictions = []
for input_prompt in tqdm(test_datasets):
    result = translator.translate(input_prompt, src="ko", dest="en")
    google_predictions.append(result.text.split())

In [None]:
score = 0
cnt = 0
for i in range(len(google_predictions)):
    cnt += 1
    temp = sentence_bleu(references[i], google_predictions[i],weights=(0.25, 0.25, 0.25, 0.25))
    print(temp)
    score += temp
score /= cnt
print('-'*100)
print(f'Google Bleu Score: {score}')
print('-'*100)

In [8]:
test_datasets[4]

"I knew I hadn't met my match but every moment we could snatch I don't know why I got so attached It's my responsibility, you don't owe nothing to me But to walk away I have no capacity"

In [9]:
print(references[4])

[['당신이', '나에게', '맞지', '않는', '걸', '알고', '있었지만', '당신이', '나를', '붙잡았을', '때', '내가', '당신에게', '끌린', '이유를', '모르겠어요', '내', '책임이에요,', '당신은', '아무것도', '나에게', '빚진', '게', '없어요', '당신에게서', '멀어지려고', '하면', '나는', '힘이', '없어서', '멀어지지', '못해요']]


In [11]:
print(predictions[4])

NameError: ignored

In [12]:
print(google_predictions[4])

['나는', '내가', '경기를', '만났다는', '것을', '알았지', '만', '우리가', '잡아', '당길', '수있는', '순간', '나는', '왜', '내가', '첨부', '된', '이유를', '모른다.', '그것은', '내', '책임이라는', '것을', '알지', '못한다.']
