In [1]:
from Seq2SeqModel.Seq2SeqModel import Seq2SeqModel

## Text 전처리

In [2]:
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")


In [3]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test")



In [4]:
split_datasets["train"][3]["translation"]


{'en': 'New Action', 'fr': 'Nouvelle action'}

In [5]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")
tokenizer.add_special_tokens({"bos_token": "<bos>"})




1

In [6]:
en_sentence = split_datasets["train"][1]["translation"]["en"]
fr_sentence = split_datasets["train"][1]["translation"]["fr"]

inputs = tokenizer(en_sentence)
with tokenizer.as_target_tokenizer():
    targets = tokenizer(fr_sentence)




In [7]:
wrong_targets = tokenizer(fr_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))

['▁Par', '▁dé', 'f', 'aut', ',', '▁dé', 've', 'lop', 'per', '▁les', '▁fil', 's', '▁de', '▁discussion', '</s>']
['▁Par', '▁défaut', ',', '▁développer', '▁les', '▁fils', '▁de', '▁discussion', '</s>']


In [8]:
max_input_length = 128
max_target_length = 128


def preprocess_function(examples):
    inputs = ["<bos> "+ex["en"] for ex in examples["translation"]]
    targets = ["<bos> "+ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # 타겟을 위한 토크나이저 셋업
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [9]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)


In [10]:
train_dataset = tokenized_datasets["train"]
validation_dataset = tokenized_datasets["validation"]

X_train = train_dataset["input_ids"]
y_train = train_dataset["labels"]

X_val = validation_dataset["input_ids"]
y_val = validation_dataset["labels"]


In [11]:
len(tokenizer.get_vocab())

59515

In [12]:

tokenizer.special_tokens_map

{'bos_token': '<bos>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '<pad>'}

In [13]:
print(tokenizer.convert_tokens_to_string([i for i in tokenizer.convert_ids_to_tokens(X_train[0]) if i != tokenizer.eos_token]))
print(tokenizer.convert_tokens_to_string([i for i in tokenizer.convert_ids_to_tokens(y_train[0]) if i != tokenizer.eos_token]))

<bos> ▁Calibration is▁about to check the▁value▁range your▁device▁delivers.▁Please▁move▁axis %1 %2 on your▁device to the maximum position. Press▁any▁button on the▁device or▁click on the'Next 'button to continue with the▁next▁step.
<bos> Le calibrage va vérifier la plage de valeurs que votre matériel produit. Veuillez déplacer l'axe %1 %2 de votre périphérique à la position maximale. Appuyez sur n'importe quel bouton du périphérique ou sur le bouton « & #160; Suivant & #160; » pour la prochaine étape.


## 1

In [14]:
from Transformer.Config import Config
from Transformer.Model import Transformer
import torch.nn as nn
from torch import optim

config=Config(3)
model=Transformer(config)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
batchsize=64

In [15]:
module=Seq2SeqModel(model, tokenizer,optimizer,loss_fn,
                    X_train,y_train,X_val,y_val)

In [16]:

module.model.config

{'n_enc_vocab': 59515,
 'n_dec_vocab': 59515,
 'n_enc_seq': 128,
 'n_dec_seq': 128,
 'n_layer': 2,
 'd_hidn': 128,
 'i_pad': 0,
 'd_ff': 256,
 'n_head': 4,
 'd_head': 64,
 'dropout': 0.1,
 'layer_norm_epsilon': 1e-12}

In [17]:
module.train_main(5)

(Epoch   1/5)   Batch:1/2956   Cost:11.084811