In [17]:
from Transformer.Config import Config
import torch
import torch.nn as nn
from torch import optim
import pandas as pd
import matplotlib.pyplot as plt

from Transformer.Trainer.Tokenizer import TokenizerPlus
from Transformer.Trainer.Trainer import Trainer
from Transformer.Trainer.decoding import greedy_decoding
from Transformer.Config import Config
from Transformer.Model import Transformer

from datasets import load_dataset

from konlpy.tag import Okt




## Text 전처리

In [18]:
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")


Using the latest cached version of the module from /Users/ichan-u/.cache/huggingface/modules/datasets_modules/datasets/kde4/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac (last modified on Wed Aug 16 20:12:54 2023) since it couldn't be found locally at kde4., or remotely on the Hugging Face Hub.


In [19]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test")



In [20]:
split_datasets["train"][3]["translation"]


{'en': 'New Action', 'fr': 'Nouvelle action'}

In [21]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")


In [22]:
en_sentence = split_datasets["train"][1]["translation"]["en"]
fr_sentence = split_datasets["train"][1]["translation"]["fr"]

inputs = tokenizer(en_sentence)
with tokenizer.as_target_tokenizer():
    targets = tokenizer(fr_sentence)


In [23]:
wrong_targets = tokenizer(fr_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))

['▁Par', '▁dé', 'f', 'aut', ',', '▁dé', 've', 'lop', 'per', '▁les', '▁fil', 's', '▁de', '▁discussion', '</s>']
['▁Par', '▁défaut', ',', '▁développer', '▁les', '▁fils', '▁de', '▁discussion', '</s>']


In [24]:
max_input_length = 128
max_target_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # 타겟을 위한 토크나이저 셋업
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [25]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)


In [26]:
train_dataset = tokenized_datasets["train"]
validation_dataset = tokenized_datasets["validation"]

X_train = train_dataset["input_ids"]
y_train = train_dataset["labels"]

X_val = validation_dataset["input_ids"]
y_val = validation_dataset["labels"]


## 1

In [27]:
from torch.nn.utils.rnn import pad_sequence
X_train = [torch.tensor(seq) for seq in X_train]
y_train = [torch.tensor(seq) for seq in y_train]
X_train= pad_sequence([seq.flip(0) for seq in X_train], batch_first=True, padding_value=tokenizer.pad_token_id).flip(1)
y_train= pad_sequence([seq.flip(0) for seq in y_train], batch_first=True, padding_value=tokenizer.pad_token_id).flip(1)
print(X_train.shape,y_train.shape)

torch.Size([189155, 128]) torch.Size([189155, 128])


In [28]:

X_val = [torch.tensor(seq) for seq in X_val]
y_val = [torch.tensor(seq) for seq in y_val]
X_val= pad_sequence([seq.flip(0) for seq in X_val], batch_first=True, padding_value=tokenizer.pad_token_id).flip(1)
y_val= pad_sequence([seq.flip(0) for seq in y_val], batch_first=True, padding_value=tokenizer.pad_token_id).flip(1)
print(X_val.shape,y_val.shape)


torch.Size([21018, 128]) torch.Size([21018, 128])


In [29]:
len(tokenizer.get_vocab())

59514

In [30]:
n=128
config=Config(len(tokenizer.get_vocab())+1)
config.n_enc_seq=n
config.n_dec_seq=n
config.d_hidn=n
config.d_ff=n*2
config.d_head=n
config.n_layer=4
print(config)

{'n_enc_vocab': 59515, 'n_dec_vocab': 59515, 'n_enc_seq': 128, 'n_dec_seq': 128, 'n_layer': 4, 'd_hidn': 128, 'i_pad': 0, 'd_ff': 256, 'n_head': 4, 'd_head': 128, 'dropout': 0.1, 'layer_norm_epsilon': 1e-12}


In [31]:
print(type(X_train),X_train.shape)
print(type(y_train),y_train.shape)
print(type(X_val),X_val.shape)
print(type(y_val),y_val.shape)
print(tokenizer.convert_tokens_to_string([i for i in tokenizer.convert_ids_to_tokens(X_train[0]) if i != tokenizer.eos_token]))
print(tokenizer.convert_tokens_to_string([i for i in tokenizer.convert_ids_to_tokens(y_train[0]) if i != tokenizer.eos_token]))

<class 'torch.Tensor'> torch.Size([189155, 128])
<class 'torch.Tensor'> torch.Size([189155, 128])
<class 'torch.Tensor'> torch.Size([21018, 128])
<class 'torch.Tensor'> torch.Size([21018, 128])
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> ▁Calibration is▁about to check the▁value▁range your▁device▁delivers.▁Please▁move▁axis %1 %2 on your▁device to the maximum position. Press▁any▁button on the▁device or▁click on the'Next 'button to continue with the▁next▁step.
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

In [32]:
tokenizer.convert_ids_to_tokens
model = Transformer(config)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

trainer = Trainer(model, loss_fn, optimizer,dec_fnc=greedy_decoding,tokenizer=tokenizer)
trainer.train(src=X_train, tgt=y_train, 
              val_src=X_val[:200], val_tgt= y_val[:200], max_epoch=4)


(Epoch   1/4)   Batch:1/2956   Cost:11.651671

KeyboardInterrupt: 

In [None]:

torch.save(model.state_dict(), 'complete_model.pth')
