In [1]:
from Transformer.Config import Config
import torch
import torch.nn as nn
from torch import optim
import pandas as pd
import matplotlib.pyplot as plt

from Transformer.Trainer.Tokenizer import TokenizerPlus
from Transformer.Trainer.Trainer import Trainer
from Transformer.Trainer.decoding import greedy_decoding
from Transformer.Config import Config
from Transformer.Model import Transformer

from datasets import load_dataset

from konlpy.tag import Okt




## Text 전처리

In [2]:
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")


In [3]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test")



In [4]:
split_datasets["train"][3]["translation"]


{'en': 'New Action', 'fr': 'Nouvelle action'}

In [5]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")




In [6]:
en_sentence = split_datasets["train"][1]["translation"]["en"]
fr_sentence = split_datasets["train"][1]["translation"]["fr"]

inputs = tokenizer(en_sentence)
with tokenizer.as_target_tokenizer():
    targets = tokenizer(fr_sentence)




In [7]:
wrong_targets = tokenizer(fr_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))

['▁Par', '▁dé', 'f', 'aut', ',', '▁dé', 've', 'lop', 'per', '▁les', '▁fil', 's', '▁de', '▁discussion', '</s>']
['▁Par', '▁défaut', ',', '▁développer', '▁les', '▁fils', '▁de', '▁discussion', '</s>']


In [8]:
max_input_length = 128
max_target_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # 타겟을 위한 토크나이저 셋업
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [9]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)


In [10]:
train_dataset = tokenized_datasets["train"]
validation_dataset = tokenized_datasets["validation"]

X_train = train_dataset["input_ids"]
y_train = train_dataset["labels"]

X_val = validation_dataset["input_ids"]
y_val = validation_dataset["labels"]


## 1

In [11]:
from torch.nn.utils.rnn import pad_sequence
X_train = [torch.tensor(seq) for seq in X_train]
y_train = [torch.tensor(seq) for seq in y_train]
X_train= pad_sequence([seq.flip(0) for seq in X_train], batch_first=True, padding_value=0).flip(1)
y_train= pad_sequence([seq.flip(0) for seq in y_train], batch_first=True, padding_value=0).flip(1)
print(X_train.shape,y_train.shape)

torch.Size([189155, 128]) torch.Size([189155, 128])


In [12]:

X_val = [torch.tensor(seq) for seq in X_val]
y_val = [torch.tensor(seq) for seq in y_val]
X_val= pad_sequence([seq.flip(0) for seq in X_val], batch_first=True, padding_value=0).flip(1)
y_val= pad_sequence([seq.flip(0) for seq in y_val], batch_first=True, padding_value=0).flip(1)
print(X_val.shape,y_val.shape)


torch.Size([21018, 128]) torch.Size([21018, 128])


In [13]:
len(tokenizer.get_vocab())

59514

In [14]:
n=128
config=Config(len(tokenizer.get_vocab())+1)
config.n_enc_seq=n
config.n_dec_seq=n
config.d_hidn=n
config.d_ff=n*2
config.d_head=n
config.n_layer=4
print(config)

{'n_enc_vocab': 59515, 'n_dec_vocab': 59515, 'n_enc_seq': 128, 'n_dec_seq': 128, 'n_layer': 4, 'd_hidn': 128, 'i_pad': 0, 'd_ff': 256, 'n_head': 4, 'd_head': 128, 'dropout': 0.1, 'layer_norm_epsilon': 1e-12}


In [15]:
model = Transformer(config)  # 모델 클래스를 인스턴스화
model.load_state_dict(torch.load('complete_model.pth'))
model.eval() 


Transformer(
  (encoder): Encoder(
    (emb): Embeding(
      (emb): Embedding(59515, 128)
      (pos_emb): Embedding(129, 128)
    )
    (layers): ModuleList(
      (0-3): 4 x EncoderLayer(
        (self_attn): MultiHeadAttention(
          (W_Q): Linear(in_features=128, out_features=512, bias=True)
          (W_K): Linear(in_features=128, out_features=512, bias=True)
          (W_V): Linear(in_features=128, out_features=512, bias=True)
          (scaled_dot_attn): ScaledDotProductAttention(
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (linear): Linear(in_features=512, out_features=128, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (layer_norm1): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
        (pos_ffn): PoswiseFeedForwardNet(
          (conv1): Conv1d(128, 256, kernel_size=(1,), stride=(1,))
          (conv2): Conv1d(256, 128, kernel_size=(1,), stride=(1,))
          (dropout): Dropout(p=0.1, inplace=False)

In [19]:
print(X_train[0].shape)
print(X_train[0].view(1,-1).shape)
print(X_train.shape)
print(X_train.view(X_train.size(0),1,-1)[0].shape)

torch.Size([128])
torch.Size([1, 128])
torch.Size([189155, 128])
torch.Size([1, 128])


In [18]:
r=greedy_decoding(model, X_train.view(X_train.size(0),1,-1).to(torch.int64)[0] ,tokenizer)


----
0/
----
1/
----
2/
----
3/
----
4/
----
5/
----
6/
----
7/
----
8/
----
9/
----
10/
----
11/
----
12/
----
13/
----
14/
----
15/
----
16/
----
17/
----
18/
----
19/
----
20/
----
21/
----
22/
----
23/
----
24/
----
25/
----
26/
----
27/
----
28/
----
29/
----
30/
----
31/
----
32/
----
33/
----
34/
----
35/
----
36/
----
37/
----
38/
----
39/
----
40/
----
41/
----
42/
----
43/
----
44/
----
45/
----
46/
----
47/
----
48/
----
49/
----
50/
----
51/
----
52/
----
53/
----
54/
----
55/
----
56/
----
57/
----
58/
----
59/
----
60/
----
61/
----
62/
----
63/
----
64/
----
65/
----
66/
----
67/
----
68/
----
69/
----
70/
----
71/
----
72/
----
73/
----
74/
----
75/
----
76/
----
77/
----
78/
----
79/
----
80/
----
81/
----
82/
----
83/
----
84/
----
85/
----
86/
----
87/
----
88/
----
89/
----
90/
----
91/
----
92/
----
93/
----
94/
----
95/
----
96/
----
97/
----
98/
----
99/
----
100/
----
101/
----
102/
----
103/
----
104/
----
105/
----
106/
----
107/
----
108/
----
109/
----
110/

In [None]:
print(r)

tensor([[817,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0]])


In [None]:
print( X_train[0].view(1,-1).to(torch.int64))
x= X_train[0].view(1,-1).to(torch.int64)

tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0, 34378,   226,  5783,    32,
           200,    12,  3647,     4,  1223,  1628,   117,  4923, 23608,     3,
          1789,  2942, 20059,   301,   548,   301,   331,    30,   117,  4923,
            12,     4,  1528,   668,     3,  5734,   212,  9319,    30,     4,
          4923,    57,  5487,    30,     4,     6, 32712,    25,  7243,  1160,
            12,   621,    42,     4,  1156,  3009,  

In [None]:
torch.tensor(tokenizer.convert_ids_to_tokens(x[0]))

ValueError: too many dimensions 'str'

In [None]:
print(y_val[0:3])

tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0, 10773,    20,     6,  1549,
             5,    14,     6,  8543,    11,    22,  