In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
text = """가는 말이 고와야 오는 말이 곱다\n"""

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
vocab_size = len(tokenizer.word_index) + 1
print('단어 집합의 크기 : %d' % vocab_size)

단어 집합의 크기 : 6


In [4]:
print(tokenizer.word_index)

{'말이': 1, '가는': 2, '고와야': 3, '오는': 4, '곱다': 5}


In [5]:
sequences = list()
for line in text.split('\n'): # 줄바꿈 문자를 기준으로 문장 토큰화
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

print('학습에 사용할 샘플의 개수: %d' % len(sequences))

학습에 사용할 샘플의 개수: 5


In [6]:
print(sequences)

[[2, 1], [2, 1, 3], [2, 1, 3, 4], [2, 1, 3, 4, 1], [2, 1, 3, 4, 1, 5]]


In [7]:
max_len = max(len(l) for l in sequences) # 모든 샘플에서 길이가 가장 긴 샘플의 길이 출력
print('샘플의 최대 길이 : {}'.format(max_len))

샘플의 최대 길이 : 6


In [8]:
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

In [9]:
sequences

array([[0, 0, 0, 0, 2, 1],
       [0, 0, 0, 2, 1, 3],
       [0, 0, 2, 1, 3, 4],
       [0, 2, 1, 3, 4, 1],
       [2, 1, 3, 4, 1, 5]], dtype=int32)

In [10]:
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

In [11]:
X

array([[0, 0, 0, 0, 2],
       [0, 0, 0, 2, 1],
       [0, 0, 2, 1, 3],
       [0, 2, 1, 3, 4],
       [2, 1, 3, 4, 1]], dtype=int32)

In [12]:
y

array([1, 3, 4, 1, 5], dtype=int32)

In [13]:
y = to_categorical(y, num_classes=vocab_size)

In [14]:
y

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]], dtype=float32)

## 문장을 생성하는 함수

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

In [16]:
embedding_dim = 10
hidden_units = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(SimpleRNN(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


2022-05-21 19:45:39.377975: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-21 19:45:39.397532: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-21 19:45:39.397633: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-21 19:45:39.397991: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 10)          60        
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                1376      
                                                                 
 dense (Dense)               (None, 6)                 198       
                                                                 
Total params: 1,634
Trainable params: 1,634
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.fit(X, y, epochs=200, verbose=2)

Epoch 1/200
1/1 - 1s - loss: 1.8112 - accuracy: 0.2000 - 906ms/epoch - 906ms/step
Epoch 2/200
1/1 - 0s - loss: 1.7924 - accuracy: 0.4000 - 3ms/epoch - 3ms/step
Epoch 3/200
1/1 - 0s - loss: 1.7739 - accuracy: 0.4000 - 3ms/epoch - 3ms/step
Epoch 4/200
1/1 - 0s - loss: 1.7556 - accuracy: 0.4000 - 3ms/epoch - 3ms/step
Epoch 5/200
1/1 - 0s - loss: 1.7373 - accuracy: 0.2000 - 3ms/epoch - 3ms/step
Epoch 6/200
1/1 - 0s - loss: 1.7189 - accuracy: 0.4000 - 3ms/epoch - 3ms/step
Epoch 7/200
1/1 - 0s - loss: 1.7003 - accuracy: 0.4000 - 3ms/epoch - 3ms/step
Epoch 8/200
1/1 - 0s - loss: 1.6814 - accuracy: 0.4000 - 3ms/epoch - 3ms/step
Epoch 9/200
1/1 - 0s - loss: 1.6620 - accuracy: 0.4000 - 3ms/epoch - 3ms/step
Epoch 10/200
1/1 - 0s - loss: 1.6421 - accuracy: 0.4000 - 3ms/epoch - 3ms/step
Epoch 11/200
1/1 - 0s - loss: 1.6215 - accuracy: 0.6000 - 3ms/epoch - 3ms/step
Epoch 12/200
1/1 - 0s - loss: 1.6003 - accuracy: 0.6000 - 3ms/epoch - 3ms/step
Epoch 13/200
1/1 - 0s - loss: 1.5785 - accuracy: 0.6000 -

2022-05-21 19:45:40.713863: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 47/200
1/1 - 0s - loss: 0.7849 - accuracy: 0.8000 - 3ms/epoch - 3ms/step
Epoch 48/200
1/1 - 0s - loss: 0.7593 - accuracy: 0.8000 - 3ms/epoch - 3ms/step
Epoch 49/200
1/1 - 0s - loss: 0.7341 - accuracy: 0.8000 - 3ms/epoch - 3ms/step
Epoch 50/200
1/1 - 0s - loss: 0.7092 - accuracy: 0.8000 - 3ms/epoch - 3ms/step
Epoch 51/200
1/1 - 0s - loss: 0.6847 - accuracy: 0.8000 - 3ms/epoch - 3ms/step
Epoch 52/200
1/1 - 0s - loss: 0.6608 - accuracy: 0.8000 - 3ms/epoch - 3ms/step
Epoch 53/200
1/1 - 0s - loss: 0.6374 - accuracy: 0.8000 - 3ms/epoch - 3ms/step
Epoch 54/200
1/1 - 0s - loss: 0.6147 - accuracy: 0.8000 - 4ms/epoch - 4ms/step
Epoch 55/200
1/1 - 0s - loss: 0.5927 - accuracy: 0.8000 - 3ms/epoch - 3ms/step
Epoch 56/200
1/1 - 0s - loss: 0.5713 - accuracy: 0.8000 - 3ms/epoch - 3ms/step
Epoch 57/200
1/1 - 0s - loss: 0.5508 - accuracy: 0.8000 - 3ms/epoch - 3ms/step
Epoch 58/200
1/1 - 0s - loss: 0.5310 - accuracy: 0.8000 - 3ms/epoch - 3ms/step
Epoch 59/200
1/1 - 0s - loss: 0.5120 - accuracy: 0.8

<keras.callbacks.History at 0x7f9a74a3afd0>

In [18]:
def sentence_generation(model, tokenizer, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word
    sentence = ''

    # n번 반복
    for _ in range(n):
        # 현재 단어에 대한 정수 인코딩과 패딩
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=5, padding='pre')
        # 입력한 X(현재 단어)에 대해서 Y를 예측하고 Y(예측한 단어)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items(): 
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면 break
            if index == result:
                break

        # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        current_word = current_word + ' '  + word

        # 예측 단어를 문장에 저장
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence

In [19]:
print(sentence_generation(model, tokenizer, '가는', 5))

가는 말이 고와야 오는 말이 곱다
