In [1]:
#RNN Character RNNLM

In [127]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

# Preprocessing [ Tokenize ]
---

In [162]:
doc="""겨울에 눈을 기다리다\n
나의 눈은 상대의 눈을 바라본다\n
눈이 내리는 길을 걷다\n"""

tkn = Tokenizer()
tkn.fit_on_texts([doc])

vc_size = len(tkn.word_index)+1
print('\nword_index size\n',vc_size)
print('\nword_index\n',tkn.word_index)


word_index size
 12

word_index
 {'눈을': 1, '겨울에': 2, '기다리다': 3, '나의': 4, '눈은': 5, '상대의': 6, '바라본다': 7, '눈이': 8, '내리는': 9, '길을': 10, '걷다': 11}


# Preprocessing [ Sequencing ]

In [164]:
sequences = list()

for line in doc.split('\n'):
    encoded = tkn.texts_to_sequences([line])[0]
    
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
        
print('\nsequences\n', sequences)        


sequences
 [[2, 1], [2, 1, 3], [4, 5], [4, 5, 6], [4, 5, 6, 1], [4, 5, 6, 1, 7], [8, 9], [8, 9, 10], [8, 9, 10, 11]]


# Preprocessing [ Padding]

In [165]:
max_len_sequence = max(len(s) for s in sequences)
print('\nmax_len_sequence\n', max_len_sequence)

sequences = pad_sequences(sequences, maxlen=max_len_sequence, padding='pre')
print('\nsequence\n', sequences)


max_len_sequence
 5

sequence
 [[ 0  0  0  2  1]
 [ 0  0  2  1  3]
 [ 0  0  0  4  5]
 [ 0  0  4  5  6]
 [ 0  4  5  6  1]
 [ 4  5  6  1  7]
 [ 0  0  0  8  9]
 [ 0  0  8  9 10]
 [ 0  8  9 10 11]]


# Preprocessing [ Label ]

In [166]:
sequences = np.array(sequences)
train_x = sequences[:,:-1]
train_y = sequences[:,-1]

print('\ntrain_x\n', train_x)
print('\ntrain_y\n', train_y)

train_y = to_categorical(train_y, num_classes = vc_size)
print('\ntrain_y after one-hot endoded\n', train_y)


train_x
 [[ 0  0  0  2]
 [ 0  0  2  1]
 [ 0  0  0  4]
 [ 0  0  4  5]
 [ 0  4  5  6]
 [ 4  5  6  1]
 [ 0  0  0  8]
 [ 0  0  8  9]
 [ 0  8  9 10]]

train_y
 [ 1  3  5  6  1  7  9 10 11]

train_y after one-hot endoded
 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


# Training

In [167]:
model = Sequential()

'''
Embedding
텍스트 데이터에 대해서 워드 임베딩을 수행
Embedding(input_dim, output_dim, input_length)
   input_dim : size of vocabulary 
   output_dim : 
   input_length : sentence length, ie, word count in sentence
     문장이 가지고 있는 단어의 수
'''
model.add(Embedding(vc_size, 10, input_length=max_len_sequence -1))
model.add(SimpleRNN(32))
model.add(Dense(vc_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_x, train_y, epochs=200, verbose =2)

Epoch 1/200
9/9 - 0s - loss: 2.4921 - accuracy: 0.2222
Epoch 2/200
9/9 - 0s - loss: 2.4806 - accuracy: 0.1111
Epoch 3/200
9/9 - 0s - loss: 2.4692 - accuracy: 0.2222
Epoch 4/200
9/9 - 0s - loss: 2.4581 - accuracy: 0.2222
Epoch 5/200
9/9 - 0s - loss: 2.4470 - accuracy: 0.2222
Epoch 6/200
9/9 - 0s - loss: 2.4359 - accuracy: 0.2222
Epoch 7/200
9/9 - 0s - loss: 2.4247 - accuracy: 0.2222
Epoch 8/200
9/9 - 0s - loss: 2.4134 - accuracy: 0.2222
Epoch 9/200
9/9 - 0s - loss: 2.4019 - accuracy: 0.2222
Epoch 10/200
9/9 - 0s - loss: 2.3902 - accuracy: 0.2222
Epoch 11/200
9/9 - 0s - loss: 2.3782 - accuracy: 0.3333
Epoch 12/200
9/9 - 0s - loss: 2.3659 - accuracy: 0.3333
Epoch 13/200
9/9 - 0s - loss: 2.3533 - accuracy: 0.2222
Epoch 14/200
9/9 - 0s - loss: 2.3402 - accuracy: 0.2222
Epoch 15/200
9/9 - 0s - loss: 2.3268 - accuracy: 0.2222
Epoch 16/200
9/9 - 0s - loss: 2.3129 - accuracy: 0.2222
Epoch 17/200
9/9 - 0s - loss: 2.2985 - accuracy: 0.2222
Epoch 18/200
9/9 - 0s - loss: 2.2836 - accuracy: 0.2222
E

Epoch 147/200
9/9 - 0s - loss: 0.3471 - accuracy: 1.0000
Epoch 148/200
9/9 - 0s - loss: 0.3413 - accuracy: 1.0000
Epoch 149/200
9/9 - 0s - loss: 0.3356 - accuracy: 1.0000
Epoch 150/200
9/9 - 0s - loss: 0.3300 - accuracy: 1.0000
Epoch 151/200
9/9 - 0s - loss: 0.3246 - accuracy: 1.0000
Epoch 152/200
9/9 - 0s - loss: 0.3193 - accuracy: 1.0000
Epoch 153/200
9/9 - 0s - loss: 0.3141 - accuracy: 1.0000
Epoch 154/200
9/9 - 0s - loss: 0.3090 - accuracy: 1.0000
Epoch 155/200
9/9 - 0s - loss: 0.3040 - accuracy: 1.0000
Epoch 156/200
9/9 - 0s - loss: 0.2991 - accuracy: 1.0000
Epoch 157/200
9/9 - 0s - loss: 0.2943 - accuracy: 1.0000
Epoch 158/200
9/9 - 0s - loss: 0.2896 - accuracy: 1.0000
Epoch 159/200
9/9 - 0s - loss: 0.2850 - accuracy: 1.0000
Epoch 160/200
9/9 - 0s - loss: 0.2805 - accuracy: 1.0000
Epoch 161/200
9/9 - 0s - loss: 0.2762 - accuracy: 1.0000
Epoch 162/200
9/9 - 0s - loss: 0.2718 - accuracy: 1.0000
Epoch 163/200
9/9 - 0s - loss: 0.2676 - accuracy: 1.0000
Epoch 164/200
9/9 - 0s - loss: 

<tensorflow.python.keras.callbacks.History at 0x7f69ca9c4a20>

# Predict 

In [168]:
def SentenceGeneration(model, tkn, curr_word, n):
    ''' 
    model : 모델
    t : Tokenizer
    current_word : 현재 단어
    n: 반복할 횟수
    '''
    init_word = curr_word
    sentence = ''
    
    for _ in range(n):
        encoded = tkn.texts_to_sequences([curr_word])[0]
        encoded = pad_sequences([encoded],maxlen=max_len_sequence -1, padding='pre')
        
        predit_idx = model.predict_classes(encoded, verbose=0)
        
        for word, index in tkn.word_index.items():
            if index == predit_idx:
                break
        curr_word = curr_word + ' ' + word
        print('\ncurr_word',curr_word)
        sentence = sentence + ' ' + word
    sentence = init_word + sentence
    return sentence
            

In [172]:
print('\ncurr_sentense',tkn.word_index)
print(SentenceGeneration(model,tkn, '나의',3))


curr_sentense {'눈을': 1, '겨울에': 2, '기다리다': 3, '나의': 4, '눈은': 5, '상대의': 6, '바라본다': 7, '눈이': 8, '내리는': 9, '길을': 10, '걷다': 11}

curr_word 나의 눈은

curr_word 나의 눈은 상대의

curr_word 나의 눈은 상대의 눈을
나의 눈은 상대의 눈을
