In [1]:
import numpy as np
import re
from IPython.display import clear_output

from keras.layers import Dense, LSTM, Input, Embedding, Dropout
from keras.utils import np_utils
from keras.models import Model, load_model
from keras.optimizers import Adam, RMSprop
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import LambdaCallback

In [2]:
load_saved_model = False
train_model = False

In [3]:
token_type = 'word'

In [4]:
seq_length = 20

filename = "../data/aesop.txt"

with open(filename, encoding='utf-8-sig') as f:
    text = f.read()
    
    
#removing text before and after the main stories
start = text.find("THE FOX AND THE GRAPES\n\n\n") #1
end = text.find("ILLUSTRATIONS\n\n\n") #-1
text = text[start:end]

In [5]:
start_story = '| ' * seq_length
    
text = start_story + text
text = text.lower()
text = text.replace('\n\n\n\n\n', start_story)
text = text.replace('\n', '')
text = re.sub('  +', '. ', text).strip()
text = text.replace('..', '.')

text = re.sub('([!"#$%&()*+,-./:;<=>?@[\]^_`{|}~])', r' \1 ', text)
text = re.sub('\s{2,}', ' ', text)

In [6]:
len(text)

210620

In [7]:
text

' | | | | | | | | | | | | | | | | | | | | the fox and the grapesa hungry fox saw some fine bunches of grapes hanging from a vine thatwas trained along a high trellis , and did his best to reach them byjumping as high as he could into the air . but it was all in vain , forthey were just out of reach : so he gave up trying , and walked awaywith an air of dignity and unconcern , remarking , " i thought thosegrapes were ripe , but i see now they are quite sour . " | | | | | | | | | | | | | | | | | | | | the goose that laid the golden eggsa man and his wife had the good fortune to possess a goose which laida golden egg every day . lucky though they were , they soon began tothink they were not getting rich fast enough , and , imagining the birdmust be made of gold inside , they decided to kill it in order tosecure the whole store of precious metal at once . but when they cut itopen they found it was just like any other goose . thus , they neithergot rich all at once , as they had hoped , nor

In [8]:
if token_type == 'word': #단어 토큰화
    tokenizer = Tokenizer(char_level = False, filters = '')
else:
    tokenizer = Tokenizer(char_level = True, filters = '', lower = False)

In [9]:
# fit_on_texts()안에 코퍼스를 입력으로 하면 빈도수를 기준으로 단어 집합을 생성
tokenizer.fit_on_texts([text]) 

In [10]:
print(tokenizer.word_index)

{'|': 1, ',': 2, 'the': 3, 'and': 4, '.': 5, '"': 6, 'to': 7, 'a': 8, 'he': 9, 'of': 10, 'his': 11, 'you': 12, 'in': 13, 'was': 14, 'him': 15, 'it': 16, 'but': 17, 'i': 18, 'for': 19, 'that': 20, 'said': 21, 'with': 22, 'they': 23, 'as': 24, 'at': 25, ':': 26, 'when': 27, ';': 28, 'on': 29, 'by': 30, 'so': 31, 'had': 32, 'one': 33, 'be': 34, 'them': 35, 'up': 36, 'all': 37, '-': 38, 'who': 39, 'is': 40, 'out': 41, 'my': 42, 'her': 43, 'me': 44, 'not': 45, 'were': 46, 'which': 47, 'no': 48, 'your': 49, 'if': 50, 'have': 51, '!': 52, 'their': 53, 'she': 54, '?': 55, 'what': 56, 'into': 57, 'are': 58, 'lion': 59, 'time': 60, 'fox': 61, 'from': 62, 'ass': 63, 'man': 64, 'do': 65, 'came': 66, 'an': 67, 'about': 68, 'upon': 69, 'wolf': 70, 'then': 71, 'day': 72, 'this': 73, 'very': 74, 'there': 75, 'would': 76, 'once': 77, 'himself': 78, 'well': 79, 'will': 80, 'some': 81, 'could': 82, 'got': 83, 'went': 84, 'down': 85, 'replied': 86, 'see': 87, 'away': 88, 'off': 89, 'before': 90, 'than': 9

In [11]:
total_words = len(tokenizer.word_index) + 1 

In [12]:
token_list = tokenizer.texts_to_sequences([text])[0]

In [13]:
print(token_list) #text의 단어들 길이
print(len(token_list)) #50436

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 61, 4, 3, 2205, 371, 61, 104, 81, 195, 1432, 10, 1433, 847, 62, 8, 697, 2206, 2207, 161, 8, 372, 2208, 2, 4, 157, 11, 277, 7, 373, 35, 2209, 24, 372, 24, 9, 82, 57, 3, 698, 5, 17, 16, 14, 37, 13, 699, 2, 1434, 46, 94, 41, 10, 373, 26, 31, 9, 172, 36, 417, 2, 4, 580, 2210, 67, 698, 10, 1435, 4, 2211, 2, 1436, 2, 6, 18, 138, 2212, 46, 1065, 2, 17, 18, 87, 118, 23, 58, 463, 2213, 5, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1066, 20, 700, 3, 848, 2214, 64, 4, 11, 511, 32, 3, 97, 418, 7, 2215, 8, 1066, 47, 2216, 848, 1067, 164, 72, 5, 849, 374, 23, 46, 2, 23, 184, 130, 2217, 23, 46, 45, 375, 419, 420, 179, 2, 4, 2, 2218, 3, 2219, 34, 109, 10, 701, 376, 2, 23, 850, 7, 254, 16, 13, 294, 2220, 3, 581, 512, 10, 2221, 2222, 25, 77, 5, 17, 27, 23, 337, 2223, 23, 120, 16, 14, 94, 98, 139, 93, 1066, 5, 255, 2, 23, 2224, 419, 37, 25, 77, 2, 24, 23, 32, 2225, 2, 702, 1068, 139, 278, 2226, 2227, 7, 53, 1069, 5, 95

In [14]:
total_words #unique한 단어의 갯수

5978

In [15]:
def generate_sequences(token_list, step):
    X = []
    y = []

    for i in range(0, len(token_list) - seq_length, step):
        X.append(token_list[i: i + seq_length])
        y.append(token_list[i + seq_length])
    

    y = np_utils.to_categorical(y, num_classes = total_words) #원핫 인코딩
    
    num_seq = len(X)
    print('Number of sequences:', num_seq, "\n")
    
    return X, y, num_seq

step = 1
seq_length = 20

X, y, num_seq = generate_sequences(token_list, step)

X = np.array(X)
y = np.array(y)

Number of sequences: 47625 



In [16]:
X.shape #X 20벡터의 값이 y의 4169벡터 값에 대응

(47625, 20)

In [17]:
y.shape

(47625, 5978)

In [18]:
#LSTM model
if load_saved_model: #이미 한번 돌려서 실행되어져있을 시
    # model = load_model('./saved_models/lstm_aesop_1.h5')
    model = load_model('./saved_models/aesop_dropout_100.h5')

else:

    n_units = 256
    embedding_size = 100

    text_in = Input(shape = (None,))
    embedding = Embedding(total_words, embedding_size)
    x = embedding(text_in)
    x = LSTM(n_units)(x)
    # x = Dropout(0.2)(x)
    text_out = Dense(total_words, activation = 'softmax')(x)

    model = Model(text_in, text_out)

    opti = RMSprop(lr = 0.001)
    model.compile(loss='categorical_crossentropy', optimizer=opti)

In [19]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         597800    
_________________________________________________________________
lstm (LSTM)                  (None, 256)               365568    
_________________________________________________________________
dense (Dense)                (None, 5978)              1536346   
Total params: 2,499,714
Trainable params: 2,499,714
Non-trainable params: 0
_________________________________________________________________


이 함수는 softmax 함수 다시 적용하기 전에 temperature 스케일 매개 변수로 logit에 가중치 부여  
temperature값이 0에 가까울수록 샘플링을 더 결정적으로 만든다.(즉, 가장 높은 확률을 가진 단어가 선택될 가능성이 많음)  
temperature 값이 1에 가까우면 모델이 출력한 확률에 따라 단어가 선택

In [None]:
def sample_with_temp(preds, temperature=1.0):
    # 확률 배열에서 인덱스 하나를 샘플링하는 helper 함수
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [20]:
def generate_text(seed_text, next_words, model, max_sequence_len, temp):
    output_text = seed_text
    
    seed_text = start_story + seed_text
    
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0] #단어를 토큰의 리스트로 변환
        token_list = token_list[-max_sequence_len:] #max_sequence_len 개의 토큰만 유지(길이제한)
        token_list = np.reshape(token_list, (1, max_sequence_len))
        
        probs = model.predict(token_list, verbose=0)[0]
        y_class = sample_with_temp(probs, temperature = temp)
        
        if y_class == 0:
            output_word = ''
        else:
            output_word = tokenizer.index_word[y_class]
            
        if output_word == "|":
            break
            
        if token_type == 'word':
            output_text += output_word + ' '
            seed_text += output_word + ' '
        else:
            output_text += output_word + ' ' #그렇지 않으면 새로운 단어를 seed_text에 덧붙이고 다음 생성 과정을 반복할 준비
            seed_text += output_word + ' '
            
            
    return output_text

In [21]:
def on_epoch_end(epoch, logs):
    seed_text = ""
    gen_words = 500

    print('Temp 0.2')
    print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.2))
    print('Temp 0.33')
    print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.33))
    print('Temp 0.5')
    print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.5))
    print('Temp 1.0')
    print (generate_text(seed_text, gen_words, model, seq_length, temp = 1))

    
    
if train_model:
    epochs = 1000
    batch_size = 32
    num_batches = int(len(X) / batch_size)
    callback = LambdaCallback(on_epoch_end=on_epoch_end)
    model.fit(X, y, epochs=epochs, batch_size=batch_size, callbacks = [callback], shuffle = True)

In [22]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         597800    
_________________________________________________________________
lstm (LSTM)                  (None, 256)               365568    
_________________________________________________________________
dense (Dense)                (None, 5978)              1536346   
Total params: 2,499,714
Trainable params: 2,499,714
Non-trainable params: 0
_________________________________________________________________


In [23]:
seed_text = "the frog and the snake . "
gen_words = 500
temp = 0.1

print (generate_text(seed_text, gen_words, model, seq_length, temp))

the frog and the snake . daily avoided minutes hadas that talk separate defeat ravena fornothing tell drop warmly earlierthan replied loads nothingbut sacrificed juno thegift wemust newmoon inquiry captors ailing whitedress offin winged man : andcombing herd getyour faster dug destroyourselves sell feast itand firmly sowa beat flicks attentionupon feathers these hind himdo gown tips meeting harder cows flicks death's heels blow buried rooks small thatmeans treeinterrupted meansof hefelt steaming theyinvited shepherdone handle loses wasn't bottom andthe sodisgusted pet vacant noble snake'shole pursuers golden theweasel farewell frightened toshame festivals thanshe pest rejoined shepherds stagthere andfrom theni'll be incase hersong antidote invitedeach clearing dividebetween dogonce foxfor acemetery mounted verypopular recognisedhis fun himloose discontented pole guilty whatpowerful wringits sowa wateredwhen sofast mixing beingchosen ireally these thedoctor window contrastinghis dearly 

In [24]:
def generate_human_led_text(model, max_sequence_len):
    
    output_text = ''
    seed_text = start_story
    
    while 1:
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = token_list[-max_sequence_len:]
        token_list = np.reshape(token_list, (1, max_sequence_len))
        
        probs = model.predict(token_list, verbose=0)[0]

        top_10_idx = np.flip(np.argsort(probs)[-10:])
        top_10_probs = [probs[x] for x in top_10_idx]
        top_10_words = tokenizer.sequences_to_texts([[x] for x in top_10_idx])
        
        for prob, word in zip(top_10_probs, top_10_words):
            print('{:<6.1%} : {}'.format(prob, word))

        chosen_word = input()
                
        if chosen_word == '|':
            break
            
        
        seed_text += chosen_word + ' '
        output_text += chosen_word + ' '
        
        clear_output()

        print (output_text)

In [26]:
generate_human_led_text(model, 20)

the dog and the hare. a dog was lying 
0.0%   : enemy
0.0%   : mangera
0.0%   : staff
0.0%   : clipped
0.0%   : requested
0.0%   : by
0.0%   : minding
0.0%   : elect
0.0%   : kingtime
0.0%   : numberof
|


In [None]:
# model.save('./saved_models/aesop_no_dropout_100.h5')