In [53]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [51]:
# Load Arabian nights
file = 'arabian-nights.txt'
corpus = None
with open(file, 'r') as f:
    corpus = f.read()

corpus[:50]

'The Arabian Nights\nIn the chronicles of the ancien'

In [125]:
# Tokenize the content
tokenizer = keras.preprocessing.text.Tokenizer()
seq_len = 10

def generate_training_data(corpus, seq_len):
    # Populate the tokenizer vocabulary
    tokenizer.fit_on_texts([corpus])
    
    dataX = []
    dataY = []
    
    for line in corpus.split('\n')[:1000]:
        if len(line.strip()) == 0:
            continue
        tokens = tokenizer.texts_to_sequences([line])[0]
        if len(tokens) == 0:
            continue
        for i in range(1, len(tokens)):
            if i < seq_len:
                seqX = tokens[:i]
            else:
                seqX = tokens[i-seq_len+1:i]
            dataX.append(seqX)
            dataY.append(tokens[i])
    return dataX, dataY

dataX, dataY = generate_training_data(corpus, seq_len)
dataX[:5], dataY[:5]

([[1], [1, 4049], [10], [10, 1], [10, 1, 4050]], [4049, 1853, 1, 4050, 4])

In [126]:
# Pad the input sequences
paddedX = keras.preprocessing.sequence.pad_sequences(dataX, maxlen=seq_len)

X = np.array(paddedX)
Y = np.array(dataY).reshape((len(dataY), 1))
Y.shape

(23366, 1)

In [127]:
# Create the model
def create_model(vocab_size, seq_len):
    model = keras.Sequential()
    model.add(keras.layers.Embedding(vocab_size, 10, input_length=seq_len))
    # LSTM
    model.add(keras.layers.LSTM(256))
    # Regularization layer
    model.add(keras.layers.Dropout(0.1))
    # Dense softmax
    model.add(keras.layers.Dense(vocab_size, activation="softmax"))
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = create_model(len(tokenizer.word_index)+1, seq_len)
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 10, 10)            63530     
_________________________________________________________________
lstm_9 (LSTM)                (None, 256)               273408    
_________________________________________________________________
dropout_9 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 6353)              1632721   
Total params: 1,969,659
Trainable params: 1,969,659
Non-trainable params: 0
_________________________________________________________________


In [128]:
# Train the model
model.fit(X, Y, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f8d76ecb5d0>

In [165]:
seed_text = 'a crow found a pot of gold with some water in it'
gen_tokens = tokenizer.texts_to_sequences([seed_text])
for i in range(100):
    if len(gen_tokens[0]) > seq_len:
        seed_tokens = np.array([gen_tokens[0][-seq_len:]])
    else:
        seed_tokens = keras.preprocessing.sequence.pad_sequences(gen_tokens, maxlen=seq_len)
    preds = model.predict(seed_tokens)
    index = np.argmax(preds)
    gen_tokens[0].append(index)
tokenizer.sequences_to_texts(gen_tokens)

['a found a pot of gold with some water in it and the genius of the genius and was and to the sultan who was very pleased to the sultan who was and to the sultan who was very pleased to the sultan who was and to the sultan who was very pleased to the sultan who was and to the sultan who was very pleased to the sultan who was and to the sultan who was very pleased to the sultan who was and to the sultan who was very pleased to the sultan who was and to the sultan who was very pleased to the sultan who was and']