In [53]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [51]:
# Load Arabian nights
file = 'arabian-nights.txt'
corpus = None
with open(file, 'r') as f:
    corpus = f.read()

corpus[:50]

'The Arabian Nights\nIn the chronicles of the ancien'

In [169]:
# Tokenize the content
tokenizer = keras.preprocessing.text.Tokenizer()
seq_len = 15

def generate_training_data(corpus, seq_len):
    # Populate the tokenizer vocabulary
    tokenizer.fit_on_texts([corpus])
    
    dataX = []
    dataY = []
    
    for line in corpus.split('\n')[:1000]:
        if len(line.strip()) == 0:
            continue
        tokens = tokenizer.texts_to_sequences([line])[0]
        if len(tokens) == 0:
            continue
        for i in range(1, len(tokens)):
            if i < seq_len:
                seqX = tokens[:i]
            else:
                seqX = tokens[i-seq_len+1:i]
            dataX.append(seqX)
            dataY.append(tokens[i])
    return dataX, dataY

dataX, dataY = generate_training_data(corpus, seq_len)
dataX[:5], dataY[:5]

([[1], [1, 4049], [10], [10, 1], [10, 1, 4050]], [4049, 1853, 1, 4050, 4])

In [170]:
# Pad the input sequences
paddedX = keras.preprocessing.sequence.pad_sequences(dataX, maxlen=seq_len)

X = np.array(paddedX)
Y = np.array(dataY).reshape((len(dataY), 1))
Y.shape

(23366, 1)

In [171]:
# Create the model
def create_model(vocab_size, seq_len):
    model = keras.Sequential()
    model.add(keras.layers.Embedding(vocab_size, 20, input_length=seq_len))
    # LSTM
    model.add(keras.layers.LSTM(256))
    # Regularization layer
    model.add(keras.layers.Dropout(0.2))
    # Dense softmax
    model.add(keras.layers.Dense(vocab_size, activation="softmax"))
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = create_model(len(tokenizer.word_index)+1, seq_len)
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 15, 20)            127060    
_________________________________________________________________
lstm_11 (LSTM)               (None, 256)               283648    
_________________________________________________________________
dropout_11 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 6353)              1632721   
Total params: 2,043,429
Trainable params: 2,043,429
Non-trainable params: 0
_________________________________________________________________


In [175]:
# Train the model (the model for trained for roughly 80 epochs)
model.fit(X, Y, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f8d76ae9a50>

In [180]:
seed_text = 'the magic carpet bought a house'
gen_tokens = tokenizer.texts_to_sequences([seed_text])
for i in range(100):
    if len(gen_tokens[0]) > seq_len:
        seed_tokens = np.array([gen_tokens[0][-seq_len:]])
    else:
        seed_tokens = keras.preprocessing.sequence.pad_sequences(gen_tokens, maxlen=seq_len)
    preds = model.predict(seed_tokens)
    index = np.argmax(preds)
    gen_tokens[0].append(index)
tokenizer.sequences_to_texts(gen_tokens)

['the magic carpet bought a house and a great city which was filled by the young lady keeping a little came into the centre of my daughter and was bounded by the room of him that he was four overcome at a room of which she said to take them for your head he is one and what the merchant who is do to know your life to tell me but the vizir had a most beautiful jewels came who came here with the deliberate intention of causing his life to put his the genius life that he had all the stones found a well and']