In [1]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [2]:
from nltk.tokenize import sent_tokenize
# import nltk
# nltk.download('punkt')

In [16]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dropout, LSTM, GRU, Dense
from tensorflow.keras.optimizers import RMSprop

Data Source: Project Gutenberg  
http://www.gutenberg.org/ebooks/2600

https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py

# Loading and Preprocessing Data

Loading War and Peace:

In [4]:
filepath = './war_peace.txt'
with open(filepath, encoding='UTF-8') as f:
    war = f.read().lower()

Tokenizing the text (character level):

In [5]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(war)

Encoding the whole text:

In [6]:
max_id = len(tokenizer.word_index)
dataset_size = tokenizer.document_count
[encoded] = np.array(tokenizer.texts_to_sequences([war]))

Creating the train and validation datasets:

In [7]:
encoded_tr, encoded_vl = train_test_split(encoded, test_size=0.2,
                                          random_state=234)

In [8]:
def create_dataset(data, maxlen, batch_size):
    
    # +1 to account for the target
    window_length = maxlen + 1
    # Create windows of size window_length
    dataset = tf.data.Dataset.from_tensor_slices(data)
    dataset = dataset.window(window_length, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    # Get batch and separate features and target
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda windows: (tf.one_hot(windows[:, :-1], depth=max_id), 
                                           windows[:, -1]))
    # Prefetch for efficiency
    dataset = dataset.prefetch(1)

    return dataset

In [9]:
data_tr = create_dataset(encoded_tr, maxlen=40, batch_size=128)
data_vl = create_dataset(encoded_tr, maxlen=40, batch_size=128)

# Model

In [17]:
model = keras.models.Sequential([
    LSTM(128, input_shape=[None, max_id]),
    Dense(max_id, activation='softmax')
])

In [18]:
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)

In [19]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 128)               104448    
_________________________________________________________________
dense_2 (Dense)              (None, 75)                9675      
Total params: 114,123
Trainable params: 114,123
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.fit(data_tr, 
          validation_data=data_vl, 
          epochs = 1)



<tensorflow.python.keras.callbacks.History at 0x23350ea8a90>

In [None]:
model.predict(data_vl)