In [1]:
import os
import re
import pickle
import unidecode
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [2]:
from unidecode import unidecode
from nltk.tokenize import sent_tokenize
# import nltk
# nltk.download('punkt')

In [20]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import TimeDistributed, Embedding, LSTM, Dense

Defining some constants:

In [4]:
MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE = 10000
EMBEDDING_DIM = 100 # Glove embedding
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128

# Preparing Data

Loading and Splitting Text into Sentences first:

In [5]:
filepath = './war_peace.txt'
with open(filepath, encoding='UTF-8') as f:
    war = unidecode(f.read()) # This text has a lost of accents

In [6]:
# Remove double spaces
war = war.replace('\n\n', '\n')
# Remove CHAPTER
pattern = 'CHAPTER [I|V|X|L]+'
war = re.sub(pattern, '', war)
# Remove some symbols
pattern = '--|—'
war = re.sub(pattern, ' ', war)
# Split into words and remove punctuation
war = war.split()
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~“”‘’'
war = [word.translate(str.maketrans('', '', punctuation)) for word in war]
# Lower case
war = [word.lower() for word in war]

Tokenizing the text:

In [10]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_VOCAB_SIZE,
                                               filters='')
tokenizer.fit_on_texts(war)

In [11]:
[encoded] = np.array(tokenizer.texts_to_sequences([war]))

Now let's create the dataset:

In [12]:
def create_dataset(data, maxlen, batch_size):
    
    # +1 to account for the target
    window_length = maxlen + 1
    # Create windows of size window_length
    dataset = tf.data.Dataset.from_tensor_slices(data)
    dataset = dataset.window(window_length, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    # Get batch and separate features and target
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda windows: (windows[:, -1],
                                           tf.one_hot(windows[:, 1:], depth=MAX_VOCAB_SIZE)))
    # Prefetch for efficiency
    dataset = dataset.prefetch(1)

    return dataset

In [13]:
dataset = create_dataset(encoded, maxlen=MAX_SEQUENCE_LENGTH, batch_size=BATCH_SIZE)

We will need the vocabulary so we can calculate the embedding matrix:

In [14]:
word2idx = tokenizer.word_index
idx2word = {idx:word for (word, idx) in word2idx.items() if idx<=MAX_VOCAB_SIZE-1}

Saving the dictionary:

In [16]:
with open('./idx2word', 'wb') as f:
    pickle.dump(idx2word, f)

# Model

Loading the embedding matrix:

In [21]:
embedding_matrix = np.load('./embedding_matrix_war.npy')

Model Architecture:

In [22]:
model = Sequential([
    
    Embedding(MAX_VOCAB_SIZE, EMBEDDING_DIM, weights=[embedding_matrix],
              trainable=False),
    LSTM(25, return_sequences=True, dropout=0.2),
    LSTM(25, return_sequences=True, dropout=0.2),
    TimeDistributed(Dense(MAX_VOCAB_SIZE, activation='softmax'))
    
])

In [23]:
model.compile(optimizer='adam', loss='categorical_crossentropy', 
              metrics=['acc'])

In [None]:
history = model.fit(dataset, epochs=1)

    572/Unknown - 104s 183ms/step - loss: 7.1051 - acc: 0.0555