# World Level Neural Language Model and Use to Generate Text
Language model can predict the probability of the next word in the sequence, based on the words already observed in the sequence

statistical language models, distributed representation where different words with similar meanings have similar representation.

In [1]:
from string import punctuation

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# turn a doc into clean tokens
def clean_doc(doc):
	# replace '--' with a space ' '
    doc = doc.replace('--', ' ')
	# split into tokens by white space
    tokens = doc.split()
	# remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
	# make lower case
    tokens = [word.lower() for word in tokens]
    return tokens
 
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
 
# load document
in_filename = 'data/republic_clean.txt'
doc = load_doc(in_filename)
print(doc[:200])
 
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))
 
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))
 
# save sequences to file
out_filename = 'republic_sequences.txt'
save_doc(sequences, out_filename)

The Project Gutenberg EBook of The Republic, by Plato

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it u
['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'republic', 'by', 'plato', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 'reuse', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergorg', 'title', 'the', 'republic', 'author', 'plato', 'translator', 'b', 'jowett', 'posting', 'date', 'august', 'ebook', 'release', 'date', 'october', 'last', 'updated', 'june', 'language', 'english', 'start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'the', 'republic', 'produced', 'by', 'sue', 'asscher', 'the', 'republic', 'by', 'plato', 'tra

In [2]:
from numpy import array
from pickle import dump
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding

# load file
doc = load_doc('republic_sequences.txt')
lines = doc.split('\n')

In [3]:
# integer encode sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines) # convert texts to sequent numbers
vocab_size = len(tokenizer.word_index) + 1
display(type(sequences), vocab_size)

list

10455

In [4]:
display(sequences[:5])

[[1,
  6444,
  6443,
  3810,
  2,
  1,
  228,
  23,
  89,
  31,
  3810,
  5,
  26,
  1,
  174,
  2,
  6447,
  3207,
  34,
  44,
  4720,
  3,
  28,
  572,
  44,
  6446,
  10454,
  22,
  37,
  2452,
  73,
  241,
  73,
  249,
  13,
  10453,
  73,
  165,
  1,
  511,
  2,
  1,
  6444,
  6443,
  10452,
  1565,
  28,
  31,
  3810,
  13,
  6448],
 [6444,
  6443,
  3810,
  2,
  1,
  228,
  23,
  89,
  31,
  3810,
  5,
  26,
  1,
  174,
  2,
  6447,
  3207,
  34,
  44,
  4720,
  3,
  28,
  572,
  44,
  6446,
  10454,
  22,
  37,
  2452,
  73,
  241,
  73,
  249,
  13,
  10453,
  73,
  165,
  1,
  511,
  2,
  1,
  6444,
  6443,
  10452,
  1565,
  28,
  31,
  3810,
  13,
  6448,
  34],
 [6443,
  3810,
  2,
  1,
  228,
  23,
  89,
  31,
  3810,
  5,
  26,
  1,
  174,
  2,
  6447,
  3207,
  34,
  44,
  4720,
  3,
  28,
  572,
  44,
  6446,
  10454,
  22,
  37,
  2452,
  73,
  241,
  73,
  249,
  13,
  10453,
  73,
  165,
  1,
  511,
  2,
  1,
  6444,
  6443,
  10452,
  1565,
  28,
  31,
  3810,
  13

In [5]:
sequences = array(sequences)
# split into input and output
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=vocab_size) # one-hot encode the output words for each input-output pairs
seq_length = X.shape[1]
display(len(X),len(y), seq_length)

216740

216740

50

In [6]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length)) # input is embedding
model.add(LSTM(100, return_sequences=True)) # hidden 1
model.add(LSTM(100))  # hidden 2
model.add(Dense(100, activation='relu')) # hidden 3, sort out sequences
model.add(Dense(vocab_size, activation='softmax')) # output is multiclass
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            522750    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 10455)             1055955   
Total params: 1,729,605
Trainable params: 1,729,605
Non-trainable params: 0
_________________________________________________________________


In [7]:
# the model learns a multiclass classification, the efficient Adam to mini-batch gradient descent, and uses
# accuracy to evaluate of the model.
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, batch_size=128, epochs=5)
model.save('pickled_data/lang_model_generation.h5')
# save tokenizer
dump(tokenizer, open('pickled_data/tokenizer.pickled', 'wb'))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Use the trained model to generate text

In [13]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import numpy as np

def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate fixed words 
    for _ in range(n_words):
        # encode the text
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate the sequence to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict(encoded, verbose=0)
        yhat = np.argmax(yhat, axis=-1)
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
            
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
        
    return ' '.join(result)

# load cleaned text
lines = load_doc('republic_sequences.txt').split('\n')
seq_length = len(lines[0].split()) - 1 # get length = 50

#model = load_model('pickled_data/lang_model_generation.h5')
#tokenizer = load(open('pickled_data/tokenizer.pickled', 'rb'))
# select a seed text
seed_text = lines[randint(0, len(lines))]
print(seed_text + '\n')

# generate new text
gen = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(gen)

feeble person but sound in wind and limb and in perfect condition for the great gymnastic trial of the mind justice herself can find no fault with natures such as these and they will be the saviours of our state disciples of another sort would only make philosophy more ridiculous than

the other and of the state and the same and the same and the same and the same and the same and the same and the same and the same and the same and the same and the same and the same and the same and the same and the
