# Notebook Intro

In [1]:
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.utils import to_categorical
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences

from pickle import dump
from pickle import load

Using TensorFlow backend.


In [2]:
sequenceLength = 10

sourceTextFileName = "poem.txt"
#sourceTextFileName = "anna.txt"
trainingTextFileName = sourceTextFileName.split(".")[0] + "-Train.txt"
mappingFileName = sourceTextFileName.split(".")[0] + "-SimpleNLPMapping" + ".pkl"
bestSavedModel = sourceTextFileName.split(".")[0] + "-SimpleNLPModel" + ".hdf5"
seedTextFileName = sourceTextFileName.split(".")[0] + "-SeedText.txt"

print(sourceTextFileName)
print(trainingTextFileName)
print(mappingFileName)
print(bestSavedModel)
print(seedTextFileName)

poem.txt
poem-Train.txt
poem-SimpleNLPMapping.pkl
poem-SimpleNLPModel.hdf5
poem-SeedText.txt


In [3]:
# reads and return the text of the training file. self-explanatory
def readTextFile(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    
    return text

In [4]:
def writeTextFile(lines, fileName):
    data = '\n'.join(lines)
    file = open(fileName, 'w')
    file.write(data)
    file.close()

In [5]:
def processRawText(rawText):
    # split the raw text using space (' ') 
    tokens = rawText.split()
    rawText = ' '.join(tokens)
    
    # basically we removed all the line/paragraph breaks
    # but kept the punctuations
    return rawText

In [6]:
def sequenceRawText(rawText):
    # organize into sequences of characters
    length = 10
    sequences = list()
    for i in range(length, len(rawText)):
        # picks a sequence of tokens
        seq = rawText[i - length:i+1]
        # add to tlist
        sequences.append(seq)
        
    return sequences

In [7]:
# load the training file
rawText = readTextFile(sourceTextFileName)
print(rawText[: 140])

Sing a song of sixpence,
A pocket full of rye.
Four and twenty blackbirds,
Baked in a pie.

When the pie was opened
The birds began to sing;


In [8]:
rawText = processRawText(rawText)
print(rawText[: 140])

Sing a song of sixpence, A pocket full of rye. Four and twenty blackbirds, Baked in a pie. When the pie was opened The birds began to sing; 


In [9]:
# turn the text into a sequence of character
# each sequence is sequenceLength long
sequences = sequenceRawText(rawText)

# save sequences to file
writeTextFile(sequences, trainingTextFileName)

print('Total number of sequences: %d' % len(sequences))
print(sequences[: 5])

Total number of sequences: 399
['Sing a song', 'ing a song ', 'ng a song o', 'g a song of', ' a song of ']


In [10]:
# now we read the text sequences from the file we saved
rawTrainingText = readTextFile(trainingTextFileName)
sequenceLines = rawTrainingText.split('\n')

print(sequenceLines[: 5])

['Sing a song', 'ing a song ', 'ng a song o', 'g a song of', ' a song of ']


In [11]:
# time to encode the text

# create a vocabulary (all used characters in the text)
vocab = sorted(list(set(rawTrainingText)))
vocabSize = len(vocab)

# map each character to an integer by creating a dictionary
vocabMap = dict((c, i) for i, c in enumerate(vocab))

print("Vocabulary size: %d" % vocabSize)
print(vocab)
print(vocabMap)

Vocabulary size: 38
['\n', ' ', ',', '.', ';', 'A', 'B', 'C', 'E', 'F', 'H', 'S', 'T', 'W', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'w', 'x', 'y', '’']
{'\n': 0, ' ': 1, ',': 2, '.': 3, ';': 4, 'A': 5, 'B': 6, 'C': 7, 'E': 8, 'F': 9, 'H': 10, 'S': 11, 'T': 12, 'W': 13, 'a': 14, 'b': 15, 'c': 16, 'd': 17, 'e': 18, 'f': 19, 'g': 20, 'h': 21, 'i': 22, 'k': 23, 'l': 24, 'm': 25, 'n': 26, 'o': 27, 'p': 28, 'q': 29, 'r': 30, 's': 31, 't': 32, 'u': 33, 'w': 34, 'x': 35, 'y': 36, '’': 37}


In [12]:
# let's turn those characters in the sequences to integers 
sequences = list()

for line in sequenceLines:
    # encode line
    encodedSequence = [vocabMap[char] for char in line]
    # add it to the list
    sequences.append(encodedSequence)
    
print(sequences[: 5])

[[11, 22, 26, 20, 1, 14, 1, 31, 27, 26, 20], [22, 26, 20, 1, 14, 1, 31, 27, 26, 20, 1], [26, 20, 1, 14, 1, 31, 27, 26, 20, 1, 27], [20, 1, 14, 1, 31, 27, 26, 20, 1, 27, 19], [1, 14, 1, 31, 27, 26, 20, 1, 27, 19, 1]]


In [13]:
# now we need to prepare the input and target matrices

# X = sequences[:,:-1] means we are grabbing all the rows from sequences but dropping the last column
# .... the last column will be used as the target

# y = sequences[:,-1] means we are grabbing all the rows from sequences but only retaining the last column 
# .... the last column being our target

sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]

print(X[0:5])
print(y[0:5])

[[11 22 26 20  1 14  1 31 27 26]
 [22 26 20  1 14  1 31 27 26 20]
 [26 20  1 14  1 31 27 26 20  1]
 [20  1 14  1 31 27 26 20  1 27]
 [ 1 14  1 31 27 26 20  1 27 19]]
[20  1 27 19  1]


In [14]:
# one last thing...
# we need to one-hot-encode each character. 
# That is, each input vector (of sequenceLength) becomes a vector as long as the vocabulary
# with a 1 marked for the specific character. 

# for this we use the to_categorical() function in the Keras API to one-hot-encode

sequences = [to_categorical(x, num_classes=vocabSize) for x in X]

X = np.array(sequences)
y = to_categorical(y, num_classes=vocabSize)

In [15]:
# essentially each input becomes a matrice with the following dimensions:
# (number_of_sequences, sequenceLength, vocabSize)

# and each output becomes a matrice with the following dimensions:
# (number_of_sequences, vocabSize)

In [16]:
print(X.shape)
print(y.shape)

print(X[0:2])
print(y[0:2])

(399, 10, 38)
(399, 38)
[[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 

## Create the model and train it

In [17]:
epochs = 100
batchSize = 20
dropOutRate = 0.3
lstmCellNumber = 100

activationFunction = "softmax"

optimizerFunction = "adam"
# optimizerFunction = "rmsprop"

In [18]:
# time to model...

# define the model architecture
model = Sequential()
model.add(LSTM(lstmCellNumber, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(dropOutRate))
model.add(Dense(vocabSize, activation=activationFunction))

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100)               55600     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 38)                3838      
Total params: 59,438
Trainable params: 59,438
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
#checkpointer = ModelCheckpoint(filepath=bestSavedModel, verbose=1, save_best_only=True)
#model.fit(X, y, epochs=epochs, batch_size=batchSize, callbacks=[checkpointer], verbose=1)

# compile the model
model.compile(loss="categorical_crossentropy", optimizer=optimizerFunction, metrics=["accuracy"])

# fit the model
model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x111ee3a58>

In [20]:
# save the model to file
model.save(bestSavedModel)

# save the vocabulary map. We need it for character generation part later.
dump(vocabMap, open(mappingFileName, 'wb'))

## Generate some text

In [21]:
# length of the generated character sequence
generatedCharSeqLength = 50

In [22]:
# generate a sequence of characters with a language model
def generateSequence(model, reverseVocab, seedRawText, length):
    inputText = seedRawText
    
    # generate a fixed number of characters
    for _ in range(length):
        
        # the seed text needs to be processed just like the training text was.

        # encode the characters as integers based on the dictionary
        encodedSeedText = [vocabMap[c] for c in inputText]

        # truncate sequences to a fixed length using Keras' pad_sequence()
        encodedSeedText = pad_sequences([encodedSeedText], maxlen=sequenceLength, truncating='pre')

        # one-hot encode
        oneHotEncodedSeedText = to_categorical(encodedSeedText, num_classes=vocabSize)
        
        # use the model to predict character
        predCharInt = model.predict_classes(oneHotEncodedSeedText, verbose=0)
        predChar = reverseVocab[predCharInt[0]]

        # append to input
        inputText += predChar
        
    return inputText

In [23]:
# load the model and the text dictionary
model.load_weights(bestSavedModel)

# vocabMap is a char: int dictionary
vocabMap = load(open(mappingFileName, 'rb'))

# reverseVocab is a int: char dictionary used to convert the model prediction (int) to char
reverseVocab = dict(enumerate(vocabMap))

print(vocabMap)
print(reverseVocab)

{'\n': 0, ' ': 1, ',': 2, '.': 3, ';': 4, 'A': 5, 'B': 6, 'C': 7, 'E': 8, 'F': 9, 'H': 10, 'S': 11, 'T': 12, 'W': 13, 'a': 14, 'b': 15, 'c': 16, 'd': 17, 'e': 18, 'f': 19, 'g': 20, 'h': 21, 'i': 22, 'k': 23, 'l': 24, 'm': 25, 'n': 26, 'o': 27, 'p': 28, 'q': 29, 'r': 30, 's': 31, 't': 32, 'u': 33, 'w': 34, 'x': 35, 'y': 36, '’': 37}
{0: '\n', 1: ' ', 2: ',', 3: '.', 4: ';', 5: 'A', 6: 'B', 7: 'C', 8: 'E', 9: 'F', 10: 'H', 11: 'S', 12: 'T', 13: 'W', 14: 'a', 15: 'b', 16: 'c', 17: 'd', 18: 'e', 19: 'f', 20: 'g', 21: 'h', 22: 'i', 23: 'k', 24: 'l', 25: 'm', 26: 'n', 27: 'o', 28: 'p', 29: 'q', 30: 'r', 31: 's', 32: 't', 33: 'u', 34: 'w', 35: 'x', 36: 'y', 37: '’'}


In [25]:
# read the seed text file (input text)
seedRawText = readTextFile(seedTextFileName)

In [26]:
generatedText = generateSequence(model, reverseVocab, seedRawText, generatedCharSeqLength)

In [27]:
print(generatedText)

Wasn’t that a dainty dish queen was in the parl d in a pie.
 hen ing he  eieey. Th  piid . heeen ye  The boree
