# Notebook Intro

In [21]:
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint

from pickle import dump

In [23]:
sequenceLength = 10
sourceTextFileName = "poem.txt"
trainingTextFileName = "train_" + sourceTextFileName
mappingFileName = "SimpleNLP_mapping.pkl"
bestSavedModel = "bestSavedSimpleNLPModel.hdf5"

In [3]:
# reads and return the text of the training file. self-explanatory
def readTextFile(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [4]:
def writeTextFile(lines, fileName):
    data = '\n'.join(lines)
    file = open(fileName, 'w')
    file.write(data)
    file.close()

In [5]:
def processRawText(rawText):
    # split the raw text using space (' ') 
    tokens = rawText.split()
    rawText = ' '.join(tokens)
    # basically we removed all the line/paragraph breaks
    # but kept the punctuations
    return rawText

In [6]:
def sequenceRawText(rawText):
    # organize into sequences of characters
    length = 10
    sequences = list()
    for i in range(length, len(rawText)):
        # picks a sequence of tokens
        seq = rawText[i - length:i+1]
        # add to tlist
        sequences.append(seq)
        
    return sequences

In [7]:
# load the training file
rawText = readTextFile(sourceTextFileName)
print(rawText[: 140])

Sing a song of sixpence,
A pocket full of rye.
Four and twenty blackbirds,
Baked in a pie.

When the pie was opened
The birds began to sing;


In [8]:
rawText = processRawText(rawText)
print(rawText)

Sing a song of sixpence, A pocket full of rye. Four and twenty blackbirds, Baked in a pie. When the pie was opened The birds began to sing; Wasn’t that a dainty dish, To set before the king. The king was in his counting house, Counting out his money; The queen was in the parlour, Eating bread and honey. The maid was in the garden, Hanging out the clothes, When down came a blackbird And pecked off her nose.


In [9]:
# turn the text into a sequence of character
# each sequence is sequenceLength long
sequences = sequenceRawText(rawText)

# save sequences to file
writeTextFile(sequences, trainingTextFileName)

print('Total number of sequences: %d' % len(sequences))
print(sequences[:5])

Total number of sequences: 399
['Sing a song', 'ing a song ', 'ng a song o', 'g a song of', ' a song of ']


In [10]:
# now we read the text sequences from the file we saved
rawTrainingText = readTextFile(trainingTextFileName)
sequenceLines = rawTrainingText.split('\n')

print(sequenceLines[: 5])

['Sing a song', 'ing a song ', 'ng a song o', 'g a song of', ' a song of ']


In [11]:
# time to encode the text

# create a vocabulary (all used characters in the text)
vocab = sorted(list(set(rawTrainingText)))
vocabSize = len(vocab)

# map each character to an integer by creating a dictionary
vocabMap = dict((c, i) for i, c in enumerate(vocab))

print("Vocabulary size: %d" % vocabSize)
print(vocab)
print(vocabMap)

Vocabulary size: 38
['\n', ' ', ',', '.', ';', 'A', 'B', 'C', 'E', 'F', 'H', 'S', 'T', 'W', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'w', 'x', 'y', '’']
{'\n': 0, ' ': 1, ',': 2, '.': 3, ';': 4, 'A': 5, 'B': 6, 'C': 7, 'E': 8, 'F': 9, 'H': 10, 'S': 11, 'T': 12, 'W': 13, 'a': 14, 'b': 15, 'c': 16, 'd': 17, 'e': 18, 'f': 19, 'g': 20, 'h': 21, 'i': 22, 'k': 23, 'l': 24, 'm': 25, 'n': 26, 'o': 27, 'p': 28, 'q': 29, 'r': 30, 's': 31, 't': 32, 'u': 33, 'w': 34, 'x': 35, 'y': 36, '’': 37}


In [12]:
# let's turn those characters in the sequences to integers 
sequences = list()

for line in sequenceLines:
    # encode line
    encodedSequence = [vocabMap[char] for char in line]
    # add it to the list
    sequences.append(encodedSequence)
    
print(sequences[0:5])

[[11, 22, 26, 20, 1, 14, 1, 31, 27, 26, 20], [22, 26, 20, 1, 14, 1, 31, 27, 26, 20, 1], [26, 20, 1, 14, 1, 31, 27, 26, 20, 1, 27], [20, 1, 14, 1, 31, 27, 26, 20, 1, 27, 19], [1, 14, 1, 31, 27, 26, 20, 1, 27, 19, 1]]


In [13]:
# now we need to prepare the input and target matrices

# X = sequences[:,:-1] means we are grabbing all the rows from sequences but dropping the last column
# .... the last column will be used as the target

# y = sequences[:,-1] means we are grabbing all the rows from sequences but only retaining the last column 
# .... the last column being our target

sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]

print(X[0:5])
print(y[0:5])

[[11 22 26 20  1 14  1 31 27 26]
 [22 26 20  1 14  1 31 27 26 20]
 [26 20  1 14  1 31 27 26 20  1]
 [20  1 14  1 31 27 26 20  1 27]
 [ 1 14  1 31 27 26 20  1 27 19]]
[20  1 27 19  1]


In [None]:
# one last thing...
# we need to one-hot-encode each character. 
# That is, each input vector (of sequenceLength) becomes a vector as long as the vocabulary
# with a 1 marked for the specific character. 

# for this we use the to_categorical() function in the Keras API to one-hot-encode

sequences = [to_categorical(x, num_classes=vocabSize) for x in X]

X = np.array(sequences)
y = to_categorical(y, num_classes=vocabSize)

In [None]:
# essentially each input becomes a matrice with the following dimensions:
# (number_of_sequences, sequenceLength, vocabSize)

# and each output becomes a matrice with the following dimensions:
# (number_of_sequences, vocabSize)

In [15]:
print(X.shape)
print(y.shape)

print(X[0:2])
print(y[0:2])

(399, 10, 38)
(399, 38)
[[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 

In [18]:
# time to model...

# define model
model = Sequential()
model.add(LSTM(75, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.3))
model.add(Dense(vocabSize, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 75)                34200     
_________________________________________________________________
dense_2 (Dense)              (None, 38)                2888      
Total params: 37,088
Trainable params: 37,088
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
epochs = 100
batchSize = 20
optimizerFunction = "adam"
# optimizerFunction = "rmsprop"

In [26]:
#checkpointer = ModelCheckpoint(filepath=bestSavedModel, verbose=1, save_best_only=True)
#model.fit(X, y, epochs=epochs, batch_size=batchSize, callbacks=[checkpointer], verbose=1)

# compile the model
model.compile(loss="categorical_crossentropy, optimizer=optimizerFunction, metrics=["accuracy"])
# fit the model
model.fit(X, y, epochs=100, verbose=2)


Epoch 1/100
 - 2s - loss: 1.4223 - acc: 0.6216
Epoch 2/100
 - 1s - loss: 1.3394 - acc: 0.6742
Epoch 3/100
 - 1s - loss: 1.3050 - acc: 0.6767
Epoch 4/100
 - 1s - loss: 1.2732 - acc: 0.6792
Epoch 5/100
 - 1s - loss: 1.2371 - acc: 0.6967
Epoch 6/100
 - 1s - loss: 1.2103 - acc: 0.7243
Epoch 7/100
 - 1s - loss: 1.1892 - acc: 0.6992
Epoch 8/100
 - 1s - loss: 1.1640 - acc: 0.7268
Epoch 9/100
 - 1s - loss: 1.1234 - acc: 0.7444
Epoch 10/100
 - 1s - loss: 1.0988 - acc: 0.7368
Epoch 11/100
 - 1s - loss: 1.0730 - acc: 0.7644
Epoch 12/100
 - 1s - loss: 1.0527 - acc: 0.7569
Epoch 13/100
 - 1s - loss: 1.0224 - acc: 0.7644
Epoch 14/100
 - 1s - loss: 0.9975 - acc: 0.7920
Epoch 15/100
 - 0s - loss: 0.9669 - acc: 0.7995
Epoch 16/100
 - 0s - loss: 0.9343 - acc: 0.8020
Epoch 17/100
 - 0s - loss: 0.9143 - acc: 0.8120
Epoch 18/100
 - 0s - loss: 0.8784 - acc: 0.8446
Epoch 19/100
 - 1s - loss: 0.8553 - acc: 0.8296
Epoch 20/100
 - 1s - loss: 0.8343 - acc: 0.8396
Epoch 21/100
 - 1s - loss: 0.8022 - acc: 0.8571
E

In [28]:
# save the model to file
model.save(bestSavedModel)

# save the vocabulary map. We need it for character generation part later.
dump(vocabMap, open(mappingFileName, 'wb'))

In [29]:
model.load_weights(bestSavedModel)