# Notebook Intro

In [1]:
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.utils import to_categorical
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences

from pickle import dump
from pickle import load

Using TensorFlow backend.


In [2]:
sequenceLength = 10

#sourceTextFileName = "poem.txt"
sourceTextFileName = "anna-short.txt"
trainingTextFileName = sourceTextFileName.split(".")[0] + "-Train.txt"
mappingFileName = sourceTextFileName.split(".")[0] + "-SimpleNLPMapping" + ".pkl"
bestSavedModel = sourceTextFileName.split(".")[0] + "-SimpleNLPModel" + ".hdf5"
seedTextFileName = sourceTextFileName.split(".")[0] + "-SeedText.txt"

print(sourceTextFileName)
print(trainingTextFileName)
print(mappingFileName)
print(bestSavedModel)
print(seedTextFileName)

anna-short.txt
anna-short-Train.txt
anna-short-SimpleNLPMapping.pkl
anna-short-SimpleNLPModel.hdf5
anna-short-SeedText.txt


In [3]:
# reads and return the text of the training file. self-explanatory
def readTextFile(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    
    return text

In [4]:
def writeTextFile(lines, fileName):
    data = '\n'.join(lines)
    file = open(fileName, 'w')
    file.write(data)
    file.close()

In [5]:
def processRawText(rawText):
    # split the raw text using space (' ') 
    tokens = rawText.split()
    rawText = ' '.join(tokens)
    
    # basically we removed all the line/paragraph breaks
    # but kept the punctuations
    return rawText

In [6]:
def sequenceRawText(rawText):
    # organize into sequences of characters
    length = 10
    sequences = list()
    for i in range(length, len(rawText)):
        # picks a sequence of tokens
        seq = rawText[i - length:i+1]
        # add to tlist
        sequences.append(seq)
        
    return sequences

In [7]:
# load the training file
rawText = readTextFile(sourceTextFileName)
print(rawText[: 140])

Chapter 1


Happy families are all alike; every unhappy family is unhappy in its own
way.

Everything was in confusion in the Oblonskys' hou


In [8]:
rawText = processRawText(rawText)
print(rawText[: 140])

Chapter 1 Happy families are all alike; every unhappy family is unhappy in its own way. Everything was in confusion in the Oblonskys' house.


In [9]:
# turn the text into a sequence of character
# each sequence is sequenceLength long
sequences = sequenceRawText(rawText)

# save sequences to file
writeTextFile(sequences, trainingTextFileName)

print('Total number of sequences: %d' % len(sequences))
print(sequences[: 5])

Total number of sequences: 21865
['Chapter 1 H', 'hapter 1 Ha', 'apter 1 Hap', 'pter 1 Happ', 'ter 1 Happy']


In [10]:
# now we read the text sequences from the file we saved
rawTrainingText = readTextFile(trainingTextFileName)
sequenceLines = rawTrainingText.split('\n')

print(sequenceLines[: 5])

['Chapter 1 H', 'hapter 1 Ha', 'apter 1 Hap', 'pter 1 Happ', 'ter 1 Happy']


In [11]:
# time to encode the text

# create a vocabulary (all used characters in the text)
vocab = sorted(list(set(rawTrainingText)))
vocabSize = len(vocab)

# map each character to an integer by creating a dictionary
vocabMap = dict((c, i) for i, c in enumerate(vocab))

print("Vocabulary size: %d" % vocabSize)
print(vocab)
print(vocabMap)

Vocabulary size: 63
['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '1', '2', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'W', 'Y', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
{'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, ',': 7, '-': 8, '.': 9, '1': 10, '2': 11, '3': 12, ':': 13, ';': 14, '?': 15, 'A': 16, 'B': 17, 'C': 18, 'D': 19, 'E': 20, 'F': 21, 'G': 22, 'H': 23, 'I': 24, 'K': 25, 'L': 26, 'M': 27, 'N': 28, 'O': 29, 'P': 30, 'R': 31, 'S': 32, 'T': 33, 'W': 34, 'Y': 35, '_': 36, 'a': 37, 'b': 38, 'c': 39, 'd': 40, 'e': 41, 'f': 42, 'g': 43, 'h': 44, 'i': 45, 'j': 46, 'k': 47, 'l': 48, 'm': 49, 'n': 50, 'o': 51, 'p': 52, 'q': 53, 'r': 54, 's': 55, 't': 56, 'u': 57, 'v': 58, 'w': 59, 'x': 60, 'y': 61, 'z': 62}


In [12]:
# let's turn those characters in the sequences to integers 
sequences = list()

for line in sequenceLines:
    # encode line
    encodedSequence = [vocabMap[char] for char in line]
    # add it to the list
    sequences.append(encodedSequence)
    
print(sequences[: 5])

[[18, 44, 37, 52, 56, 41, 54, 1, 10, 1, 23], [44, 37, 52, 56, 41, 54, 1, 10, 1, 23, 37], [37, 52, 56, 41, 54, 1, 10, 1, 23, 37, 52], [52, 56, 41, 54, 1, 10, 1, 23, 37, 52, 52], [56, 41, 54, 1, 10, 1, 23, 37, 52, 52, 61]]


In [13]:
# now we need to prepare the input and target matrices

# X = sequences[:,:-1] means we are grabbing all the rows from sequences but dropping the last column
# .... the last column will be used as the target

# y = sequences[:,-1] means we are grabbing all the rows from sequences but only retaining the last column 
# .... the last column being our target

sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]

print(X[0:5])
print(y[0:5])

[[18 44 37 52 56 41 54  1 10  1]
 [44 37 52 56 41 54  1 10  1 23]
 [37 52 56 41 54  1 10  1 23 37]
 [52 56 41 54  1 10  1 23 37 52]
 [56 41 54  1 10  1 23 37 52 52]]
[23 37 52 52 61]


In [14]:
# one last thing...
# we need to one-hot-encode each character. 
# That is, each input vector (of sequenceLength) becomes a vector as long as the vocabulary
# with a 1 marked for the specific character. 

# for this we use the to_categorical() function in the Keras API to one-hot-encode

sequences = [to_categorical(x, num_classes=vocabSize) for x in X]

X = np.array(sequences)
y = to_categorical(y, num_classes=vocabSize)

In [15]:
# essentially each input becomes a matrice with the following dimensions:
# (number_of_sequences, sequenceLength, vocabSize)

# and each output becomes a matrice with the following dimensions:
# (number_of_sequences, vocabSize)

In [16]:
print(X.shape)
print(y.shape)

print(X[0:2])
print(y[0:2])

(21865, 10, 63)
(21865, 63)
[[[ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  1.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  1.  0. ...,  0.  0.  0.]]

 [[ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  1.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]]
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]]


## Create the model and train it

In [17]:
epochs = 100
batchSize = 20
dropOutRate = 0.3
lstmCellsNumber = 100

activationFunction = "softmax"

optimizerFunction = "adam"
# optimizerFunction = "rmsprop"

In [18]:
# time to model...

# define the model architecture
model = Sequential()
model.add(LSTM(lstmCellsNumber, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(dropOutRate))
model.add(Dense(vocabSize, activation=activationFunction))

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100)               65600     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 63)                6363      
Total params: 71,963
Trainable params: 71,963
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
#checkpointer = ModelCheckpoint(filepath=bestSavedModel, verbose=1, save_best_only=True)
#model.fit(X, y, epochs=epochs, batch_size=batchSize, callbacks=[checkpointer], verbose=1)

# compile the model
model.compile(loss="categorical_crossentropy", optimizer=optimizerFunction, metrics=["accuracy"])

# fit the model
model.fit(X, y, epochs=epochs, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x118a83748>

In [20]:
# save the model to file
model.save(bestSavedModel)

# save the vocabulary map. We need it for character generation part later.
dump(vocabMap, open(mappingFileName, 'wb'))

## Generate some text

In [25]:
# length of the generated character sequence
generatedCharSeqLength = 2000

In [26]:
# generate a sequence of characters with a language model
def generateSequence(model, reverseVocab, seedRawText, length):
    inputText = seedRawText
    
    # generate a fixed number of characters
    for _ in range(length):
        
        # the seed text needs to be processed just like the training text was.

        # encode the characters as integers based on the dictionary
        encodedSeedText = [vocabMap[c] for c in inputText]

        # truncate sequences to a fixed length using Keras' pad_sequence()
        encodedSeedText = pad_sequences([encodedSeedText], maxlen=sequenceLength, truncating='pre')

        # one-hot encode
        oneHotEncodedSeedText = to_categorical(encodedSeedText, num_classes=vocabSize)
        
        # use the model to predict character
        predCharInt = model.predict_classes(oneHotEncodedSeedText, verbose=0)
        predChar = reverseVocab[predCharInt[0]]

        # append to input
        inputText += predChar
        
    return inputText

In [27]:
# load the model and the text dictionary
model.load_weights(bestSavedModel)

# vocabMap is a char: int dictionary
vocabMap = load(open(mappingFileName, 'rb'))

# reverseVocab is a int: char dictionary used to convert the model prediction (int) to char
reverseVocab = dict(enumerate(vocabMap))

print(vocabMap)
print(reverseVocab)

{'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, ',': 7, '-': 8, '.': 9, '1': 10, '2': 11, '3': 12, ':': 13, ';': 14, '?': 15, 'A': 16, 'B': 17, 'C': 18, 'D': 19, 'E': 20, 'F': 21, 'G': 22, 'H': 23, 'I': 24, 'K': 25, 'L': 26, 'M': 27, 'N': 28, 'O': 29, 'P': 30, 'R': 31, 'S': 32, 'T': 33, 'W': 34, 'Y': 35, '_': 36, 'a': 37, 'b': 38, 'c': 39, 'd': 40, 'e': 41, 'f': 42, 'g': 43, 'h': 44, 'i': 45, 'j': 46, 'k': 47, 'l': 48, 'm': 49, 'n': 50, 'o': 51, 'p': 52, 'q': 53, 'r': 54, 's': 55, 't': 56, 'u': 57, 'v': 58, 'w': 59, 'x': 60, 'y': 61, 'z': 62}
{0: '\n', 1: ' ', 2: '!', 3: '"', 4: "'", 5: '(', 6: ')', 7: ',', 8: '-', 9: '.', 10: '1', 11: '2', 12: '3', 13: ':', 14: ';', 15: '?', 16: 'A', 17: 'B', 18: 'C', 19: 'D', 20: 'E', 21: 'F', 22: 'G', 23: 'H', 24: 'I', 25: 'K', 26: 'L', 27: 'M', 28: 'N', 29: 'O', 30: 'P', 31: 'R', 32: 'S', 33: 'T', 34: 'W', 35: 'Y', 36: '_', 37: 'a', 38: 'b', 39: 'c', 40: 'd', 41: 'e', 42: 'f', 43: 'g', 44: 'h', 45: 'i', 46: 'j', 47: 'k', 48: 'l', 49: 'm',

In [28]:
# read the seed text file (input text)
seedRawText = readTextFile(seedTextFileName)

In [29]:
generatedText = generateSequence(model, reverseVocab, seedRawText, generatedCharSeqLength)

In [30]:
print(generatedText)

that to amend, to set right their relations was impossible,
because it was impossible to make her attractive again and able to
inspire love, or to make him an old man, not susceptible to love. Except
deceit and lying nothing could come of it now; and deceit and lying were
opposed to his nature.
" she tiok, or tereselles with he wifl comsed the door, he called the mono insold too himainace of anre. "Well might be of use to her fair; hus selfiess, and that it was quite senseless in our day in whach he was prateed when him arding in alowar, and to he mabed to him at that instadith on his side. "Well, what alstay, paisend on ahd was alceady wat excing and tor that havingountrend on his wife's bedroom. And thereope, and his handsome face as untersing on excell newander to goine he tailed the herrelestion with his wife was not sleeping in his wife's bedroom. And thereope, and his handsome face as untersing on excell newander to goine he tailed the herrelestion with his wife was not sleeping 