# Text Generation with Neural Networks

## Functions for Processing Text

### Reading in files as a string text

In [1]:
def read_file(filepath):
    
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text

In [2]:
read_file('moby_dick_four_chapters.txt')

'Call me Ishmael.  Some years ago--never mind how long\nprecisely--having little or no money in my purse, and nothing\nparticular to interest me on shore, I thought I would sail about a\nlittle and see the watery part of the world.  It is a way I have of\ndriving off the spleen and regulating the circulation.  Whenever I\nfind myself growing grim about the mouth; whenever it is a damp,\ndrizzly November in my soul; whenever I find myself involuntarily\npausing before coffin warehouses, and bringing up the rear of every\nfuneral I meet; and especially whenever my hypos get such an upper\nhand of me, that it requires a strong moral principle to prevent me\nfrom deliberately stepping into the street, and methodically knocking\npeople\'s hats off--then, I account it high time to get to sea as soon\nas I can.  This is my substitute for pistol and ball.  With a\nphilosophical flourish Cato throws himself upon his sword; I quietly\ntake to the ship.  There is nothing surprising in this.  If t

### Tokenize and Clean Text

In [3]:
## We want to use spacy only for tokenization , hence removing other 
## Process in the pipeline.
## here tokenize - > breakdown text into units 
import spacy
nlp = spacy.load('en_core_web_sm',disable=['parser', 'tagger','ner'])



In [4]:
# So there is nothing but a tokenizer.
nlp.pipeline

[]

In [5]:
# this nlp would've complain after seeing words more than a million
# This new number should cover the entirety of the text file
## we're dealing with now
nlp.max_length = 1198623

In [6]:
# Given a string of text - tokenize it - remove the puncs and return a list of token text
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [7]:
d = read_file('melville-moby_dick.txt')
tokens = separate_punc(d)

In [8]:
tokens

['chapter',
 '1',
 'loomings',
 'call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on',
 'shore',
 'i',
 'thought',
 'i',
 'would',
 'sail',
 'about',
 'a',
 'little',
 'and',
 'see',
 'the',
 'watery',
 'part',
 'of',
 'the',
 'world',
 'it',
 'is',
 'a',
 'way',
 'i',
 'have',
 'of',
 'driving',
 'off',
 'the',
 'spleen',
 'and',
 'regulating',
 'the',
 'circulation',
 'whenever',
 'i',
 'find',
 'myself',
 'growing',
 'grim',
 'about',
 'the',
 'mouth',
 'whenever',
 'it',
 'is',
 'a',
 'damp',
 'drizzly',
 'november',
 'in',
 'my',
 'soul',
 'whenever',
 'i',
 'find',
 'myself',
 'involuntarily',
 'pausing',
 'before',
 'coffin',
 'warehouses',
 'and',
 'bringing',
 'up',
 'the',
 'rear',
 'of',
 'every',
 'funeral',
 'i',
 'meet',
 'and',
 'especially',
 'whenever',
 'my',
 'hypos',
 'get',
 'such

In [9]:
len(tokens)

214712

## Create Sequences of Tokens

In [10]:
# organize into sequences of tokens
train_len = 25+1 # 50 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [11]:
' '.join(text_sequences[0])

'chapter 1 loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to'

In [12]:
' '.join(text_sequences[1])

'1 loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest'

In [13]:
' '.join(text_sequences[2])

'loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me'

In [14]:
len(text_sequences)

214686

In [15]:
len(tokens)-len(text_sequences)

26

# Keras

### Keras Tokenization
assigns a numerical identifier to each token text. Same identifier for same token text (absolute `==` equality between strings). 

In [19]:
## tf-gpu==1.15
## keras==2.3.1
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [20]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [21]:
sequences[0]

[158,
 9447,
 17527,
 402,
 42,
 1043,
 43,
 247,
 659,
 140,
 296,
 116,
 82,
 787,
 347,
 113,
 36,
 50,
 1788,
 6,
 49,
 3028,
 3,
 218,
 442,
 5]

In [22]:
tokenizer.index_word

{1: 'the',
 2: 'of',
 3: 'and',
 4: 'a',
 5: 'to',
 6: 'in',
 7: 'that',
 8: 'his',
 9: 'it',
 10: 'i',
 11: 'he',
 12: 'but',
 13: "'s",
 14: 'as',
 15: 'with',
 16: 'is',
 17: 'was',
 18: 'for',
 19: 'all',
 20: 'this',
 21: 'at',
 22: 'not',
 23: 'by',
 24: 'whale',
 25: 'from',
 26: 'so',
 27: 'him',
 28: 'on',
 29: 'be',
 30: 'one',
 31: 'you',
 32: 'there',
 33: 'now',
 34: 'had',
 35: 'have',
 36: 'or',
 37: 'were',
 38: 'they',
 39: 'like',
 40: 'which',
 41: 'then',
 42: 'me',
 43: 'some',
 44: 'their',
 45: 'what',
 46: 'when',
 47: 'an',
 48: 'are',
 49: 'my',
 50: 'no',
 51: 'upon',
 52: 'out',
 53: 'man',
 54: 'into',
 55: 'ship',
 56: 'up',
 57: 'more',
 58: 'ahab',
 59: 'if',
 60: 'them',
 61: 'old',
 62: 'we',
 63: 'sea',
 64: 'would',
 65: "'",
 66: 'ye',
 67: 'do',
 68: 'other',
 69: 'been',
 70: 'over',
 71: 'these',
 72: 'will',
 73: 'though',
 74: 'only',
 75: 'its',
 76: 'down',
 77: 'such',
 78: 'who',
 79: 'yet',
 80: 'head',
 81: 'time',
 82: 'long',
 83: 'boat

In [23]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

158 : chapter
9447 : 1
17527 : loomings
402 : call
42 : me
1043 : ishmael
43 : some
247 : years
659 : ago
140 : never
296 : mind
116 : how
82 : long
787 : precisely
347 : having
113 : little
36 : or
50 : no
1788 : money
6 : in
49 : my
3028 : purse
3 : and
218 : nothing
442 : particular
5 : to


In [24]:
tokenizer.word_counts

OrderedDict([('chapter', 4447),
             ('1', 28),
             ('loomings', 3),
             ('call', 1382),
             ('me', 16095),
             ('ishmael', 500),
             ('some', 15789),
             ('years', 2400),
             ('ago', 815),
             ('never', 5262),
             ('mind', 2039),
             ('how', 6330),
             ('long', 8567),
             ('precisely', 690),
             ('having', 1679),
             ('little', 6412),
             ('or', 17879),
             ('no', 14916),
             ('money', 305),
             ('in', 105799),
             ('my', 15231),
             ('purse', 178),
             ('and', 164029),
             ('nothing', 2936),
             ('particular', 1273),
             ('to', 117832),
             ('interest', 442),
             ('on', 26910),
             ('shore', 572),
             ('i', 53430),
             ('thought', 3874),
             ('would', 11232),
             ('sail', 2522),
             ('about', 

In [25]:
vocabulary_size = len(tokenizer.word_counts);vocabulary_size

17527

In [26]:
## Finding tokens that are present in the 
## tokenization of spacy and not in keras tokenization

## tokens already has tokens for spacy
keras_tokens = list(tokenizer.index_word.values())

not_in_keras = []
for token in tokens:
    if token not in keras_tokens:
        not_in_keras.append(token)
        

## Finding tokens that are present in the 
## tokenization of keras and not in spacy tokenization

not_in_spacy = []
for token in keras_tokens:
    if token not in tokens:
        not_in_spacy.append(token)

In [27]:
len(not_in_keras),len(not_in_spacy)

(1, 0)

In [28]:
not_in_keras

['orphan']

In [29]:
len(tokenizer.index_word.values()),len(tokens)

(17527, 214712)

This length difference is mainly due to the presence of only unique words in `tokenizer.index_word` and a missing 'orphan' word.

### Convert to Numpy Matrix

In [30]:
import numpy as np

In [31]:
sequences = np.array(sequences)

In [32]:
sequences

array([[  158,  9447, 17527, ...,   218,   442,     5],
       [ 9447, 17527,   402, ...,   442,     5,  1165],
       [17527,   402,    42, ...,     5,  1165,    42],
       ...,
       [  240,   937,   351, ...,  1419,  1313,    74],
       [  937,   351,  1418, ...,  1313,    74,   219],
       [  351,  1418,     3, ...,    74,   219,   222]])

In [33]:
## Should exactly be same
np.array(text_sequences).shape , sequences.shape

((214686, 26), (214686, 26))

# Creating an LSTM based model

In [34]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

```python
model.add(Embedding(1000, 64, input_length=10))
```
Turns positive integers (indexes) into dense vectors of fixed size.
eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]

This layer can only be used as the first layer in a model.

  * the model will take as input an integer matrix of size (batch, input_length).
  * the largest integer (i.e. word index) in the input should be
  * no larger than 999 (vocabulary size).
  * now model.output_shape == (None, 10, 64), where None is the batch dimension.

In [37]:
### GPU-Support
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0',
 '/job:localhost/replica:0/task:0/device:GPU:1',
 '/job:localhost/replica:0/task:0/device:GPU:2',
 '/job:localhost/replica:0/task:0/device:GPU:3']

In [39]:
import tensorflow as tf
config = tf.ConfigProto( device_count = {'GPU': 4 , 'CPU': 16} ) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

In [40]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

### Train / Test Split

In [41]:
from keras.utils import to_categorical

In [42]:
sequences

array([[  158,  9447, 17527, ...,   218,   442,     5],
       [ 9447, 17527,   402, ...,   442,     5,  1165],
       [17527,   402,    42, ...,     5,  1165,    42],
       ...,
       [  240,   937,   351, ...,  1419,  1313,    74],
       [  937,   351,  1418, ...,  1313,    74,   219],
       [  351,  1418,     3, ...,    74,   219,   222]])

In [43]:
# First 49 words
sequences[:,:-1]

array([[  158,  9447, 17527, ...,     3,   218,   442],
       [ 9447, 17527,   402, ...,   218,   442,     5],
       [17527,   402,    42, ...,   442,     5,  1165],
       ...,
       [  240,   937,   351, ...,    84,  1419,  1313],
       [  937,   351,  1418, ...,  1419,  1313,    74],
       [  351,  1418,     3, ...,  1313,    74,   219]])

In [44]:
# last Word
sequences[:,-1]

array([   5, 1165,   42, ...,   74,  219,  222])

In [45]:
X = sequences[:,:-1]

In [46]:
y = sequences[:,-1]

In [47]:
## Its important to specify the num_classes, because not all
## tokens will get to be a part of y , the remaining 1 is for padding
y = to_categorical(y, num_classes=vocabulary_size+1,dtype=np.int16)

In [48]:
## Taking around 7GB
## If you left it at default it was taking 14GB
y.nbytes / (1024*1024*1024)

7.00916388630867

In [49]:
import gc
del(y)
gc.collect()

## Since y has only 0 and 1, we're using bool type, thats 1/4th the default
y = sequences[:,-1]
y = to_categorical(y, num_classes=vocabulary_size+1,dtype=bool)
y.nbytes / (1024*1024*1024)

3.504581943154335

In [50]:
seq_len = X.shape[1]

In [51]:
seq_len

25

### Training the Model

In [52]:
## Parameters in embedding_1 are calculated as
## basically for each word in vocab it is giving us an
## embedding vector of size 25(i.e., output_dim)
(vocabulary_size+1)*25

438200

In [53]:
# define model
# We're passing vocabulary_size+1 so that it matches the size of y's vector
model = create_model(vocabulary_size+1, seq_len)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            438200    
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 150)           105600    
_________________________________________________________________
lstm_2 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense_1 (Dense)              (None, 150)               22650     
_________________________________________________________________
dense_2 (Dense)              (None, 17528)             2646728   
Total params: 3,393,778
Trainable params: 3,393,778
Non-trainable params: 0
_________________________________________________________________


---

----

In [56]:
from pickle import dump,load

In [None]:
# fit model
model.fit(X, y, batch_size=128, epochs=300,verbose=1)


Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
 43904/214686 [=====>........................] - ETA: 1:39 - loss: 4.1468 - accuracy: 0.2049

So, its taking around 2 mins per epoch on GPU(single)

In [57]:
# save the model to file
model.save('epochBIG-GPU.h5')
# save the tokenizer
dump(tokenizer, open('epochBIGTokenizer', 'wb'))

# Generating New Text

In [80]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [81]:
## Testing
input_text = 'This is the latest of the series'
tokenizer.texts_to_sequences([input_text])[0]
## we get the corresponding tokens for each word in input_text

[20, 16, 1, 5135, 2, 1, 4179]

In [92]:
## pad sequences
## pads the sequence with 0 if shorter than maxlen , 
## position of padding is defined by truncating
input_text = 'This is the latest of the series'
print(len(input_text.split()))
encoded_text = tokenizer.texts_to_sequences([input_text])[0]
pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
print(pad_encoded)
print('-'*60)
## if seq is greater than maxlen then it keeps only the last max_len size
## Position of truncation depend upon the arg
input_text = 'This is the latest series ' + 'just '*24
print(len(input_text.split()))
encoded_text = tokenizer.texts_to_sequences([input_text])[0]
pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
print(pad_encoded)


7
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0   20   16    1 5135    2    1 4179]]
------------------------------------------------------------
29
[[4179  212  212  212  212  212  212  212  212  212  212  212  212  212
   212  212  212  212  212  212  212  212  212  212  212]]


In [93]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed (ideally should be of same size as seq_len, otherwise pad_sequences will come into play)
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate (50 words in the video)
        ## If the input sequence is greater than seq len, t
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        # Grab word by index
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

### Grab a random seed sequence

In [94]:
text_sequences[0]

['chapter',
 '1',
 'loomings',
 'call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to']

In [95]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [96]:
random_seed_text = text_sequences[random_pick]

In [97]:
random_seed_text

['stranger',
 'that',
 'stubb',
 'vowed',
 'he',
 'recognised',
 'his',
 'cutting',
 'spade',
 'pole',
 'entangled',
 'in',
 'the',
 'lines',
 'that',
 'were',
 'knotted',
 'round',
 'the',
 'tail',
 'of',
 'one',
 'of',
 'these',
 'whales',
 'there']

In [98]:
seed_text = ' '.join(random_seed_text)

In [99]:
seed_text

'stranger that stubb vowed he recognised his cutting spade pole entangled in the lines that were knotted round the tail of one of these whales there'

In [100]:
seq_len

25

In [102]:
len(seed_text.split())

26

In [107]:
model =  keras.models.load_model('epochBIG.h5')



In [108]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)

"my epitaphs to the secondly of this crew whiteness monster wo suspended and well earthly veteran his last cloaked in the sea thumb was the hull spring over to it now wore to it terrific only caused what larger as a try 's yellow in some skull the turned spring"

### Exploring Generated Sequence

In [109]:
full_text = read_file('moby_dick_four_chapters.txt')

In [110]:
for i,word in enumerate(full_text.split()):
    if word == 'inkling':
        print(' '.join(full_text.split()[i-20:i+20]))
        print('\n')

were stains of some sort or other. At first I knew not what to make of this; but soon an inkling of the truth occurred to me. I remembered a story of a white man--a whaleman too--who, falling among the


