# Text Generation with Python and Keras

+ Part One

1. Read Moby Dick .txt files into pandas
2. Process Text
3. Clean Text
4. Tokenize the Text and create Sequences with Keras

In [1]:
# Create a function to read in a .txt file
# Once definition is written, a .txt file with four chapter of moby dick will appear below

def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
        
    return str_text

In [3]:
# 

# read_file('moby_dick_four_chapters.txt')

In [4]:
# importing spacy library to tokenize text
# we will also disable any parts of th

import spacy

nlp = spacy.load('en_core_web_md', disable = ['parser', 'tagger', 'ner'])

In [5]:
nlp.max_length = 1198623

In [6]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n']

In [7]:
d = read_file('moby_dick_four_chapters.txt')

In [11]:
tokens = separate_punc(d)

In [13]:
# tokens

In [14]:
len(tokens)

11338

In [15]:
# 25 words --> Neural Network to predict the next word # 26

## Create Sequence of Tokens

In [17]:
# organize into sequences of tokens
train_len = 25+1 #training words, then one target word

# Creating an empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)): # range is the training length, up to the length of all of the tokens 
    
    # grab the amount of characters in train_len
    seq = tokens[i-train_len:i] # i minus train_len up to i 
    
    # Add to text_sequences using append
    text_sequences.append(seq)

In [46]:
' '.join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [19]:
' '.join(text_sequences[1]) # moves one word over to the right - as we can see it started with call me and now it starts with me

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [20]:
' '.join(text_sequences[2]) # on this sequence again it moved one word over to the right, starting with ishmael

'ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i'

In [21]:
len(text_sequences)

11312

In [22]:
from keras.preprocessing.text import Tokenizer

In [48]:
# integer encode sequences of words
# the number is an id for the word - as it is unique to each word

tokenizer = Tokenizer() # create the tokenizer object
tokenizer.fit_on_texts(text_sequences) # calling on tokenizers fit_on_texts and provide the text_sequences
sequences = tokenizer.texts_to_sequences(text_sequences) # calling on texts_to_sequences and replaces texts sequences to sequences of numbers

In [26]:
# sequences[0]

In [28]:
# tokenizer.index_word

In [49]:
# for i in sequences, print out the id : followed by word
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
314 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2713 : interest
14 : me
24 : on


In [52]:
# tokenizer.word_counts # counts how many times each words shows up ie. Ishmael shows up 133 times in this .txt file

In [56]:
vocabulary_size = len(tokenizer.word_counts)

In [38]:
import numpy as np

In [57]:
sequences = np.array(sequences) # transforming sequences the list into a numpy array

In [60]:
# last word on the right - ie. 24 in the first row is the target word or in other words the label
# the features would be the 25 numbers starting from 956 and ending at 14

sequences # formatted sequences into the numpy array

array([[ 956,   14,  263, ..., 2713,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2712, ...,   53,    2, 2718],
       [ 166, 2712,    3, ...,    2, 2718,   26]])

## Creating an LSTM based model

# Approach

+ Create the LSTM based model
+ Split the data into features and labels

  - X Features (First 25 words of Sequence)
  - Y Label (Next word after the sequence)


+ Fit the model

In [43]:
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [45]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length = seq_len))
    model.add(LSTM(150, return_sequences = True))
    model.add(LSTM(150))
    model.add(Dense(150, activation = 'relu')) # relu = rectified linear activation function. Output directly if it is positive or else it will output zero.
    
    model.add(Dense(vocabulary_size, activation = 'softmax'))
    
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    model.summary()
    
    return model

# Train / Test Split

In [62]:
from tensorflow.keras.utils import to_categorical

In [63]:
sequences

array([[ 956,   14,  263, ..., 2713,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2712, ...,   53,    2, 2718],
       [ 166, 2712,    3, ...,    2, 2718,   26]])

In [66]:
# First 49 words

X = sequences[:,:-1] # Features - this gets the first 49 words and excludes the last which is y or the label

In [68]:
y = sequences[:,-1] # Labels - last word and the word we are trying to predict

In [69]:
y = to_categorical(y, num_classes = vocabulary_size+1) # the way keras padding words it needs an extra 1 to hold 0

In [70]:
seq_len = X.shape[1]

In [72]:
seq_len

25

# Training the Model

In [73]:
# defining the model

model = create_model(vocabulary_size+1, seq_len)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 25)            67975     
                                                                 
 lstm (LSTM)                 (None, 25, 150)           105600    
                                                                 
 lstm_1 (LSTM)               (None, 150)               180600    
                                                                 
 dense (Dense)               (None, 150)               22650     
                                                                 
 dense_1 (Dense)             (None, 2719)              410569    
                                                                 
Total params: 787,394
Trainable params: 787,394
Non-trainable params: 0
_________________________________________________________________


In [74]:
from pickle import dump,load

In [75]:
# fitting the model

model.fit(X, y, batch_size = 128, epochs = 300, verbose = 1)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.callbacks.History at 0x1ffa8c7d4f0>

In [77]:
# saving the model to file

model.save('epoch300.h5')

# saving the tokenizer

dump(tokenizer, open('epoch300', 'wb'))

## Generating New Text

In [78]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [98]:
# going to take in a model, a tokenizer, seq_len, seed_text, num_gen_words

def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = np.argmax(model.predict(pad_encoded, verbose=0), axis = -1)[0]
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

In [81]:
# text_sequences[0]

In [109]:
import random
random.seed(107)
random_pick = random.randint(0,len(text_sequences))

In [110]:
random_seed_text = text_sequences[random_pick]

In [111]:
random_seed_text

['aggregated',
 'opinions',
 'of',
 'many',
 'aged',
 'persons',
 'with',
 'whom',
 'i',
 'conversed',
 'upon',
 'the',
 'subject',
 'the',
 'picture',
 'represents',
 'a',
 'cape',
 'horner',
 'in',
 'a',
 'great',
 'hurricane',
 'the',
 'half',
 'foundered']

In [112]:
seed_text = ' '.join(random_seed_text)

In [113]:
seed_text

'aggregated opinions of many aged persons with whom i conversed upon the subject the picture represents a cape horner in a great hurricane the half foundered'

In [114]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)

'ship weltering there with its three dismantled masts alone visible and an exasperated whale purposing to spring clean over the craft is in the enormous act of impaling himself upon the three mast heads the opposite wall of this entry was hung all over with a heathenish array of monstrous'