# Text Generation with Python and Keras

## Part One

1. Read Moby Dick .txt files into pandas
2. Process Text
3. Clean Text
4. Tokenize the Text and create Sequences with Keras

In [1]:
# Create a function to read in a .txt file
# Once definition is written, a .txt file with four chapter of moby dick will appear below

def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
        
    return str_text

In [3]:
read_file('moby_dick_four_chapters.txt') # passing in the four chapters of moby dick

In [4]:
# importing spacy library to tokenize text
# we will also disable any parts of th

import spacy

nlp = spacy.load('en_core_web_md', disable = ['parser', 'tagger', 'ner'])
nlp.max_length = 1198623 # setting the max words to be greater than one million. this should cover the entire four chapters

In [5]:
# creating a function to take in a string and grab the tokens if they are not a punctuation or a new line (\n\n) 

def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n']

In [7]:
# storing the file into a variable

d = read_file('moby_dick_four_chapters.txt')

# putting the moby dick chapters through the separate_punc function
tokens = separate_punc(d)

In [9]:
tokens # a token is a string with a known meaning

In [10]:
len(tokens) # the length of tokens

11338

# Creating a Sequence of Tokens

In [11]:
# organize into sequences of tokens
train_len = 25+1 # 50 training words, then one target word

# Creating an empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)): # range is the training length, up to the length of all of the tokens 
    
    # grab the amount of characters in train_len
    seq = tokens[i-train_len:i] # i minus train_len up to i 
    
    # Add to text_sequences using append
    text_sequences.append(seq)

In [12]:
' '.join(text_sequences[0]) # joining the tokens together to form a sentence

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [14]:
' '.join(text_sequences[1]) # moves one word over to the right - as we can see it started with call me and now it starts with me

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [15]:
' '.join(text_sequences[2]) # on this sequence again it moved one word over to the right, starting with ishmael

'ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i'

In [16]:
len(text_sequences)

11312

In [17]:
# importing keras and the tokenizer

from keras.preprocessing.text import Tokenizer

In [18]:
# integer encode sequences of words
# the number is an id for the word - as it is unique to each word

tokenizer = Tokenizer() # create the tokenizer object
tokenizer.fit_on_texts(text_sequences) # calling on tokenizers fit_on_texts and provide the text_sequences
sequences = tokenizer.texts_to_sequences(text_sequences) # calling on texts_to_sequences and replaces texts sequences to sequences of numbers

In [20]:
sequences[0] # the words are now represented by numbers or id

In [22]:
tokenizer.index_word

In [23]:
# for i in sequences, print out the id : followed by word
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
314 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2713 : interest
14 : me
24 : on


In [25]:
tokenizer.word_counts # counts how many times each words shows up ie. Ishmael shows up 133 times in this .txt file

In [26]:
vocabulary_size = len(tokenizer.word_counts) # the length of the vocabulary

In [27]:
# importing numpy

import numpy as np

In [28]:
sequences = np.array(sequences) # transforming sequences the list into a numpy array

In [29]:
# last word on the right - ie. 24 in the first row is the target word or in other words the label
# the features would be the 25 numbers starting from 956 and ending at 14

sequences # formatted sequences into the numpy array

array([[ 956,   14,  263, ..., 2713,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2712, ...,   53,    2, 2718],
       [ 166, 2712,    3, ...,    2, 2718,   26]])

# Creating an LSTM based model

## Approach

+ Create the LSTM based model
+ Split the data into features and labels

  - X Features (First 25 words of Sequence)
  - Y Label (Next word after the sequence)


+ Fit the model

In [30]:
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [31]:
def create_model(vocabulary_size, seq_len):
    model = Sequential() # creating an instance of a sequential model
    model.add(Embedding(vocabulary_size, 25, input_length = seq_len)) # transforming into fixed size, allowing us to fit text data
    model.add(LSTM(150, return_sequences = True)) # number of units or neurons  
    model.add(LSTM(150)) # adding a second layer of LSTM
    model.add(Dense(150, activation = 'relu')) # relu = rectified linear activation function. Output directly if it is positive or else it will output zero.
    
    model.add(Dense(vocabulary_size, activation = 'softmax')) 
    
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    model.summary()
    
    return model

# Train / Test Split

In [32]:
from tensorflow.keras.utils import to_categorical # converts a class vector (integers) to binary class matrix 

In [33]:
sequences

array([[ 956,   14,  263, ..., 2713,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2712, ...,   53,    2, 2718],
       [ 166, 2712,    3, ...,    2, 2718,   26]])

In [34]:
# First 49 words

X = sequences[:,:-1] # Features - this gets the first 49 words and excludes the last which is y or the label

In [38]:
y = sequences[:,-1] # Labels - last word and the word we are trying to predict

In [39]:
y = to_categorical(y, num_classes = vocabulary_size+1) # the way keras padding words it needs an extra 1 to hold 0

In [40]:
seq_len = X.shape[1]

In [41]:
seq_len

25

# Training the Model

In [42]:
# defining the model

model = create_model(vocabulary_size+1, seq_len)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 25)            67975     
                                                                 
 lstm (LSTM)                 (None, 25, 150)           105600    
                                                                 
 lstm_1 (LSTM)               (None, 150)               180600    
                                                                 
 dense (Dense)               (None, 150)               22650     
                                                                 
 dense_1 (Dense)             (None, 2719)              410569    
                                                                 
Total params: 787,394
Trainable params: 787,394
Non-trainable params: 0
_________________________________________________________________


In [43]:
from pickle import dump,load

In [44]:
# fitting the model

model.fit(X, y, batch_size = 128, epochs = 350, verbose = 1)

Epoch 1/350
Epoch 2/350
Epoch 3/350
Epoch 4/350
Epoch 5/350
Epoch 6/350
Epoch 7/350
Epoch 8/350
Epoch 9/350
Epoch 10/350
Epoch 11/350
Epoch 12/350
Epoch 13/350
Epoch 14/350
Epoch 15/350
Epoch 16/350
Epoch 17/350
Epoch 18/350
Epoch 19/350
Epoch 20/350
Epoch 21/350
Epoch 22/350
Epoch 23/350
Epoch 24/350
Epoch 25/350
Epoch 26/350
Epoch 27/350
Epoch 28/350
Epoch 29/350
Epoch 30/350
Epoch 31/350
Epoch 32/350
Epoch 33/350
Epoch 34/350
Epoch 35/350
Epoch 36/350
Epoch 37/350
Epoch 38/350
Epoch 39/350
Epoch 40/350
Epoch 41/350
Epoch 42/350
Epoch 43/350
Epoch 44/350
Epoch 45/350
Epoch 46/350
Epoch 47/350
Epoch 48/350
Epoch 49/350
Epoch 50/350
Epoch 51/350
Epoch 52/350
Epoch 53/350
Epoch 54/350
Epoch 55/350
Epoch 56/350
Epoch 57/350
Epoch 58/350
Epoch 59/350
Epoch 60/350
Epoch 61/350
Epoch 62/350
Epoch 63/350
Epoch 64/350
Epoch 65/350
Epoch 66/350
Epoch 67/350
Epoch 68/350
Epoch 69/350
Epoch 70/350
Epoch 71/350
Epoch 72/350
Epoch 73/350
Epoch 74/350
Epoch 75/350
Epoch 76/350
Epoch 77/350
Epoch 78

<keras.callbacks.History at 0x15782274af0>

In [45]:
# saving the model to file

model.save('epoch300.h5')

# saving the tokenizer

dump(tokenizer, open('epoch300', 'wb'))

## Generating New Text

In [46]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [47]:
# going to take in a model, a tokenizer, seq_len, seed_text, num_gen_words

def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = np.argmax(model.predict(pad_encoded, verbose=0), axis = -1)[0]
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

In [48]:
text_sequences[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [49]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [50]:
random_seed_text = text_sequences[random_pick]

In [51]:
random_seed_text

['thought',
 'i',
 'to',
 'myself',
 'the',
 'man',
 "'s",
 'a',
 'human',
 'being',
 'just',
 'as',
 'i',
 'am',
 'he',
 'has',
 'just',
 'as',
 'much',
 'reason',
 'to',
 'fear',
 'me',
 'as',
 'i',
 'have']

In [52]:
seed_text = ' '.join(random_seed_text) # joining the list of words to form a sentence

In [53]:
seed_text

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [54]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50) # generating the next 50 words

"to be afraid of him better sleep with a sober cannibal than a drunken christian landlord said i tell him to stash his tomahawk there or pipe or whatever you call it tell him to stop smoking in short and i will turn in with him but i do n't"