# Text Generation with Python and Keras

+ Part One

1. Read Moby Dick .txt files into pandas
2. Process Text
3. Clean Text
4. Tokenize the Text and create Sequences with Keras

In [1]:
# Create a function to read in a .txt file
# Once definition is written, a .txt file with four chapter of moby dick will appear below

def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
        
    return str_text

In [3]:
# 

# read_file('moby_dick_four_chapters.txt')

In [4]:
# importing spacy library to tokenize text
# we will also disable any parts of th

import spacy

nlp = spacy.load('en_core_web_md', disable = ['parser', 'tagger', 'ner'])

In [5]:
nlp.max_length = 1198623

In [6]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n']

In [7]:
d = read_file('moby_dick_four_chapters.txt')

In [11]:
tokens = separate_punc(d)

In [13]:
# tokens

In [14]:
len(tokens)

11338

In [15]:
# 25 words --> Neural Network to predict the next word # 26

## Create Sequence of Tokens

In [17]:
# organize into sequences of tokens
train_len = 25+1 #training words, then one target word

# Creating an empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)): # range is the training length, up to the length of all of the tokens 
    
    # grab the amount of characters in train_len
    seq = tokens[i-train_len:i] # i minus train_len up to i 
    
    # Add to text_sequences using append
    text_sequences.append(seq)

In [46]:
' '.join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [19]:
' '.join(text_sequences[1]) # moves one word over to the right - as we can see it started with call me and now it starts with me

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [20]:
' '.join(text_sequences[2]) # on this sequence again it moved one word over to the right, starting with ishmael

'ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i'

In [21]:
len(text_sequences)

11312

In [22]:
from keras.preprocessing.text import Tokenizer

In [48]:
# integer encode sequences of words
# the number is an id for the word - as it is unique to each word

tokenizer = Tokenizer() # create the tokenizer object
tokenizer.fit_on_texts(text_sequences) # calling on tokenizers fit_on_texts and provide the text_sequences
sequences = tokenizer.texts_to_sequences(text_sequences) # calling on texts_to_sequences and replaces texts sequences to sequences of numbers

In [26]:
# sequences[0]

In [28]:
# tokenizer.index_word

In [49]:
# for i in sequences, print out the id : followed by word
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
314 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2713 : interest
14 : me
24 : on


In [52]:
# tokenizer.word_counts # counts how many times each words shows up ie. Ishmael shows up 133 times in this .txt file

In [56]:
vocabulary_size = len(tokenizer.word_counts)

In [38]:
import numpy as np

In [57]:
sequences = np.array(sequences) # transforming sequences the list into a numpy array

In [60]:
# last word on the right - ie. 24 in the first row is the target word or in other words the label
# the features would be the 25 numbers starting from 956 and ending at 14

sequences # formatted sequences into the numpy array

array([[ 956,   14,  263, ..., 2713,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2712, ...,   53,    2, 2718],
       [ 166, 2712,    3, ...,    2, 2718,   26]])

## Creating an LSTM based model

# Approach

+ Create the LSTM based model
+ Split the data into features and labels

  - X Features (First 25 words of Sequence)
  - Y Label (Next word after the sequence)


+ Fit the model

In [43]:
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [45]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length = seq_len))
    model.add(LSTM(150, return_sequences = True))
    model.add(LSTM(150))
    model.add(Dense(150, activation = 'relu')) # relu = rectified linear activation function. Output directly if it is positive or else it will output zero.
    
    model.add(Dense(vocabulary_size, activation = 'softmax'))
    
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    model.summary()
    
    return model

# Train / Test Split