In [1]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    return str_text

In [4]:
#read_file("moby_dick_four_chapters.txt")

In [5]:
import spacy

In [6]:
nlp = spacy.load("en",disable=["parser","tagger","ner"])

In [7]:
nlp.max_length = 1198623

In [8]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [9]:
d = read_file("moby_dick_four_chapters.txt")

In [10]:
tokens = separate_punc(d)

In [12]:
len(tokens)

11394

In [13]:
#25 words ---> network predict #26 

In [14]:
train_len = 25 + 1

text_sequences = []

for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    
    text_sequences.append(seq)

In [19]:
" ".join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [20]:
" ".join(text_sequences[1])

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [22]:
from keras.preprocessing.text import Tokenizer

In [24]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [25]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [27]:
sequences[0]

[964,
 14,
 265,
 51,
 263,
 416,
 87,
 222,
 129,
 111,
 962,
 262,
 50,
 43,
 37,
 321,
 7,
 23,
 555,
 3,
 150,
 261,
 6,
 2704,
 14,
 24]

In [29]:
for i in sequences[0]:
    print(f"{i} : {tokenizer.index_word[i]}") #Unique ID for words

964 : call
14 : me
265 : ishmael
51 : some
263 : years
416 : ago
87 : never
222 : mind
129 : how
111 : long
962 : precisely
262 : having
50 : little
43 : or
37 : no
321 : money
7 : in
23 : my
555 : purse
3 : and
150 : nothing
261 : particular
6 : to
2704 : interest
14 : me
24 : on


In [30]:
vocabulary_size = len(tokenizer.word_counts)

In [32]:
vocabulary_size #Unique words

2709

In [33]:
type(sequences)

list

In [34]:
import numpy as np

In [35]:
sequences = np.array(sequences)

In [36]:
sequences

array([[ 964,   14,  265, ..., 2704,   14,   24],
       [  14,  265,   51, ...,   14,   24,  965],
       [ 265,   51,  263, ...,   24,  965,    5],
       ...,
       [ 960,   12,  168, ...,  264,   53,    2],
       [  12,  168, 2703, ...,   53,    2, 2709],
       [ 168, 2703,    3, ...,    2, 2709,   26]])

In [37]:
from keras.utils import to_categorical

In [40]:
X = sequences[:,:-1] #all rows, grab everything but last column

In [41]:
y = sequences[:,-1]

In [42]:
y = to_categorical(y,num_classes=vocabulary_size+1)

In [43]:
seq_len = X.shape[1]

In [44]:
X.shape

(11368, 25)

In [46]:
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [47]:
def create_model(vocabulary_size,seq_len):
    
    model = Sequential()
    model.add(Embedding(vocabulary_size,seq_len,input_length=seq_len))
    model.add(LSTM(seq_len*3,return_sequences=True))
    model.add(LSTM(seq_len*3))
    model.add(Dense(150,activation="relu"))
    
    model.add(Dense(vocabulary_size,activation="softmax"))
    
    model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
    
    model.summary()
    
    return model

In [52]:
model = create_model(vocabulary_size+1,seq_len)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 25, 25)            67750     
_________________________________________________________________
lstm_3 (LSTM)                (None, 25, 75)            30300     
_________________________________________________________________
lstm_4 (LSTM)                (None, 75)                45300     
_________________________________________________________________
dense_3 (Dense)              (None, 150)               11400     
_________________________________________________________________
dense_4 (Dense)              (None, 2710)              409210    
Total params: 563,960
Trainable params: 563,960
Non-trainable params: 0
_________________________________________________________________


In [49]:
from pickle import dump,load

In [53]:
model.fit(X,y,batch_size=128,epochs=2,verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fbc9b705fd0>

In [54]:
model.save("my_mobydick_model.h5")

In [55]:
dump(tokenizer,open("my_simpletokenizer","wb"))

In [56]:
from keras.preprocessing.sequence import pad_sequences

In [70]:
def generate_text(model,tokenizer,seq_len,seed_text,num_gen_words):
    
    output_text = []
    
    input_text = seed_text
    
    for i in range(num_gen_words):
        
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        pad_encoded = pad_sequences([encoded_text],maxlen = seq_len,truncating="pre")
        
        pred_word_ind = model.predict_classes(pad_encoded,verbose=0)[0]
        
        pred_word = tokenizer.index_word[pred_word_ind]

        input_text += " " + pred_word
        
        output_text.append(pred_word)
    
    return " ".join(output_text)

In [59]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [60]:
random_seed_text = text_sequences[random_pick]

In [61]:
random_seed_text

['and',
 'throwing',
 'the',
 'clothes',
 'to',
 'one',
 'side',
 'he',
 'really',
 'did',
 'this',
 'in',
 'not',
 'only',
 'a',
 'civil',
 'but',
 'a',
 'really',
 'kind',
 'and',
 'charitable',
 'way',
 'i',
 'stood',
 'looking']

In [62]:
seed_text = " ".join(random_seed_text)

In [63]:
seed_text

'and throwing the clothes to one side he really did this in not only a civil but a really kind and charitable way i stood looking'

In [71]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

'the the the the the the the the the the the the the the the the the the the the the the the the the'

In [72]:
from keras.models import load_model

In [74]:
model = load_model("epochBIG.h5")

In [76]:
tokenizer = load(open("epochBIG","rb"))

In [77]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

"at that stubb ' my frame roman eyes of his own power for the whale 's grain to wrenched progeny for a fever drawn up"