In [1]:
# imports
import numpy as np
import spacy
import glob
import os
# TF Keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, GRU
from tensorflow.keras.utils import to_categorical
# pickle to save stuff
from pickle import dump

In [2]:
# grab the texts
# CHANGE THE DIRECTORY OF WHERE THE TEXTS ARE
texts = glob.glob(os.path.join('../shakecraft/stories/',
                              'hp*.txt'))

In [3]:
# combine all the texts into one
txt = []

for tmp_txt in texts:
    with open(tmp_txt,'r') as f:
        txt.append(f.read())
complete_txt = " ".join(txt)

# it's a bit long, shave the text down
# MAYBE TRY TO DIVIDE BY 2 OR 3
c_text = complete_txt[:len(complete_txt)//3]


In [4]:
print(len(c_text))

637966


In [5]:
# load spacy object and set max length the same as our text
nlp = spacy.load('en_core_web_lg')
nlp.max_length = len(c_text)

In [6]:
# filter out some garbage from the text
the_filter = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n—\n\n\n     ’”“   '

In [7]:
# lowercase everything and get rid of punctuation according to the_filter
def separate_punctuation(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in the_filter]

In [8]:
# get the tokens
tokens = separate_punctuation(c_text)

In [9]:
# set the sequence length and make a list of sequences
# seq_len = 50 + 1
seq_len = 200 + 1
txt_seqs = []

# the sequences will look something like this:
# sequence 1: [a b c d]
# sequence 2: [b c d e]
# sequence 3: [c d e f]
for i in range(seq_len,len(tokens)):
    seq = tokens[i-seq_len:i]
    txt_seqs.append(seq)

In [10]:
# make a tensorflow tokenizer and turn the texts into sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(txt_seqs)
seqs = tokenizer.texts_to_sequences(txt_seqs)

In [11]:
# vocabulary size
vocab_size = len(tokenizer.word_counts)

In [12]:
print(vocab_size)

12046


In [13]:
# turn the sequences into numpy arrays
seqs = np.array(seqs)

In [15]:
# function to create the model
# embedding -> gated recurrent unit (GRU) -> GRU -> dense -> dense
def create_model(voc_size,s_len):
    model = Sequential()
    model.add(Embedding(voc_size,64,input_length=s_len))
    model.add(GRU(128,return_sequences=True))
    model.add(GRU(128))
    model.add(Dense(128,activation='relu'))
    model.add(Dense(voc_size,activation='softmax'))
    
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    model.summary()
    
    return model

In [16]:
# turn the sequences into X and y
# e.g. [a b c d e] will become [a b c d] and [e]
X = seqs[:,:-1]
y = seqs[:,-1]

# one-hot-encode y 
y = to_categorical(y,num_classes=vocab_size+1)

In [17]:
# set the sequence length
seq_len = X.shape[1]

In [18]:
# create the model
model = create_model(vocab_size+1,seq_len)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 64)           771008    
                                                                 
 gru (GRU)                   (None, 200, 128)          74496     
                                                                 
 gru_1 (GRU)                 (None, 128)               99072     
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dense_1 (Dense)             (None, 12047)             1554063   
                                                                 
Total params: 2,515,151
Trainable params: 2,515,151
Non-trainable params: 0
_________________________________________________________________


In [19]:
# fit the model
model.fit(X,y,batch_size=128,epochs=125,verbose=1)

Epoch 1/125
Epoch 2/125
Epoch 3/125
Epoch 4/125
Epoch 5/125
Epoch 6/125
Epoch 7/125
Epoch 8/125
Epoch 9/125
Epoch 10/125
Epoch 11/125
Epoch 12/125
Epoch 13/125
Epoch 14/125
Epoch 15/125
Epoch 16/125
Epoch 17/125
Epoch 18/125
Epoch 19/125
Epoch 20/125
Epoch 21/125
Epoch 22/125
Epoch 23/125
Epoch 24/125
Epoch 25/125
Epoch 26/125
Epoch 27/125
Epoch 28/125
Epoch 29/125
Epoch 30/125
Epoch 31/125
Epoch 32/125
Epoch 33/125
Epoch 34/125
Epoch 35/125
Epoch 36/125
Epoch 37/125
Epoch 38/125
Epoch 39/125
Epoch 40/125
Epoch 41/125
Epoch 42/125
Epoch 43/125
Epoch 44/125
Epoch 45/125
Epoch 46/125
Epoch 47/125
Epoch 48/125
Epoch 49/125
Epoch 50/125
Epoch 51/125
Epoch 52/125
Epoch 53/125
Epoch 54/125
Epoch 55/125
Epoch 56/125
Epoch 57/125
Epoch 58/125
Epoch 59/125
Epoch 60/125
Epoch 61/125
Epoch 62/125
Epoch 63/125
Epoch 64/125
Epoch 65/125
Epoch 66/125
Epoch 67/125
Epoch 68/125
Epoch 69/125
Epoch 70/125
Epoch 71/125
Epoch 72/125
Epoch 73/125
Epoch 74/125
Epoch 75/125
Epoch 76/125
Epoch 77/125
Epoch 78

<keras.callbacks.History at 0x7f695cc2ca60>

In [20]:
# save our model, tokenizer, and text sequences
model.save('hp_GRU_E125_200seq.h5')
# with open('hp_GRU_E15_tokenizer.pickle', 'wb') as f:
#     dump(tokenizer)
# with open('hp_GRU_E15_txt_seqs.pickle','wb') as f:
#     dump(txt_seqs)
dump(tokenizer,open('hp_GRU_E125_200seq_tokenizer.pickle','wb'))
dump(txt_seqs,open('hp_GRU_E125_200seq_txt_seqs.pickle','wb'))