In [1]:
import spacy
import numpy as np
import collections as cl
import tensorflow as tf
import tensorflow.keras as ks
import tensorflow.keras.callbacks

In [2]:
txt = open("../data/rowling.txt", "r").read()
txt = unicode(txt, "UTF-8")

In [3]:
nlp = spacy.load("en_core_web_md")
nlp.max_length = 2*len(txt)

In [4]:
doc = nlp(txt, disable=['tagger', 'parser', 'ner'])

## Building vocabularies

In [11]:
%%time
norms = set([token.norm_ for token in doc])
shapes = set([token.shape_ for token in doc])
whitespaces = set([token.whitespace_ for token in doc])

CPU times: user 24.2 s, sys: 1.09 s, total: 25.3 s
Wall time: 25.5 s


In [12]:
norm2row = {norm: i for i, norm in enumerate(norms)}
shape2row = {shape: i for i, shape in enumerate(shapes)}
whitespace2row = {ws: i for i, ws in enumerate(whitespaces)}

In [13]:
n_norms = len(norms) + 1
n_shapes = len(shapes) + 1
n_whitespaces = len(whitespaces) + 1

In [14]:
def data_gen(doc, length=128, batch_size=100):
    
    xs = []
    norms = np.zeros((batch_size, n_norms))
    shapes = np.zeros((batch_size, n_shapes))
    whitespaces = np.zeros((batch_size, n_whitespaces))

    while True:
        for i in range(0, len(doc) - length - 1):
            x = np.stack([token.vector for token in doc[i:(i+length)]])
            xs.append(x)
            
            norm_ind = norm2row[doc[i+length].norm_]
            shape_ind = shape2row[doc[i+length].shape_]
            ws_ind = whitespace2row[doc[i+length].whitespace_]
            
            norms[i % 100, norm_ind] = 1
            shapes[i % 100, shape_ind] = 1
            whitespaces[i % 100, ws_ind] = 1
            
            if len(xs) == batch_size:
                #yield np.stack(xs), norms, shapes, whitespaces
                yield np.stack(xs), norms
                
                xs = []
                norms = np.zeros((batch_size, n_norms))
                shapes = np.zeros((batch_size, n_shapes))
                whitespaces = np.zeros((batch_size, n_whitespaces))
                

In [15]:
d = data_gen(doc)
vec, norms = d.next()

# First model

In [16]:
input_vector = ks.Input(shape = (128, 300))

In [17]:
x = ks.layers.Conv1D(64, 3, activation="relu")(input_vector)
x = ks.layers.Conv1D(64, 3, activation="relu")(x)
x = ks.layers.LSTM(64, activation="relu")(x)

In [18]:
prediction = ks.layers.Dense(n_norms, activation="sigmoid")(x)

In [19]:
model = ks.Model(input_vector, prediction)

In [20]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128, 300)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 126, 64)           57664     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 124, 64)           12352     
_________________________________________________________________
lstm (LSTM)                  (None, 64)                33024     
_________________________________________________________________
dense (Dense)                (None, 25616)             1665040   
Total params: 1,768,080
Trainable params: 1,768,080
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [31]:
tensorboad_cb = ks.callbacks.TensorBoard(batch_size=100, log_dir="../logs", 
                                         write_grads=True)

In [33]:
history = model.fit_generator(d, steps_per_epoch=500, epochs=50, callbacks=[tensorboad_cb])

Epoch 1/50
Epoch 2/50
Epoch 3/50

KeyboardInterrupt: 