In [1]:
import spacy
import numpy as np

import tensorflow as tf
import tensorflow.keras as ks
import tensorflow.keras.callbacks

import pydot
from tensorflow.keras.utils import plot_model

In [2]:
txt = open("../data/rowling.txt", "r").read()
txt = unicode(txt, "UTF-8")

In [3]:
nlp = spacy.load("en_core_web_md")
nlp.max_length = 2*len(txt)

In [4]:
doc = nlp(txt, disable=['tagger', 'parser', 'ner'])

## Setting target vocabularies

In [5]:
norms = {}
shapes = {}
whitespaces = {}
for token in doc:
    if token.norm_ not in norms:
        norms[token.norm_] = 1
    else: 
        norms[token.norm_] += 1
        
    if token.shape_ not in shapes:
        shapes[token.shape_] = 1
    else:
        shapes[token.shape_] += 1
        
    if token.whitespace_ not in whitespaces:
        whitespaces[token.whitespace_] = 1
    else:
        whitespaces[token.whitespace_] += 1

In [6]:
norms_reduced = {norm: count for norm, count in norms.iteritems() if count > 10}
shapes_reduced = {shape: count for shape, count in shapes.iteritems() if count > 10}
whitespaces_reduced = {ws: count for ws, count in whitespaces.iteritems() if count > 10}

In [7]:
n_norms = len(norms_reduced)
n_shapes = len(shapes_reduced)
n_whitespaces = len(whitespaces_reduced)

In [8]:
norm2row = {norm: i for i, norm in enumerate(norms_reduced.keys())}
shape2row = {shape: i for i, shape in enumerate(shapes_reduced.keys())}
whitespace2row = {ws: i for i, ws in enumerate(whitespaces_reduced.keys())}

## Data generator

In [None]:
def data_gen(doc, length=128, batch_size=100):
    
    xs = []
    norms = []
    
    while True:
        for i in range(0, len(doc) - length - 1):
            try:
                x = np.stack([token.vector for token in doc[i:(i+length)]])
                norm = doc[i+length].norm_
                norm_id = norm2row[norm]
                
                xs.append(x)
                norms.append(norm_id)
            except KeyError: # rare words
                pass

            
            if len(xs) == batch_size:
                yield np.stack(xs), np.stack(norms)
                
                xs = []
                norms = []

In [9]:
def data_gen_randomized(doc, length=128, batch_size=100):
    
    xs = []
    norms = []
    
    while True:
        i = np.random.randint(0, len(doc) - length - 1)
        try:
            x = np.stack([token.vector for token in doc[i:(i+length)]])
            norm = doc[i+length].norm_
            norm_id = norm2row[norm]

            xs.append(x)
            norms.append(norm_id)
        except KeyError: # rare words
            pass


        if len(xs) == batch_size:
            yield np.stack(xs), np.stack(norms)

            xs = []
            norms = []

In [10]:
d = data_gen_randomized(doc)
for _ in range(5):
    x, y = d.next()
    print x.shape, 
    print y.shape

(100, 128, 300) (100,)
(100, 128, 300) (100,)
(100, 128, 300) (100,)
(100, 128, 300) (100,)
(100, 128, 300) (100,)


## Another model

In [97]:
n_hidden_lstm = 64
sequence_length = 128

n_lstm = 3

In [98]:
input_sequence = ks.Input(shape = (sequence_length, 300))

# project initial word vectors onto same dimensions as LSTM
x = ks.layers.Dense(n_hidden_lstm)(input_sequence)

# add LSTMs with residual connections
for i in range(n_lstm - 1):
    resid = ks.layers.LSTM(n_hidden_lstm, return_sequences=True, name = "lstm_%i" % (i))(x)
    x = ks.layers.Add()([x, resid])
    
resid = ks.layers.LSTM(n_hidden_lstm, return_sequences=False, name = "lstm_%i" % (n_lstm-1))(x)
x = ks.layers.Lambda(lambda x: x[:, 127, :])(x)
x = ks.layers.Add()([x, resid])
    
# add dense connection for predictions
prediction = ks.layers.Dense(n_norms, activation = "softmax")(x)

In [99]:
m = ks.Model([input_sequence], [prediction])

In [100]:
m.compile("rmsprop", loss = "sparse_categorical_crossentropy")

In [101]:
m.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 128, 300)     0                                            
__________________________________________________________________________________________________
dense_7 (Dense)                 (None, 128, 64)      19264       input_5[0][0]                    
__________________________________________________________________________________________________
lstm_0 (LSTM)                   (None, 128, 64)      33024       dense_7[0][0]                    
__________________________________________________________________________________________________
add_10 (Add)                    (None, 128, 64)      0           dense_7[0][0]                    
                                                                 lstm_0[0][0]                     
__________

In [102]:
m.fit_generator(d, steps_per_epoch=10, epochs=5)

Epoch 1/5


ValueError: Operation u'VarIsInitializedOp_247' has been marked as not fetchable.

## Rambler

In [86]:
def ramble(model, string, nlp, ramble_length = 128):
    seq_length = model.input_shape[1]
    
    string = nlp(string, disable=['tagger', 'parser', 'ner'])
    
    # initialize input matrix
    x = np.zeros((1, model.input_shape[1], model.input_shape[2]))
    

In [87]:
def ramble_once(model, string, nlp):
    
    tokens = nlp(string, disable=['tagger', 'parser', 'ner'])
    n_tokens = len(tokens)
    seq_length = model.input_shape[1]
    
    if n_tokens > seq_length:
        token_start = n_tokens - sequence_length
        seq_start = 0
    else:
        tokens_start = 0
        seq_start = sequence_length - n_tokens
        
    x = np.zeros((1, seq_length, model.input_shape[2]))
    for i, token in tokens[token_start:n_tokens]:
        x[0, seq_start+i, :] = token.vector
        
    model.predict(x)

In [88]:
model = m

In [89]:
model.input_shape[1]

128

In [90]:
string = u"And then Harry saw"

In [91]:
tokens = nlp(string, disable=['tagger', 'parser', 'ner'])
x = np.zeros((1, model.input_shape[1], model.input_shape[2]))
n_padding = x.shape[1] - len(tokens)

for i, token in enumerate(tokens):
    x[0, i+n_padding, :] = token.vector

In [92]:
tokens = nlp(string, disable=['tagger', 'parser', 'ner'])
n_tokens = len(tokens)
seq_length = model.input_shape[1]

if n_tokens > seq_length:
    token_start = n_tokens - sequence_length
    seq_start = 0
else:
    token_start = 0
    seq_start = sequence_length - n_tokens
    
x = np.zeros((1, seq_length, model.input_shape[2]))
for i, token in enumerate(tokens[token_start:n_tokens]):
    x[0, seq_start+i, :] = token.vector

In [93]:
x[0, 123 :].shape

(5, 300)

In [94]:
string = u"I am on a research month, which is really nice because the research month is good and everything is the best and hello I am still typing. So many words are being written on this computer. I went to the store to buy some cat food, but all they had was dog food. And I'm not going to feed my cat dog food because she's not a dog. I bought a Christmas tree last week and we decorated the tree with lights that looks really really good. I can't wait for christmas. I hope I get lots of good presents this year. I already bought many presents for my family including wrestling tikets, books and a subscription to a podcast streaming service. I hope my family likes the presents. Then we are going to stay in palm beach for a few days with my wife's family"

In [95]:
m.predict(x)

ValueError: Operation u'VarIsInitializedOp_178' has been marked as not fetchable.

In [120]:
type(1) == int

True

In [121]:
d = {"a": 1, "b": 2, "c": 3}

In [123]:
d.keys()

['a', 'c', 'b']

In [1]:
a =[1,2,3]

In [4]:
map(lambda x: 2*x, a)

[2, 4, 6]

In [7]:
filter(lambda x: x==2, a)

[2]

In [8]:
first(a)

NameError: name 'first' is not defined

In [2]:
all([True, True])

True

In [3]:
map(lambda x: x is None, [None, 1, 4])

[True, False, False]