In [1]:
from theano.sandbox import cuda

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [2]:
%matplotlib inline
import utils; reload(utils)
from utils import *
from __future__ import division, print_function

Using Theano backend.


In [3]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))

corpus length: 600901


In [4]:
# Characters are sorted
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 86


In [5]:
print(chars[78])

z


In [6]:
# At index 0, insert a line break
chars.insert(0, "\0")

In [7]:
chars[79]

'z'

In [8]:
# At each character, join characters from index 1 to the 6th last index(79).
''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

In [9]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [10]:
# The index of each character is packed as a dictionary
# char_indices

In [11]:
# Indices are mapped to characters
# indices_char

In [12]:
# depending on the char_indices defined above, the entire text is parsed, and 
# the character is replaced with the index.
idx = [char_indices[c] for c in text]

In [14]:
# The indices of the first 10 characters are such
idx[:10]
# print(idx[600901-4])

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [15]:
# As defined above, we have a mechanism to convert characters to indices and then back.
''.join(indices_char[i] for i in idx[:40])

'PREFACE\n\n\nSUPPOSING that Truth is a woma'

In [16]:
# 3 character model. Given 3 characters, predict the fourth
cs=3
# xrange has 3 params : the start index, the end index and the incrementation of each element position. 
c1_dat = [idx[i] for i in xrange(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in xrange(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in xrange(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in xrange(0, len(idx)-1-cs, cs)]

In [17]:
print(len(c1_dat))
print(len(c2_dat))
print(len(c3_dat))
print(type(c1_dat))

200299
200299
200299
<type 'list'>


In [18]:
# c1_dat

In [19]:
# c2_dat

In [20]:
# c3_dat

In [21]:
# c4_dat

In [22]:
# Inputs are stacked
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

In [23]:
len(x3)

200297

In [24]:
# This is the output.
y = np.stack(c4_dat[:-2])

In [25]:
len(y)

200297

In [26]:
# First four values of each of the above defined input numpy arrays
x1[:4], x2[:4], x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [27]:
# The first four values of the output array
y[:4]

array([30, 29,  1, 40])

In [28]:
# The input shape and the output shapes are the same as expected. We have the same number of outputs as inputs!
x1.shape, y.shape

((200297,), (200297,))

In [29]:
# The number of latent factors(size of the embedding matrix.)
n_fac = 42

In [30]:
vocab_size

86

In [31]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name)
    # Embeddings turn indices into dense vectors.
    # Thus, here, we are passing in n_in = 86 and outputing a dense vector of 42 latent factors
    # and then flattening that
#     print(inp)
    emb = Embedding(n_in, n_out, input_length=1)(inp)
#     print(emb)
    return inp, Flatten()(emb)

In [32]:
# Creating embeddings for each of the inputs c1, c2 & c3
# 86 index values are given 42 latent factors
c1_in, c1 = embedding_input('c1', vocab_size, n_fac)
c2_in, c2 = embedding_input('c2', vocab_size, n_fac)
c3_in, c3 = embedding_input('c3', vocab_size, n_fac)

In [33]:
# The hidden state is given an arbitrary size of 256
n_hidden = 256

In [34]:
# This is the input from an input matrix to a hidden state
# dense_in will produce an output shape of (*, 256) and apply an activation of relu
dense_in = Dense(n_hidden, activation='relu')

In [35]:
# The activation of the first hidden layer is simply the dense_in function
# applied to the result of the emebedding of the first character.
c1_hidden = dense_in(c1)

In [36]:
# This is the input from the first hidden state to the next hidden state.
# The output of the first hidden layer is also 256 dimensions.
dense_hidden = Dense(n_hidden, activation='tanh')

In [37]:
c2_dense = dense_in(c2)
hidden_2 = dense_hidden(c1_hidden)
c2_hidden = merge([c2_dense, hidden_2])

In [38]:
c3_dense = dense_in(c3)
hidden_3 = dense_hidden(c2_hidden)
c3_hidden = merge([c3_dense, hidden_3])

In [39]:
# The hidden state is then transformed to an output(prediction of vocab_size dimension)
dense_out = Dense(vocab_size, activation='softmax')

In [40]:
c4_out = dense_out(c3_hidden)

In [41]:
# The model is defined by the functional API with an input of an list of 3 elements and and output of one
model = Model([c1_in, c2_in, c3_in], c4_out)

In [42]:
model.compile(loss='sparse_categorical_crossentropy', optimizer = Adam())

In [43]:
model.optimizer.lr = 0.000001

In [44]:
# We pass in the numpy arrays of inputs & outputs to fit the model to the data.
model.fit([x1,x2,x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f139677b790>

In [50]:
model.optimizer.lr = 0.01
model.fit([x1,x2,x3], y, batch_size=64, nb_epoch=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f138ce2bc10>

In [51]:
# def get_next(inp):
#     idxs = [char_indices[c] for c in inp]
#     arrs = [np.array(i)[np.newaxis] for i in idxs]
#     p = model.predict(arrs)
#     i = np.argmax(p)
#     return chars[i]

def get_next(inp):
    idxs = [np.array(char_indices[c])[np.newaxis] for c in inp]
    p = model.predict(idxs)
    return chars[np.argmax(p)]

In [63]:
get_next('rai')

't'

In [54]:
get_next(' th')

'e'

In [64]:
# FIRST RNN
cs = 8
# For ranges of indices 0-7 and outputs the 8th character idx 
c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)] for n in range(cs)]

In [73]:
len(c_in_dat[0])

75112

In [65]:
# Labels for the model
c_out_dat = [idx[i+cs] for i in xrange(0, len(idx)-1-cs, cs)]

In [66]:
xs = [np.stack(c[:-2]) for c in c_in_dat]

In [74]:
print(len(xs[0]))

75110


In [75]:
len(xs), xs[0].shape

(8, (75110,))

In [76]:
y = np.stack(c_out_dat[:-2])

In [77]:
[xs[n][:cs] for n in range(cs)]

[array([40,  1, 33,  2, 72, 67, 73,  2]),
 array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67])]

In [78]:
y[:cs]

array([ 1, 33,  2, 72, 67, 73,  2, 68])

In [80]:
# Number of latent factors
n_fac = 42

In [81]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name+'_in')
    emb = Embedding(n_in, n_out, input_length=1, name=name+'_emb')(inp)
    return inp, Flatten()(emb)

In [82]:
# Creating embedding inputs for all the input characters in the range cs = 8
c_ins = [embedding_input('c'+str(n), vocab_size, n_fac) for n in range(cs)]

In [83]:
n_hidden = 256

In [87]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax')

In [88]:
# The first character in each sequence goes through the dense_in() to create the first hidden activations
hidden= dense_in(c_ins[0][1])

In [89]:
# For each successive layer, we combine the output of dense_in() on the next character with the output of dense_hidden() on the current hidden state.
for i in range(1, cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden])

In [90]:
c_out = dense_out(hidden)

In [91]:
model = Model([c[0] for c in c_ins], c_out)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [92]:
model.fit(xs, y, batch_size=64, nb_epoch=12)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f138c7ed650>

In [93]:
# Testing the model
def get_next(inp):
    idxs = [np.array(char_indices[c])[np.newaxis] for c in inp]
    p = model.predict(idxs)
    return chars[np.argmax(p)]

In [94]:
get_next('for thos')

'e'

In [95]:
get_next('queens o')

'f'

In [96]:
get_next('part of ')

't'

In [107]:
# First Recurrent NN with Keras
# n_hidden, n_fac, cs, vocab_size = (256, 42, 8, 86)
n_hidden = 256
n_fac = 42
cs = 8
vocab_size = 86

In [108]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs),
        SimpleRNN(n_hidden, activation='relu', inner_init='identity'),
        Dense(vocab_size, activation='softmax')
    ])

In [109]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_6 (Embedding)          (None, 8, 42)         3612        embedding_input_3[0][0]          
____________________________________________________________________________________________________
simplernn_3 (SimpleRNN)          (None, 256)           76544       embedding_6[0][0]                
____________________________________________________________________________________________________
dense_12 (Dense)                 (None, 86)            22102       simplernn_3[0][0]                
Total params: 102258
____________________________________________________________________________________________________


In [110]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [128]:
np.stack(xs,0).shape

(8, 75110, 1)

In [134]:
# print(model.shape)
model.fit(np.stack(xs,1), y, batch_size=64, nb_epoch=8)

Exception: Error when checking model input: expected embedding_input_3 to have 2 dimensions, but got array with shape (75110, 8, 1)