In [1]:
import numpy as np
import theano as theano
import theano.tensor as T
import time
import operator
from theano.gradient import grad_clip
from utils import *
from lstm_theano import *
from gru_theano import *

In [2]:
# Load data
VOCABULARY_SIZE = 2000
X_train, y_train, word_to_index, index_to_word = load_data("data/reddit-comments-2015.csv", VOCABULARY_SIZE)

Reading CSV file...
Parsed 502183 sentences.
Found 193212 unique words tokens.
Using vocabulary size 2000.
The least frequent word in our vocabulary is 'tonight' and appeared 324 times.


In [37]:
# Do a gradient check
np.random.seed(0)
model = GRUTheano(100, 10)
gradient_check_theano(model, [0,1,2,3], [1,2,3,4])

Performing gradient check for parameter E with size 1000.
Gradient check for parameter E passed.
Performing gradient check for parameter U with size 600.
Gradient check for parameter U passed.
Performing gradient check for parameter W with size 600.
Gradient check for parameter W passed.
Performing gradient check for parameter b with size 60.
Gradient check for parameter b passed.
Performing gradient check for parameter V with size 1000.
Gradient check for parameter V passed.
Performing gradient check for parameter c with size 100.
Gradient check for parameter c passed.


In [21]:
# Load parameters of pre-trained model
# model = load_model_parameters_theano("GRU-2015-10-25-10-37-2000-48-128.npz")
# model = load_model_parameters_theano('./data/pretrained.npz', GRUTheano)

Building model model from GRU-2015-10-25-10-37-2000-48-128.npz with hidden_dim=128 word_dim=2000


In [22]:
print model.calculate_loss(X_train[:500], y_train[:500])

43.3986080477


In [44]:
class GRUTheano:
    
    def __init__(self, word_dim, hidden_dim=128, bptt_truncate=-1):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Initialize the network parameters
        E = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        U = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (6, hidden_dim, hidden_dim))
        W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (6, hidden_dim, hidden_dim))
        V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        b = np.zeros((6, hidden_dim))
        c = np.zeros(word_dim)
        # Theano: Created shared variables
        self.E = theano.shared(name='E', value=E.astype(theano.config.floatX))
        self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))
        self.W = theano.shared(name='W', value=W.astype(theano.config.floatX))
        self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))
        self.b = theano.shared(name='b', value=b.astype(theano.config.floatX))
        self.c = theano.shared(name='c', value=c.astype(theano.config.floatX))
        # SGD / rmsprop: Initialize parameters
        self.mE = theano.shared(name='mE', value=np.zeros(E.shape).astype(theano.config.floatX))
        self.mU = theano.shared(name='mU', value=np.zeros(U.shape).astype(theano.config.floatX))
        self.mV = theano.shared(name='mV', value=np.zeros(V.shape).astype(theano.config.floatX))
        self.mW = theano.shared(name='mW', value=np.zeros(W.shape).astype(theano.config.floatX))
        self.mb = theano.shared(name='mb', value=np.zeros(b.shape).astype(theano.config.floatX))
        self.mc = theano.shared(name='mc', value=np.zeros(c.shape).astype(theano.config.floatX))
        # We store the Theano graph here
        self.theano = {}
        self.__theano_build__()
    
    def __theano_build__(self):
        E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c
        
        U_stacked1 = T.concatenate([U[0], U[1], U[2]])
        b_stacked1 = T.concatenate([b[0], b[1], b[2]])
        W_stacked1 = T.concatenate([W[0], W[1], W[2]])
        U_stacked2 = T.concatenate([U[3], U[4], U[5]])
        b_stacked2 = T.concatenate([b[3], b[4], b[5]])
        W_stacked2 = T.concatenate([W[3], W[4], W[5]])        
        
        x = T.ivector('x')
        y = T.ivector('y')
        
        def forward_prop_step(x_t, s_t1_prev, s_t2_prev):
            # This is how we calculated the hidden state in a simple RNN. No longer!
            # s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev))
            
            n = self.hidden_dim
            # Word embedding layer
            x_e = E[:,x_t]
                
            # GRU Layer 1
            # z_t1 = T.nnet.sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) + b[0])
            # r_t1 = T.nnet.sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) + b[1])
            # c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2])
            # s_t1 = (T.ones_like(z_t1) - z_t1) * s_t1_prev + z_t1 * c_t1
            
            Ux1 = U_stacked1.dot(x_e) + b_stacked1
            Ws1 = W_stacked1.dot(s_t1_prev)
            z_t1 = T.nnet.sigmoid(Ux1[0:n] + Ws1[0:n])
            r_t1 = T.nnet.sigmoid(Ux1[n:2*n] + Ws1[n:2*n])
            c_t1 = T.tanh(Ux1[2*n:3*n] + Ws1[2*n:3*n] * r_t1)
            s_t1 = (T.ones_like(z_t1) - z_t1) * s_t1_prev + z_t1 * c_t1            
            
            # GRU Layer 2
            # z_t2 = T.nnet.sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) + b[3])
            # r_t2 = T.nnet.sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) + b[4])
            # c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5])
            # s_t2 = (T.ones_like(z_t2) - z_t2) * s_t2_prev + z_t2 * c_t2

            Ux2 = U_stacked2.dot(s_t1) + b_stacked2
            Ws2 = W_stacked2.dot(s_t2_prev)
            z_t2 = T.nnet.sigmoid(Ux2[0:n] + Ws2[0:n])
            r_t2 = T.nnet.sigmoid(Ux2[n:2*n] + Ws2[n:2*n])
            c_t2 = T.tanh(Ux2[2*n:3*n] + Ws2[2*n:3*n] * r_t2)
            s_t2 = (T.ones_like(z_t2) - z_t2) * s_t2_prev + z_t2 * c_t2
            
            # Final output calculation
            # Theano's softmax returns a matrix with one row, we only need the row
            o_t = T.nnet.softmax(V.dot(s_t2) + c)[0]

            return [o_t, s_t1, s_t2]
        
        [o, s, s2], updates = theano.scan(
            forward_prop_step,
            sequences=x,
            truncate_gradient=self.bptt_truncate,
            outputs_info=[None, 
                          dict(initial=T.zeros(self.hidden_dim)),
                          dict(initial=T.zeros(self.hidden_dim))])
        
        prediction = T.argmax(o, axis=1)
        o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
        
        # Total cost (could add regularization here)
        cost = o_error
        
        # Gradients
        dE = T.grad(cost, E)
        dU = T.grad(cost, U)
        dW = T.grad(cost, W)
        db = T.grad(cost, b)
        dV = T.grad(cost, V)
        dc = T.grad(cost, c)
        
        # Assign functions
        self.predict = theano.function([x], o)
        self.predict_class = theano.function([x], prediction)
        self.ce_error = theano.function([x, y], cost)
        self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc])
        
        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')
        
        # rmsprop cache updates
        mE = decay * self.mE + (1 - decay) * T.sqr(dE)
        mU = decay * self.mU + (1 - decay) * T.sqr(dU)
        mW = decay * self.mW + (1 - decay) * T.sqr(dW)
        mV = decay * self.mV + (1 - decay) * T.sqr(dV)
        mb = decay * self.mb + (1 - decay) * T.sqr(db)
        mc = decay * self.mc + (1 - decay) * T.sqr(dc)
        
        self.sgd_step = theano.function(
            [x, y, learning_rate, theano.Param(decay, default=0.9)],
            [], 
            updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)),
                     (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
                     (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
                     (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
                     (b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
                     (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
                     (self.mE, mE),
                     (self.mU, mU),
                     (self.mW, mW),
                     (self.mV, mV),
                     (self.mb, mb),
                     (self.mc, mc)
                    ])
        
        
    def calculate_total_loss(self, X, Y):
        return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)])
    
    def calculate_loss(self, X, Y):
        # Divide calculate_loss by the number of words
        num_words = np.sum([len(y) for y in Y])
        return self.calculate_total_loss(X,Y)/float(num_words)



In [None]:
# Load parameters of pre-trained model
# model = load_model_parameters_theano('./data/2015-10-18/GRUTheano-80-8000-10.npz', GRUTheano)
# model = load_model_parameters_theano('./data/pretrained.npz', GRUTheano)

# Build model and train

LEARNING_RATE = 1e-3
NEPOCH = 20
HIDDEN_DIM = 128

model = GRUTheano(VOCABULARY_SIZE, hidden_dim=HIDDEN_DIM, bptt_truncate=-1)

t1 = time.time()
model.sgd_step(X_train[10], y_train[10], LEARNING_RATE)
t2 = time.time()
print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

train_with_sgd(model, X_train[:500], y_train[:500], LEARNING_RATE, NEPOCH, evaluate_loss_after=1, decay=0.9)

In [29]:
def generate_sentence(model, index_to_word, word_to_index):
    # We start the sentence with the start token
    new_sentence = [word_to_index[SENTENCE_START_TOKEN]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[SENTENCE_END_TOKEN]:
        next_word_probs = model.forward_propagation(new_sentence)[-1]
        samples = np.random.multinomial(1, next_word_probs)
        sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
        # Seomtimes we get stuck in an infinite loop if the sentence becomes too long (e.g. .....) :(
        if len(new_sentence) > 100:
          return []
    return new_sentence

def print_sentence(s, index_to_word):
  sentence_str = [index_to_word[x] for x in s[1:-1]]
  print(" ".join(sentence_str))
  sys.stdout.flush()

def generate_sentences(model, n, index_to_word, word_to_index, min_length=5):
  for i in range(n):
      sent = []
      while len(sent) < min_length:
          sent = generate_sentence(model, index_to_word, word_to_index)
      print_sentence(sent, index_to_word)

generate_sentences(model, 10, index_to_word, word_to_index)

do . managed gt ; on was UNKNOWN_TOKEN that n't more park it you so too from so , and attempt ) , 's who you ... friends has it from type .
UNKNOWN_TOKEN keep right , guy that , & has she UNKNOWN_TOKEN to final .
i holding that request and any 's be .. the ready the i degree their make 25 his UNKNOWN_TOKEN the at for eat a over of UNKNOWN_TOKEN it .
she world a too account , i please : from still UNKNOWN_TOKEN lost `` various .
this building 's the '' of UNKNOWN_TOKEN .
gt just them is as the ^ done UNKNOWN_TOKEN .
if i & be can UNKNOWN_TOKEN upset show and been be and and UNKNOWN_TOKEN but you terms UNKNOWN_TOKEN to entire be show .
've were along religion , it teach be these .
UNKNOWN_TOKEN and they a trade career .
ways and have UNKNOWN_TOKEN would ) is my a really has year and i under
