In [5]:
import numpy as np
import theano as theano
import theano.tensor as T
import time
import operator
from theano.gradient import grad_clip
from utils import *
from lstm_theano import *
from gru_theano import *

In [6]:
# Load Stanford pre-trained glove vectors
gdic, gvec = load_stanford_glove("data/glove/glove.6B.100d.txt.gz")

In [7]:
# Load data
from utils import load_and_proprocess_data
VOCABULARY_SIZE = 2000
X_train, y_train, word_to_index, index_to_word = load_and_proprocess_data(VOCABULARY_SIZE)

Reading CSV file...
Parsed 76456 sentences.
Found 58100 unique words tokens.
Using vocabulary size 2000.
The least frequent word in our vocabulary is 'hardly' and appeared 54 times.


In [8]:
# Construct glove word vectors for vocabulary
wv_dim = gvec.shape[1]
wv = []
for i in range(VOCABULARY_SIZE):
    word = index_to_word[i]
    if word not in gdic:
        wv.append(np.zeros(wv_dim))
    else:
        wv.append(gvec[gdic[word]])
wv = np.array(wv)

In [51]:
class GRUTheano:
    
    def __init__(self, word_dim, hidden_dim=100, reg_lambda=0, wordvec=None, bptt_truncate=-1):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.reg_lambda = reg_lambda
        self.bptt_truncate = bptt_truncate
        # Initialize the network parameters
        if wordvec != None:
            U = np.array([wordvec.T, wordvec.T, wordvec.T])
        else:
            U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (3, hidden_dim, word_dim))
        W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (9, hidden_dim, hidden_dim))
        b = np.zeros((6, hidden_dim))
        V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        
        c = np.zeros(word_dim)
        # Theano: Created shared variables
        self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))
        self.W = theano.shared(name='W', value=W.astype(theano.config.floatX))
        self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))
        # Bias terms
        self.b = theano.shared(name='b_i', value=b.astype(theano.config.floatX))
        self.c = theano.shared(name='c', value=c.astype(theano.config.floatX))
        # SGD: Initialize parameters
        self.mU = theano.shared(name='mU', value=np.zeros(U.shape).astype(theano.config.floatX))
        self.mV = theano.shared(name='mV', value=np.zeros(V.shape).astype(theano.config.floatX))
        self.mW = theano.shared(name='mW', value=np.zeros(W.shape).astype(theano.config.floatX))
        self.mb = theano.shared(name='mb', value=np.zeros(b.shape).astype(theano.config.floatX))
        self.mc = theano.shared(name='mc', value=np.zeros(c.shape).astype(theano.config.floatX))
        # We store the Theano graph here
        self.theano = {}
        self.__theano_build__()
    
    def __theano_build__(self):
        V, U, W, b, c = self.V, self.U, self.W, self.b, self.c
        
        x = T.ivector('x')
        y = T.ivector('y')
        
        def forward_prop_step(x_t, s_t_prev, s_t2_prev):
            # This is how we calculated the hidden state in a simple RNN. No longer!
            # s_t = T.tanh(U[:,x_t] + W.dot(s_t_prev))
            
            # Clip the gradients
            W_clipped = grad_clip(W, -1, 1)
            U_clipped = grad_clip(U, -1, 1)
            V_clipped = grad_clip(V, -1, 1)
            b_clipped = grad_clip(b, -1, 1)
            c_clipped = grad_clip(c, -1, 1)
            
            # LRU hidden state calculation
            z_t = T.nnet.sigmoid(U_clipped[0][:,x_t] + W_clipped[0].dot(s_t_prev) + b_clipped[0])
            r_t = T.nnet.sigmoid(U_clipped[1][:,x_t] + W_clipped[1].dot(s_t_prev) + b_clipped[1])
            c_t = T.tanh(U_clipped[2][:,x_t] + W_clipped[2].dot(s_t_prev) * r_t + b_clipped[2])
            s_t = (1 - z_t) * c_t + z_t * s_t_prev
            
            # Layer 2
            z_t2 = T.nnet.sigmoid(W_clipped[3].dot(s_t) + W_clipped[6].dot(s_t2_prev) + b_clipped[3])
            r_t2 = T.nnet.sigmoid(W_clipped[4].dot(s_t) + W_clipped[7].dot(s_t2_prev) + b_clipped[4])
            c_t2 = T.tanh(W_clipped[5].dot(s_t) + W_clipped[8].dot(s_t2_prev) * r_t2 + b_clipped[5])
            s_t2 = (1 - z_t2) * c_t2 + z_t2 * s_t2_prev            
              
            # Final output calculation
            # Theano's softmax returns a matrix with one row, we only need the row
            o_t = T.nnet.softmax(V_clipped.dot(s_t2) + c_clipped)[0]

            return [o_t, s_t, s_t2]
        
        [o,s,s2], updates = theano.scan(
            forward_prop_step,
            sequences=x,
            truncate_gradient=self.bptt_truncate,
            outputs_info=[None, dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim))])
        
        prediction = T.argmax(o, axis=1)
        o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
        
        # Regularization cost
        reg_cost = self.reg_lambda/2. * \
            (T.sum(T.sqr(V)) + T.sum(T.sqr(U)) + T.sum(T.sqr(W)) + T.sum(T.sqr(b)) + T.sum(T.sqr(c)))
        # Total cost
        cost = o_error + reg_cost
        
        # Gradients
        dU = T.grad(cost, U)
        dW = T.grad(cost, W)
        db = T.grad(cost, b)
        dV = T.grad(cost, V)
        dc = T.grad(cost, c)
        
        # Assign functions
        self.forward_propagation = theano.function([x], o)
        self.predict = theano.function([x], prediction)
        self.ce_error = theano.function([x, y], cost)
        self.bptt = theano.function([x, y], [dU, dW, db, dV, dc])
        
        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')
        
        # rmsprop cache updates
        mU = decay * self.mU + (1 - decay) * T.sqr(dU)
        mW = decay * self.mW + (1 - decay) * T.sqr(dW)
        mV = decay * self.mV + (1 - decay) * T.sqr(dV)
        mb = decay * self.mb + (1 - decay) * T.sqr(db)
        mc = decay * self.mc + (1 - decay) * T.sqr(dc)
        
        self.sgd_step = theano.function(
            [x, y, learning_rate, theano.Param(decay, default=0.9)],
            [], 
            updates=[(U, U - learning_rate * dU / T.sqrt(mU + 1e-8)),                     
                     (W, W - learning_rate * dW / T.sqrt(mW + 1e-8)),
                     (V, V - learning_rate * dV / T.sqrt(mV + 1e-8)),
                     (b, b - learning_rate * db / T.sqrt(mb + 1e-8)),
                     (c, c - learning_rate * dc / T.sqrt(mc + 1e-8)),
                     (self.mU, mU),
                     (self.mW, mW),
                     (self.mV, mV),
                     (self.mb, mb),
                     (self.mc, mc)
                    ])
    def calculate_total_loss(self, X, Y):
        return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)])
    
    def calculate_loss(self, X, Y):
        # Divide calculate_loss by the number of words
        num_words = np.sum([len(y) for y in Y])
        return self.calculate_total_loss(X,Y)/float(num_words)



In [52]:
# Do a gradient check
np.random.seed(0)
model = GRUTheano(100, 10)
gradient_check_theano(model, [0,1,2,3], [1,2,3,4])

Performing gradient check for parameter U with size 3000.
Gradient check for parameter U passed.
Performing gradient check for parameter W with size 900.
Gradient check for parameter W passed.
Performing gradient check for parameter b with size 60.
Gradient check for parameter b passed.
Performing gradient check for parameter V with size 1000.
Gradient check for parameter V passed.


AttributeError: GRUTheano instance has no attribute 'b2'

In [None]:
# Load parameters of pre-trained model
# model = load_model_parameters_theano('./data/2015-10-18/GRUTheano-80-8000-10.npz', GRUTheano)
model = load_model_parameters_theano('./data/pretrained.npz', GRUTheano)

In [58]:
# Load parameters of pre-trained model
# model = load_model_parameters_theano('./data/2015-10-18/GRUTheano-80-8000-10.npz', GRUTheano)
# model = load_model_parameters_theano('./data/pretrained.npz', GRUTheano)

# Build model and train

REGULARIZATION = 0
LEARNING_RATE = 1e-4
NEPOCH = 50
HIDDEN_DIM = 100

model = GRUTheano(VOCABULARY_SIZE, hidden_dim=HIDDEN_DIM, reg_lambda=REGULARIZATION, wordvec=wv, bptt_truncate=4)

t1 = time.time()
model.sgd_step(X_train[10], y_train[10], LEARNING_RATE)
t2 = time.time()
print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

train_with_sgd(model, X_train[:500], y_train[:500], LEARNING_RATE, NEPOCH, evaluate_loss_after=1, decay=0.99)

SGD Step time: 121.037960 milliseconds
2015-10-20 11:51:18: Loss after num_examples_seen=0 epoch=0: 7.614625
2015-10-20 11:52:16: Loss after num_examples_seen=500 epoch=1: 6.182702
2015-10-20 11:53:11: Loss after num_examples_seen=1000 epoch=2: 6.134361




KeyboardInterrupt: 

In [59]:
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
unknown_token = "UNKNOWN_TOKEN"
    
def generate_sentence(model):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)
        sampled_word = word_to_index[unknown_token]
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
    sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    return sentence_str
 
num_sentences = 50
senten_min_length = 5
 
for i in range(num_sentences):
    sent = []
    # We want long sentences, not sentences with one or two words
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
    print " ".join(sent)

best here environment op . to
challenge settings enjoyed road . logic . .
context apart count environment .
watched metal determine the about
de jump window somewhat your it . be
period lost life . ?
final europe goal well through option game it
led finding be taxes .
mad pretty charge did lol . )
seen going up . tip . flying . . and number ? .
nothing mostly adding ass into
abilities moving me abilities affect .
running wanting feels i going stuff
bar dick knowing events . understand .
looking wonder one at . op .
upset events upon it or . . skin your . . '' . . .
loved directly of into it ! .
road ) feedback alone seconds bad ca
have old '' , . flying
actual might list . allowed out ! all
're town term lower . )
making counter ! it release you
usually to road . ?
! single site . and
forces speech events the life it
tree any board little her ? as . imo knowing .
back though damage . you . . . is . ) .
hp 15 move 12 use beyond . like .
gt drug now areas . taxes
listen issue 6 . to
easy

In [21]:
def generate_sentences(model, beam_size=50, max_length=6):
    # We start the sentence with the start token and a random word
    sentences = [[word_to_index[sentence_start_token]]]
    sentence_probs = [0]
    for i in range(max_length):
        next_sentences = []
        next_sentences_probs = []
        for s, prob in zip(sentences, sentence_probs):
            next_word_probs = np.log(model.forward_propagation(s))[-1]
            next_words = np.argsort(next_word_probs)[-beam_size:][::-1]
            for word in next_words:
                if word != word_to_index[unknown_token] and word != word_to_index[sentence_end_token] and word != s[-1]:
                    next_sentences.append(np.concatenate((s, [word])))
                    next_sentences_probs.append(prob + next_word_probs[word])
        next_sentences = np.array(next_sentences).astype('int32')
        next_sentences_probs = np.array(next_sentences_probs).astype('int32')
        top_sentences_idx = np.argsort(next_sentences_probs)[-beam_size:][::-1]
        sentences = next_sentences[top_sentences_idx]
        sentence_probs = next_sentences_probs[top_sentences_idx]
    return sentences, sentence_probs
        
def print_senence(s):
    sentence_str = [index_to_word[x] for x in s[1:-1]]
    print " ".join(sent)

sentences, sentence_probs = generate_sentences(model)
for sent, prob in zip(sentences, sentence_probs):
    sentence_str = [index_to_word[x] for x in sent[1:-1]]
    print "%f %s" % (prob, " ".join(sentence_str))

-23.000000 be . the . the
-23.000000 to . for . the
-23.000000 to . ? . the
-23.000000 to . to . the
-23.000000 to . the . the
-23.000000 to . it . the
-23.000000 to . is . the
-23.000000 to . you . the
-23.000000 to . a . the
-23.000000 to . of . the
-23.000000 to . as . the
-24.000000 to . to . you
-24.000000 to . it . ?
-24.000000 to . it . a
-24.000000 to . you . ,
-24.000000 to . you . ?
-24.000000 to . it . it
-24.000000 to . is . !
-24.000000 to . a . !
-24.000000 to . to . ?
-24.000000 to . you . !
-24.000000 to . it . for
-24.000000 to . a . ?
-24.000000 to . to . to
-24.000000 to . to . ,
-24.000000 to . it . you
-24.000000 to . you . it
-24.000000 to . it . ,
-24.000000 to . to . a
-24.000000 to . to . it
-24.000000 to . a . ,
-24.000000 to . and . the
-24.000000 to . a . for
-24.000000 to . a . you
-24.000000 to . is . ,
-24.000000 to . to . of
-24.000000 to . it . to
-24.000000 to . it . of
-24.000000 to . it . !
-24.000000 to . to . for
-24.000000 to . you . for
-24.00000