In [1]:
import csv
import itertools
import operator
import numpy as np
import matplotlib.pyplot as plt
import theano
import theano.tensor as T
import nltk
from utils import softmax

%matplotlib inline

In [2]:
# Data Loading and Preparation
vocabulary_size = 1000
unknown_token = "<UNKNOWN/>"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

with open('data/reddit-comments-2015-08.csv', 'rb') as f:
    reader = csv.reader(f, skipinitialspace=True)
    reader.next()
    sentences = ["%s %s %s" % (sentence_start_token, x[0].decode('utf-8'), sentence_end_token) for x in reader]
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = dict(enumerate(x[0] for x in vocab))
index_to_word[vocabulary_size-1] = unknown_token
word_to_index = inv_map = {v: k for k, v in index_to_word.items()}

# Replace all words not in our vocabulary with [unknown_token]
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

In [3]:
# Creates a one-hot vector
def make_onehot(x, vocabulary_size=vocabulary_size):
    result = np.zeros((len(x), vocabulary_size))
    result[np.arange(len(x)), x] = 1
    return result

X_train = np.array([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
X_train_onehot = np.array([make_onehot(x) for x in X_train])
y_train = np.array([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

print X_train[123]
print " ".join([index_to_word[x] for x in X_train[123]])
print y_train[123]
print " ".join([index_to_word[x] for x in y_train[123]])

[13, 266, 1, 8, 72, 455, 77, 5, 94, 9, 16, 298, 7, 999, 892, 27, 26, 17, 168, 38, 31, 26, 999, 3, 999, 999, 999, 4, 999, 3, 999, 1, 28, 913, 12, 146, 166, 27, 5, 103, 327, 66, 26, 666, 88, 999, 3, 2, 999, 428, 0]
SENTENCE_START No , you 're fine - I know it 's kind of <UNKNOWN/> involved but was n't sure if this was <UNKNOWN/> to <UNKNOWN/> <UNKNOWN/> <UNKNOWN/> a <UNKNOWN/> to <UNKNOWN/> , not posted in here before but I could tell there was absolutely no <UNKNOWN/> to the <UNKNOWN/> number .
[266, 1, 8, 72, 455, 77, 5, 94, 9, 16, 298, 7, 999, 892, 27, 26, 17, 168, 38, 31, 26, 999, 3, 999, 999, 999, 4, 999, 3, 999, 1, 28, 913, 12, 146, 166, 27, 5, 103, 327, 66, 26, 666, 88, 999, 3, 2, 999, 428, 0, 14]
No , you 're fine - I know it 's kind of <UNKNOWN/> involved but was n't sure if this was <UNKNOWN/> to <UNKNOWN/> <UNKNOWN/> <UNKNOWN/> a <UNKNOWN/> to <UNKNOWN/> , not posted in here before but I could tell there was absolutely no <UNKNOWN/> to the <UNKNOWN/> number . SENTENCE_END


First, let's review the equations for a RNN:

$
\begin{align}
s_t &= \tanh(Ux_t + Ws_{t_1}) \\
o_t &= \mathrm{softmax}(Vs_t)
\end{align}
$

In [63]:
class RNNNumpy:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        self.U = np.random.randn(hidden_dim, word_dim) * np.sqrt(2.0/word_dim)
        self.V = np.random.randn(word_dim, hidden_dim) * np.sqrt(2.0/hidden_dim)
        self.W = np.random.randn(hidden_dim, hidden_dim) * np.sqrt(2.0/hidden_dim)
        
    def forward_propagation(self, x):
        # The number of time steps
        T = len(x)
        # During forward propagation we save all hidden states in s. We need them later.
        # We add one additional element for the initial hidden, which we set to 0
        s = np.zeros((T + 1, self.hidden_dim))
        s[-1] = np.zeros(self.hidden_dim)
        # The outputs at each time step. Again, we save them for later.
        o = np.zeros((T, self.word_dim))
        # For each time step...
        for t in np.arange(T):
            s[t] = np.tanh(self.U.dot(x[t]) + self.W.dot(s[t-1]))
            o[t] = softmax(self.V.dot(s[t]))
        return [o, s]
        
    def predict(self, x):
        # Perform forward propagation and return index of the highest score
        o, s = self.forward_propagation(x)
        return np.argmax(o, axis=1)
    
    def calculate_loss(self, X, y):
        # We accumulate the total loss in L
        L = 0
        for i in np.arange(len(y)):
            x_i, y_i = X[i], y[i]
            o, s = self.forward_propagation(x_i)
            L += -1 * np.sum(np.log(o[np.arange(len(y_i)), y_i]))
        return L
    
    def calculate_mean_loss(self, X, y):
        # Divide calculate_loss by the number of words
        num_words = np.sum([len(y_i) for y_i in y])
        return self.calculate_loss(X,y)/float(num_words)   
            
    def bptt(self, x, y):
        T = len(y)
        # Perform forward propagation
        o, s = self.forward_propagation(x)
        # We accumulate the gradients in these variables
        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)
        delta_o = o
        delta_o[np.arange(len(y)), y] -= 1.
        # For each output backwards...
        for t in np.arange(T)[::-1]:
            dLdV += np.outer(delta_o[t], s[t].T)
            # Initial delta calculation
            delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
                dLdW += np.outer(delta_t, s[bptt_step-1])
                dLdU += np.outer(delta_t, x[bptt_step])
                # Update delta for next step
                delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
        return [dLdU, dLdV, dLdW]
    

model = RNNNumpy(2, 10)
# model.bptt(np.random.randn(2,2), [0, 0])
None

In [64]:
def gradient_check(model, x, y, h=0.001, error_threshold=0.01):
    # Overwrite the bptt attribute. We need to backpropagate all the way to get the correct gradient
    model.bptt_truncate = 1000
    # Calculate the gradients using backprop
    bptt_gradients = model.bptt(x, y)
    # List of all parameters we want to chec.
    model_parameters = ['U', 'V', 'W']
    # Gradient check for each parameter
    for pidx, pname in enumerate(model_parameters):
        # Get the actual parameter value from the mode, e.g. model.W
        parameter = operator.attrgetter(pname)(model)
        print "Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape))
        # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            # Save the original value so we can reset it later
            original_value = parameter[ix]
            # Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
            parameter[ix] = original_value + h
            gradplus = model.calculate_loss([x],[y])
            parameter[ix] = original_value - h
            gradminus = model.calculate_loss([x],[y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            parameter[ix] = original_value
            # The gradient for this parameter calculated using backpropagation
            backprop_gradient = bptt_gradients[pidx][ix]
            # calculate The relative error: (|x - y|/(|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            # If the error is to large fail the gradient check
            if relative_error > error_threshold:
                print "Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix)
                print "+h Loss: %f" % gradplus
                print "-h Loss: %f" % gradminus
                print "Estimated_gradient: %f" % estimated_gradient
                print "Backpropagation gradient: %f" % backprop_gradient
                print "Relative Error: %f" % relative_error
                return 
            it.iternext()
        print "Gradient check for parameter %s passed." % (pname)
            
            
np.random.seed(10)
# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 5
model = RNNNumpy(check_vocab_size, 10)
gradient_check(model, make_onehot([0,1,2,3], grad_check_vocab_size), [1,2,3,4])

Performing gradient check for parameter U with size 50.
Gradient check for parameter U passed.
Performing gradient check for parameter V with size 50.
Gradient check for parameter V passed.
Performing gradient check for parameter W with size 100.
Gradient check for parameter W passed.


In [43]:
def train_rnn_with_sgd(model, X_train, y_train, learning_rate=0.0005, print_loss_after=1000, nepoch=1, anneal_after=-1, anneal_factor=0.5):
    # We keep track of the losses so we can plot them later
    losses = []
    for epoch in range(nepoch):
        for i in np.arange(len(y_train)):
            x_i, y_i = X_train[i], y_train[i]
            dLdU, dLdV, dLdW = model.bptt(x_i, y_i)
            model.U -= learning_rate * dLdU
            model.V -= learning_rate * dLdV
            model.W -= learning_rate * dLdW
            if (i % print_loss_after == 0):
                loss = model.calculate_mean_loss(X_train, y_train)
                losses.append(loss)
                print "Loss after epoch=%d i=%d: %f" % (epoch, i,loss)
        # Adjust the learning rate
        if(epoch % anneal_after == 0):
            learning_rate = learning_rate * anneal_factor


In [50]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size, 100, bptt_truncate=4)
losses = train_rnn_with_sgd(model, X_train_onehot[:100], y_train[:100], nepoch=100, learning_rate=0.005, anneal_after=5)

Loss after epoch=0 i=0: 7.271489
Loss after epoch=1 i=0: 5.074434
Loss after epoch=2 i=0: 4.864124


KeyboardInterrupt: 

In [103]:
# How long would it take to train a model
np.random.seed(10)
model = RNNNumpy(vocabulary_size, 100, bptt_truncate=4)
x = %timeit -o model.bptt(X_train_onehot[3], y_train[3])
print "Total training time: %d hours" % (x.best * len(y_train) * 40 / (3600))

1 loops, best of 3: 187 ms per loop
Total training time: 31 hours


In [16]:
class RNNTheano:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Initialize the parameters
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # TODO: What about the bias!?
        U = 0.1 * np.random.randn(hidden_dim, word_dim)
        W = 0.1 * np.random.randn(hidden_dim, hidden_dim)
        V = 0.1 * np.random.randn(word_dim, hidden_dim)
        self.params = {}
        self.params['U'] = theano.shared(name='U', value=U.astype(theano.config.floatX))
        self.params['V'] = theano.shared(name='V', value=V.astype(theano.config.floatX))
        self.params['W'] = theano.shared(name='W', value=W.astype(theano.config.floatX))
        self.comp = self.build_computation_graph()
    
    def build_computation_graph(self):
        # Define Variables
        U, V, W = self.params['U'], self.params['V'], self.params['W']
        x_t = T.vector('x_t', theano.config.floatX)
        s_t_prev = T.vector('s_t_prev', theano.config.floatX)
        s_initial = T.vector('s_initial', theano.config.floatX)
        words = T.matrix('x', theano.config.floatX)
        # All the hidden states and outputs
        s = T.matrix('s')
        o = T.matrix('o')
        # A function that returns the hidden states and outputs
        def forward_prop_step(x_t, s_t_prev, o_t_prev, U, V, W):
            s_t = T.tanh(U * x_t + W * s_t_prev)
            o_t = T.nnet.softmax(V * s_t)
            return [s_t, o_t]
        ([s_vals, o_vals], updates) = theano.scan(
            forward_prop_step,
            sequences=words,
            outputs_info=[s, o],
            non_sequences=[U, V, W],
            strict=True
        )
        forward_prop = theano.function([s, o, words], [s_vals, o_vals])
        return {
            'forward_prop': forward_prop
        }
    
    def forward_propagation(self, x):
        s_initial = np.zeros((len(x)+1, self.hidden_dim))
        o_initial = np.zeros((len(x)+1, self.word_dim))
        s, o = self.comp['forward_prop'](s_initial, o_initial, x)
        return [s, o]
    
    def calculate_loss(self, X, y):
        return 0
    
    def predict(self):
        # TODO
        return nil
    
    def backpropagation(self):
        # TODO
        return nil

model = RNNTheano(vocabulary_size, 50)

In [18]:
s, o = model.forward_propagation(X_train_onehot[3])
print o[0]

ValueError: Input dimension mis-match. (input[0].shape[0] = 50, input[2].shape[0] = 108)
Apply node that caused the error: Elemwise{Composite{tanh((i0 + (i1 * i2)))}}(<TensorType(float64, matrix)>, W_copy, s[t-1])
Inputs types: [TensorType(float64, matrix), TensorType(float64, matrix), TensorType(float64, matrix)]

HINT: Use another linker then the c linker to have the inputs shapes and strides printed.
HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
Apply node that caused the error: forall_inplace,cpu,scan_fn}(Shape_i{0}.0, Elemwise{mul,no_inplace}.0, IncSubtensor{InplaceSet;:int64:}.0, IncSubtensor{InplaceSet;:int64:}.0, W, V)
Inputs types: [TensorType(int64, scalar), TensorType(float64, 3D), TensorType(float64, 3D), TensorType(float64, 3D), TensorType(float64, matrix), TensorType(float64, matrix)]
Inputs shapes: [(), (107, 50, 1000), (107, 108, 50), (107, 108, 1000), (50, 50), (1000, 50)]
Inputs strides: [(), (400000, 8000, 8), (43200, 400, 8), (864000, 8000, 8), (400, 8), (400, 8)]
Inputs values: [array(107), 'not shown', 'not shown', 'not shown', 'not shown', 'not shown']

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.