From 5e13aa3644e3ac140ab27e1b5a8336d8cb50ec01 Mon Sep 17 00:00:00 2001 From: Denny Britz Date: Wed, 30 Sep 2015 10:07:05 +0200 Subject: [PATCH] Added RNNTheano class in extra file --- rnn_theano.py | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 rnn_theano.py diff --git a/rnn_theano.py b/rnn_theano.py new file mode 100644 index 0000000..45a193f --- /dev/null +++ b/rnn_theano.py @@ -0,0 +1,115 @@ +import numpy as np +import theano as theano +import theano.tensor as T +from utils import * +import operator + +class RNNTheano: + + def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4): + # Assign instance variables + self.word_dim = word_dim + self.hidden_dim = hidden_dim + self.bptt_truncate = bptt_truncate + # Randomly initialize the network parameters + U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim)) + V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim)) + W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim)) + # Theano: Created shared variables + self.U = theano.shared(name='U', value=U.astype(theano.config.floatX)) + self.V = theano.shared(name='V', value=V.astype(theano.config.floatX)) + self.W = theano.shared(name='W', value=W.astype(theano.config.floatX)) + # We store the Theano graph here + self.theano = {} + self.__theano_build__() + + def __theano_build__(self): + U, V, W = self.U, self.V, self.W + x = T.ivector('x') + y = T.ivector('y') + def forward_prop_step(x_t, s_t_prev, U, V, W): + s_t = T.tanh(U[:,x_t] + W.dot(s_t_prev)) + o_t = T.nnet.softmax(V.dot(s_t)) + return [o_t[0], s_t] + [o,s], updates = theano.scan( + forward_prop_step, + sequences=x, + outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))], + non_sequences=[U, V, W], + truncate_gradient=self.bptt_truncate, + strict=True) + + prediction = T.argmax(o, axis=1) + o_error = T.sum(T.nnet.categorical_crossentropy(o, y)) + + # Gradients + dU = T.grad(o_error, U) + dV = T.grad(o_error, V) + dW = T.grad(o_error, W) + + # Assign functions + self.forward_propagation = theano.function([x], o) + self.predict = theano.function([x], prediction) + self.ce_error = theano.function([x, y], o_error) + self.bptt = theano.function([x, y], [dU, dV, dW]) + + # SGD + learning_rate = T.scalar('learning_rate') + self.sgd_step = theano.function([x,y,learning_rate], [], + updates=[(self.U, self.U - learning_rate * dU), + (self.V, self.V - learning_rate * dV), + (self.W, self.W - learning_rate * dW)]) + + def calculate_total_loss(self, X, Y): + return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)]) + + def calculate_loss(self, X, Y): + # Divide calculate_loss by the number of words + num_words = np.sum([len(y) for y in Y]) + return self.calculate_total_loss(X,Y)/float(num_words) + + +def gradient_check_theano(model, x, y, h=0.001, error_threshold=0.01): + # Overwrite the bptt attribute. We need to backpropagate all the way to get the correct gradient + model.bptt_truncate = 1000 + # Calculate the gradients using backprop + bptt_gradients = model.bptt(x, y) + # List of all parameters we want to chec. + model_parameters = ['U', 'V', 'W'] + # Gradient check for each parameter + for pidx, pname in enumerate(model_parameters): + # Get the actual parameter value from the mode, e.g. model.W + parameter_T = operator.attrgetter(pname)(model) + parameter = parameter_T.get_value() + print "Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape)) + # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ... + it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite']) + while not it.finished: + ix = it.multi_index + # Save the original value so we can reset it later + original_value = parameter[ix] + # Estimate the gradient using (f(x+h) - f(x-h))/(2*h) + parameter[ix] = original_value + h + parameter_T.set_value(parameter) + gradplus = model.calculate_total_loss([x],[y]) + parameter[ix] = original_value - h + parameter_T.set_value(parameter) + gradminus = model.calculate_total_loss([x],[y]) + estimated_gradient = (gradplus - gradminus)/(2*h) + parameter[ix] = original_value + parameter_T.set_value(parameter) + # The gradient for this parameter calculated using backpropagation + backprop_gradient = bptt_gradients[pidx][ix] + # calculate The relative error: (|x - y|/(|x| + |y|)) + relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient)) + # If the error is to large fail the gradient check + if relative_error > error_threshold: + print "Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix) + print "+h Loss: %f" % gradplus + print "-h Loss: %f" % gradminus + print "Estimated_gradient: %f" % estimated_gradient + print "Backpropagation gradient: %f" % backprop_gradient + print "Relative Error: %f" % relative_error + return + it.iternext() + print "Gradient check for parameter %s passed." % (pname) \ No newline at end of file