# Implement RNN code only with numpy

In [1]:
import numpy as np
import nltk

## Generate Training Data
1. x and y label
2. For word forecast model, 0 stands for SENTENCE_START and 1 stands for SENTENCE_END

In [2]:
## example 1:
# example sentense
# SENTENCE_START what do you think about language processing model, is it good? SENTENCE_END
# full sentence: [0, 51, 27, 16, 10, 856, 53, 25, 34, 69, 12, 13, 43, 41, 1]
# in this example, each number stands for a word, in real problem, normally a vector stands for a word, so the input is a series of vector 
#(also can view as a 2-D array)
x_train = [0, 51, 27, 16, 10, 856, 53, 25, 34, 69, 12, 13, 43, 41]
y_train = [51, 27, 16, 10, 856, 53, 25, 34, 69, 12, 13, 43, 41,1]

In [3]:
## example 2:
# word2vector


[0, 51, 27, 16, 10, 856, 53, 25, 34, 69, 12, 13, 43, 41]

## Process
1. Init stucture and parameters 
2. Forward 
    - Simple RNN
    - LSTM
3. Backpropogation
3. Update parameters
4. Predict


# My implement is a simple RNN with only one hiden layer


remember the forward propagation follow the function: \

$$ s_t =tanh(U^{(sx)}x_t + Ws_{t-1}^{(ss)})$$        
$$ \omicron = softmax(V_{S_t})$$       
lost function: $$J^{(t)}(\theta) = \sum_{i=1}^{|V|}(y_{t_i}^{'}logy_{t_i})$$   
## init function parameter here: \
size of input data (word dimension): x \
size of previous state (size of output dimension): s \
W(ss),U(sx),V(xs)

In this example, the default model is a three layers RNN model with a recurrent hiden layer, and an input, an output layer \
input dimension is **word_dim** \
output dimension is **d** \
number of hiden nodes is **hiden_dim** 

In [None]:
class RNN():
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim,  hidden_dim))
        
## input x and output y (a full list through time s--> states   o--> output)
## input follow function 1,2
    def forward_propagation(self, x):
        # The total number of time steps
        if self.word_dim == 1:
            T = len(x)
        else:
            T = x.shape[0]
    #    print('Input data has %d time steps'%(T))
        # During forward propagation we save all hidden states in s because need them later.
        # We add one additional element for the initial hidden, which we set to 0
        s = np.zeros((T + 1, self.hidden_dim))
        s[-1] = np.zeros(self.hidden_dim)
        # The outputs at each time step. Again, we save them for later.
        o = np.zeros((T, self.word_dim))
        # For each time step...
        for t in np.arange(T):
            # Note that we are indxing U by x[t]. This is the same as multiplying U with a one-hot vector.
            s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
            o[t] = softmax(self.V.dot(s[t]))
        return [o, s]
 

    def predict(self, x):
        # Perform forward propagation and return index of the highest score
        o, s = self.forward_propagation(x)
        return np.argmax(o, axis=1)
    
## lost function follow function 3    
    def calculate_total_loss(self, x, y):
        L = 0
        # For each sentence...
        for i in np.arange(len(y)):
            o, s = self.forward_propagation(x[i])
            # We only care about our prediction of the &quot;correct&quot; words
            correct_word_predictions = o[np.arange(len(y[i])), y[i]]
            # Add to the loss based on how off we were
            L += -1 * np.sum(np.log(correct_word_predictions))
        return L

    def calculate_loss(self, x, y):
        # Divide the total loss by the number of training examples
        N = np.sum((len(y_i) for y_i in y))
        return self.calculate_total_loss(x,y)/N
    

    def bptt(self, x, y):
        if self.word_dim == 1:
            T = len(x)
        else:
            T = x.shape[0]
        # Perform forward propagation
        o, s = self.forward_propagation(x)
        # We accumulate the gradients in these variables
        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)
        ## delta_o is the output error, o is the forwardprop value (0-1)
        delta_o = o
        delta_o[np.arange(len(y)), y] -= 1.
        # For each output backwards...
        for t in np.arange(T)[::-1]:
            dLdV += np.outer(delta_o[t], s[t].T)
            # Initial delta calculation
            delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print &quot;Backpropagation step t=%d bptt step=%d &quot; % (t, bptt_step)
                dLdW += np.outer(delta_t, s[bptt_step-1])              
                dLdU[:,x[bptt_step]] += delta_t
                # Update delta for next step
                delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
        return [dLdU, dLdV, dLdW]

## bptt
from the previous function, we know:\
$$ \frac{\partial E_3}{\partial V} = \frac{\partial E_3}{\partial \hat{y_3}} \frac{\partial \hat{y_3}}{\partial V} 
        = (\hat{y_3}-y_3) * s_3$$
$$\frac{\partial E_3}{\partial W} = \sum^3_{k=0}
\frac{\partial E_3}{\partial \hat{y_3}} \frac{\partial \hat{y_3}}{\partial s_3} \frac{\partial s_3}{\partial s_k} \frac{\partial s_k}{\partial W}$$
To calculate from bptt, we want to get: \
$$\frac{\partial L}{\partial U}$$
$$\frac{\partial L}{\partial V} = \sum_{t=0}^T \partial O* s_t^T $$ 
$$\frac{\partial L}{\partial W}$$
And update U,V,W by gradient descent:\
$$ U_{n} = U_{n-1} - \eta * \frac{\partial L}{\partial U_{n-1}}$$
$$ V_{n} = V_{n-1} - \eta * \frac{\partial L}{\partial V_{n-1}}$$
$$ W_{n} = W_{n-1} - \eta * \frac{\partial L}{\partial W_{n-1}}$$

In [25]:
def gradient_check(self, x, y, h=0.001, error_threshold=0.01):
    # Calculate the gradients using backpropagation. We want to checker if these are correct.
    bptt_gradients = self.bptt(x, y)
    # List of all parameters we want to check.
    model_parameters = ['U', 'V', 'W']
    # Gradient check for each parameter
    for pidx, pname in enumerate(model_parameters):
        # Get the actual parameter value from the mode, e.g. model.W
        parameter = operator.attrgetter(pname)(self)
        print &quot;Performing gradient check for parameter %s with size %d.&quot; % (pname, np.prod(parameter.shape))
        # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            # Save the original value so we can reset it later
            original_value = parameter[ix]
            # Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
            parameter[ix] = original_value + h
            gradplus = self.calculate_total_loss([x],[y])
            parameter[ix] = original_value - h
            gradminus = self.calculate_total_loss([x],[y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            # Reset parameter to original value
            parameter[ix] = original_value
            # The gradient for this parameter calculated using backpropagation
            backprop_gradient = bptt_gradients[pidx][ix]
            # calculate The relative error: (|x - y|/(|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            # If the error is to large fail the gradient check
            if relative_error &amp;gt; error_threshold:
                print &quot;Gradient Check ERROR: parameter=%s ix=%s&quot; % (pname, ix)
                print &quot;+h Loss: %f&quot; % gradplus
                print &quot;-h Loss: %f&quot; % gradminus
                print &quot;Estimated_gradient: %f&quot; % estimated_gradient
                print &quot;Backpropagation gradient: %f&quot; % backprop_gradient
                print &quot;Relative Error: %f&quot; % relative_error
                return 
            it.iternext()
        print &quot;Gradient check for parameter %s passed.&quot; % (pname)


# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 100
np.random.seed(10)
model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate=1000)
model.gradient_check([0,1,2,3], [1,2,3,4])

TypeError: unsupported operand type(s) for -: 'list' and 'int'

In [None]:
# Performs one step of SGD.
def numpy_sgd_step(self, x, y, learning_rate):
    # Calculate the gradients
    dLdU, dLdV, dLdW = self.bptt(x, y)
    # Change parameters according to gradients and learning rate
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW



In [None]:
# Outer SGD Loop
# - model: The RNN model instance
# - X_train: The training data set
# - y_train: The training data labels
# - learning_rate: Initial learning rate for SGD
# - nepoch: Number of times to iterate through the complete dataset
# - evaluate_loss_after: Evaluate the loss after this many epochs
def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    # We keep track of the losses so we can plot them later
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        # Optionally evaluate the loss
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print &quot;%s: Loss after num_examples_seen=%d epoch=%d: %f&quot; % (time, num_examples_seen, epoch, loss)
            # Adjust the learning rate if loss increases
            if (len(losses) &amp;gt; 1 and losses[-1][1] &amp;gt; losses[-2][1]):
                learning_rate = learning_rate * 0.5  
                print &quot;Setting learning rate to %f&quot; % learning_rate
            sys.stdout.flush()
        # For each training example...
        for i in range(len(y_train)):
            # One SGD step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [None]:


# GRU Layer 1
z_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) + b[0])
r_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) + b[1])
c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2])
s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev
 
# GRU Layer 2
z_t2 = T.nnet.hard_sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) + b[3])
r_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) + b[4])
c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5])
s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev