# Implementation from Scratch

<br />

I am going to implement RNN from scratch.

### Initialization Method

In [1]:
class SimpleInitializer:
    """
    Simple initialization by Gaussian distribution
    
    Parameters
    ----------
    sigma : float
        Standard deviation of Gaussian distribution
    """
    
    def __init__(self, sigma):
        self.sigma = sigma
    
    
    def W(self, m, n):
        """
        Initialization of weights
        
        Parameters
        ----------
        m : int
            Number of features/nodes

        n : int
            Number of nodes
        
        Returns
        ----------
        W : ndarray, shape (m, n)
            Weight
        """

        W = self.sigma * np.random.randn(m, n)
        
        return W.astype("f")
    
    
    def B(self, n):
        """
        Initialization of biases

        Parameters
        ----------
        n : real number
            Random real number
        
        Returns
        ----------
        B : ndarray, shape (n,)
            Bias
        """

        B = self.sigma * np.random.randn(n, 1)
        
        return B.astype("f")

### Activation Function

In [2]:
class Tanh:
    def forward(self, A):
        """
        Forward propagation
        
        Parameters
        ----------
        A : ndarray, shape (n_sequences, n_nodes)
            Input
        
        Returns
        -------
        ndarray, shape (batch_size, n_nodes)
            State/output
        """
        
        Z = np.tanh(A)
        
        return Z
    
    
#     def backward(self, dA):
#         """
#         Back propagation
        
#         Parameters
#         ----------
#         dA : ndarray, shape (batch_size, n_nodes2)
#             Gradient given from the following layer
        
#         Returns
#         -------
#         ndarray, shape (batch_size, ith n_nodes)
#             Output
#         """
        
#         Z = self.forward(self.A)
        
#         d_tanh = (1 - Z**2)*dA
        
#         return d_tanh

## [Task 1] Implement Forward Propagation of SimpleRNN

### Forward Propagation

<br />

$$
a_t = x_{t}\cdot W_{x} + h_{t-1}\cdot W_{h} + b\\
h_t = tanh(a_t)
$$

<br />

$a_t$ : State before passing an activation function at time $t$

$h_t$ : State/output at time $t$

$x_t$ : Input at time $t$

$W_x$ : Weight for input

$h_{t-1}$ : State at time $t-1$ (Forward propagation from a previous time)

$W_h$ : Weight for state

$b$ : Bias term

## [Task 3] Implement Back Propagation

### Update Equations

<br />

$$
W_x^{\prime} = W_x - \alpha E(\frac{\partial L}{\partial W_x}) \\
W_h^{\prime} = W_h - \alpha E(\frac{\partial L}{\partial W_h}) \\
b^{\prime} = b - \alpha E(\frac{\partial L}{\partial b})
$$

<br />

$\alpha$ : Learning rate

$\frac{\partial L}{\partial W_x}$ : Gradient of a loss $L$ about $W_x$

$\frac{\partial L}{\partial W_h}$ : Gradient of a loss $L$ about $W_h$

$\frac{\partial L}{\partial b}$ : Gradient of a loss $L$ about $b$

$E()$ : Computation of a mean of a vector in a direction of a mini-batch

### Back Propagation to Compute Gradients

<br />

$$
\frac{\partial h_t}{\partial a_t} = \frac{\partial L}{\partial h_t} ×(1-tanh^2(a_t))
$$

$$
\frac{\partial L}{\partial b} = \frac{\partial h_t}{\partial a_t}
$$

$$
\frac{\partial L}{\partial W_x} = x_{t}^{T}\cdot \frac{\partial h_t}{\partial a_t}
$$

$$
\frac{\partial L}{\partial W_h} = h_{t-1}^{T}\cdot \frac{\partial h_t}{\partial a_t}
$$

<br />

$\frac{\partial L}{\partial h_t}$ : Sum of an error of a state and an error of an output at a previous time

### Equations of Errors Given to a Previous Time/Layer

<br />

$$
\frac{\partial L}{\partial h_{t-1}} = \frac{\partial h_t}{\partial a_t}\cdot W_{h}^{T}
$$

$$
\frac{\partial L}{\partial x_{t}} = \frac{\partial h_t}{\partial a_t}\cdot W_{x}^{T}
$$

In [3]:
class SimpleRNN:
    """
    Parameters
    ----------
    batch_size : int
        Size of a batch
    
    n_features : int
        Number of features
    
    n_nodes : int
        Number of nodes
    
    initializer : Instance
        Instance of initialization method
    
    optimizer : Instance
        Instance of optimisation method
    
    
    Attributes
    ----------
    W_input : ndarray, shape (n_features, n_nodes)
        Weight for input
    
    W_state : ndarray, shape (n_nodes, n_nodes)
        Weight for input
    
    B : ndarray, shape (1,)
        Bias
    """
    
    def __init__(self, batch_size, n_sequences, n_features, n_nodes, initializer):
        self.batch_size = batch_size
        self.n_sequences = n_sequences
        self.n_features = n_features
        self.n_nodes = n_nodes
        self.initializer = initializer
        
        # Initialize weights and biases by using initializer methods
        self.W_input = self.initializer.W(self.n_features, self.n_nodes)
        self.W_state = self.initializer.W(self.n_nodes, self.n_nodes)
        self.B = self.initializer.B(1)
        
        self.a = []
        self.h = np.zeros((batch_size, n_nodes))
    
    
    def forward(self, X):
        """
        Forward propagation
        
        Parameters
        ----------
        X : ndarray, shape (n_sequences, n_features)
            Input
        
        Returns
        ----------
        ndarray, shape (batch_size, n_nodes)
            State/Output
        """        
        
        a = np.dot(X, self.W_input) + np.dot(self.h, self.W_state) + self.B
        self.a.append(a)
        
        tanh = Tanh()
        self.h = tanh.forward(a)
        
        return self.h
    
    
    def backward(self, X, a, state, previous_state output):
        """
        Backward propagation
        
        Parameters
        ----------
        dA : ndarray, shape (batch_size, n_nodes2)
            Gradient given from the following layer
        
        Returns
        ----------
        dZ : ndarray, shape (batch_size, n_nodes1)
            Gradient given to the next layer
        """
        
        # Activation function layer
        tanh = Tanh()
        p = (state + output) * (1 - tanh(a)**2)
        
        # Update
        self.b = p
        self.W_input = np.dot(X.T, p)
        self.W_state = np.dot(previous_state.T, p)
        
        return

## [Task 2] Validate Forward Propagation by Using Small Array

In [4]:
import numpy as np

In [5]:
# Settings

# Input
x = np.array([[[1, 2], [2, 3], [3, 4]]])/100
w_x = np.array([[1, 3, 5, 7], [3, 5, 7, 8]])/100
w_h = np.array([[1, 3, 5, 7], [2, 4, 6, 8], [3, 5, 7, 8], [4, 6, 8, 10]])/100
batch_size = x.shape[0] # 1
n_sequences = x.shape[1] # 3
n_features = x.shape[2] # 2
n_nodes = w_x.shape[1] # 4
h = np.zeros((batch_size, n_nodes))
b = np.array([1])

# Output
correct_h = np.array([[0.79494228, 0.81839002, 0.83939649, 0.85584174]])

In [6]:
# Definition of forward propagation

def forward(x, w_x, w_h, h, b):
        """
        Forward propagation
        
        Parameters
        ----------
        X : ndarray, shape (n_sequences, n_features)
            Input
        
        Returns
        ----------
        ndarray, shape (batch_size, n_nodes)
            State/Output
        """        
        
        a = np.dot(x, w_x) + np.dot(h, w_h) + b
        
        tanh = Tanh()
        h = tanh.forward(a)
        
        return h

In [7]:
# Forward propagtion

for i in range(n_sequences):
    h = forward(x[0][i], w_x, w_h, h, b)
    print(h)

[[0.76188798 0.76213958 0.76239095 0.76255841]]
[[0.792209   0.8141834  0.83404912 0.84977719]]
[[0.79494228 0.81839002 0.83939649 0.85584174]]


In [8]:
# Check

print(h)
print(correct_h)

[[0.79494228 0.81839002 0.83939649 0.85584174]]
[[0.79494228 0.81839002 0.83939649 0.85584174]]
