# Implementation from Scratch

<br />

I am going to implement RNN from scratch.

### Fully Connected Layer

In [32]:
class FC:
    """
    Fully connected layer
    
    Parameters
    ----------
    n_nodes1 : int
        Number of nodes of the previous layer
    
    n_nodes2 : int
        Number of nodes of the following layer
    
    initializer : Instance
        Instance of initialization method
    
    optimizer : Instance
        Instance of optimisation method
    
    Attributes
    ----------
    W : ndarray, shape (n_nodes1, n_nodes2)
        Weight
    
    B : ndarray, shape (n_nodes2,)
        Bias
    
    Z : ndarray, shape (batch_size, n_nodes1)
        Deepcopy of input
    
    dW : float
        Gradient of weight
    
    dB : float
        Gradient of bias
    """
    
    def __init__(self, n_nodes1, n_nodes2, initializer, optimizer):
        self.n_nodes1 = n_nodes1
        self.n_nodes2 = n_nodes2
        self.initializer = initializer
        self.optimizer = optimizer
        
        # Initialize self.W and self.B by using initializer method
        self.W = self.initializer.W(self.n_nodes1, self.n_nodes2)
        self.B = self.initializer.B(self.n_nodes2)
        
        self.Z = 0
        self.dW = 0
        self.dB = 0
    
    
    def forward(self, X):
        """
        Forwardpropagation
        
        Parameters
        ----------
        X : ndarray, shape (batch_size, n_nodes1)
            Input
        
        Returns
        ----------
        ndarray, shape (batch_size, n_nodes2)
            Output
        """        
        
        self.Z = copy.deepcopy(X)
        
        return np.dot(X, self.W) + self.B
    
    
    def backward(self, dA):
        """
        Backwardpropagation
        
        Parameters
        ----------
        dA : ndarray, shape (batch_size, n_nodes2)
            Gradient given from the following layer
        
        Returns
        ----------
        dZ : ndarray, shape (batch_size, n_nodes1)
            Gradient given to the next layer
        """
        
        self.dB = np.average(dA)
        self.dW = np.dot(self.Z.T, dA) / dA.shape[0]
        
        dZ = np.dot(dA, self.W.T)
        
        # Update
        self = self.optimizer.update(self)
        
        return dZ

### Initialization Method

In [33]:
class SimpleInitializer:
    """
    Simple initialization by Gaussian distribution
    
    Parameters
    ----------
    sigma : float
        Standard deviation of Gaussian distribution
    """
    
    def __init__(self, sigma):
        self.sigma = sigma
    
    
    def W(self, m, n):
        """
        Initialization of weights
        
        Parameters
        ----------
        m : int
            Number of features/nodes

        n : int
            Number of nodes
        
        Returns
        ----------
        W : ndarray, shape (m, n)
            Weight
        """

        W = self.sigma * np.random.randn(m, n)
        
        return W.astype("f")
    
    
    def B(self, n):
        """
        Initialization of biases

        Parameters
        ----------
        n : real number
            Random real number
        
        Returns
        ----------
        B : ndarray, shape (n,)
            Bias
        """

        B = self.sigma * np.random.randn(n, 1)
        
        return B.astype("f")

### Optimization Method

In [34]:
class SGD_FC:
    """
    Stochastic Gradient Descent for Fully Connected layer
    
    Parameters
    ----------
    lr : float
        Learning rate
    """
    
    def __init__(self, lr):
        self.lr = lr
    
    
    def update(self, layer):
        """
        Update weights and biases of layers.
        
        Parameters
        ----------
        layer : Instance
            Instance of preupdated layer
        
        Returns
        ----------
        layer : Instance
            Instance of updated layer
        """
        
        layer.W -= self.lr * layer.dW
        layer.B -= self.lr * layer.dB
        
        return layer

In [35]:
class SGD_RNN:
    """
    Stochastic Gradient Descent for RNN
    
    Parameters
    ----------
    lr : float
        Learning rate
    """
    
    def __init__(self, lr):
        self.lr = lr
    
    
    def update(self, layer):
        """
        Update weights and biases of layers.
        
        Parameters
        ----------
        layer : Instance
            Instance of preupdated layer
        
        Returns
        ----------
        layer : Instance
            Instance of updated layer
        """
        
        layer.W_input -= self.lr * layer.dW
        layer.W_state -= self.lr * layer.dW
        layer.B -= self.lr * layer.dB
        
        return layer

### Activation Function

In [36]:
class Tanh:
    def forward(self, A):
        """
        Forward propagation
        
        Parameters
        ----------
        A : ndarray, shape (n_sequences, n_nodes)
            Input
        
        Returns
        -------
        ndarray, shape (batch_size, n_nodes)
            State/Output
        """
        
        self.A = A
        
        Z = np.tanh(self.A)
        
        return Z
    
    
    def backward(self, dA):
        """
        Back propagation
        
        Parameters
        ----------
        dA : ndarray, shape (batch_size, n_nodes)
            Sum of an error of state and an error of output from a previous time
        
        Returns
        -------
        ndarray, shape (batch_size, n_nodes)
            Output
        """
        
        Z = self.forward(self.A)
        
        d_tanh = dA * (1 - Z**2)
        
        return d_tanh

In [37]:
class Softmax:
    """
    Attributes
    ----------
    Z : ndarray, shape (batch_size, n_nodes)
        Output
    """
    
    def __init__(self):
        self.Z = None
    
    
    def forward(self, A):
        """
        Forward propagation
        
        Parameters
        ----------
        A : ndarray, shape (batch_size,)
            Vector
        
        Returns
        -------
        ndarray, shape (batch_size, n_nodes)
            Output
        """
        
        A -= np.max(A)
        
        Z = np.exp(A) / np.sum(np.exp(A), axis=1, keepdims=True)
        
        self.Z = Z
        
        return Z
    
    
    def backward(self, y):
        """
        Backward propagation
        
        Parameters
        ----------
        y : ndarray, shape (n_sequences, 1)
            Correct values
        
        Returns
        -------
        ndarray, shape (batch_size,)
            Probability vector of kth class
        """
        
        return self.Z - y

### Mini-Batch Processing

In [38]:
class GetMiniBatch():
    """
    Iterator to get a mini-batch

    Parameters
    ----------
    X : ndarray, shape (n_sequences, n_features)
      Train dataset
    
    y : ndarray, shape (n_sequences, 1)
      Correct values
    
    batch_size : int
      Size of batch
    
    seed : int
      Seed of random numbers of Numpy
    """
    
    def __init__(self, X, y, batch_size=10, seed=0):
        self.batch_size = batch_size
        np.random.seed(seed)
        shuffle_index = np.random.permutation(np.arange(X.shape[0]))
        self.X = X[shuffle_index]
        self.y = y[shuffle_index]
        self._stop = np.ceil(X.shape[0]/self.batch_size).astype(np.int)
    
    
    def __len__(self):
        return self._stop
    
    
    def __getitem__(self, item):
        p0 = item*self.batch_size
        p1 = item*self.batch_size + self.batch_size
        return self.X[p0:p1], self.y[p0:p1]        
    
    
    def __iter__(self):
        self._counter = 0
        return self
    
    
    def __next__(self):
        if self._counter >= self._stop:
            raise StopIteration()
        
        p0 = self._counter*self.batch_size
        p1 = self._counter*self.batch_size + self.batch_size
        
        self._counter += 1
        
        return self.X[p0:p1], self.y[p0:p1]

### Loss

In [39]:
class Loss:
    def cross_entropy_loss(self, y, y_pred):
        """
        Cross entropy error

        Parameters
        ----------
        y : ndarray, shape (n_sequences, 1)
            Correct values

        y_pred : ndarray, shape (n_sequences, 1)
            Predicted values

        Returns
        -------
        ndarray, shape (n_sequences, 1)
            Cross entropy error
        """

        return np.sum(-1*y*np.log(y_pred+1e-10), axis=1)

## [Task 1] Implement Forward Propagation of SimpleRNN

### Forward Propagation

<br />

$$
a_t = x_{t}\cdot W_{x} + h_{t-1}\cdot W_{h} + b\\
$$

$$
h_t = tanh(a_t)
$$

<br />

$a_t$ : State before passing an activation function at time $t$

$h_t$ : State/output at time $t$

$x_t$ : Input at time $t$

$W_x$ : Weight for input

$h_{t-1}$ : State at time $t-1$ (Forward propagation from a previous time)

$W_h$ : Weight for state

$b$ : Bias term

## [Task 3] Implement Back Propagation

### Update Equations

<br />

$$
W_x^{\prime} = W_x - \alpha E(\frac{\partial L}{\partial W_x})
$$

$$
W_h^{\prime} = W_h - \alpha E(\frac{\partial L}{\partial W_h})
$$

$$
b^{\prime} = b - \alpha E(\frac{\partial L}{\partial b})
$$

<br />

$\alpha$ : Learning rate

$\frac{\partial L}{\partial W_x}$ : Gradient of a loss $L$ about $W_x$

$\frac{\partial L}{\partial W_h}$ : Gradient of a loss $L$ about $W_h$

$\frac{\partial L}{\partial b}$ : Gradient of a loss $L$ about $b$

$E()$ : Computation of a mean of a vector in a direction of a mini-batch

### Back Propagation to Compute Gradients

<br />

$$
\frac{\partial h_t}{\partial a_t} = \frac{\partial L}{\partial h_t} ×(1-tanh^2(a_t))
$$

$$
\frac{\partial L}{\partial b} = \frac{\partial h_t}{\partial a_t}
$$

$$
\frac{\partial L}{\partial W_x} = x_{t}^{T}\cdot \frac{\partial h_t}{\partial a_t}
$$

$$
\frac{\partial L}{\partial W_h} = h_{t-1}^{T}\cdot \frac{\partial h_t}{\partial a_t}
$$

<br />

$\frac{\partial L}{\partial h_t}$ : Sum of an error of a state and an error of an output at a previous time

### Equations of Errors Given to a Previous Time/Layer

<br />

$$
\frac{\partial L}{\partial h_{t-1}} = \frac{\partial h_t}{\partial a_t}\cdot W_{h}^{T}
$$

$$
\frac{\partial L}{\partial x_{t}} = \frac{\partial h_t}{\partial a_t}\cdot W_{x}^{T}
$$

In [40]:
class SimpleRNN:
    """
    Parameters
    ----------
    batch_size : int
        Size of a batch
    
    n_features : int
        Number of features
    
    n_nodes : int
        Number of nodes
    
    initializer : Instance
        Instance of initialization method
    
    optimizer : Instance
        Instance of optimisation method
    
    Attributes
    ----------
    W_input : ndarray, shape (n_features, n_nodes)
        Weight for input
    
    W_state : ndarray, shape (n_nodes, n_nodes)
        Weight for input
    
    B : ndarray, shape (1,)
        Bias
    """
    
    def __init__(self, batch_size, n_sequences, n_features, n_nodes, initializer, optimizer):
        self.batch_size = batch_size
        self.n_sequences = n_sequences
        self.n_features = n_features
        self.n_nodes = n_nodes
        self.initializer = initializer
        self.optimizer = optimizer
        
        # Initialize weights and biases by using initializer methods
        self.W_input = self.initializer.W(self.n_features, self.n_nodes)
        self.W_state = self.initializer.W(self.n_nodes, self.n_nodes)
        self.B_RNN = self.initializer.B(1)
        
        self.X_save = 0
        self.a = []
        self.h = np.zeros((batch_size, n_nodes))
        self.dW_input = 0
        self.dW_state = 0
        self.dB_RNN = 0
        self.d_state = 0
        self.d_input = 0
    
    
    def forward(self, X):
        """
        Forward propagation
        
        Parameters
        ----------
        X : ndarray, shape (n_sequences, n_features)
            Input
        
        Returns
        ----------
        ndarray, shape (batch_size, n_nodes)
            State/Output
        """
        
        self.X_save = X
        
        A = np.dot(X, self.W_input) + np.dot(self.h, self.W_state) + self.B_RNN
        self.a.append(A)
        
        tanh = Tanh()
        self.h = tanh.forward(a)
        
        return self.h
    
    
    def backward(self, dA):
        """
        Backward propagation
        
        Parameters
        ----------
        dA : ndarray, shape (batch_size, n_nodes)
            Gradient given from a next layer
        """
        
        # Activation function layer
        tanh = Tanh()
        d_tanh = tanh.backward(dA)
        
        self.dB_RNN = d_tanh
        self.dW_input = np.dot(self.X_save.T, d_tanh)
        self.dW_state = np.dot(d_tanh.T, d_tanh)
        
        self.d_state = np.dot(d_tanh, self.W_state.T)
        self.d_input = np.dot(d_tanh, self.W_input.T)
        
        # Update
        self = self.optimizer.update(self)

## [Task 2] Validate Forward Propagation by Using Small Array

In [49]:
import numpy as np
import pandas as pd

In [42]:
# Settings

# Input
x = np.array([[[1, 2], [2, 3], [3, 4]]])/100
w_x = np.array([[1, 3, 5, 7], [3, 5, 7, 8]])/100
w_h = np.array([[1, 3, 5, 7], [2, 4, 6, 8], [3, 5, 7, 8], [4, 6, 8, 10]])/100
batch_size = x.shape[0] # 1
n_sequences = x.shape[1] # 3
n_features = x.shape[2] # 2
n_nodes = w_x.shape[1] # 4
h = np.zeros((batch_size, n_nodes))
b = np.array([1])

# Output
correct_h = np.array([[0.79494228, 0.81839002, 0.83939649, 0.85584174]])

In [43]:
# Definition of forward propagation

def forward(x, w_x, w_h, h, b):
    """
    Forward propagation

    Parameters
    ----------
    X : ndarray, shape (n_sequences, n_features)
        Input

    Returns
    ----------
    ndarray, shape (batch_size, n_nodes)
        State/Output
    """
    
    a = np.dot(x, w_x) + np.dot(h, w_h) + b
    
    tanh = Tanh()
    h = tanh.forward(a)
    
    return h

In [44]:
# Forward propagtion

for i in range(n_sequences):
    h = forward(x[0][i], w_x, w_h, h, b)
    print(h)

[[0.76188798 0.76213958 0.76239095 0.76255841]]
[[0.792209   0.8141834  0.83404912 0.84977719]]
[[0.79494228 0.81839002 0.83939649 0.85584174]]


In [45]:
# Check

print(h)
print(correct_h)

[[0.79494228 0.81839002 0.83939649 0.85584174]]
[[0.79494228 0.81839002 0.83939649 0.85584174]]


## [Task 4] Fit and Predict Dataset

<br />

"IMDB Review Dataset" on Kaggle


https://www.kaggle.com/utathya/imdb-review-dataset

In [51]:
from importlib import reload
import sys
from imp import reload
import warnings
warnings.filterwarnings('ignore')
if sys.version[0] == '2':
    reload(sys)
    sys.setdefaultencoding("utf-8")

In [52]:
# Read dataset

import pandas as pd

df1 = pd.read_csv("imdb_master.csv", delimiter="\t")
df1 = df1.drop(['id'], axis=1)
df1.head()

ParserError: Error tokenizing data. C error: Expected 1 fields in line 1933, saw 2


In [46]:
class NeuralNetwork:
    """
    Parameters
    ----------
    num_epoch : int
        Number of epochs
    
    batch_size : int
        Size of batch
    
    verbose : bool
        True if outputting learning process
    
    Attributes
    ----------
    loss : list
        List of arrays of records of loss on train dataset
    
    val_loss : list
        List of arrays of records of loss on validation dataset
    
    layers : list
        List of layers
    """
    
    def __init__(self, num_epoch, batch_size, verbose=True):
        self.epoch = num_epoch
        self.batch_size = batch_size
        self.verbose = verbose
        
        self.loss = []
        self.val_loss = []
        self.layers = []
    
    
    def add(self, layer):
        self.layers += [layer]
    
    
    def forward_layer(self, X):
        for layer in self.layers:
            X = layer.forward(X)
        return X
    
    
    def backward_layer(self, y):
        for layer in reversed(self.layers):
            y = layer.backward(y)
        return y
    
    
    def fit(self, X, y, X_val=None, y_val=None):
        """
        Parameters
        ----------
        X : ndarray, shape (n_sequences, n_features)
            Features of train dataset
        
        y : ndarray, shape (n_sequences, )
            Correct values of train dataset
        
        X_val : ndarray, shape (n_sequences, n_features)
            Features of validation dataset
        
        y_val : ndarray, shape (n_sequences, )
            Correct values of validation dataset
        """
        
        if self.verbose:
            count = 0
        
        for i in range(self.epoch):
            # Mini-Batch Processing
            get_mini_batch = GetMiniBatch(X, y, batch_size=self.batch_size)
            if (X_val is not None) and (y_val is not None):
                get_mini_batch_val = GetMiniBatch(X_val, y_val, batch_size=self.batch_size)
                # Loop per iteration
                for ((mini_X_train, mini_y_train), (mini_X_val_train, mini_y_val_train)) in zip(get_mini_batch, 
                                                                                                get_mini_batch_val):
                    # Forward propagation
                    Z = self.forward_layer(mini_X_train)
                    Z_val = self.forward_layer(mini_X_val_train)
                    # Loss
                    if self.verbose:
                        loss = Loss()
                        L = loss.cross_entropy_loss(mini_y_train, Z)
                        L_val = loss.cross_entropy_loss(mini_y_val_train, Z_val)
                    # Backforward propagation
                    dX = self.backward_layer(mini_y_train)
                    dX_val = self.backward_layer(mini_y_val_train)
            else:
                # Loop per iteration
                for mini_X_train, mini_y_train in get_mini_batch:
                    # Forward propagation
                    Z = self.forward_layer(mini_X_train)
                    # Loss
                    if self.verbose:
                        loss = Loss()
                        L = loss.cross_entropy_loss(mini_y_train, Z)
                    # Backforward propagation
                    dX = self.backward_layer(mini_y_train)
            
            # Output learning process
            if self.verbose:
                self.loss += [sum(L) / self.batch_size]
                if (X_val is not None) and (y_val is not None):
                    self.val_loss += [sum(L_val) / self.batch_size]
                    print("{0}ep loss: {1}, val_loss: {2}".format(count+1, self.loss[count], self.val_loss[count]))
                else:
                    print(self.loss[count])
                count += 1
    
    
    def predict(self, X):
        """
        Parameters
        ----------
        X : ndarray, shape (n_sequences, n_features)
            Samples
        
        Returns
        -------
        ndarray, shape (n_sequences, 1)
            Results of prediction
        """
        
        Z = self.forward_layer(X)
        
        return np.argmax(Z, axis=1)
    
    
    def plot_learning_record(self):
        plt.figure(facecolor="azure", edgecolor="coral")
        plt.plot(self.loss, label="loss")
        plt.plot(self.val_loss, label="val_loss")
        plt.title("Learning Records")
        plt.xlabel("Number of Epochs")
        plt.ylabel("Loss")
        plt.grid(True)
        plt.legend()
        plt.show()
    
    
    def compute_index_values(self, y, y_pred):
        """
        Parameters
        ----------
        X: ndarray, shape(n_sequences, n_features)
            Features of train dataset
        
        y: ndarray, shape(n_sequences,)
            Correct values of train dataset
        """
        
        print("accuracy score:", accuracy_score(y, y_pred))

In [47]:
# Construct

nn = NeuralNetwork(100, 10)
nn.add(SimpleRNN(batch_size, n_sequences, n_features, n_nodes, SimpleInitializer(sigma=0.01), SGD_RNN(lr=0.001)))
nn.add(FC(784, 400, SimpleInitializer(sigma=0.01), SGD_FC(lr=0.001)))
nn.add(Softmax())

In [None]:
# Fit

dnn.fit()