An implementation of RNN
forked from https://github.com/pangolulu/rnn-from-scratch

In [31]:
from preprocessing import getSentenceData
import numpy as np

Activation function

In [32]:
class Tanh:
    def forward(self, x):
        return np.tanh(x)
    def backward(self, x, diff):
        output = np.tanh(x)
        return (1.0 - np.square(output)) * diff
    
class sigmoid:
    def forward(self, x):
        return 1.0/(1.0+np.exp(-x))
    def backward(self, x, diff):
        output = self.forward(x)
        return (1.0-output)*output*diff
    
class softmax:
    def predict(self, x):
        return np.exp(x)/np.sum(np.exp(x))
    
    def loss(self, x, y):
        probs = self.predict(x)
        return -np.log(probs[0,y])
    
    def diff(self, x, y):
        probs = self.predict(x)
        probs[:,y] -= 1.0
        return probs

Gate for calculating derivative

In [3]:
class MultiplyGate:
    def forward(self, x, w):
        return np.dot(x, w)
    
    def backward(self, x, w, dz):
        dw = np.dot(x.T, dz)
        dx = np.dot(dz, w.T)
        return dw, dx
    
class AddGate:
    def forward(self, x1, x2):
        return x1 + x2
    
    def backward(self, x1, x2, dz):
        dx1 = dz
        dx2 = dz
        return dx1, dx2

One layer of RNN

In [4]:
mulGate = MultiplyGate()
addGate = AddGate()
activation = Tanh()

class RNNLayer:
    def forward(self, x, prev_s, U, W, V):
        self.mulu = mulGate.forward(x, U)
        self.mulw = mulGate.forward(prev_s, W)
        self.adduw = addGate.forward(self.mulu, self.mulw)
        self.state = activation.forward(self.adduw)
        self.mulv = mulGate.forward(self.state, V)
    
    def backward(self, x, prev_s, U, W, V, dmulv):       
        self.forward(x, prev_s, U, W, V)
        dV, dVx = mulGate.backward(self.state, V, dmulv)
        dadd = activation.backward(self.adduw, dVx)
        dmulu, dmulw = addGate.backward(self.mulu, self.mulw, dadd)
        dU, dUx = mulGate.backward(x, U, dmulu)
        dW, dWx = mulGate.backward(prev_s, W, dmulw)
        return dU, dW, dV
        

A full RNN

In [5]:
output = softmax()

class RNN:
    def __init__(self, input_dim, hidden_nodes, output_dim, lr = 0.001, bptt_truncate = 4):
        self.input_dim = input_dim
        self.hidden_nodes = hidden_nodes
        self.output_dim = output_dim
        self.U = np.random.random([input_dim, hidden_nodes])*0.01
        self.W = np.random.random([hidden_nodes, hidden_nodes])*0.01
        self.V = np.random.random([hidden_nodes, output_dim])*0.01
        self.lr = lr
        self.bptt_truncate = bptt_truncate

    def forward(self, x):  
        # total number of time steps
        # each steps input a word, a word is a vector of length 8000 
        self.time_steps = len(x)
        layers = []
        prev_s = np.zeros([1, self.hidden_nodes])
        for t in range(self.time_steps):
            layer = RNNLayer()
            input_vec = np.zeros([1, self.input_dim])
            input_vec[0,x[t]] = 1
            layer.forward(input_vec, prev_s, self.U, self.W, self.V)
            prev_s = layer.state
            layers.append(layer)
        return layers
    
    def backward(self, x, y):
        dU = np.zeros_like(self.U)
        dW = np.zeros_like(self.W)
        dV = np.zeros_like(self.V)
        layers = self.forward(x)
        for t in range(self.time_steps):
            dmulv = output.diff(layers[t].mulv, y[t])
            input_vec = np.zeros([1,self.input_dim])
            prev_s = np.zeros([1,self.hidden_nodes])
            input_vec[0,x[t]] = 1
            dU_t, dW_t, dV_t = layers[t].backward(input_vec, prev_s, self.U, self.W, self.V, dmulv)
            for i in range(t-1,max(-1, t-self.bptt_truncate-1),-1):
                input_vec = np.zeros([1,self.input_dim])
                input_vec[0,x[i]] = 1
                prev_s_i = np.zeros([1,self.hidden_nodes]) if i == 0 else layers[i-1].state
                dU_i, dW_i, dV_i = layers[i].backward(input_vec, prev_s_i, self.U, self.W, self.V, dmulv)
                dU_t += dU_i
                dW_t += dW_i
                dV_t += dV_i
            dU += dU_t
            dW += dW_t
            dV += dV_t
        return dU, dW, dV
    
    def sgd_optimizer(self, x, y, lr):
        dU, dW, dV = self.backward(x,y)
        self.U -= lr*dU
        self.W -= lr*dW
        self.V -= lr*dV
    
    def caculate_loss(self, x, y):
        loss = 0.0
        for example in range(len(y)):
            single_loss = 0.0
            layers = self.forward(x[example])
            for j,layer in enumerate(layers):                
                single_loss += output.loss(layer.mulv, y[example][j])
            loss += (single_loss/len(layers))
        return loss/len(y)
                
    def train(self, x, y, lr=0.005, nepoch=100, evaluate_loss_after=5):     
        for epoch in range(nepoch):
            if epoch % evaluate_loss_after == 0:
                loss = self.caculate_loss(x,y)
                print("Epoch=%d   Loss=%f" % (epoch, loss))
            for i in range(len(y)):
                self.sgd_optimizer(x[i], y[i], lr) # x[i], y[i] is a list
                
    def predict(self, x):
        output = softmax()
        layers = self.forward(x)
        predict_y = [np.argmax(output.predict(layer.mulv)) for layer in layers]
        return predict_y

An example

train: input a sentence for each RNN layer, no matter how long it is, each RNN layer outputs one word's next word

test: input a word

In [6]:
input_dim = 8000
hidden_dim = 100
output_dim = 8000

In [7]:
X_train, y_train, index_to_word = getSentenceData('data/reddit-comments-2015-08.csv', input_dim)

Reading CSV file...
Parsed 79171 sentences.
Found 65720 unique words tokens.
Using vocabulary size 8000.
The least frequent word in our vocabulary is 'bethesda' and appeared 10 times.

Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']'

X_train shape: (78483,)
y_train shape: (78483,)
x:
SENTENCE_START what are n't you understanding about this ? !
[1, 51, 27, 16, 10, 853, 53, 25, 34, 69]

y:
what are n't you understanding about this ? ! SENTENCE_END
[51, 27, 16, 10, 853, 53, 25, 34, 69, 0]


In [8]:
np.random.seed(10)
model = RNN(input_dim, hidden_dim, output_dim)

losses = model.train(X_train[:100], y_train[:100], lr=0.001, nepoch=10, evaluate_loss_after=1)

Epoch=0   Loss=8.987161
Epoch=1   Loss=8.985943
Epoch=2   Loss=8.984481
Epoch=3   Loss=8.982446
Epoch=4   Loss=8.979229
Epoch=5   Loss=8.973298
Epoch=6   Loss=8.958033
Epoch=7   Loss=8.018889
Epoch=8   Loss=7.010689
Epoch=9   Loss=6.628749


In [33]:
losses = model.train(X_train[:100], y_train[:100], lr=0.001, nepoch=10, evaluate_loss_after=1)

Epoch=0   Loss=6.421941
Epoch=1   Loss=6.282418
Epoch=2   Loss=6.179307
Epoch=3   Loss=6.098684
Epoch=4   Loss=6.033502
Epoch=5   Loss=5.979838
Epoch=6   Loss=5.935124
Epoch=7   Loss=5.897456
Epoch=8   Loss=5.865345
Epoch=9   Loss=5.837620


In [34]:
predict_y = model.predict(X_train[0])

In [35]:
predict_y

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]