In [None]:
import numpy as np
import matplotlib.pyplot as plt
from numpy import genfromtxt
import pandas as pd
import io

The vanishing gradient problem happens when the gradient becomes too small and gets very close to zero, that does not allow the network to converge properly. Typically happens when you multiply together several derivatives that are already small, using the chain rule and it is very common in RNN. Typically, a way to solve vanishing gradient would be to use relu/leaky relu  activation functions, using gradient clipping or use LSTM. LSTM is probably the best choice since it use a unique additive gradient structure controlled by the forget gate and the activation function used works as an identity function allowing the gradient not to vanish durning the backpropagation. For example since a RNN is traind using back propagation through layers and through time, in each time step we need to sum up all the previous contribution untill the current one. When their derivatives become smaller than one and they're multipled together, the gradient approaches zero and we have the vanishing gradient problem.

Typically the best data for RNN/LSTM is sequence data like time-series or text data.

In [None]:
reddit = pd.read_csv("reddit.csv")

In [None]:
reddit.head()

Unnamed: 0,body
0,I joined a new league this year and they have ...
1,"In your scenario, a person could just not run ..."
2,They don't get paid for how much time you spen...
3,"I dunno, back before the August update in an A..."
4,"No, but Toriyama sometimes would draw himself ..."


In [None]:
import csv
import numpy as np
import itertools
import nltk
nltk.download('punkt')

def getSentenceData(path, vocabulary_size=8000):
    unknown_token = "UNKNOWN_TOKEN"
    sentence_start_token = "SENTENCE_START"
    sentence_end_token = "SENTENCE_END"

    # Read the data and append SENTENCE_START and SENTENCE_END tokens
    print("Reading CSV file...")
    with open(path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, skipinitialspace=True)
        # Split full comments into sentences
        sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
        # Append SENTENCE_START and SENTENCE_END
        sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
    print("Parsed %d sentences." % (len(sentences)))

    # Tokenize the sentences into words
    tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    # Filter the sentences having few words (including SENTENCE_START and SENTENCE_END)
    tokenized_sentences = list(filter(lambda x: len(x) > 3, tokenized_sentences))

    # Count the word frequencies
    word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    print("Found %d unique words tokens." % len(word_freq.items()))

    # Get the most common words and build index_to_word and word_to_index vectors
    vocab = word_freq.most_common(vocabulary_size-1)
    index_to_word = [x[0] for x in vocab]
    index_to_word.append(unknown_token)
    word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

    print("Using vocabulary size %d." % vocabulary_size)
    print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

    # Replace all words not in our vocabulary with the unknown token
    for i, sent in enumerate(tokenized_sentences):
        tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

    print("\nExample sentence: '%s'" % sentences[1])
    print("\nExample sentence after Pre-processing: '%s'\n" % tokenized_sentences[0])

    # Create the training data
    X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
    y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

    print("X_train shape: " + str(X_train.shape))
    print("y_train shape: " + str(y_train.shape))

    # Print an training data example
    x_example, y_example = X_train[17], y_train[17]
    print("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
    print("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

    return X_train, y_train

if __name__ == '__main__':
    X_train, y_train = getSentenceData('reddit.csv')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Reading CSV file...
Parsed 79062 sentences.
Found 63006 unique words tokens.
Using vocabulary size 8000.
The least frequent word in our vocabulary is 'appointments' and appeared 10 times.

Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']'





X_train shape: (78489,)
y_train shape: (78489,)
x:
SENTENCE_START what are n't you understanding about this ? !
[0, 52, 28, 17, 10, 858, 55, 26, 35, 70]

y:
what are n't you understanding about this ? ! SENTENCE_END
[52, 28, 17, 10, 858, 55, 26, 35, 70, 1]




In [None]:


class RNNLayer:
    def forward(self, x, prev_s, U, W, V):
        self.mulu = mulGate.forward(U, x)
        self.mulw = mulGate.forward(W, prev_s)
        self.add = addGate.forward(self.mulw, self.mulu)
        #state output
        self.s = activation.forward(self.add)
        self.mulv = mulGate.forward(V, self.s)

    def backward(self, x, prev_s, U, W, V, diff_s, dmulv):
        self.forward(x, prev_s, U, W, V)
        dV, dsv = mulGate.backward(V, self.s, dmulv)
        ds = dsv + diff_s
        dadd = activation.backward(self.add, ds)
        dmulw, dmulu = addGate.backward(self.mulw, self.mulu, dadd)
        dW, dprev_s = mulGate.backward(W, prev_s, dmulw)
        dU, dx = mulGate.backward(U, x, dmulu)
        return (dprev_s, dU, dW, dV)



In [None]:
import numpy as np

class MultiplyGate:
    def forward(self,W, x):
        return np.dot(W, x)

    def backward(self, W, x, dz):
        dW = np.asarray(np.dot(np.transpose(np.asmatrix(dz)), np.asmatrix(x)))
        dx = np.dot(np.transpose(W), dz)
        return dW, dx

class AddGate:
    def forward(self, x1, x2):
        return x1 + x2

    def backward(self, x1, x2, dz):
        dx1 = dz * np.ones_like(x1)
        dx2 = dz * np.ones_like(x2)
        return dx1, dx2

In [None]:
import numpy as np

class Sigmoid:
    def forward(self, x):
        return 1.0 / (1.0 + np.exp(-x))

    def backward(self, x, top_diff):
        output = self.forward(x)
        return (1.0 - output) * output * top_diff

class Tanh:
    def forward(self, x):
        return np.tanh(x)

    def backward(self, x, top_diff):
        output = self.forward(x)
        return (1 - np.square(output)) * top_diff

In [None]:
import numpy as np

class Softmax:
    def predict(self, x):
        exp_scores = np.exp(x)
        return exp_scores / np.sum(exp_scores)

    def loss(self, x, y):
        probs = self.predict(x)
        return -np.log(probs[y])

    def diff(self, x, y):
        probs = self.predict(x)
        probs[y] -= 1.0
        return probs

In [None]:
from datetime import datetime
import numpy as np
import sys



class Model:
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        #initialize U,V,W in the recommended interval
        self.U = np.random.uniform(-1. / np.sqrt(word_dim), 1. / np.sqrt(word_dim), (hidden_dim, word_dim))
        self.W = np.random.uniform(-1. / np.sqrt(hidden_dim), 1. / np.sqrt(hidden_dim), (hidden_dim, hidden_dim))
        self.V = np.random.uniform(-1. / np.sqrt(hidden_dim), 1. / np.sqrt(hidden_dim), (word_dim, hidden_dim))
    '''
        forward propagation (predicting word probabilities)
        x is one single data, and a batch of data
        for example x = [0, 179, 341, 416], then its y = [179, 341, 416, 1]
    '''
    def forward_propagation(self, x):
        # The total number of time steps
        n = len(x)
        layers = []
        prev_s = np.zeros(self.hidden_dim)
        # For each time step...
        for i in range(n):
            # create an instance of our RNNLayer
            rnn_layer = RNNLayer()
            input = np.zeros(self.word_dim)
            input[x[i]] = 1
            rnn_layer.forward(input, prev_s, self.U, self.W, self.V)
            # call forward for our layer here.
            prev_s = rnn_layer.s
            layers.append(rnn_layer)
        return layers

    def predict(self, x):
        output = Softmax()
        layers = self.forward_propagation(x)
        #get the index of the biggest element in each layer output vector
        return [np.argmax(output.predict(layer.mulv)) for layer in layers]

    def calculate_loss(self, x, y):
        output = Softmax()
        layers = self.forward_propagation(x)
        loss = 0.0
        for i, layer in enumerate(layers):
          loss += output.loss(layer.mulv, y[i])
        return loss / float(len(y))


    def calculate_total_loss(self, X, Y):
        loss = 0.0
        for i in range(len(Y)):
            loss += self.calculate_loss(X[i], Y[i])
        return loss / float(len(Y))

    def bptt(self, x, y):
        output = Softmax()
        layers = self.forward_propagation(x)
        #initialize to zero the network parameters
        der_U = np.zeros(self.U.shape)
        der_V = np.zeros(self.V.shape)
        der_W = np.zeros(self.W.shape)

        n_layers = len(layers)
        prev_status_i = np.zeros(self.hidden_dim)
        diff_status = np.zeros(self.hidden_dim)
        for i in range(0, n_layers):
            #derivative Mulv gate
            der_mulv = output.diff(layers[i].mulv, y[i])
            input = np.zeros(self.word_dim)
            input[x[i]] = 1
            #backpropagation step for each layer, calculate derivative of previous status, U,V,W
            der_prev_status, der_U_i, der_W_i, der_V_i = layers[i].backward(input, prev_status_i, self.U, self.W, self.V, diff_status, der_mulv)
            #Update the previous status
            prev_status_i = layers[i].s
            der_mulv = np.zeros(self.word_dim)
            #backpropagation through time in the opposite direction
            for j in range(i-1, max(-1, i-self.bptt_truncate-1), -1):
                input = np.zeros(self.word_dim)
                input[x[j]] = 1
                prev_status_j = np.zeros(self.hidden_dim) if j == 0 else layers[j-1].s
                #backpropagation step for each layer, calculate derivative of previous status, U,V,W
                der_prev_status, der_U_j, der_W_j, der_V_j = layers[j].backward(input, prev_status_j, self.U, self.W, self.V, der_prev_status, der_mulv)
                der_U_i += der_U_j
                der_W_i += der_W_j
            der_V += der_V_i
            der_U += der_U_i
            der_W += der_W_i
        return (der_U, der_W, der_V)

    def sgd_step(self, x, y, learning_rate):
        dU, dW, dV = self.bptt(x, y)
        self.U -= learning_rate * dU
        self.V -= learning_rate * dV
        self.W -= learning_rate * dW

    def train(self, X, Y, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
        num_examples_seen = 0
        losses = []
        for epoch in range(nepoch):
            if (epoch % evaluate_loss_after == 0):
                loss = self.calculate_total_loss(X, Y)
                losses.append((num_examples_seen, loss))
                time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                print("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
                # Adjust the learning rate if loss increases
                if len(losses) > 1 and losses[-1][1] > losses[-2][1]:
                    learning_rate = learning_rate * 0.5
                    print("Setting learning rate to %f" % learning_rate)
                sys.stdout.flush()
            # For each training example...
            for i in range(len(Y)):
                self.sgd_step(X[i], Y[i], learning_rate)
                num_examples_seen += 1
        return losses

In [None]:
mulGate = MultiplyGate()
addGate = AddGate()
activation = Tanh()

In [None]:
import numpy as np




word_dim = 8000
hidden_dim = 100
X_train, y_train = getSentenceData('reddit.csv', word_dim)

np.random.seed(10)
rnn = Model(word_dim, hidden_dim)

losses = rnn.train(X_train[:100], y_train[:100], learning_rate=0.005, nepoch=10, evaluate_loss_after=1)

Reading CSV file...
Parsed 79062 sentences.
Found 63006 unique words tokens.
Using vocabulary size 8000.
The least frequent word in our vocabulary is 'appointments' and appeared 10 times.

Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']'





X_train shape: (78489,)
y_train shape: (78489,)
x:
SENTENCE_START what are n't you understanding about this ? !
[0, 52, 28, 17, 10, 858, 55, 26, 35, 70]

y:
what are n't you understanding about this ? ! SENTENCE_END
[52, 28, 17, 10, 858, 55, 26, 35, 70, 1]
2022-11-09 22:54:51: Loss after num_examples_seen=0 epoch=0: 8.987482
2022-11-09 22:55:54: Loss after num_examples_seen=100 epoch=1: 8.973095
2022-11-09 22:56:57: Loss after num_examples_seen=200 epoch=2: 8.951278
2022-11-09 22:58:06: Loss after num_examples_seen=300 epoch=3: 8.908663
2022-11-09 22:59:21: Loss after num_examples_seen=400 epoch=4: 8.810082
2022-11-09 23:00:37: Loss after num_examples_seen=500 epoch=5: 6.947560
2022-11-09 23:01:41: Loss after num_examples_seen=600 epoch=6: 6.303379
2022-11-09 23:02:57: Loss after num_examples_seen=700 epoch=7: 5.993970
2022-11-09 23:04:15: Loss after num_examples_seen=800 epoch=8: 5.795992
2022-11-09 23:05:37: Loss after num_examples_seen=900 epoch=9: 5.664663
