In [1]:
# import pandas as pd
#  import matplotlib.pyplot as plt
from activation_functions import softmax
from tensorflow.keras import activations
tf_sigmoid = activations.sigmoid
def sigmoid(X):
    return tf_sigmoid(X).numpy()

def sigmoid_gradient(x):
    sigmoid_x = sigmoid(x)
    return sigmoid_x * (1 - sigmoid_x)

def tanh_gradient(x):
    return 1 - np.tanh(x) ** 2

import numpy as np
import pandas as pd
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow as tf
import re
from sklearn.metrics import mean_squared_error
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

# from tensorflow.keras.losses import CategoricalCrossentropy
bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)


In [35]:
def cross_entropy_loss(y_pred, y_true, epsilon=1e-10):
    y_pred = np.clip(y_pred, epsilon, 1.0 - epsilon)
    num_samples = y_pred.shape[0]
    loss = -np.sum(y_true * np.log(y_pred + epsilon)) / num_samples
    return loss

class MY_LSTM:
    def __init__(self, units, recurrent_activation=sigmoid, activation=np.tanh, use_bias=True):
        self.units = units
        self.output_size = 1
        self.batch_size = 1
        self.USE_OPTIMIZER = True
        self.recurrent_activation = recurrent_activation
        self.activation = activation
        self.use_bias = use_bias
        self.global_step = 0
        

                
    def build(self, input_shape):
#         input_dim = input_shape[-1]
        input_dim = input_shape
        # self.kernel is used for a new information passed to lstm, thus it has shape of (input.dim, self.units * 4)
        # multiplication by 4 because we have 4 gates (forget, input, candidate, output)
        self.kernel = self.__init_orthogonal(np.empty(shape=(input_dim, self.units * 4)))
        
        # self.recurrent_kernel is used for previous state passed to lstm, thus it has shape of (self.units, self.units * 4)
        # multiplication by 4 because we have 4 gates (forget, input, candidate, output)
        self.recurrent_kernel = self.__init_orthogonal(np.empty(shape=(self.units, self.units * 4)))
        
          # Classifier weights and biases.
        self.classifier_kernel = self.__init_orthogonal(np.empty(shape=(self.units, input_dim)))

        
        if self.use_bias:
            # Bias initialization which are self.units*4 in the end of all concatination
            self.bias = np.random.uniform(low=-0.1, high=0.1, size=(self.units * 4,))
            self.classifier_bias = np.random.uniform(low=-0.1, high=0.1,size=(input_dim,))
        else:
            self.bias = None
            
    def get_weights(self):
        return (self.kernel, self.recurrent_kernel, self.classifier_kernel)
    
    def set_weights(self, weights):
        self.kernel, self.recurrent_kernel, self.classification_kernel = weights

    @staticmethod
    def __init_orthogonal(param):
        """
        Initializes weight parameters orthogonally.
        This is a common initiailization for recurrent neural networks.

        Refer to this paper for an explanation of this initialization:
            https://arxiv.org/abs/1312.6120
        """
        if param.ndim < 2:
            raise ValueError("Only parameters with 2 or more dimensions are supported.")

        rows, cols = param.shape

        new_param = np.random.randn(rows, cols)

        if rows < cols:
            new_param = new_param.T

        # Compute QR factorization
        q, r = np.linalg.qr(new_param)

        d = np.diag(r, 0)
        ph = np.sign(d)
        q *= ph

        if rows < cols:
            q = q.T

        new_param = q

        return new_param

    def forward(self, sentence):
        h_tm1 = np.zeros((self.batch_size, self.units))
        c_tm1 = np.zeros((self.batch_size, self.units))
        states = (h_tm1, c_tm1)
        forward_cache = {
            "i": [],
            "f": [],
            "c": [],
            "o": [],
            "ct": [c_tm1],
            "ht": [h_tm1],
            "x": []
        }
        for word in sentence:
            word = np.array(word).reshape(1,1)
#             print("word", word)
            forward_pass, states = self._step(word_input=word, states=states)
            for key, output in forward_pass.items():
                forward_cache[key].append(output)
        prediction = self._apply_output_layer(states[0])
        return forward_cache, prediction

    def _step(self, word_input, states):
        """
        word_input: [[9]] shape: (1, input_dim)
        self.kernel:  shape: (input_dim, self.units*4)
        z:            shape: (input_dim, self.units*4) 
        """
        h_tm1, c_tm1 = states  # stm, ltm
        
        # apply weights for inputs
        concat = np.dot(word_input, self.kernel)
        # apply weights for stm
        concat += np.dot(h_tm1, self.recurrent_kernel)
        
        # apply bias
        if self.use_bias:
            concat += self.bias
        concat = np.split(concat, 4, axis=1)
        i, f, c, o = self._apply_activations(concat, c_tm1) # candidate (new ltm), output gate 
        
        ct = f * c_tm1 + i * c     # new ltm value
        h = o * self.activation(c) # new stm value
                
        forward_pass = {
            "i": i,
            "f": f,
            "c": c,
            "o": o,
            "ct": ct,
            "ht": h,
            "x": word_input
        }
        # 1st return is when CURRENT lstm is final layer, 2nd return when it is feeded to next layer
        return forward_pass, (h, c) 
    
    def _apply_activations(self, concat, c_tm1):
        concat0, concat1, concat2, concat3 = concat
        i = self.recurrent_activation(concat0)
        f = self.recurrent_activation(concat1)
        c = self.activation(concat2)
        o = self.recurrent_activation(concat3)
        return i, f, c, o
    
    def _apply_output_layer(self, h):
        prediction = sigmoid(np.dot(h, self.classifier_kernel))
        if self.use_bias:
            prediction += self.classifier_bias
        return prediction
# EVERYTHING UPPER IS FINE

    

    def compute_gradient(self, loss, prediction, forward_cache):
        """
        dh:              is a gradient of classifier kernel
        forward_cache:   is results from forward_pass
        kernels:         are weights of LSTM
        
        returns          tuple(grad_kernel, grad_recurrent_kernel, grad_classification_kernel)
        """
        grad_kernel, grad_recurrent_kernel, grad_classifier_kernel = self.__default_grads()
        ltm = forward_cache["ct"]
        forget_gate, output_gate = forward_cache["f"], forward_cache["o"]
        candidate_gate, input_gate = forward_cache["c"], forward_cache["i"]
        
        dh = loss * tanh_gradient(prediction)
        for t in reversed(range(len(output_gate)-1)):

            # dh is gradient of classifier kernel
            dC = loss * output_gate[t] * tanh_gradient(ltm[t])                      

            d_output_gate = loss * np.tanh(ltm[t]) * sigmoid_gradient(output_gate[t]) 

            d_candidate_gate = dC * input_gate[t] * tanh_gradient(candidate_gate[t]) 

            d_input_gate = dC * candidate_gate[t] * sigmoid_gradient(input_gate[t]) 

            d_forget_gate = dC * ltm[t-1] * sigmoid_gradient(forget_gate[t])    

            d_gates = np.array([d_input_gate, d_forget_gate, d_candidate_gate, d_output_gate]) #Looks reasonable
            d_kernel = (d_gates * forward_cache["x"][t]).reshape(self.units*4,1)
#             for i, gate in enumerate(d_gates):
#                 print(f"GATE{i}", gate)
            d_recurrent_kernel = (d_gates * forward_cache["ht"][t-1]).reshape(self.units*4,1)
#             print("BACKWARD PROPAGATION")




            grad_kernel.append(np.dot(self.kernel, d_kernel))
            grad_recurrent_kernel.append(np.dot(self.recurrent_kernel, d_recurrent_kernel))
            grad_classifier_kernel.append(np.dot(self.classifier_kernel, dh))
        
        grad_kernel = self.__clip_gradient_norm(sum(grad_kernel))
        grad_recurrent_kernel = self.__clip_gradient_norm(sum(grad_recurrent_kernel))
        grad_classifier_kernel = self.__clip_gradient_norm(sum(grad_classifier_kernel))
        return (grad_kernel, grad_recurrent_kernel, grad_classifier_kernel)
    
    def __default_grads(self):
        grad_kernel = np.empty_like(self.kernel)
        grad_recurrent_kernel = np.empty_like(self.recurrent_kernel)
        grad_classifier_kernel = np.empty_like(self.classifier_kernel)
        return [np.array(grad_kernel)],[np.array(grad_recurrent_kernel)], [np.array(grad_classifier_kernel)]
    
    def __clip_gradient_norm(self, gradient, max_norm=0.1):
        """
        Clips gradients to have a maximum norm of `max_norm`.
        This is to prevent the exploding gradients problem.
        """
        # Set the maximum of the norm to be of type float
        # Calculate the L2 norm squared for each gradient and add them to the total norm
        grad_norm = np.sum(np.power(gradient, 2))
        total_norm = grad_norm
        total_norm = np.sqrt(total_norm)
        # Calculate clipping coeficient
        clip_coef = max_norm / (total_norm + 1e-6)
        # If the total norm is larger than the maximum allowable norm, then clip the gradient
        if clip_coef < 1:
            gradient *= clip_coef
        return gradient
  
    def apply_gradients(self, gradients):
        """
        gradients    gradients after execution of self.compute_gradients()
        weights      current weights of LSTM
        prev_stm     previous hidden_state to which recurrent_kernel weights are a
        stm          current hidden_state  to which classification_kernel weight is applied
        """
        # applying optimizer
        kernels = self.get_weights()
        kernels = self._Adam(kernels=kernels, gradients=gradients)
#         new_kernels = []
#         for kernel, gradient in zip(kernels, gradients):
#             new_kernels.append(kernel * gradient)
#         self.set_weights(new_kernels)
    
    
    def _Adam(self, kernels, gradients, learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
        """
        Adam optimization algorithm implementation.
        """

        # Update timestep
        self.global_step += 1
        
        for kernel, gradient in zip(kernels, gradients):
            first_moment = np.zeros_like(kernel)
            second_moment = np.zeros_like(kernel)

            # Perform Adam update for each parameter
            for i in range(len(kernel)):
                # Update biased first moment estimate
#                 print(f"{i} hello gradient[i]", gradient[i])

                first_moment[i] = beta1 * first_moment[i] + (1 - beta1) * gradient[i]

                # Update biased second moment estimate

                second_moment[i] = beta2 * second_moment[i] + (1 - beta2) * (gradient[i] ** 2)

                # Correct bias in first and second moment estimates
                first_moment_corrected = first_moment[i] / (1 - np.power(beta1, self.global_step))
                second_moment_corrected = second_moment[i] / (1 - np.power(beta2, self.global_step))

                # Update parameter
                kernel -= learning_rate * first_moment_corrected / (np.sqrt(second_moment_corrected) + epsilon)

        return kernels

# x,y,z = MY_LSTM(2).__default_grads([1,2,3], [3,2,1], [1])
            
            

In [454]:
### """
# TESTING OF FORWARD PASS WITH DUMMY VALUES
# """



# def get_

np.random.seed(1337)
# # Example data dimensions
input_size = 1  # Number of features in the input
units = 3  # Number of units in the hidden state/memory cell
x = np.array([[0.002], [0.005], [0.2]])
y = np.array([[1.0]])

model = build_model(units=units, input_dim=1)
print("BEFORE_ADAM", model.get_weights())

for i in range(3):
    print("\n=========PERFORM FORWARD PASS==============\n")
    forward_cache, prediction = model.forward(sentence=x)
    print("===========OUTPUT============================")
    # print("prediction: ",prediction)
    loss = bce(y, prediction).numpy() 
    # print("loss: ", loss)
    dh = loss * sigmoid_gradient(prediction)
    gradients = model.compute_gradient(loss=loss, prediction=prediction, forward_cache=forward_cache)
    # print("gradients", gradients)
    for grad in gradients:
        print("\n",grad)
    model.apply_gradients(gradients)
    print("AFTER_ADAM", model.get_weights())



# print("\n\n======START OF BACKWARDPROPAGATION=======\n")
# dh = loss * sigmoid_gradient(prediction)
# print("Loss: ", loss)
# kernels = model.get_weights()
# gradients = model.optimizer.compute_gradient(dh, forward_cache, kernels)
# weights = apply_gradients(gradients, kernels, x, forward_cache["h_tm1"], forward_cache["ht"], lr=0.01)
# model.set_weights(weights)

# # print(f"softmax(Output)\n {output_softmax}")
# print("\n\n======FINISH OF BACKWARDPROPAGATION======\n")

BEFORE_ADAM (array([[-0.18267242, -0.12736445, -0.0836002 , -0.45593041,  0.05368683,
        -0.5224818 , -0.14476134,  0.08760148,  0.40235313, -0.35608687,
         0.370259  , -0.07259851]]), array([[-1.47609841e-01,  3.12925531e-01,  4.48008582e-01,
        -4.46083433e-01, -1.84509125e-01,  1.53764763e-01,
         2.58020168e-01, -3.21099252e-01, -3.50647007e-01,
        -3.83709017e-04, -3.46758392e-01, -1.00128011e-01],
       [ 3.91333858e-01, -7.48095589e-02, -1.13785948e-01,
        -5.88603360e-01, -3.28411740e-02, -5.06489371e-02,
         3.09617785e-01, -1.47774671e-01,  4.78137583e-01,
        -2.65354521e-01,  2.45895404e-01,  3.19793869e-02],
       [ 8.62175912e-02,  5.79504784e-01,  2.10521230e-01,
        -6.44122732e-02,  3.94100669e-01, -2.73767529e-01,
        -2.55924082e-01,  5.57228457e-02,  9.64775585e-02,
         8.80289949e-02,  3.00185513e-01, -4.49708872e-01]]), array([[ 0.08034021],
       [-0.99394046],
       [-0.07501873]]))


BACKWARD PROPAGATION


In [19]:
def build_model(X):
    model = Sequential()
    print(X.shape[1])
    model.add(Embedding(num_words, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

    return model


In [36]:
def train_split(X, Y, test_size=0.8):
    length = int(len(X) * test_size)
    X_train = X[1:length]
    Y_train = Y[1:length]

    X_valid = X[length:]
    Y_valid = Y[length:]
    return X_train, X_valid, Y_train, Y_valid


def my_train(model, X, Y, hidden_size, input_size, num_epochs=2):
    X_train, X_valid, Y_train, Y_valid = train_split(X = X, Y = Y)
    training_loss, validation_loss = [], []

    for i in range(num_epochs):

        epoch_training_loss = 0
        epoch_validation_loss = 0
        
        sentence_size = len(X_train)
        it = 0
        for sentence, targets in zip(X_train, Y_train):
#             print(f"{it} sentence", sentence)
            forward_cache, prediction = model.forward(sentence=sentence)
            y = np.array([targets]).reshape(1,1)
            loss = bce(y, prediction).numpy() 
            print(f"{it} loss: ", loss)
            dh = loss * sigmoid_gradient(prediction)
            print(f"{it} train prediction: ", dh)
            print(f"{it} train target: ", targets)

            gradients = model.compute_gradient(loss=loss, prediction=prediction, forward_cache=forward_cache)

            model.apply_gradients(gradients)

            epoch_training_loss += loss
            it += 1

        sentence_size = len(X_valid) 
        it = 0
        for sentence, targets in zip(X_valid, Y_valid):            
            forward_cache, prediction = model.forward(sentence=sentence)
        
            loss = bce(y, prediction).numpy() 
            # print("loss: ", loss)
            dh = loss * sigmoid_gradient(prediction)
#             print(f"{it} valid prediction: ", dh)
            epoch_training_loss += loss
            it += 1
#             print(f"Epoch {i}, {it} out of {sentence_size} loss: ", loss)



        # Save loss for plot
        training_loss.append(epoch_training_loss / len(X_train))
        validation_loss.append(epoch_validation_loss / len(X_valid))

        # Print loss every 2 epochs
        import statistics
        print(f'Epoch {i+1}, training loss: {statistics.mean(training_loss)}, validation loss: {statistics.mean(validation_loss)}')
        print(f'sentence sentence {i}:')
        print(sentence)

        print(f'\nTarget sequence {i}:')
        print(targets)

        print('\nPredicted sequence:')
        output_softmax = np.dot(forward_pass["next_stm"], model.parameters["weights"]["OutputSoftmax"]) + model.parameters["bias"]["OutputSoftmax"]
        output_softmax = softmax(output_softmax.reshape(1,1))

        print(output_softmax)
    return training_loss, validation_loss

In [5]:
"""
====================FINAL=====================
Preparing the dataset
"""

def convert(x):
    """
    Coverting JSON to pandas dataframe

    """    
    ob = json.loads(x)
    for k, v in ob.items():
        if isinstance(v, list):
            ob[k] = ','.join(v)
        elif isinstance(v, dict):
            for kk, vv in v.items():
                ob['%s_%s' % (k, kk)] = vv
            del ob[k]
    return ob



def filter_data(data):
    """
    Converting into pandas dataframe and filtering only text and ratings given by the users
    """

    df = pd.DataFrame([convert(line) for line in data])
    df.drop(columns=df.columns.difference(['text','stars']),inplace=True)
    df.loc[:, ("sentiment")] = 0
    

#I have considered a rating above 3 as positive and less than or equal to 3 as negative.
    df.loc[:,'sentiment']=['pos' if (x>3) else 'neg' for x in df.loc[:, 'stars']]
    df.loc[:,'text'] = df.loc[:,'text'].apply(lambda x: x.lower())
    df.loc[:,'text'] = df.loc[:,'text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
    for idx,row in df.iterrows():
        df.loc[:,'text']= [x for x in df.loc[:,'text']]
    return df

def min_max_normalize(tokens):
    min_val = min(tokens)
    max_val = max(tokens)
    normalized_tokens = [(token - min_val) / (max_val - min_val) for token in tokens]
    return normalized_tokens

def read_data():
    json_filename = 'review_mockup_500.json'
    with open(json_filename,'rb') as f:
        data = f.readlines()
    data = filter_data(data)
    tokenizer = Tokenizer(num_words = 2500, split=' ')
    tokenizer.fit_on_texts(data.loc[:,'text'].values)

    X = tokenizer.texts_to_sequences(data.loc[:,'text'].values)
    X = [min_max_normalize(i) for i in X]
    test = pad_sequences(X)

    Y = pd.get_dummies(data['sentiment'], dtype=int).values[:, 0]   
    return X, Y
# print(X)
X, Y = read_data()


In [None]:
def build_model(units, input_dim):
    lstm = MY_LSTM(units=units)
    lstm.build(input_dim)
    return lstm

np.random.seed(1337)
batch_size = 1  # Number of training examples
input_dim = 1  # Number of features in the input
units = 4  # Number of units in the hidden state/memory cell
model = build_model(units=units, input_dim=input_dim)

my_train(model=model, X=X, Y=Y, hidden_size=units, input_size=input_dim, num_epochs=5)

0 loss:  1.0102429751623614
0 train prediction:  [[0.23391113]]
0 train target:  0
1 loss:  0.46711230737876025
1 train prediction:  [[0.10926659]]
1 train target:  1
2 loss:  0.9848288702654138
2 train prediction:  [[0.23044852]]
2 train target:  0
3 loss:  1.0012682491746698
3 train prediction:  [[0.2327155]]
3 train target:  0
4 loss:  0.4695390800713692
4 train prediction:  [[0.11001409]]
4 train target:  1
5 loss:  0.9880663183669564
5 train prediction:  [[0.23090286]]
5 train target:  0
6 loss:  1.0015816217800504
6 train prediction:  [[0.23275775]]
6 train target:  0
7 loss:  0.46861976982103964
7 train prediction:  [[0.10973101]]
7 train target:  1
8 loss:  0.46899717425002957
8 train prediction:  [[0.10984724]]
8 train target:  1
9 loss:  0.9841107087284695
9 train prediction:  [[0.23034721]]
9 train target:  0
10 loss:  0.9819662366900095
10 train prediction:  [[0.23004355]]
10 train target:  0
11 loss:  0.9861604002683514
11 train prediction:  [[0.23063586]]
11 train target: