In [70]:
# import pandas as pd
#  import matplotlib.pyplot as plt
from activation_functions import softmax
from tensorflow.keras import activations
tf_sigmoid = activations.sigmoid
def sigmoid(X):
    return tf_sigmoid(X).numpy()

def sigmoid_gradient(x):
    sigmoid_x = sigmoid(x)
    return sigmoid_x * (1 - sigmoid_x)

def tanh_gradient(x):
    return 1 - np.tanh(x) ** 2

import numpy as np
import pandas as pd
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import re
from sklearn.metrics import mean_squared_error
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

# from tensorflow.keras.losses import CategoricalCrossentropy

In [73]:
def cross_entropy_loss(y_pred, y_true, epsilon=1e-10):
    y_pred = np.clip(y_pred, epsilon, 1.0 - epsilon)
    num_samples = y_pred.shape[0]
    loss = -np.sum(y_true * np.log(y_pred + epsilon)) / num_samples
    return loss

class MY_LSTM:
    def __init__(self, units, optimizer=0, recurrent_activation=sigmoid, activation=np.tanh, use_bias=True):
        self.units = units
        self.output_size = 1
#         self._optimizer = self.Adam
        self.USE_OPTIMIZER = True
        self.recurrent_activation = recurrent_activation
        self.activation = activation
        self.use_bias = use_bias
        
    @property
    def optimizer(self):
        return self._optimizer

    @optimizer.setter
    def optimizer(self, optimizer):
        self._optimizer = optimizer

                
    def build(self, input_shape):
        input_dim = input_shape[-1]
        # self.kernel is used for a new information passed to lstm, thus it has shape of (input.dim, self.units * 4)
        # multiplication by 4 because we have 4 gates (forget, input, candidate, output)
        self.kernel = self.__init_orthogonal(np.empty(shape=(input_dim, self.units * 4)))
        
        # self.recurrent_kernel is used for previous state passed to lstm, thus it has shape of (self.units, self.units * 4)
        # multiplication by 4 because we have 4 gates (forget, input, candidate, output)
        self.recurrent_kernel = self.__init_orthogonal(np.empty(shape=(self.units, self.units * 4)))
        
          # Classifier weights and biases.
        self.classifier_kernel = self.__init_orthogonal(np.empty(shape=(self.units, input_dim)))

        
        if self.use_bias:
            # Bias initialization which are self.units*4 in the end of all concatination
            self.bias = np.random.uniform(low=-0.1, high=0.1, size=(self.units * 4,))
            self.classifier_bias = np.random.uniform(low=-0.1, high=0.1,size=(input_dim,))
        else:
            self.bias = None
            
    def get_weights():
        return (self.kernel, self.recurrent_kernel, self.classifier_kernel)

    @staticmethod
    def __init_orthogonal(param):
        """
        Initializes weight parameters orthogonally.
        This is a common initiailization for recurrent neural networks.

        Refer to this paper for an explanation of this initialization:
            https://arxiv.org/abs/1312.6120
        """
        if param.ndim < 2:
            raise ValueError("Only parameters with 2 or more dimensions are supported.")

        rows, cols = param.shape

        new_param = np.random.randn(rows, cols)

        if rows < cols:
            new_param = new_param.T

        # Compute QR factorization
        q, r = np.linalg.qr(new_param)

        d = np.diag(r, 0)
        ph = np.sign(d)
        q *= ph

        if rows < cols:
            q = q.T

        new_param = q

        return new_param

    

    def forward(self, inputs, states):
        """
        inputs: [[9]] shape: (1, input_dim)
        self.kernel:  shape: (input_dim, self.units*4)
        z:            shape: (input_dim, self.units*4) 
        """
        h_tm1, c_tm1 = states  # stm, ltm
        
        # apply weights for inputs
        concat = np.dot(inputs, self.kernel)
        
        # apply weights for stm
        concat += np.dot(h_tm1, self.recurrent_kernel)
        
        # apply bias
        if self.use_bias:
            concat += self.bias
        concat = np.split(concat, 4, axis=1)
        c, o = self._apply_activations(concat, c_tm1) # candidate (new ltm), output gate 
        
        h = o * self.activation(c) # new stm value
                
        forward_cache = {
            "i": i,
            "f": f,
            "ct": c,
            "o": o,
            "ht": h,
            "h_tm1": h_tm1,
            "c_tm1": c_tm1,
            "inputs": inputs
        }
        # 1st return is when CURRENT lstm is final layer, 2nd return when it is feeded to next layer
        return forward_cache, (h, c) 
    
    def _apply_activations(self, concat, c_tm1):
        concat0, concat1, concat2, concat3 = concat
        i = self.recurrent_activation(concat0)
        f = self.recurrent_activation(concat1)
        c = f * c_tm1 + i * self.activation(concat2)
        o = self.recurrent_activation(concat3)
        return c, o


    def __clip_gradient_norm(self, grads, max_norm=0.25):
        """
        Clips gradients to have a maximum norm of `max_norm`.
        This is to prevent the exploding gradients problem.
        """
        # Set the maximum of the norm to be of type float
        max_norm = float(max_norm)
        total_norm = 0
        # Calculate the L2 norm squared for each gradient and add them to the total norm
        for gate, grad in grads["weights"].items():
            grad_norm = np.sum(np.power(grad, 2))
            total_norm += grad_norm
        total_norm = np.sqrt(total_norm)
        # Calculate clipping coeficient
        clip_coef = max_norm / (total_norm + 1e-6)
        # If the total norm is larger than the maximum allowable norm, then clip the gradient
        if clip_coef < 1:
            for gate, grad in grads["weights"].items():
                grad *= clip_coef
        return grads
    
    def __default_grads(self, kernel, recurrent_kernel, classification_kernel):
        grad_kernel = np.empty_like(kernel)
        grad_recurrent_kernel = np.empty_like(recurrent_kernel)
        grad_classification_kernel = np.empty_like(classification_kernel)
        return np.array(grad_kernel), np.array(grad_recurrent_kernel), np.array(grad_classification_kernel)
    
    
    def compute_gradients(self, dh, forward_cache, parameters):
        kernel, recurrent_kernel, classification_kernel = parameters
        grad_kernel, grad_recurrent_kernel, grad_classification_kernel = self.__default_grads(kernel, recurrent_kernel, classification_kernel)
#         k_i, k_f, k_c, k_o = np.split(kernel, 4, axis=1)
#         r_i, r_f, r_c, r_o = np.split(recurrent_kernel, 4, axis=1)
        
        # Backward propagation algorithm
        # https://www.geeksforgeeks.org/lstm-derivation-of-back-propagation-through-time/
        # https://medium.com/@aidangomez/let-s-do-this-f9b699de31d9
        # https://chat.openai.com/?model=text-davinci-002-render-sha
        # Set the next cell and hidden state equal to zero
        stm, ltm = forward_cache["ht"], forward_cache["ct"]
        prev_stm, prev_ltm = forward_cache["h_tm1"], forward_cache["c_tm1"]

        forget_gate, output_gate = forward_cache["f"], forward_cache["o"]
        candidate_gate, input_gate = forward_cache["c"], forward_cache["i"]
        inputs = forward_cache["inputs"]
        dh_next = np.zeros_like(h)
        dC_next = np.zeros_like(c)
        
        # dh is gradient of softmax
        # d_output_gate 
        dC = dh * output_gate * tanh_gradient(ltm)                        # Correct
        
        d_output_gate = dh * np.tanh(ltm) * sigmoid_gradient(output_gate) # Correct
        
        d_candidate_gate = dC * input_gate * tanh_gradient(candidate_gate) # Correct
        
        d_input_gate = dC * candidate_gate * sigmoid_gradient(input_gate) # Correct
        
        d_forget_gate = dC * prev_ltm * sigmoid_gradient(forget_gate)     # Correct
        
        d_gates = np.array(d_input_gate, d_forget_gate, d_candidate_gate, d_output_gate)
        
        
        
        grad_kernel = np.dot(kernel.T, d_gates)
        grad_recurrent_kernel = np.dot(recurrent_kernel.T, d_gates)
        grad_classification_kernel = np.dot(classification_kernel.T, dh)
        
        return (grad_kernel, grad_recurrent_kernel, grad_classification_kernel)

        
        
        # check with if statement whether to add [t+1]
#             d_ltm[t] = loss * output_gate[t] * (1 - tanh(ltm[t])*tanh(ltm[t]))
#             if t == self.units:
#                 d_ltm[t] += ltm[t+1] * forget_gate[t+1]
            
#             d_candidate[t] = d_ltm[t] * input_gate[t] * (1-candidate_gate[t]*candidate_gate[t])
#             d_input[t] = d_ltm[t] * candidate_gate[t] * input_gate[t] * (1-input_gate[t])
#             d_forget[t] = d_ltm[t] * prev_ltm * forget_gate[t] * (1 - forget_gate[t])
#             d_output[t] = loss * tanh(ltm[t]) * output_gate[t] * (1-output_gate[t])
            
#             d_gates = np.array(d_candidate[t], d_input[t], d_forget[t], d_output[t])            
            
        

class Optimizer:
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def _Adam(self, parameters, gradients, learning_rate, global_step, beta1=0.9, beta2=0.999, epsilon=1e-8):
        """
        Adam optimization algorithm implementation.

        Arguments:
        parameters -- dictionary containing model parameters
        gradients -- dictionary containing gradients of model parameters
        learning_rate -- learning rate for the algorithm
        beta1 -- exponential decay rate for the first moment estimates (default: 0.9)
        beta2 -- exponential decay rate for the second moment estimates (default: 0.999)
        epsilon -- small constant to prevent division by zero (default: 1e-8)

        Returns:
        parameters -- updated model parameters
        """
        # Initialize the first and second moment estimates to zero
        first_moment = {}
        second_moment = {}

        # Initialize the parameters with zeros
        for param_name, param in parameters.items():
            first_moment[param_name] = np.zeros_like(param)
            second_moment[param_name] = np.zeros_like(param)

        # Perform Adam update for each parameter
        for param_name, param in parameters.items():
            # Update first moment estimate
            first_moment[param_name] = beta1 * first_moment[param_name] + (1 - beta1) * gradients[param_name]

            # Update second moment estimate
            second_moment[param_name] = beta2 * second_moment[param_name] + (1 - beta2) * np.square(gradients[param_name])

            # Bias correction
            first_moment_corrected = first_moment[param_name] / (1 - np.power(beta1, global_step))
            second_moment_corrected = second_moment[param_name] / (1 - np.power(beta2, global_step))

            # Update parameters
            parameters[param_name] -= learning_rate * first_moment_corrected / (np.sqrt(second_moment_corrected) + epsilon)

        return parameters
        
    
def apply_gradients(gradients, weights, inputs, prev_stm, stm, lr=0.01):
    grad_kernel, grad_recurrent_kernel, grad_classification_kernel = gradients
    kernel, recurrent_kernel, classification_kernel = weights

    grad_kernel *= inputs
    grad_recurrent_kernel *= prev_stm
    grad_classification_kernel *= stm

    new_kernel = kernel - grad_kernel * self.lr
    new_recurrent_kernel = recurrent_kernel - grad_recurrent_kernel * self.lr
    classification_kernel = classification_kernel - grad_classification_kernel * self.lr

    return new_kernel, new_recurrent_kernel, classification_kernel
        

# x,y,z = MY_LSTM(2).__default_grads([1,2,3], [3,2,1], [1])
            
            

In [44]:
### """
# TESTING OF FORWARD PASS WITH DUMMY VALUES
# """
def build_model(units, input_dim):
    lstm = MY_LSTM(units=units, optimizer=0)
    lstm.build(input_dim)
    return lstm



# def get_

np.random.seed(228)
# # Example data dimensions
input_size = 1  # Number of features in the input
units = 6  # Number of units in the hidden state/memory cell
x = np.array([[9]])
y = np.array([[1.0]])

model = build_model(units=units, input_dim=x.shape)

prev_stm = np.zeros((1, units))
prev_ltm = np.zeros((1, units))
states = (prev_stm, prev_ltm)
print("\n=========Printing for MY_LSTM===============\n")
print("INPUTS\n")
print("(x) Input:\n", x)
print("(prev_stm) {h} Previous hidden state:\n", prev_stm)
print("(prev_ltm) {c} Previous memory cell:\n", prev_ltm)
print("\n=========PERFORM FORWARD PASS==============\n")
forward_pass = model.forward(inputs=x, states=states)
h, states = forward_pass

classifier_output = sigmoid(np.dot(h, model.classifier_kernel))
if model.use_bias:
    classifier_output += model.classifier_bias
print("===========OUTPUT============================")
print("states: \n", states)
print("classifier_output: ",classifier_output)
loss = cross_entropy_loss(classifier_output, y)
print("loss: ", loss)


# print("\n\n======START OF BACKWARDPROPAGATION=======\n")
# print(f"softmax(Output)\n {output_softmax}")
# model.backward(forward_pass=forward_pass, prediction=output_softmax, targets=[[1.0],[0.0]])
# print("\n\n======FINISH OF BACKWARDPROPAGATION======\n")



INPUTS

(x) Input:
 [[9]]
(prev_stm) {h} Previous hidden state:
 [[0. 0. 0. 0. 0. 0.]]
(prev_ltm) {c} Previous memory cell:
 [[0. 0. 0. 0. 0. 0.]]


states: 
 [array([[ 0.09366264,  0.00137708, -0.07950929, -0.11217219,  0.35705341,
        -0.02163748]]), array([[ 0.77131783,  0.00863679, -0.12254149, -0.66960698,  0.9640742 ,
        -0.05081409]])]
classifier_output:  [[0.54247906]]
loss:  0.6116058007133529


In [None]:
def build_model(X):
    model = Sequential()
    print(X.shape[1])
    model.add(Embedding(num_words, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

    return model


In [None]:
def train_split(X, Y, test_size=0.8):
    length = int(len(X) * test_size)
    X_train = X[1:length]
    Y_train = Y[1:length]

    X_valid = X[length:]
    Y_valid = Y[length:]
    return X_train, X_valid, Y_train, Y_valid


def my_train(model, X, Y, hidden_size, input_size, num_epochs=2):
    X_train, X_valid, Y_train, Y_valid = train_split(X = X, Y = Y)
    training_loss, validation_loss = [], []

    for i in range(num_epochs):

        epoch_training_loss = 0
        epoch_validation_loss = 0
        
        sentence_size = len(X_train)
        it = 0
        for sentence, targets in zip(X_train, Y_train):

            prev_stm = np.zeros((1, hidden_size))
            prev_ltm = np.zeros((1, hidden_size))
            forward_cache = []

       
            for word in sentence:
                word = np.array([[word]])
                forward_pass, prev_stm, prev_ltm = model.forward(word, prev_stm, prev_ltm)
                forward_cache.append(forward_pass)
            
            prediction = sigmoid(prev_stm) * self.classifier_kernel + bias.classifier_kernel
            loss = # calculate loss (prediction, targets)
            dh = loss * prediction
            gradients = backward(dh, forward_cache) # start backward with 
            weights = model.get_weights()
            apply_gradients(gradients, weights, inputs, prev_stm, stm, lr=0.01):
            
            epoch_training_loss += loss
            it += 1

        sentence_size = len(X_valid) 
        it = 0
        for sentence, targets in zip(X_valid, Y_valid):            
            prev_stm = np.zeros((1, hidden_size))
            prev_ltm = np.zeros((1, hidden_size))
    
            for word in sentence:
                word = np.array([[word]])
                forward_pass = model.forward(word, prev_stm, prev_ltm)
                prev_stm = forward_pass["next_stm"]
                prev_ltm = forward_pass["next_ltm"]
            
            print("forward_pass['next_stm']", forward_pass["next_stm"])
            output_softmax = np.dot(forward_pass["next_stm"], model.parameters["weights"]["OutputSoftmax"]) + model.parameters["bias"]["OutputSoftmax"]
            print("=========BEFORE SOFTMAX=========\n", output_softmax)
            output_softmax = sigmoid(output_softmax.reshape(1,1))
            print("=========AFTER SOFTMAX=========\n", output_softmax)

            
            loss = model.calculate_loss(output_softmax, targets)

            # Update loss
            epoch_validation_loss += loss
            it += 1
#             print(f"Epoch {i}, {it} out of {sentence_size} loss: ", loss)



        # Save loss for plot
        training_loss.append(epoch_training_loss / len(X_train))
        validation_loss.append(epoch_validation_loss / len(X_valid))

        # Print loss every 2 epochs
        import statistics
        print(f'Epoch {i+1}, training loss: {statistics.mean(training_loss)}, validation loss: {statistics.mean(validation_loss)}')
        print(f'sentence sentence {i}:')
        print(sentence)

        print(f'\nTarget sequence {i}:')
        print(targets)

        print('\nPredicted sequence:')
        output_softmax = np.dot(forward_pass["next_stm"], model.parameters["weights"]["OutputSoftmax"]) + model.parameters["bias"]["OutputSoftmax"]
        output_softmax = softmax(output_softmax.reshape(1,1))

        print(output_softmax)
    return training_loss, validation_loss

In [None]:
"""
====================FINAL=====================
Preparing the dataset
"""

def convert(x):
    """
    Coverting JSON to pandas dataframe

    """    
    ob = json.loads(x)
    for k, v in ob.items():
        if isinstance(v, list):
            ob[k] = ','.join(v)
        elif isinstance(v, dict):
            for kk, vv in v.items():
                ob['%s_%s' % (k, kk)] = vv
            del ob[k]
    return ob



def filter_data(data):
    """
    Converting into pandas dataframe and filtering only text and ratings given by the users
    """

    df = pd.DataFrame([convert(line) for line in data])
    df.drop(columns=df.columns.difference(['text','stars']),inplace=True)
    df.loc[:, ("sentiment")] = 0
    

#I have considered a rating above 3 as positive and less than or equal to 3 as negative.
    df.loc[:,'sentiment']=['pos' if (x>3) else 'neg' for x in df.loc[:, 'stars']]
    df.loc[:,'text'] = df.loc[:,'text'].apply(lambda x: x.lower())
    df.loc[:,'text'] = df.loc[:,'text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
    for idx,row in df.iterrows():
        df.loc[:,'text']= [x for x in df.loc[:,'text']]
    return df

def read_data():
    json_filename = 'review_mockup_500.json'
    with open(json_filename,'rb') as f:
        data = f.readlines()
    data = filter_data(data)
    tokenizer = Tokenizer(num_words = num_words, split=' ')
    tokenizer.fit_on_texts(data.loc[:,'text'].values)
    X = tokenizer.texts_to_sequences(data.loc[:,'text'].values)
    Y = pd.get_dummies(data['sentiment'], dtype=int).values[:, 0]   
    return X, Y

X, Y = read_data()
print(Y)

In [None]:
np.random.seed(1337)
batch_size = 1  # Number of training examples
input_size = 1  # Number of features in the input
hidden_size = 4  # Number of units in the hidden state/memory cell
model = MY_LSTM.my_build_model(hidden_size=hidden_size, input_size=input_size)
prev_stm = np.zeros((batch_size, hidden_size))
prev_ltm = np.zeros((batch_size, hidden_size))

my_train(model=model, X=X, Y=Y, hidden_size=hidden_size, input_size=input_size, num_epochs=5)