In [151]:
# import pandas as pd
#  import matplotlib.pyplot as plt
from activation_functions import softmax
from tensorflow.keras import activations
tf_sigmoid = activations.sigmoid
def sigmoid(X):
    return tf_sigmoid(X).numpy()

def sigmoid_gradient(x):
    sigmoid_x = sigmoid(x)
    return sigmoid_x * (1 - sigmoid_x)

def tanh_gradient(x):
    return 1 - np.tanh(x) ** 2

import numpy as np
import pandas as pd
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow as tf
import re
from sklearn.metrics import mean_squared_error
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

# from tensorflow.keras.losses import CategoricalCrossentropy
bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)


In [187]:
def cross_entropy_loss(y_pred, y_true, epsilon=1e-10):
    y_pred = np.clip(y_pred, epsilon, 1.0 - epsilon)
    num_samples = y_pred.shape[0]
    loss = -np.sum(y_true * np.log(y_pred + epsilon)) / num_samples
    return loss

class MY_LSTM:
    def __init__(self, units, optimizer=Optimizer(), recurrent_activation=sigmoid, activation=np.tanh, use_bias=True):
        self.units = units
        self.output_size = 1
        self.optimizer = optimizer
        self.USE_OPTIMIZER = True
        self.recurrent_activation = recurrent_activation
        self.activation = activation
        self.use_bias = use_bias
        

                
    def build(self, input_shape):
#         input_dim = input_shape[-1]
        input_dim = input_shape
        # self.kernel is used for a new information passed to lstm, thus it has shape of (input.dim, self.units * 4)
        # multiplication by 4 because we have 4 gates (forget, input, candidate, output)
        self.kernel = self.__init_orthogonal(np.empty(shape=(input_dim, self.units * 4)))
        
        # self.recurrent_kernel is used for previous state passed to lstm, thus it has shape of (self.units, self.units * 4)
        # multiplication by 4 because we have 4 gates (forget, input, candidate, output)
        self.recurrent_kernel = self.__init_orthogonal(np.empty(shape=(self.units, self.units * 4)))
        
          # Classifier weights and biases.
        self.classifier_kernel = self.__init_orthogonal(np.empty(shape=(self.units, input_dim)))

        
        if self.use_bias:
            # Bias initialization which are self.units*4 in the end of all concatination
            self.bias = np.random.uniform(low=-0.1, high=0.1, size=(self.units * 4,))
            self.classifier_bias = np.random.uniform(low=-0.1, high=0.1,size=(input_dim,))
        else:
            self.bias = None
            
    def get_weights(self):
        return (self.kernel, self.recurrent_kernel, self.classifier_kernel)
    
    def set_weights(self, weights):
        self.kernel, self.recurrent_kernel, self.classification_kernel = weights

    @staticmethod
    def __init_orthogonal(param):
        """
        Initializes weight parameters orthogonally.
        This is a common initiailization for recurrent neural networks.

        Refer to this paper for an explanation of this initialization:
            https://arxiv.org/abs/1312.6120
        """
        if param.ndim < 2:
            raise ValueError("Only parameters with 2 or more dimensions are supported.")

        rows, cols = param.shape

        new_param = np.random.randn(rows, cols)

        if rows < cols:
            new_param = new_param.T

        # Compute QR factorization
        q, r = np.linalg.qr(new_param)

        d = np.diag(r, 0)
        ph = np.sign(d)
        q *= ph

        if rows < cols:
            q = q.T

        new_param = q

        return new_param

    

    def forward(self, inputs, states):
        """
        inputs: [[9]] shape: (1, input_dim)
        self.kernel:  shape: (input_dim, self.units*4)
        z:            shape: (input_dim, self.units*4) 
        """
        h_tm1, c_tm1 = states  # stm, ltm
        
        # apply weights for inputs
        concat = np.dot(inputs, self.kernel)
        
        # apply weights for stm
        concat += np.dot(h_tm1, self.recurrent_kernel)
        
        # apply bias
        if self.use_bias:
            concat += self.bias
        concat = np.split(concat, 4, axis=1)
        i, f, c, o = self._apply_activations(concat, c_tm1) # candidate (new ltm), output gate 
        
        h = o * self.activation(c) # new stm value
                
        forward_cache = {
            "i": i,
            "f": f,
            "ct": c,
            "o": o,
            "ht": h,
            "h_tm1": h_tm1,
            "c_tm1": c_tm1,
            "inputs": inputs
        }
        # 1st return is when CURRENT lstm is final layer, 2nd return when it is feeded to next layer
        return forward_cache, (h, c) 
    
    def _apply_activations(self, concat, c_tm1):
        concat0, concat1, concat2, concat3 = concat
        i = self.recurrent_activation(concat0)
        f = self.recurrent_activation(concat1)
        c = f * c_tm1 + i * self.activation(concat2)
        o = self.recurrent_activation(concat3)
        return i,f,c, o


    
            
        

class Optimizer:
    def __init__(self, lr=0.01):
        self.lr = lr
        self.global_step = 1
        
    def _Adam(self, kernels, gradients, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8):
        """
        Adam optimization algorithm implementation.

        Arguments:
        """
        # Initialize the first and second moment estimates to zero
       
        

        # Perform Adam update for each parameter
        for kernel, gradient in zip(kernels, gradients):
            # Update first moment estimate
            first_moment = beta1 * kernel + (1 - beta1) * gradient

            # Update second moment estimate
            second_moment = beta2 * kernel + (1 - beta2) * np.square(gradient)

            # Bias correction
            first_moment_corrected = first_moment / (1 - np.power(beta1, self.global_step))
            second_moment_corrected = second_moment / (1 - np.power(beta2, self.global_step))

            # Update parameters
            kernel -= learning_rate * first_moment_corrected / (np.sqrt(second_moment_corrected) + epsilon)
        self.global_step += 1
        
        return kernels
    def __clip_gradient_norm(self, gradient, max_norm=0.25):
        """
        Clips gradients to have a maximum norm of `max_norm`.
        This is to prevent the exploding gradients problem.
        """
        # Set the maximum of the norm to be of type float
        max_norm = float(max_norm)
        total_norm = 0
        # Calculate the L2 norm squared for each gradient and add them to the total norm
        grad_norm = np.sum(np.power(gradient, 2))
        total_norm += grad_norm
        total_norm = np.sqrt(total_norm)
        # Calculate clipping coeficient
        clip_coef = max_norm / (total_norm + 1e-6)
        # If the total norm is larger than the maximum allowable norm, then clip the gradient
        if clip_coef < 1:
            gradient *= clip_coef
        return gradient
    
    def __default_grads(self, kernels):
        kernel, recurrent_kernel, classification_kernel = kernels

        grad_kernel = np.empty_like(kernel)
        grad_recurrent_kernel = np.empty_like(recurrent_kernel)
        grad_classification_kernel = np.empty_like(classification_kernel)
        return np.array(grad_kernel), np.array(grad_recurrent_kernel), np.array(grad_classification_kernel)
    
    
    def compute_gradient(self, dh, forward_cache, kernels):
        """
        dh:              is a gradient of classifier kernel
        forward_cache:   is results from forward_pass
        kernels:         are weights of LSTM
        
        returns          tuple(grad_kernel, grad_recurrent_kernel, grad_classification_kernel)
        """
        kernel, recurrent_kernel, classification_kernel = kernels
        grad_kernel, grad_recurrent_kernel, grad_classification_kernel = self.__default_grads(kernels)

        shape = grad_classification_kernel.shape[0] * 4
        # Backward propagation algorithm
        # https://www.geeksforgeeks.org/lstm-derivation-of-back-propagation-through-time/
        # https://medium.com/@aidangomez/let-s-do-this-f9b699de31d9
        # https://chat.openai.com/?model=text-davinci-002-render-sha
        # Set the next cell and hidden state equal to zero
        stm, ltm, prev_ltm = forward_cache["ht"], forward_cache["ct"], forward_cache["c_tm1"]
        forget_gate, output_gate = forward_cache["f"], forward_cache["o"]
        candidate_gate, input_gate = forward_cache["ct"], forward_cache["i"]
        
        # dh is gradient of classifier kernel
        dC = dh * output_gate * tanh_gradient(ltm)                        # Correct
        
        d_output_gate = dh * np.tanh(ltm) * sigmoid_gradient(output_gate) # Correct
        
        d_candidate_gate = dC * input_gate * tanh_gradient(candidate_gate) # Correct
        
        d_input_gate = dC * candidate_gate * sigmoid_gradient(input_gate) # Correct
        
        d_forget_gate = dC * prev_ltm * sigmoid_gradient(forget_gate)     # Correct
        
        d_gates = np.array([d_input_gate, d_forget_gate, d_candidate_gate, d_output_gate]).reshape(shape,1)
        
        
        grad_kernel = self.__clip_gradient_norm(np.dot(kernel, d_gates))
        grad_recurrent_kernel = self.__clip_gradient_norm(np.dot(recurrent_kernel, d_gates))
        grad_classification_kernel = self.__clip_gradient_norm(np.dot(classification_kernel, dh))
        
        return (grad_kernel, grad_recurrent_kernel, grad_classification_kernel)
    
    def apply_gradients(self, gradients, weights, inputs, prev_stm, stm, lr=0.1):
        """
        gradients    gradients after execution of self.compute_gradients()
        weights      current weights of LSTM
        inputs       last input to lstm model (!!!!!!!!!!!THIS PART IS INCORRECT!!!!!!!!!!!!)
        prev_stm     previous hidden_state to which recurrent_kernel weights are a
        stm          current hidden_state  to which classification_kernel weight is applied
        """
        grad_kernel, grad_recurrent_kernel, grad_classification_kernel = gradients

        # applying optimizer
        kernels = self._Adam(weights)

        return kernels
        

# x,y,z = MY_LSTM(2).__default_grads([1,2,3], [3,2,1], [1])
            
            

In [179]:
### """
# TESTING OF FORWARD PASS WITH DUMMY VALUES
# """



# def get_

np.random.seed(228)
# # Example data dimensions
input_size = 1  # Number of features in the input
units = 6  # Number of units in the hidden state/memory cell
x = np.array([[12]])
y = np.array([[1.0]])

model = build_model(units=units, input_dim=1)

prev_stm = np.zeros((1, units))
prev_ltm = np.zeros((1, units))
states = (prev_stm, prev_ltm)
print("\n=========Printing for MY_LSTM===============\n")
print("INPUTS\n")
print("(x) Input:\n", x)
print("(prev_stm) {h} Previous hidden state:\n", prev_stm)
print("(prev_ltm) {c} Previous memory cell:\n", prev_ltm)
print("\n=========PERFORM FORWARD PASS==============\n")
forward_pass = model.forward(inputs=x, states=states)
forward_cache, (h, c) = forward_pass
print("h", h)
print("c", c)

classifier_output = sigmoid(np.dot(h, model.classifier_kernel))
if model.use_bias:
    classifier_output += model.classifier_bias
print("===========OUTPUT============================")
print("states: \n", states)
print("classifier_output: ",classifier_output)
loss = cross_entropy_loss(classifier_output, y)
print("loss: ", loss)

dh = loss * sigmoid_gradient(classifier_output)

print("\n\n======START OF BACKWARDPROPAGATION=======\n")
# print(f"softmax(Output)\n {output_softmax}")
model.backward(dh=dh, forward_cache=forward_cache, parameters=model.get_weights())
print("\n\n======FINISH OF BACKWARDPROPAGATION======\n")



INPUTS

(x) Input:
 [[12]]
(prev_stm) {h} Previous hidden state:
 [[0. 0. 0. 0. 0. 0.]]
(prev_ltm) {c} Previous memory cell:
 [[0. 0. 0. 0. 0. 0.]]


h [[ 6.10012290e-02  2.74413000e-04 -5.37516347e-02 -8.40681687e-02
   3.54052311e-01 -1.21627718e-02]]
c [[ 0.88698052  0.00281794 -0.07738788 -0.81545595  0.98984162 -0.03009637]]
states: 
 (array([[0., 0., 0., 0., 0., 0.]]), array([[0., 0., 0., 0., 0., 0.]]))
classifier_output:  [[0.54180175]]
loss:  0.6128551247310724





AttributeError: 'MY_LSTM' object has no attribute 'backward'

In [19]:
def build_model(X):
    model = Sequential()
    print(X.shape[1])
    model.add(Embedding(num_words, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

    return model


In [197]:
def train_split(X, Y, test_size=0.8):
    length = int(len(X) * test_size)
    X_train = X[1:length]
    Y_train = Y[1:length]

    X_valid = X[length:]
    Y_valid = Y[length:]
    return X_train, X_valid, Y_train, Y_valid


def my_train(model, X, Y, hidden_size, input_size, num_epochs=2):
    X_train, X_valid, Y_train, Y_valid = train_split(X = X, Y = Y)
    training_loss, validation_loss = [], []

    for i in range(num_epochs):

        epoch_training_loss = 0
        epoch_validation_loss = 0
        
        sentence_size = len(X_train)
        it = 0
        for sentence, targets in zip(X_train, Y_train):

            prev_stm = np.zeros((1, hidden_size))
            prev_ltm = np.zeros((1, hidden_size))
            states = (prev_stm, prev_ltm)
            forward_cache = []

       
            for word in sentence:
                word = np.array([[word]])
                forward_pass, states = model.forward(word, states)
                forward_cache.append(forward_pass)
            
            prediction = np.dot(sigmoid(prev_stm), model.classifier_kernel) + model.classifier_bias
            forward_cache = forward_cache[-1]
            print("prediction", prediction)
            print("targets", targets)

            loss = bce([[targets]],prediction).numpy() # calculate loss (prediction, targets)
            print("Loss: ", loss)
            dh = loss * sigmoid_gradient(prediction)
            kernels = model.get_weights()
            gradients = model.optimizer.compute_gradient(dh, forward_cache, kernels)

#             print("Gradients: ", gradients)
            weights = apply_gradients(gradients, kernels, word, forward_cache["h_tm1"], forward_cache["ht"], lr=0.01)
#             print("WEIGHTS BEFORE UPDATE: ", model.get_weights())
            model.set_weights(weights)
#             print("WEIGHTS AFTER UPDATE: ", model.get_weights())

            epoch_training_loss += loss
            it += 1

        sentence_size = len(X_valid) 
        it = 0
        for sentence, targets in zip(X_valid, Y_valid):            
            prev_stm = np.zeros((1, hidden_size))
            prev_ltm = np.zeros((1, hidden_size))
    
            for word in sentence:
                word = np.array([[word]])
                forward_pass = model.forward(word, prev_stm, prev_ltm)
                prev_stm = forward_pass["next_stm"]
                prev_ltm = forward_pass["next_ltm"]
            
#             print("forward_pass['next_stm']", forward_pass["next_stm"])
            output_softmax = np.dot(forward_pass["next_stm"], model.parameters["weights"]["OutputSoftmax"]) + model.parameters["bias"]["OutputSoftmax"]
#             print("=========BEFORE SOFTMAX=========\n", output_softmax)
            output_softmax = sigmoid(output_softmax.reshape(1,1))
#             print("=========AFTER SOFTMAX=========\n", output_softmax)

            
            loss = model.calculate_loss(output_softmax, targets)

            # Update loss
            epoch_validation_loss += loss
            it += 1
#             print(f"Epoch {i}, {it} out of {sentence_size} loss: ", loss)



        # Save loss for plot
        training_loss.append(epoch_training_loss / len(X_train))
        validation_loss.append(epoch_validation_loss / len(X_valid))

        # Print loss every 2 epochs
        import statistics
        print(f'Epoch {i+1}, training loss: {statistics.mean(training_loss)}, validation loss: {statistics.mean(validation_loss)}')
        print(f'sentence sentence {i}:')
        print(sentence)

        print(f'\nTarget sequence {i}:')
        print(targets)

        print('\nPredicted sequence:')
        output_softmax = np.dot(forward_pass["next_stm"], model.parameters["weights"]["OutputSoftmax"]) + model.parameters["bias"]["OutputSoftmax"]
        output_softmax = softmax(output_softmax.reshape(1,1))

        print(output_softmax)
    return training_loss, validation_loss

In [198]:
"""
====================FINAL=====================
Preparing the dataset
"""

def convert(x):
    """
    Coverting JSON to pandas dataframe

    """    
    ob = json.loads(x)
    for k, v in ob.items():
        if isinstance(v, list):
            ob[k] = ','.join(v)
        elif isinstance(v, dict):
            for kk, vv in v.items():
                ob['%s_%s' % (k, kk)] = vv
            del ob[k]
    return ob



def filter_data(data):
    """
    Converting into pandas dataframe and filtering only text and ratings given by the users
    """

    df = pd.DataFrame([convert(line) for line in data])
    df.drop(columns=df.columns.difference(['text','stars']),inplace=True)
    df.loc[:, ("sentiment")] = 0
    

#I have considered a rating above 3 as positive and less than or equal to 3 as negative.
    df.loc[:,'sentiment']=['pos' if (x>3) else 'neg' for x in df.loc[:, 'stars']]
    df.loc[:,'text'] = df.loc[:,'text'].apply(lambda x: x.lower())
    df.loc[:,'text'] = df.loc[:,'text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
    for idx,row in df.iterrows():
        df.loc[:,'text']= [x for x in df.loc[:,'text']]
    return df

def min_max_normalize(tokens):
    min_val = min(tokens)
    max_val = max(tokens)
    normalized_tokens = [(token - min_val) / (max_val - min_val) for token in tokens]
    return normalized_tokens

def read_data():
    json_filename = 'review_mockup_500.json'
    with open(json_filename,'rb') as f:
        data = f.readlines()
    data = filter_data(data)
    tokenizer = Tokenizer(num_words = 2500, split=' ')
    tokenizer.fit_on_texts(data.loc[:,'text'].values)
    X = tokenizer.texts_to_sequences(data.loc[:,'text'].values)
#     X = [min_max_normalize(i) for i in X]

    Y = pd.get_dummies(data['sentiment'], dtype=int).values[:, 0]   
    return X, Y

X, Y = read_data()
print(X)

[[38, 20, 1038, 5, 202, 45, 43, 31, 1477, 8, 9, 133, 5, 170, 52, 203, 259, 46, 1478, 5, 507, 13, 21, 140, 8, 907, 211, 97, 4, 174, 5, 47, 8, 4, 21, 61, 5, 44, 76, 1039, 11, 1216, 2, 95, 22, 3, 245, 116, 1, 25, 9, 28, 17, 8, 1040, 3, 30, 164, 41, 5, 107, 39, 1, 1041, 9, 30, 722, 17, 400, 448, 13, 21, 43, 22, 69, 143, 1217, 190, 13, 908, 108, 69, 164, 351, 13, 400, 10, 154, 651, 58, 82, 18, 1, 801, 11, 126, 5, 31, 251], [99, 723, 3, 227, 7, 1479, 109, 1, 237, 2, 252, 1926, 5, 1, 1479, 27, 1927, 1928, 46, 1, 77, 195, 426, 2, 127, 1929, 5, 1, 724, 2, 191, 1480, 9, 3, 281, 909, 212, 39, 10, 569, 175, 5, 1042, 11, 1, 652, 1481, 427, 8, 428, 5, 1482, 1043, 2, 470, 62, 253, 5, 238, 60, 108, 11, 47, 143, 137, 20, 89, 34, 9, 62, 108, 4, 70, 910, 15, 294, 449, 605, 1930, 1, 309, 7, 1927, 1928, 3, 1218, 39, 1044, 10, 1931, 2, 9, 24, 35, 9, 134, 1932, 10, 37, 7, 134, 5, 64, 9, 80, 1933, 11, 5, 1479, 5, 320, 2, 9, 471, 5, 1934, 2, 1483, 46, 569, 1930, 80, 3, 911, 18, 134, 541, 72, 59, 802, 84, 1935,

In [199]:
def build_model(units, input_dim):
    lstm = MY_LSTM(units=units, optimizer=Optimizer())
    lstm.build(input_dim)
    return lstm

np.random.seed(1337)
batch_size = 1  # Number of training examples
input_dim = 1  # Number of features in the input
units = 4  # Number of units in the hidden state/memory cell
model = build_model(units=units, input_dim=input_dim)
prev_stm = np.zeros((batch_size, hidden_size))
prev_ltm = np.zeros((batch_size, hidden_size))

my_train(model=model, X=X, Y=Y, hidden_size=hidden_size, input_size=input_size, num_epochs=5)

prediction [[0.37256323]]
targets 0
Loss:  0.8966797882853172
prediction [[0.37256323]]
targets 1
Loss:  0.5241165591433968
prediction [[0.37256323]]
targets 0
Loss:  0.8966797882853172
prediction [[0.37256323]]
targets 0
Loss:  0.8966797882853172
prediction [[0.37256323]]
targets 1
Loss:  0.5241165591433968
prediction [[0.37256323]]
targets 0
Loss:  0.8966797882853172
prediction [[0.37256323]]
targets 0
Loss:  0.8966797882853172
prediction [[0.37256323]]
targets 1
Loss:  0.5241165591433968
prediction [[0.37256323]]
targets 1
Loss:  0.5241165591433968
prediction [[0.37256323]]
targets 0
Loss:  0.8966797882853172
prediction [[0.37256323]]
targets 0
Loss:  0.8966797882853172
prediction [[0.37256323]]
targets 0
Loss:  0.8966797882853172
prediction [[0.37256323]]
targets 0
Loss:  0.8966797882853172
prediction [[0.37256323]]
targets 0
Loss:  0.8966797882853172
prediction [[0.37256323]]
targets 0
Loss:  0.8966797882853172
prediction [[0.37256323]]
targets 0
Loss:  0.8966797882853172
predicti

KeyboardInterrupt: 