# Sentiment Analysis through Metric Learning

In this project, we are utilizing the 

"@inproceedings{sazzed2019sentiment, title={A Sentiment Classification in Bengali and Machine Translated English Corpus}, author={Sazzed, Salim and Jayarathna, Sampath}, booktitle={2019 IEEE 20th International Conference on Information Reuse and Integration for Data Science (IRI)}, pages={107--114}, year={2019}, organization={IEEE} }"

data to implement sentiment analysis through siamese network. 

There are in total 3307 negative comments and 8500 positive comments present

We will implement Metric Learning Techniques to proceed with the analysis. The Siamese Neural Network architecture would help us to achieve the implementation. The network would have two parts. One for distance learning, the other for sentiment classification.

## Files needed to import

In [1]:
import glob
import os
import numpy as np
import pandas as pd
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
from sklearn.utils.class_weight import compute_class_weight
from itertools import combinations
import random
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None 
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import re
import pickle
from sklearn.metrics import classification_report

## Making Pairs

In [2]:
def pair_generator(X, y):
    #equal number of similar and dissimilar pair geneartor
    x1 = np.zeros((int(np.ceil(X.shape[0]/2))*2, X.shape[1],))
    x2 = np.zeros((int(np.ceil(X.shape[0]/2))*2, X.shape[1],))
    y1_label =  np.zeros((int(np.ceil(X.shape[0]/2))*2, 1,))
    y2_label =  np.zeros((int(np.ceil(X.shape[0]/2))*2, 1,))
    y_simil =  np.zeros((int(np.ceil(X.shape[0]/2))*2, 1,))
    
    marker_comb = list(combinations(list(range(X.shape[0])),2))
    random.shuffle(marker_comb)
    
    simil_count = 0
    disimil_count = 0
    count = 0
    fill_up_count = 0
    
    
    while count<len(marker_comb):
        
        ids = marker_comb[count]
        
        one_val, one_label = X[ids[0]], y[ids[0]]
        two_val, two_label = X[ids[1]], y[ids[1]]
        
        if one_label == two_label:
            if simil_count<int(np.ceil(X.shape[0]/2)):
                x1[fill_up_count] = one_val
                x2[fill_up_count] = two_val
                y1_label[fill_up_count] = one_label
                y2_label[fill_up_count] = two_label
                
                simil_count +=1
                y_simil[fill_up_count] = 1 #similar labels
                fill_up_count += 1
                
                
        else:
            
            if disimil_count<int(np.ceil(X.shape[0]/2)):
                x1[fill_up_count] = one_val
                x2[fill_up_count] = two_val
                y1_label[fill_up_count] = one_label
                y2_label[fill_up_count] = two_label
                
                disimil_count += 1
                y_simil[fill_up_count] = -1 #dissimilar labels
                fill_up_count += 1
        count +=1
    x1 = x1[~np.all(y1_label == 0, axis=1)]
    x2 = x2[~np.all(y1_label == 0, axis=1)]
    y1_label = y1_label[~np.all(y1_label == 0, axis=1)]
    y2_label = y2_label[~np.all(y1_label == 0, axis=1)]
    y_simil = y_simil[~np.all(y1_label == 0, axis=1)]
    y_simil = np.array(y_simil, dtype = np.float32)
    
    print(count, simil_count, disimil_count)
    return x1, x2, y1_label, y2_label, y_simil

    

    
    

## Siamese network

In [3]:
def siamese_model(max_len, max_features,embedding_dim):
    #Here the siamese network part is defined
    input = tf.keras.Input(shape = (max_len,), name = 'Input')
    x = input
    x = tf.keras.layers.Embedding(
            input_dim=max_features,
            output_dim=embedding_dim,
            # Use masking to handle the variable sequence lengths
            mask_zero=True)(x)
    x = tf.keras.layers.LSTM(64)(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
  
    return tf.keras.Model(inputs = input, outputs = x)
    

def classification_model():
    #Here the classification network part is defined
    input = tf.keras.Input(shape = (64,))
    x = input
    x = tf.keras.layers.Dense(units = 1,
                             name = 'Classification_output')(x) 
    return tf.keras.Model(inputs = input, outputs = x)

def contrastive_loss(y_true, y_pred):
    #contrastive loss
    print(y_true.shape, y_pred.shape)
    margin = 1
    square_pred = tf.math.square(y_pred)
    margin_square = tf.math.square(tf.math.maximum(margin - y_pred, 0))
    return tf.keras.backend.mean(y_true* square_pred + (1 - y_true) * margin_square)

def cross_entropy_loss_defined(y_true, y_pred):
    return tf.keras.losses.BinaryCrossentropy(from_logits=True)(y_true, y_pred)

def euclidean_distance(vects):
    x, y = vects
    sum_square = tf.math.reduce_sum(tf.math.square(x - y), axis = 1, keepdims=True)
    return tf.math.sqrt(tf.math.maximum(sum_square, tf.keras.backend.epsilon()))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def similarity_accuracy(y_true, y_pred):
    return tf.keras.backend.mean(tf.math.equal(y_true, tf.cast(y_pred< 0.5, y_true.dtype))) #it is less than .5 as the similar paris are 1

def non_nan_average(x):
    # Computes the average of all elements that are not NaN in a rank 1 tensor
    nan_mask = tf.math.is_nan(x)
    x = tf.boolean_mask(x, tf.logical_not(nan_mask))
    return tf.keras.backend.mean(x)
    

def class_accuracy(y_true, y_pred): 
    
    y_pred = tf.keras.activations.sigmoid(y_pred)
    y_pred = tf.where(y_pred>0.5, 1, -1)
    confusion_matrix = tf.math.confusion_matrix(y_true, y_pred)
    total_instance = tf.reduce_sum(confusion_matrix, axis = 1)
    correct_instances = tf.linalg.tensor_diag_part(confusion_matrix)
    ratio = tf.divide(correct_instances, tf.maximum(1,total_instance))
    uar = non_nan_average(ratio)   
    
    return uar
    


def total_model(max_len, max_features,embedding_dim):
    siamese_network = siamese_model(max_len, max_features,embedding_dim)
    
    input_a = tf.keras.Input(shape = max_len)
    input_b = tf.keras.Input(shape = max_len)
    
    processed_a = siamese_network(input_a)
    processed_b = siamese_network(input_b)
    
    distance = tf.keras.layers.Lambda(euclidean_distance, 
                                      output_shape = eucl_dist_output_shape, 
                                      name = 'Distance' )([processed_a, processed_b])
    
    classification_network = classification_model()
    
    accuracy_a = classification_network(processed_a)
    accuracyoutputa = tf.keras.layers.Lambda(lambda x: x, name = 'accuracyoutput_a')([accuracy_a])
    
    accuracy_b = classification_network(processed_b)
    accuracyoutputb = tf.keras.layers.Lambda(lambda x: x, name = 'accuracyoutput_b')([accuracy_b])
    
    return tf.keras.Model(inputs = [input_a, input_b],outputs = [distance, accuracyoutputa, accuracyoutputb])

## Reading Data

In [4]:
pos_file = open('data/all_positive_8500.txt', encoding="utf8")
pos_lines = pos_file.readlines()
pos_values = [1]*len(pos_lines)

neg_file = open('data/all_negative_3307.txt', encoding="utf8")
neg_lines = neg_file.readlines()
neg_values = [-1]*len(neg_lines)

X = np.array(pos_lines + neg_lines)
y = np.array(pos_values + neg_values)

#removing whitespaces, punctuations, digits and english words from text
converted_X =[]
punctuation_list = ['[',',','-','_','=',':','+','$','@',
                    '~','!',';','/','^',']','{','}','(',')','<','>','.']
whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
bangla_digits = u"[\u09E6\u09E7\u09E8\u09E9\u09EA\u09EB\u09EC\u09ED\u09EE\u09EF]+"
english_chars = u"[a-zA-Z0-9]"
punc = u"[(),$%^&*+={}\[\]:\"|\'\~`<>/,¦!?½£¶¼©⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞⅟↉¤¿º;-]+"
bangla_fullstop = u"\u0964"     #bangla fullstop(dari)
punctSeq   = u"['\"“”‘’]+|[.?!,…]+|[:;]+"


for x in X:
#     x = re.sub(bangla_digits, " ", x)
#     x = re.sub(punc, " ", x)
#     x = re.sub(english_chars, " ", x)
#     x = re.sub(bangla_fullstop, " ", x)
#     x = re.sub(punctSeq, " ", x)
#     x = whitespace.sub(" ", x).strip()
    
#     x = re.sub(r'https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE)
#     x = re.sub(r'\<a href', ' ', x)
#     x = re.sub(r'&amp;‘:‘ ’', '', x) 
#     x = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]। ,', ' ', x)
#     x = re.sub(r'<br />', ' ', x)
#     x = re.sub(r'\'', ' ', x)
#     x = re.sub(r"[\@$#%~+-\.\'।\"]"," ",x)
#     x = re.sub(r"(?m)^\s+", "", x)
#     x = re.sub("[()]","",x)
#     x = re.sub("[‘’]","",x)
#     x = re.sub("[!]","",x)
#     x = re.sub("[/]","",x)
#     x = re.sub("[:]","",x)
#     x = re.sub('\ |\?|\.|\!|\/|\;|\:', ' ',x)
#     x = x.strip("/")
    converted_X.append(x)
converted_X = np.array(converted_X)

## Splitting Sets and Training the Network

In [5]:
skf = StratifiedKFold(n_splits=10, random_state = 42)
fold_count = 0
max_features = 2500
embedding_dim = 64
batch_size = 256
macro_avg = {'precision':[],'recall':[],'f1-score':[]}
weighted_avg =  {'precision':[],'recall':[],'f1-score':[]}
metrics = ['precision', 'recall', 'f1-score']
epochs = 500
read_from_store_data = 0
read_from_stored_model = 0
maxlen = 250


if read_from_store_data:
    
    if read_from_stored_model:
        
        for fc in range(10):
            with open('train_eval_test_data_'+str(fold_count)+'.pkl','rb') as f:
                X1_train, X2_train, y1_label_train, y2_label_train, y_simil_train, X1_eval, X2_eval, y1_label_eval, y2_label_eval, y_simil_eval, X_test_tokenized_padded, y_test = pickle.load(f)
            final_model = total_model(maxlen, max_features,embedding_dim)
            checkpoint_path = "saved_model/SNN/"+str(fold_count)+"/cp.ckpt"
            checkpoint_dir = os.path.dirname(checkpoint_path)
            fold_count += 1
            latest = tf.train.latest_checkpoint(checkpoint_dir)
            final_model.load_weights(latest)
            predictions =  tf.keras.activations.sigmoid(final_model.predict([X_test_tokenized_padded, X_test_tokenized_padded])[-1])

            predictions = np.where(predictions>0.5, 1, -1)
            print(classification_report(y_test, predictions))
            cl = classification_report(y_test, predictions, output_dict =True)
            
    else:
        for fc in range(10):
            with open('train_eval_test_data_'+str(fold_count)+'.pkl','rb') as f:
                X1_train, X2_train, y1_label_train, y2_label_train, y_simil_train, X1_eval, X2_eval, y1_label_eval, y2_label_eval, y_simil_eval, X_test_tokenized_padded, y_test = pickle.load(f)
            final_model = total_model(maxlen, max_features,embedding_dim)
            losses = {'Distance': contrastive_loss, 'accuracyoutput_a': cross_entropy_loss_defined, 'accuracyoutput_b': cross_entropy_loss_defined}
        #     weights = {'Distance': distance_weight, 'accuracyoutput':class_weight}
            metrices = {'Distance':similarity_accuracy, 'accuracyoutput_a': class_accuracy, 'accuracyoutput_b': class_accuracy}
            final_model.compile(optimizer = tf.keras.optimizers.Adam(1e-4), loss = losses)#, metrics = metrices)
    #         print(final_model.summary())

            checkpoint_path = "saved_model/SNN/"+str(fold_count)+"/cp.ckpt"
            checkpoint_dir = os.path.dirname(checkpoint_path)

            # Create a callback that saves the model's weights
            cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                             save_weights_only=True,
                                                             verbose=1)

            es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=3)
            final_model.fit(x = [X1_train, X2_train], y = [y_simil_train, y1_label_train, y2_label_train], batch_size=batch_size, epochs= epochs,
                           validation_data = ([X1_eval, X2_eval], [y_simil_eval, y1_label_eval, y2_label_eval]), verbose = 0, callbacks=[es, cp_callback])
            predictions =  tf.keras.activations.sigmoid(final_model.predict([X_test_tokenized_padded, X_test_tokenized_padded])[-1])

            predictions = np.where(predictions>0.5, 1, -1)
            print(classification_report(y_test, predictions))
            cl = classification_report(y_test, predictions, output_dict =True)
            for t in metrics:
                macro_avg[t].append(cl['macro avg'][t])
                weighted_avg[t].append(cl['weighted avg'][t])


            tf.keras.backend.clear_session()
            fold_count += 1
else:

    for train_index, test_index in skf.split(converted_X, y):

        X_train, X_test = converted_X[train_index], converted_X[test_index]
        y_train, y_test = y[train_index], y[test_index]


        X_train, X_eval, y_train, y_eval = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42) #approximate train, test, eval size (8500,) (1181,) (2126,)

        #Data processing: Tokenization
        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_features)
        tokenizer.fit_on_texts(X_train) #fitting the tokenizer on the train data
        X_train_tokenized_padded = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen = maxlen)
        X_test_tokenized_padded = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen = maxlen)
        X_eval_tokenized_padded = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(X_eval), maxlen = maxlen)

        #pair making
        X1_train, X2_train, y1_label_train, y2_label_train, y_simil_train = pair_generator(X_train_tokenized_padded, y_train)
        X1_eval, X2_eval, y1_label_eval, y2_label_eval, y_simil_eval = pair_generator(X_eval_tokenized_padded, y_eval)

        #make similar labels 1 and dissimilar labels 0
        y_simil_train = np.where(y_simil_train == -1, 0, y_simil_train)
        y_simil_eval = np.where(y_simil_eval == -1, 0, y_simil_eval)

        with open('train_eval_test_data_'+str(fold_count)+'.pkl','wb') as f:
            pickle.dump([X1_train, X2_train, y1_label_train, y2_label_train, y_simil_train, X1_eval, X2_eval, y1_label_eval, y2_label_eval, y_simil_eval, X_test_tokenized_padded, y_test],f)

        final_model = total_model(maxlen, max_features,embedding_dim)
        losses = {'Distance': contrastive_loss, 'accuracyoutput_a': cross_entropy_loss_defined, 'accuracyoutput_b': cross_entropy_loss_defined}
    #     weights = {'Distance': distance_weight, 'accuracyoutput':class_weight}
        metrices = {'Distance':similarity_accuracy, 'accuracyoutput_a': class_accuracy, 'accuracyoutput_b': class_accuracy}
        final_model.compile(optimizer = tf.keras.optimizers.Adam(1e-4), loss = losses)#, metrics = metrices)
        print(final_model.summary())

        checkpoint_path = "saved_model/SNN/"+str(fold_count)+"/cp.ckpt"
        checkpoint_dir = os.path.dirname(checkpoint_path)

        # Create a callback that saves the model's weights
        cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         save_weights_only=True,
                                                         verbose=1)

        es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=3)
        final_model.fit(x = [X1_train, X2_train], y = [y_simil_train, y1_label_train, y2_label_train], batch_size=batch_size, epochs= epochs,
                       validation_data = ([X1_eval, X2_eval], [y_simil_eval, y1_label_eval, y2_label_eval]), verbose = 1, callbacks=[es, cp_callback])
        predictions =  tf.keras.activations.sigmoid(final_model.predict([X_test_tokenized_padded, X_test_tokenized_padded])[-1])
        
        predictions = np.where(predictions>0.5, 1, -1)
        print('For fold: ', fold_count+1)
        print(classification_report(y_test, predictions))
        cl = classification_report(y_test, predictions, output_dict =True)
        for t in metrics:
            macro_avg[t].append(cl['macro avg'][t])
            weighted_avg[t].append(cl['weighted avg'][t])



        tf.keras.backend.clear_session()
        fold_count += 1
    
print(macro_avg, weighted_avg)
with open('evaluation_metrics.pkl','wb') as f:
            pickle.dump([macro_avg, weighted_avg],f)    


   
    
    
    

(None, None) (None, 1)
(None, 1) (None, 1)
(None, 1) (None, 1)
(None, 1) (None, 1)

Epoch 00001: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00002: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00003: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00004: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00005: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00006: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00007: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00008: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00009: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00010: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00011: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00012: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00013: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00014: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00015: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00016: saving model to saved_model/SNN/0/cp.ckpt

Epoch 00017: saving 

with cleaning: {'precision': [0.8737148064353946, 0.8672166472679228, 0.8801921836142796, 0.8715060122215652, 0.8675550968368648, 0.8433567506735327, 0.8253182651850254, 0.8826926635283934, 0.8652818769380207, 0.7486556242473364], 'recall': [0.9227154789408211, 0.9149084769859606, 0.9314448196196907, 0.9019157632841657, 0.887318286831349, 0.8578549848942598, 0.8858308157099697, 0.9350267379679145, 0.9182887700534759, 0.8084313725490195], 'f1-score': [0.8913374861675805, 0.884401627160035, 0.8984086021505377, 0.8843006114011795, 0.8764491616498806, 0.8500299199120493, 0.84018010356836, 0.901229779577819, 0.883228436844397, 0.7394433901345427]} {'precision': [0.9217439407426706, 0.9158163791100448, 0.9282289641448055, 0.9098687194062373, 0.9016130318285529, 0.8797409667898461, 0.8922219540538695, 0.9309129570633766, 0.9176357429746849, 0.8433397494701], 'recall': [0.9060118543607113, 0.9000846740050804, 0.9119390347163421, 0.9026248941574937, 0.8975444538526672, 0.8763759525825572, 0.8569009314140559, 0.9144067796610169, 0.8983050847457628, 0.7533898305084745], 'f1-score': [0.9088858665644349, 0.9031131244303306, 0.9147016288365064, 0.9045352981044286, 0.8988845236442377, 0.8776533359847637, 0.8628976681021718, 0.9171277843185681, 0.9017186654026758, 0.7660080384658414]}

without cleaning: {'precision': [0.8704078988766124, 0.8617863358221524, 0.8860144499679383, 0.8690471464019851, 0.8583848544487075, 0.838525855838097, 0.8043628237180999, 0.8708490753628952, 0.8566143829535365, 0.7508474576271187], 'recall': [0.9268189088324151, 0.9123014039452639, 0.932207215212369, 0.8964617025057757, 0.8844570819264261, 0.8662360049760085, 0.8669273147325396, 0.9255258467023173, 0.9124064171122994, 0.811301247771836], 'f1-score': [0.8888315919145784, 0.8792146693587763, 0.9034319222502569, 0.8808025755771784, 0.8695848688349876, 0.8500938356415282, 0.8154850581388009, 0.8892215864733726, 0.8743656859436988, 0.7381340579710144]} {'precision': [0.9238284118183931, 0.9132492880571815, 0.9298607932993456, 0.9064218026108168, 0.8972027930161791, 0.882055133632093, 0.878839712895249, 0.9230749786829608, 0.912693555911136, 0.8479459925308819], 'recall': [0.9026248941574937, 0.8950042337002541, 0.9170194750211685, 0.9000846740050804, 0.890770533446232, 0.8738357324301439, 0.8323454699407282, 0.9033898305084745, 0.8898305084745762, 0.7508474576271187], 'f1-score': [0.906040062741677, 0.8984062007246472, 0.9193505224491353, 0.9018708052278757, 0.892684364493046, 0.8763109514570422, 0.8399964131113506, 0.906680086232266, 0.8937901007167816, 0.7635608572832226]}