# Sentiment Analysis through Metric Learning

In this project, we are utilizing the 

"@inproceedings{sazzed2019sentiment, title={A Sentiment Classification in Bengali and Machine Translated English Corpus}, author={Sazzed, Salim and Jayarathna, Sampath}, booktitle={2019 IEEE 20th International Conference on Information Reuse and Integration for Data Science (IRI)}, pages={107--114}, year={2019}, organization={IEEE} }"

data to implement sentiment analysis through siamese network. 

There are in total 3307 negative comments and 8500 positive comments present

We will implement Metric Learning Techniques to proceed with the analysis. The Siamese Neural Network architecture would help us to achieve the implementation. The network would have two parts. One for distance learning, the other for sentiment classification.

## Files needed to import

In [1]:
import glob
import os
import numpy as np
import pandas as pd
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
from sklearn.utils.class_weight import compute_class_weight
from itertools import combinations
import random
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None 
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import re
from sklearn.metrics import classification_report
import pickle

## Reading Data

In [2]:
pos_file = open('data/all_positive_8500.txt', encoding="utf8")
pos_lines = pos_file.readlines()
pos_values = [1]*len(pos_lines)

neg_file = open('data/all_negative_3307.txt', encoding="utf8")
neg_lines = neg_file.readlines()
neg_values = [-1]*len(neg_lines)

X = np.array(pos_lines + neg_lines)
y = np.array(pos_values + neg_values)

#removing whitespaces, punctuations, digits and english words from text
converted_X =[]
punctuation_list = ['[',',','-','_','=',':','+','$','@',
                    '~','!',';','/','^',']','{','}','(',')','<','>','.']
whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
bangla_digits = u"[\u09E6\u09E7\u09E8\u09E9\u09EA\u09EB\u09EC\u09ED\u09EE\u09EF]+"
english_chars = u"[a-zA-Z0-9]"
punc = u"[(),$%^&*+={}\[\]:\"|\'\~`<>/,¦!?½£¶¼©⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞⅟↉¤¿º;-]+"
bangla_fullstop = u"\u0964"     #bangla fullstop(dari)
punctSeq   = u"['\"“”‘’]+|[.?!,…]+|[:;]+"


for x in X:
#     x = re.sub(bangla_digits, " ", x)
#     x = re.sub(punc, " ", x)
#     x = re.sub(english_chars, " ", x)
#     x = re.sub(bangla_fullstop, " ", x)
#     x = re.sub(punctSeq, " ", x)
#     x = whitespace.sub(" ", x).strip()
    
#     x = re.sub(r'https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE)
#     x = re.sub(r'\<a href', ' ', x)
#     x = re.sub(r'&amp;‘:‘ ’', '', x) 
#     x = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]। ,', ' ', x)
#     x = re.sub(r'<br />', ' ', x)
#     x = re.sub(r'\'', ' ', x)
#     x = re.sub(r"[\@$#%~+-\.\'।\"]"," ",x)
#     x = re.sub(r"(?m)^\s+", "", x)
#     x = re.sub("[()]","",x)
#     x = re.sub("[‘’]","",x)
#     x = re.sub("[!]","",x)
#     x = re.sub("[/]","",x)
#     x = re.sub("[:]","",x)
#     x = re.sub('\ |\?|\.|\!|\/|\;|\:', ' ',x)
#     x = x.strip("/")
    converted_X.append(x)
converted_X = np.array(converted_X)

## Splitting Sets and Training the Network

In [3]:
skf = StratifiedKFold(n_splits=10, random_state = 42)
fold_count = 0
max_features = 2500
embedding_dim = 64
batch_size = 256
macro_avg = {'precision':[],'recall':[],'f1-score':[]}
weighted_avg =  {'precision':[],'recall':[],'f1-score':[]}
metrics = ['precision', 'recall', 'f1-score']
epochs = 500

for train_index, test_index in skf.split(converted_X, y):
    
    X_train, X_test = converted_X[train_index], converted_X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    
    X_train, X_eval, y_train, y_eval = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42) #approximate train, test, eval size (8500,) (1181,) (2126,)
    
    #Data processing: Tokenization
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(X_train) #fitting the tokenizer on the train data
    X_train_tokenized_padded = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(X_train))
    X_test_tokenized_padded = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(X_test))
    X_eval_tokenized_padded = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(X_eval))
    
    #sample weights for training
    y_orig_combined = np.array(y_train).reshape(-1,)
    class_weights_combined = compute_class_weight('balanced',[-1,1], y_orig_combined)

    #making sample weight array
    y_orig_combined = np.where(y_orig_combined==-1, class_weights_combined[0], y_orig_combined)
    y_orig_combined = np.where(y_orig_combined==1, class_weights_combined[1], y_orig_combined)
    

    
    
    #the RNN model
    
    rnn_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(
            input_dim=max_features,
            output_dim=embedding_dim,
            # Use masking to handle the variable sequence lengths
            mask_zero=True),
        tf.keras.layers.LSTM(64),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ])
    
    rnn_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])
     
    es = tf.keras.callbacks.EarlyStopping(patience=3)
    
    checkpoint_path = "saved_model/RNN/"+str(fold_count)+"/cp.ckpt"
    checkpoint_dir = os.path.dirname(checkpoint_path)

    # Create a callback that saves the model's weights
    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                     save_weights_only=True,
                                                     verbose=1)

    hist = rnn_model.fit(x=[X_train_tokenized_padded], y=[y_train], 
                        validation_data = ([X_eval_tokenized_padded], [y_eval]) ,
                        sample_weight = y_orig_combined, batch_size=batch_size, epochs=epochs, verbose=0, callbacks=[es,cp_callback])
    predictions = tf.keras.activations.sigmoid(rnn_model.predict(X_test_tokenized_padded))
    predictions = np.where(predictions>0.5, 1, -1)
    print('For fold: ', fold_count+1)
    print(classification_report(y_test, predictions))
    cl = classification_report(y_test, predictions, output_dict =True)
    for t in metrics:
        macro_avg[t].append(cl['macro avg'][t])
        weighted_avg[t].append(cl['weighted avg'][t])
    

    tf.keras.backend.clear_session()
    fold_count += 1
    
print(macro_avg, weighted_avg)

with open('evaluation_metrics.pkl','wb') as f:
            pickle.dump([macro_avg, weighted_avg],f)


    


   
    
    
    

  ...
    to  
  ['...']

Epoch 00001: saving model to saved_model/RNN/0/cp.ckpt

Epoch 00002: saving model to saved_model/RNN/0/cp.ckpt

Epoch 00003: saving model to saved_model/RNN/0/cp.ckpt

Epoch 00004: saving model to saved_model/RNN/0/cp.ckpt

Epoch 00005: saving model to saved_model/RNN/0/cp.ckpt

Epoch 00006: saving model to saved_model/RNN/0/cp.ckpt
              precision    recall  f1-score   support

          -1       0.37      1.00      0.54       331
           1       1.00      0.34      0.50       850

    accuracy                           0.52      1181
   macro avg       0.68      0.67      0.52      1181
weighted avg       0.82      0.52      0.51      1181

  ...
    to  
  ['...']

Epoch 00001: saving model to saved_model/RNN/1/cp.ckpt

Epoch 00002: saving model to saved_model/RNN/1/cp.ckpt

Epoch 00003: saving model to saved_model/RNN/1/cp.ckpt

Epoch 00004: saving model to saved_model/RNN/1/cp.ckpt

Epoch 00005: saving model to saved_model/RNN/1/cp.ckpt

Epoch 

In [4]:
with open('evaluation_metrics_RNN.pkl','wb') as f:
            pickle.dump([macro_avg, weighted_avg],f)

With clearning: {'precision': [0.6511415525114155, 0.8284932317032692, 0.6756900212314225, 0.8334188353076517, 0.7893557814880452, 0.7825139798451003, 0.8148084815321477, 0.8360684357541899, 0.7879617678864199, 0.6942630630630631], 'recall': [0.5505882352941176, 0.8869273147325396, 0.6405882352941177, 0.8811533676914874, 0.8307819441976186, 0.8357934956459925, 0.8795201706060067, 0.8980392156862745, 0.8531550802139037, 0.7402317290552585], 'f1-score': [0.323998153942053, 0.8440396817727961, 0.4794831122046719, 0.8493355885052079, 0.8026782223648852, 0.7942179806893024, 0.8266677818879644, 0.8519547950625863, 0.7928576754822032, 0.6684540999795048]} {'precision': [0.8044502182578807, 0.8929072491243175, 0.8182106638909412, 0.8891623644516291, 0.8504936473698235, 0.8539507230764023, 0.8885274252268301, 0.9015392718492568, 0.8712950519495339, 0.7964400061078026], 'recall': [0.35309060118543606, 0.8611346316680779, 0.4826418289585097, 0.8687552921253175, 0.8281117696867062, 0.8154106689246401, 0.842506350550381, 0.8677966101694915, 0.8084745762711865, 0.6805084745762712], 'f1-score': [0.2623696636591312, 0.8667309324010603, 0.4616638296476952, 0.873106404964729, 0.8338103239215481, 0.8232391025735774, 0.8496936001955112, 0.8732961003879509, 0.8179218372422998, 0.6963130990475872]}

Without cleaning: {'precision': [0.6826092901511895, 0.8133391417348416, 0.7110969387755102, 0.8438818765619562, 0.7902655167521082, 0.785650723025584, 0.8052900812085535, 0.8248587570621468, 0.7896372891494545, 0.673795366246565], 'recall': [0.6661364848054026, 0.8770872578638707, 0.7335294117647059, 0.8863675137728808, 0.8344712990936556, 0.8194597476452816, 0.8671814465967655, 0.8870231729055258, 0.8555971479500891, 0.7092335115864528], 'f1-score': [0.5200483077912827, 0.825439608081501, 0.6152250603241501, 0.8594001969760923, 0.803763592683995, 0.797845736353199, 0.8169339786783083, 0.8394578380749026, 0.7940837268751371, 0.623763013406112]} {'precision': [0.8205526373966149, 0.8864651282467695, 0.8380577252069329, 0.8942497428051538, 0.8530201661867929, 0.8429904711133549, 0.8787908837934482, 0.8933256726994159, 0.8735394798822602, 0.7841345018767256], 'recall': [0.5207451312447079, 0.8416596104995766, 0.6164267569856055, 0.8789161727349704, 0.8281117696867062, 0.8264182895850973, 0.834038950042337, 0.8559322033898306, 0.809322033898305, 0.6305084745762712], 'f1-score': [0.5120116106284465, 0.8488234449008932, 0.6246747658892301, 0.8824201774734961, 0.8341402846657429, 0.8312447343877829, 0.8415253364011087, 0.8621209861271845, 0.8187688348254403, 0.6459632653585351]}