# Sentiment Analysis through Metric Learning

In this project, we are utilizing the 

"@inproceedings{sazzed2019sentiment, title={A Sentiment Classification in Bengali and Machine Translated English Corpus}, author={Sazzed, Salim and Jayarathna, Sampath}, booktitle={2019 IEEE 20th International Conference on Information Reuse and Integration for Data Science (IRI)}, pages={107--114}, year={2019}, organization={IEEE} }"

data to implement sentiment analysis through siamese network. 

There are in total 3307 negative comments and 8500 positive comments present

We will implement Metric Learning Techniques to proceed with the analysis. The Siamese Neural Network architecture would help us to achieve the implementation. The network would have two parts. One for distance learning, the other for sentiment classification.

## Files needed to import

In [None]:
import glob
import os
import numpy as np
import pandas as pd
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
from sklearn.utils.class_weight import compute_class_weight
from itertools import combinations
import random
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None 
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from bnlp import NLTKTokenizer
from bnlp.corpus import stopwords, punctuations
from bnlp.corpus.util import remove_stopwords
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import re
from sklearn.metrics import classification_report

## Reading Data

In [None]:
pos_file = open('data/all_positive_8500.txt', encoding="utf8")
pos_lines = pos_file.readlines()
pos_values = [1]*len(pos_lines)

neg_file = open('data/all_negative_3307.txt', encoding="utf8")
neg_lines = neg_file.readlines()
neg_values = [-1]*len(neg_lines)

X = np.array(pos_lines + neg_lines)
y = np.array(pos_values + neg_values)

#removing whitespaces, punctuations, digits and english words from text
converted_X =[]
punctuation_list = ['[',',','-','_','=',':','+','$','@',
                    '~','!',';','/','^',']','{','}','(',')','<','>','.']
whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
bangla_digits = u"[\u09E6\u09E7\u09E8\u09E9\u09EA\u09EB\u09EC\u09ED\u09EE\u09EF]+"
english_chars = u"[a-zA-Z0-9]"
punc = u"[(),$%^&*+={}\[\]:\"|\'\~`<>/,¦!?½£¶¼©⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞⅟↉¤¿º;-]+"
bangla_fullstop = u"\u0964"     #bangla fullstop(dari)
punctSeq   = u"['\"“”‘’]+|[.?!,…]+|[:;]+"


for x in X:
    x = re.sub(bangla_digits, " ", x)
    x = re.sub(punc, " ", x)
    x = re.sub(english_chars, " ", x)
    x = re.sub(bangla_fullstop, " ", x)
    x = re.sub(punctSeq, " ", x)
    x = whitespace.sub(" ", x).strip()
    
    x = re.sub(r'https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE)
    x = re.sub(r'\<a href', ' ', x)
    x = re.sub(r'&amp;‘:‘ ’', '', x) 
    x = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]। ,', ' ', x)
    x = re.sub(r'<br />', ' ', x)
    x = re.sub(r'\'', ' ', x)
    x = re.sub(r"[\@$#%~+-\.\'।\"]"," ",x)
    x = re.sub(r"(?m)^\s+", "", x)
    x = re.sub("[()]","",x)
    x = re.sub("[‘’]","",x)
    x = re.sub("[!]","",x)
    x = re.sub("[/]","",x)
    x = re.sub("[:]","",x)
    x = re.sub('\ |\?|\.|\!|\/|\;|\:', ' ',x)
    x = x.strip("/")
    converted_X.append(x)
converted_X = np.array(converted_X)

## Splitting Sets and Training the Network

In [None]:
skf = StratifiedKFold(n_splits=10, random_state = 42)
fold_count = 0
max_features = 2500
embedding_dim = 64
batch_size = 32
macro_avg = {'precision':[],'recall':[],'f1-score':[]}
weighted_avg =  {'precision':[],'recall':[],'f1-score':[]}
metrics = ['precision', 'recall', 'f1-score']
epochs = 500

for train_index, test_index in skf.split(converted_X, y):
    
    X_train, X_test = converted_X[train_index], converted_X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    
    X_train, X_eval, y_train, y_eval = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42) #approximate train, test, eval size (8500,) (1181,) (2126,)
    
    #Data processing: Tokenization
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(X_train) #fitting the tokenizer on the train data
    X_train_tokenized_padded = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(X_train))
    X_test_tokenized_padded = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(X_test))
    X_eval_tokenized_padded = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(X_eval))
    
    #sample weights for training
    y_orig_combined = np.array(y_train).reshape(-1,)
    class_weights_combined = compute_class_weight('balanced',[-1,1], y_orig_combined)

    #making sample weight array
    y_orig_combined = np.where(y_orig_combined==-1, class_weights_combined[0], y_orig_combined)
    y_orig_combined = np.where(y_orig_combined==1, class_weights_combined[1], y_orig_combined)
    

    
    
    #the RNN model
    
    rnn_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(
            input_dim=max_features,
            output_dim=embedding_dim,
            # Use masking to handle the variable sequence lengths
            mask_zero=True),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ])
    
    rnn_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])
     
    es = tf.keras.callbacks.EarlyStopping(patience=3)
    
    checkpoint_path = "saved_model/RNN/"+str(fold_count)+"/cp.ckpt"
    checkpoint_dir = os.path.dirname(checkpoint_path)

    # Create a callback that saves the model's weights
    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                     save_weights_only=True,
                                                     verbose=1)

    hist = rnn_model.fit(x=[X_train_tokenized_padded], y=[y_train], 
                        validation_data = ([X_eval_tokenized_padded], [y_eval]) ,
                        sample_weight = y_orig_combined, batch_size=batch_size, epochs=epochs, verbose=0, callbacks=[es,cp_callback])
    predictions = tf.keras.activations.sigmoid(rnn_model.predict(X_test_tokenized_padded))
    predictions = np.where(predictions>0.5, 1, -1)
    print(classification_report(y_test, predictions))
    cl = classification_report(y_test, predictions, output_dict =True)
    for t in metrics:
        macro_avg[t].append(cl['macro avg'][t])
        weighted_avg[t].append(cl['weighted avg'][t])
    

    tf.keras.backend.clear_session()
    fold_count += 1
    
print(macro_avg, weighted_avg)
    


   
    
    
    