# Importing libraries

In [1]:
import torch
import torch.nn as nn 
import pandas as pd 
import spacy
from torch.utils.data import DataLoader
from Encoder import Encoder_Sentiment_Analysis
from utils import POS_VOCAB
from typing import List, Tuple
from utils import *

# HYPER-Parameters

In [2]:
MAX_LEN_SENT = 64
PADDING_TOKEN = "<PAD>"
INPUT_DIM = 200
BATCH_SIZE = 32
NUM_HEADS = 8

# GLOBAL VARS 

In [3]:
nlp = spacy.load('en_core_web_sm')

# Loading the dev set

In [4]:
test_df = pd.read_csv('dev.csv')
print(test_df.shape)

(4855, 2)


In [5]:
test_df.head()

Unnamed: 0,text_id,sentence
0,r2-0017684,They were really quiet during lunch hour on a ...
1,r1-0056793,"They were, however, delicious and because my h..."
2,r1-0005378,We opted for a desert beverages as opposed to ...
3,r1-0065594,"Eat, sleep, repeat."
4,r1-0031164,I watched a number of those people who were wa...


# Pre-processing 

## Functions req

In [6]:
def pre_process_single_sent(sent: str) -> Tuple[List[str], List[str], List[int]]:
        '''
        given a single sentence returns a list of the lemmas in the list along with their Part of speech tags in a seperate list
        and masking (my_lemmas, my_pos, mask), padding or truncation is also done to make the length of each sentence to be same
        each list is of length self.max_len_sent = 256
        '''

        doc = nlp(sent.lower())
        my_lemmas = [token.lemma_ for token in doc if token.is_alpha]
        my_pos = [token.pos_ for token in doc if token.is_alpha]

        len_of_my_lemmas = len(my_lemmas)

        if len_of_my_lemmas >= MAX_LEN_SENT:
            # truncate at maximum length of the sentence
            my_lemmas = my_lemmas[:MAX_LEN_SENT]
            my_pos = my_pos[:MAX_LEN_SENT]
            my_mask = [1 for _ in range(MAX_LEN_SENT)]

        else:
            while len(my_lemmas) < MAX_LEN_SENT:
                my_lemmas.append(PADDING_TOKEN)
                my_pos.append(PADDING_TOKEN)
            
            t1 = [1 for _ in range(len_of_my_lemmas)]
            t0 = [0 for _ in range(MAX_LEN_SENT - len_of_my_lemmas)]
            my_mask = t1 + t0

        return my_lemmas, my_pos, my_mask

In [7]:
def pre_process_list_of_sentences(list_of_sent: List[str]) -> Tuple[List[List[str]], List[List[str]], List[List[int]]]:

    '''
    pre-processes all the senteces and returns their lemmas, pos_tags and masking
    '''

    lemmas_list_all = []
    pos_tags_list_all = []
    mask_list_all = []

    for sent in list_of_sent:

        my_lemmas, my_pos, my_mask = pre_process_single_sent(sent)
        lemmas_list_all.append(my_lemmas)
        pos_tags_list_all.append(my_pos)
        mask_list_all.append(my_mask)

    return lemmas_list_all, pos_tags_list_all, mask_list_all

## doing the pre-processing

In [8]:
list_of_all_sentences = test_df['sentence'].tolist()
print(type(list_of_all_sentences))
print(len(list_of_all_sentences))

<class 'list'>
4855


In [9]:
lemmas_list_all, pos_tags_list_all, mask_list_all = pre_process_list_of_sentences(list_of_all_sentences)

In [None]:
lemma_tensors_list = convert_lemmas_list_into_tensor_embeddings(lemmas_list_all)
pos_indices_list = convert_pos_tags_list_into_list_of_indices_given_vocab(pos_tags_list_all)
positional_tensors_list = get_positional_encodings(mask_list_all, MAX_LEN_SENT, INPUT_DIM)

In [15]:
print(len(lemma_tensors_list), len(pos_indices_list), len(positional_tensors_list))

4855 4855 4855


In [16]:
lemma_add_positional_tensors_list = []
for lemma_tensor, positional_tensor in zip(lemma_tensors_list, positional_tensors_list):
    temp = lemma_tensor + positional_tensor
    lemma_add_positional_tensors_list.append(temp)

print(len(lemma_add_positional_tensors_list))
print(lemma_add_positional_tensors_list[0].shape)

4855
torch.Size([256, 50])


In [17]:
data = []
for i in range(len(lemma_add_positional_tensors_list)):
    temp = (lemma_add_positional_tensors_list[i], pos_indices_list[i])
    data.append(temp)

print(len(data))

4855


In [18]:
test_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=False)

# LOADING THE MODEL

In [19]:
model = Encoder_Sentiment_Analysis(input_dim=INPUT_DIM, num_heads=NUM_HEADS)
model.load_state_dict(torch.load('sentiment.params'))
model.eval()

Encoder_Sentiment_Analysis(
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=50, out_features=50, bias=True)
    )
    (linear1): Linear(in_features=50, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=50, bias=True)
    (norm1): LayerNorm((50,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((50,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=50, out_features=50, bias=True)
        )
        (linear1): Linear(in_features=50, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (li

# MAKING THE PREDICTIONS

In [20]:
output_list = [] # stores the sentiment for each input

for temp in test_loader:
    
    lemma_tensors = temp[0]
    pos_tensors = torch.stack(temp[1], dim=1)

    with torch.no_grad():

        output = model([lemma_tensors, pos_tensors])
        _, preds = torch.max(output, dim=1)
        # print(preds)
        
    output_list += (preds - 1).tolist()

# Writing the output_list to ans.txt

In [21]:
with open('answer.txt', 'a') as f: 
    for ans in output_list:
        f.write(f"{ans}\n")

# Loading the test set

In [4]:
dev_set = pd.read_csv('test.csv')
print(dev_set.shape)

(5110, 2)


In [5]:
dev_set.head()

Unnamed: 0,text_id,sentence
0,r1-0086521,A helpful valet at the Bellagio said it was a ...
1,r1-0044715,"People often ask ""what happened to the human c..."
2,r1-0060690,He explained there would be a diagnostic fee o...
3,r1-0016852,I had initially purchased a massage on Groupon.
4,r2-0006040,Primarily do high-end cars as they get referra...


In [8]:
list_of_all_sentences = dev_set['sentence'].tolist()
print(type(list_of_all_sentences))
print(len(list_of_all_sentences))

<class 'list'>
5110


In [9]:
lemmas_list_all, pos_tags_list_all, mask_list_all = pre_process_list_of_sentences(list_of_all_sentences)

lemma_tensors_list = convert_lemmas_list_into_tensor_embeddings(lemmas_list_all)
pos_indices_list = convert_pos_tags_list_into_list_of_indices_given_vocab(pos_tags_list_all)
positional_tensors_list = get_positional_encodings(mask_list_all, MAX_LEN_SENT, INPUT_DIM)

Started converting the lemmas into tensor embeddings
Converted all the lemmas into embeddings
Started to map pos_tags into indices using vocab
Done mapping pos_tags to index 
5110
****************************************************************************************************



my_index = 0
Start generating positional encodings
Got the positional encoding tensors
got the positional_encoding
****************************************************************************************************



my_index = 1
Start generating positional encodings
Got the positional encoding tensors
got the positional_encoding
****************************************************************************************************



my_index = 2
Start generating positional encodings
Got the positional encoding tensors
got the positional_encoding
****************************************************************************************************



my_index = 3
Start generating positional encodings
Got the 

In [10]:
print(len(lemma_tensors_list), len(pos_indices_list), len(positional_tensors_list))

5110 5110 5110


In [11]:
lemma_add_positional_tensors_list = []
for lemma_tensor, positional_tensor in zip(lemma_tensors_list, positional_tensors_list):
    temp = lemma_tensor + positional_tensor
    lemma_add_positional_tensors_list.append(temp)

print(len(lemma_add_positional_tensors_list))
print(lemma_add_positional_tensors_list[0].shape)

5110
torch.Size([64, 100])


In [12]:
data = []
for i in range(len(lemma_add_positional_tensors_list)):
    temp = (lemma_add_positional_tensors_list[i], pos_indices_list[i])
    data.append(temp)

print(len(data))

5110


In [13]:
test_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=False)

In [14]:
model = Encoder_Sentiment_Analysis(input_dim=INPUT_DIM, num_heads=NUM_HEADS)
model.load_state_dict(torch.load('sentiment.params'))
model.eval()

Encoder_Sentiment_Analysis(
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
    )
    (linear1): Linear(in_features=100, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=100, bias=True)
    (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
  

In [15]:
output_list = [] # stores the sentiment for each input

for temp in test_loader:
    
    lemma_tensors = temp[0]
    pos_tensors = torch.stack(temp[1], dim=1)

    with torch.no_grad():

        output = model([lemma_tensors, pos_tensors])
        _, preds = torch.max(output, dim=1)
        # print(preds)
        
    output_list += (preds - 1).tolist()

In [16]:
with open('answer.txt', 'a') as f: 
    for ans in output_list:
        f.write(f"{ans}\n")