In [1]:
import pandas as pd
import nltk
import re
import string
import os
from tqdm import tqdm
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import IterableDataset, DataLoader, Dataset
import torch.optim as optim

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score   
import gc

### Loading the data

In [2]:
file_names = ['mapping', 'test_labels', 'test_text', 'train_labels', 'train_text', 'val_labels', 'val_text']
f1 = lambda file_name: f'https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/{file_name}.txt'

Below, we create all the variables containing the data. Also a variable called 'all_inputs' is created (after the preprocessing), which stores all the input data together. We will use this for the tokenization.

In [3]:
mapping = pd.read_csv(f1('mapping'), sep='\t', names=['mapping', 'sentiment'])
test_labels = pd.read_csv(f1('test_labels'), sep='\t', names=['sentiment'])
train_labels = pd.read_csv(f1('train_labels'), sep='\t', names=['sentiment'])
val_labels = pd.read_csv(f1('val_labels'), sep='\t', names=['sentiment'])
test_text = pd.read_csv(f1('test_text'), sep='\t', names = ['input'])
train_text = pd.read_csv(f1('train_text'), sep='\t', names = ['input'])
val_text = pd.read_csv(f1('val_text'), sep='\t', names = ['input'])

all_input = pd.concat([train_text, val_text, test_text])

In [4]:
train_text

Unnamed: 0,input
0,“Worry is a down payment on a problem you may ...
1,My roommate: it's okay that we can't spell bec...
2,No but that's so cute. Atsu was probably shy a...
3,Rooneys fucking untouchable isn't he? Been fuc...
4,it's pretty depressing when u hit pan on ur fa...
...,...
3252,I get discouraged because I try for 5 fucking ...
3253,The @user are in contention and hosting @user ...
3254,@user @user @user @user @user as a fellow UP g...
3255,You have a #problem? Yes! Can you do #somethin...


In [5]:
mapping

Unnamed: 0,mapping,sentiment
0,0,anger
1,1,joy
2,2,optimism
3,3,sadness


As a first step, we will use only the 2 sentiments anger and joy (0 and 1). And then, as asked in the task description, we will exchange one of the sentiments. Hence, we will use joy and sadness in a second step (1 and 3). We will need this in later tasks.


In [6]:
anger_joy_test_idx = test_labels['sentiment'].isin([0, 1])
anger_joy_train_idx = train_labels['sentiment'].isin([0, 1])
anger_joy_val_idx = val_labels['sentiment'].isin([0, 1])

sadness_joy_test_idx = test_labels['sentiment'].isin([3, 1])
sadness_joy_train_idx = train_labels['sentiment'].isin([3, 1])
sadness_joy_val_idx = val_labels['sentiment'].isin([3, 1])

### Preprocessing

Before we actually split the datasets, we will apply our preprocessing pipeline as it equally affects all the inputs, independently of the sentiment.

Here, we apply following preprocessing steps:

In [7]:
def custom_tokenize(text):
    if not text:
        print('The text to be tokenized is a None type. Defaulting to blank string.')
        text = ''
    return nltk.word_tokenize(text)

In [8]:
test_text

Unnamed: 0,input
0,#Deppression is real. Partners w/ #depressed p...
1,@user Interesting choice of words... Are you c...
2,My visit to hospital for care triggered #traum...
3,@user Welcome to #MPSVT! We are delighted to h...
4,What makes you feel #joyful?
...,...
1416,I need a sparkling bodysuit . No occasion. Jus...
1417,@user I've finished reading it; simply mind-bl...
1418,shaft abrasions from panties merely shifted to...
1419,All this fake outrage. Y'all need to stop 🤣


The preprocessing steps are the same that we did in previous tasks. Hence, I will not go into detail about the preprocessing.

In [9]:
class Preprocessing():

    def __init__(self, full_dataset):
        self.full_dataset = full_dataset
        self.word_to_ix = {}
        self.ix_to_word = {}
        self.context_dataset = []
        self.vocab_size = None
        
    def convert_lowercase(self, x):
        x = x.lower()
        return x
        
    def remove_emoji(self, x):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', x)
        
    exclude = string.punctuation
    
    def remove_punc(self, x):
        exclude = string.punctuation
        return x.translate(str.maketrans('', '', exclude))
    
    def remove_special_chars(self, x):
        x = re.sub('[^A-Za-z0-9]+', ' ', x)
        return x

    def remove_one_letter_words(self, x):
        x = re.sub(r'(?:^| )\w(?:$| )', ' ', x).strip()
        return x
    
    # default is to apply all these preprocessing steps
    def apply_preprocessing(self,
                            lowercase=True,
                            remove_emoji=True,
                            remove_punc=True,
                            remove_special_chars=True,
                            remove_one_letter_words=True):
        if lowercase:
            self.full_dataset['input'] = self.full_dataset['input'].apply(self.convert_lowercase)
        if remove_emoji:
            self.full_dataset['input'] = self.full_dataset['input'].apply(self.remove_emoji)
        if remove_punc:
            self.full_dataset['input'] = self.full_dataset['input'].apply(self.remove_punc)
        if remove_special_chars:
            self.full_dataset['input'] = self.full_dataset['input'].apply(self.remove_special_chars)
        if remove_one_letter_words:
            self.full_dataset['input'] = self.full_dataset['input'].apply(self.remove_one_letter_words)    

In [10]:
cl1 = Preprocessing(test_text)
cl1.apply_preprocessing()
test_text_preprocessed = cl1.full_dataset

cl2 = Preprocessing(val_text)
cl2.apply_preprocessing()
val_text_preprocessed = cl2.full_dataset

cl3 = Preprocessing(train_text)
cl3.apply_preprocessing()
train_text_preprocessed = cl3.full_dataset

In [11]:
test_text_preprocessed = pd.DataFrame(test_text['input'].apply(custom_tokenize))
train_text_preprocessed = pd.DataFrame(train_text['input'].apply(custom_tokenize))
val_text_preprocessed = pd.DataFrame(val_text['input'].apply(custom_tokenize))
all_input_preprocessed = pd.concat([train_text_preprocessed, val_text_preprocessed, test_text_preprocessed])

In [12]:
all_input_preprocessed

Unnamed: 0,input
0,"[worry, is, down, payment, on, problem, you, m..."
1,"[my, roommate, its, okay, that, we, cant, spel..."
2,"[no, but, thats, so, cute, atsu, was, probably..."
3,"[rooneys, fucking, untouchable, isnt, he, been..."
4,"[its, pretty, depressing, when, hit, pan, on, ..."
...,...
1416,"[need, sparkling, bodysuit, no, occasion, just..."
1417,"[user, ive, finished, reading, it, simply, min..."
1418,"[shaft, abrasions, from, panties, merely, shif..."
1419,"[all, this, fake, outrage, yall, need, to, stop]"


Get the the length of the largest tweet, so we can create a padding for all tweets that contain fewer words. Also, let us define the needed funtion to create the padding. This is done as pytorch needs the inputs to be of the same length.

In [13]:
max_len_tweet = max(all_input_preprocessed['input'].str.len())
max_len_tweet

30

In [14]:
def create_padding(dataset, max_len):
    """
    Creates a padding on the whole dataset such that each datapoint is of same length.
    The length is given as input by max_len.
    """
    for idx, row in enumerate(dataset['input']):
        if len(row) < max_len:
            tmp = len(row)
            pad1 = (max_len - tmp) // 2
            row = ['PADDING'] * pad1 + row
            row = row + ['PADDING'] * (30 - tmp - pad1)
            dataset['input'].iloc[idx] = row

In [15]:
create_padding(test_text_preprocessed, max_len_tweet)
create_padding(train_text_preprocessed, max_len_tweet)
create_padding(val_text_preprocessed, max_len_tweet)

Let's see an example of how a padded input datapoint looks like

In [16]:
test_text_preprocessed.loc[5, 'input']

['PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'am',
 'revolting',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING',
 'PADDING']

In [17]:
all_input_preprocessed = pd.concat([train_text_preprocessed, val_text_preprocessed, test_text_preprocessed])

Now, our data is in a somewhat nice format to work with. Every word is separated by a comma. As a next step, we want to create the word_to_ix and ix_to_word dictionaries.

In [18]:
word_to_ix = {}
cur_idx = 0
for l in all_input_preprocessed['input'].tolist():
    for el in l:
        if el not in word_to_ix:
            word_to_ix[el] = cur_idx
            cur_idx += 1

In [19]:
ix_to_word = dict([(v, k) for k, v in word_to_ix.items()])

The following function creates a numeric vector out of every tweet.

In [20]:
def create_num_vec(x, word_to_ix):
    """
    Takes a tokenized tweet as input and returns a numeric vector.
    """
    res = []
    for el in x:
        res.append(word_to_ix[el])
    return torch.tensor(res)

In [21]:
create_num_vec(test_text_preprocessed['input'].tolist()[2], word_to_ix)

tensor([   0,    0,    0,   15, 1482,   73, 4488,  170, 2262, 9364, 9365,   68,
        9366, 9367, 1262,   84, 9080,  147,   15, 2179,  778,  182,  229, 1438,
        9368,  147,  590,    0,    0,    0])

In [22]:
class Model_Dataset(Dataset):
    """
    The data is passed as lists
    """
    def __init__(self, x, y, word_to_ix):
        self.x = x['input'].tolist()
        self.y = y['sentiment'].tolist()
        self.word_to_ix = word_to_ix
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = create_num_vec(self.x[idx], self.word_to_ix)
        y = torch.tensor(self.y[idx])
        return x, y

Let's see what shapes our data has, for example, so we can better design our model pipeline.
As expected, the input data is of shape (batch_size, max_len_tweet), and the labels are of shape (batch_size).

In [23]:
class my_model(nn.Module):

    def __init__(self, vocab_size, embedding_size, batch_size):
        
        super().__init__()
        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                     embedding_dim=embedding_size)
        self.conv1 = nn.Conv1d(embedding_size, 128, kernel_size=7, padding="same")
        self.conv2 = nn.Conv1d(128, 64, kernel_size=5, padding="same")
        self.conv3 = nn.Conv1d(64, 16, kernel_size=3, padding="same")

        self.linear = nn.Linear(16, 2) # only 2 classes as output

    def forward(self, inputs):
        
        x = self.embedding(inputs)
        
        #30 = max num of tokens
        x = x.reshape(len(x), self.embedding_size, 30) ## Embedding Length needs to be treated as channel dimension
        
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))

        # pool the 16 dimension to 1
        x, _ = x.max(dim=-1)

        y_out = self.linear(x)

        return y_out

Before coming to the actual tasks, we will write some more functions which may be helpful. One of these functions is a function that calculates the loss and the accuracy (i.e. on the validation set):

In [24]:
def calc_loss_acc(model, loss_fn, data_loader):
    with torch.no_grad():
        Y_shuffled, Y_preds, losses = [],[],[]
        for X, Y in data_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_shuffled.append(Y)
            Y_preds.append(preds.argmax(dim=-1))

        Y_shuffled = torch.cat(Y_shuffled)
        Y_preds = torch.cat(Y_preds)

        print("Loss : {:.3f}".format(torch.tensor(losses).mean()))
        print("Acccuracy  : {:.3f}".format(accuracy_score(Y_shuffled.detach().numpy(), Y_preds.detach().numpy())))
        print("F1 Score: {:.3f}".format(f1_score(Y_shuffled.detach().numpy(), Y_preds.detach().numpy())))

# Task 1 -- Simple training on both datasets
### Let's train and train the data! As a first step on the anger - joy dataset
As a first step we need to create the datasets. Also, a small step we undertake is to change the 'sadness' sentiment, which is encoded as 3 to 0, as the cross entropy loss only accepts 0 and 1 as labels.

In [25]:
test_labels_V1 = test_labels[anger_joy_test_idx]
test_text_V1 = test_text_preprocessed[anger_joy_test_idx]

val_labels_V1 = val_labels[anger_joy_val_idx]
val_text_V1 = val_text_preprocessed[anger_joy_val_idx]

train_labels_V1 = train_labels[anger_joy_train_idx]
train_text_V1 = train_text_preprocessed[anger_joy_train_idx]


# the same for V2
test_labels_V2 = test_labels[sadness_joy_test_idx].replace(3, 0)
test_text_V2 = test_text_preprocessed[sadness_joy_test_idx]

val_labels_V2 = val_labels[sadness_joy_val_idx].replace(3, 0)
val_text_V2 = val_text_preprocessed[sadness_joy_val_idx]

train_labels_V2 = train_labels[sadness_joy_train_idx].replace(3, 0)
train_text_V2 = train_text_preprocessed[sadness_joy_train_idx]

Let's define our datasets. We call the datasets containing the labels anger and joy 'V1' and the other datasets 'V2'.

In [26]:
dataloader_train_V1 = Model_Dataset(train_text_V1, train_labels_V1, word_to_ix)
dataloader_train_V1 = DataLoader(dataloader_train_V1, batch_size=32, shuffle=True)

dataloader_train_V2 = Model_Dataset(train_text_V2, train_labels_V2, word_to_ix)
dataloader_train_V2 = DataLoader(dataloader_train_V2, batch_size=32, shuffle=True)

dataloader_val_V1 = Model_Dataset(val_text_V1, val_labels_V1, word_to_ix)
dataloader_val_V1 = DataLoader(dataloader_val_V1)

dataloader_val_V2 = Model_Dataset(val_text_V2, val_labels_V2, word_to_ix)
dataloader_val_V2 = DataLoader(dataloader_val_V2)

dataloader_test_V1 = Model_Dataset(test_text_V1, test_labels_V1, word_to_ix)
dataloader_test_V1 = DataLoader(dataloader_test_V1)

dataloader_test_V2 = Model_Dataset(test_text_V2, test_labels_V2, word_to_ix)
dataloader_test_V2 = DataLoader(dataloader_test_V2)

### Training the model on the two datasets

In [27]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [28]:
EMBED_SIZE = 300
BATCH_SIZE = 32

model_v1 = my_model(len(word_to_ix), EMBED_SIZE, BATCH_SIZE).to(device)
optimizer = optim.Adam(model_v1.parameters(), lr=0.001)
loss_func = torch.nn.CrossEntropyLoss()

In [29]:
num_epochs = 10
num_batches = len(dataloader_train_V1)

for epoch in range(num_epochs):
    for i, d in enumerate(dataloader_train_V1):
        optimizer.zero_grad()
        x, y = d
        x, y = x.to(device), y.to(device)
        y_pred = model_v1(x)
        loss = loss_func(y_pred, y)
        loss_batch = loss.item()
        loss.backward()
        optimizer.step()
        print(f'Epoch [{epoch}/{num_epochs}], batch: [{i}/{num_batches}, loss: {loss_batch:.4f}]')


Epoch [0/10], batch: [0/66, loss: 0.6892]
Epoch [0/10], batch: [1/66, loss: 0.6826]
Epoch [0/10], batch: [2/66, loss: 0.7462]
Epoch [0/10], batch: [3/66, loss: 0.6098]
Epoch [0/10], batch: [4/66, loss: 0.6487]
Epoch [0/10], batch: [5/66, loss: 0.6572]
Epoch [0/10], batch: [6/66, loss: 0.7036]
Epoch [0/10], batch: [7/66, loss: 0.6608]
Epoch [0/10], batch: [8/66, loss: 0.6110]
Epoch [0/10], batch: [9/66, loss: 0.6323]
Epoch [0/10], batch: [10/66, loss: 0.6268]
Epoch [0/10], batch: [11/66, loss: 0.6831]
Epoch [0/10], batch: [12/66, loss: 0.7512]
Epoch [0/10], batch: [13/66, loss: 0.6423]
Epoch [0/10], batch: [14/66, loss: 0.6350]
Epoch [0/10], batch: [15/66, loss: 0.6515]
Epoch [0/10], batch: [16/66, loss: 0.6742]
Epoch [0/10], batch: [17/66, loss: 0.5609]
Epoch [0/10], batch: [18/66, loss: 0.4598]
Epoch [0/10], batch: [19/66, loss: 0.6759]
Epoch [0/10], batch: [20/66, loss: 0.7048]
Epoch [0/10], batch: [21/66, loss: 0.7785]
Epoch [0/10], batch: [22/66, loss: 0.7087]
Epoch [0/10], batch: 

Epoch [2/10], batch: [61/66, loss: 0.4181]
Epoch [2/10], batch: [62/66, loss: 0.2660]
Epoch [2/10], batch: [63/66, loss: 0.4394]
Epoch [2/10], batch: [64/66, loss: 0.3892]
Epoch [2/10], batch: [65/66, loss: 0.3522]
Epoch [3/10], batch: [0/66, loss: 0.2013]
Epoch [3/10], batch: [1/66, loss: 0.2680]
Epoch [3/10], batch: [2/66, loss: 0.1953]
Epoch [3/10], batch: [3/66, loss: 0.2866]
Epoch [3/10], batch: [4/66, loss: 0.2342]
Epoch [3/10], batch: [5/66, loss: 0.2198]
Epoch [3/10], batch: [6/66, loss: 0.1902]
Epoch [3/10], batch: [7/66, loss: 0.2189]
Epoch [3/10], batch: [8/66, loss: 0.2260]
Epoch [3/10], batch: [9/66, loss: 0.2288]
Epoch [3/10], batch: [10/66, loss: 0.2139]
Epoch [3/10], batch: [11/66, loss: 0.2109]
Epoch [3/10], batch: [12/66, loss: 0.1826]
Epoch [3/10], batch: [13/66, loss: 0.3720]
Epoch [3/10], batch: [14/66, loss: 0.1964]
Epoch [3/10], batch: [15/66, loss: 0.1971]
Epoch [3/10], batch: [16/66, loss: 0.0882]
Epoch [3/10], batch: [17/66, loss: 0.4183]
Epoch [3/10], batch: 

Epoch [5/10], batch: [55/66, loss: 0.0132]
Epoch [5/10], batch: [56/66, loss: 0.0942]
Epoch [5/10], batch: [57/66, loss: 0.0253]
Epoch [5/10], batch: [58/66, loss: 0.0777]
Epoch [5/10], batch: [59/66, loss: 0.0472]
Epoch [5/10], batch: [60/66, loss: 0.2643]
Epoch [5/10], batch: [61/66, loss: 0.0196]
Epoch [5/10], batch: [62/66, loss: 0.0506]
Epoch [5/10], batch: [63/66, loss: 0.2215]
Epoch [5/10], batch: [64/66, loss: 0.0460]
Epoch [5/10], batch: [65/66, loss: 0.0293]
Epoch [6/10], batch: [0/66, loss: 0.0253]
Epoch [6/10], batch: [1/66, loss: 0.1855]
Epoch [6/10], batch: [2/66, loss: 0.1013]
Epoch [6/10], batch: [3/66, loss: 0.0146]
Epoch [6/10], batch: [4/66, loss: 0.0651]
Epoch [6/10], batch: [5/66, loss: 0.1044]
Epoch [6/10], batch: [6/66, loss: 0.0179]
Epoch [6/10], batch: [7/66, loss: 0.1927]
Epoch [6/10], batch: [8/66, loss: 0.0467]
Epoch [6/10], batch: [9/66, loss: 0.0870]
Epoch [6/10], batch: [10/66, loss: 0.0469]
Epoch [6/10], batch: [11/66, loss: 0.0092]
Epoch [6/10], batch: 

Epoch [8/10], batch: [50/66, loss: 0.0010]
Epoch [8/10], batch: [51/66, loss: 0.0043]
Epoch [8/10], batch: [52/66, loss: 0.0071]
Epoch [8/10], batch: [53/66, loss: 0.0155]
Epoch [8/10], batch: [54/66, loss: 0.0329]
Epoch [8/10], batch: [55/66, loss: 0.0039]
Epoch [8/10], batch: [56/66, loss: 0.0050]
Epoch [8/10], batch: [57/66, loss: 0.0083]
Epoch [8/10], batch: [58/66, loss: 0.0635]
Epoch [8/10], batch: [59/66, loss: 0.0037]
Epoch [8/10], batch: [60/66, loss: 0.0010]
Epoch [8/10], batch: [61/66, loss: 0.0735]
Epoch [8/10], batch: [62/66, loss: 0.0068]
Epoch [8/10], batch: [63/66, loss: 0.0019]
Epoch [8/10], batch: [64/66, loss: 0.0031]
Epoch [8/10], batch: [65/66, loss: 0.0033]
Epoch [9/10], batch: [0/66, loss: 0.0049]
Epoch [9/10], batch: [1/66, loss: 0.0074]
Epoch [9/10], batch: [2/66, loss: 0.0029]
Epoch [9/10], batch: [3/66, loss: 0.0020]
Epoch [9/10], batch: [4/66, loss: 0.0123]
Epoch [9/10], batch: [5/66, loss: 0.0337]
Epoch [9/10], batch: [6/66, loss: 0.0027]
Epoch [9/10], batc

In [30]:
calc_loss_acc(model_v1, loss_func, dataloader_val_V1)

Loss : 1.260
Acccuracy  : 0.619
F1 Score: 0.395


Before going to task 2, let's do the same for the other dataset!

In [31]:
EMBED_SIZE = 300
BATCH_SIZE = 32

model_v2 = my_model(len(word_to_ix), EMBED_SIZE, BATCH_SIZE).to(device)
optimizer = optim.Adam(model_v2.parameters(), lr=0.001)
loss_func = torch.nn.CrossEntropyLoss()

In [32]:
num_epochs = 10
num_batches = len(dataloader_train_V2)

for epoch in range(num_epochs):
    for i, d in enumerate(dataloader_train_V2):
        optimizer.zero_grad()
        x, y = d
        x, y = x.to(device), y.to(device)
        y_pred = model_v2(x)
        loss = loss_func(y_pred, y)
        loss_batch = loss.item()
        loss.backward()
        optimizer.step()
        print(f'Epoch [{epoch}/{num_epochs}], batch: [{i}/{num_batches}, loss: {loss_batch:.4f}]')


Epoch [0/10], batch: [0/49, loss: 0.6960]
Epoch [0/10], batch: [1/49, loss: 0.6824]
Epoch [0/10], batch: [2/49, loss: 0.6708]
Epoch [0/10], batch: [3/49, loss: 0.7575]
Epoch [0/10], batch: [4/49, loss: 0.6610]
Epoch [0/10], batch: [5/49, loss: 0.6911]
Epoch [0/10], batch: [6/49, loss: 0.6920]
Epoch [0/10], batch: [7/49, loss: 0.7035]
Epoch [0/10], batch: [8/49, loss: 0.7175]
Epoch [0/10], batch: [9/49, loss: 0.6835]
Epoch [0/10], batch: [10/49, loss: 0.6329]
Epoch [0/10], batch: [11/49, loss: 0.7672]
Epoch [0/10], batch: [12/49, loss: 0.6350]
Epoch [0/10], batch: [13/49, loss: 0.6699]
Epoch [0/10], batch: [14/49, loss: 0.7420]
Epoch [0/10], batch: [15/49, loss: 0.6911]
Epoch [0/10], batch: [16/49, loss: 0.7797]
Epoch [0/10], batch: [17/49, loss: 0.7311]
Epoch [0/10], batch: [18/49, loss: 0.7098]
Epoch [0/10], batch: [19/49, loss: 0.6794]
Epoch [0/10], batch: [20/49, loss: 0.7351]
Epoch [0/10], batch: [21/49, loss: 0.6988]
Epoch [0/10], batch: [22/49, loss: 0.6931]
Epoch [0/10], batch: 

Epoch [3/10], batch: [45/49, loss: 0.1587]
Epoch [3/10], batch: [46/49, loss: 0.3590]
Epoch [3/10], batch: [47/49, loss: 0.1778]
Epoch [3/10], batch: [48/49, loss: 0.1767]
Epoch [4/10], batch: [0/49, loss: 0.3426]
Epoch [4/10], batch: [1/49, loss: 0.0753]
Epoch [4/10], batch: [2/49, loss: 0.0924]
Epoch [4/10], batch: [3/49, loss: 0.0638]
Epoch [4/10], batch: [4/49, loss: 0.0924]
Epoch [4/10], batch: [5/49, loss: 0.2562]
Epoch [4/10], batch: [6/49, loss: 0.0592]
Epoch [4/10], batch: [7/49, loss: 0.1063]
Epoch [4/10], batch: [8/49, loss: 0.1093]
Epoch [4/10], batch: [9/49, loss: 0.1094]
Epoch [4/10], batch: [10/49, loss: 0.0323]
Epoch [4/10], batch: [11/49, loss: 0.0541]
Epoch [4/10], batch: [12/49, loss: 0.0842]
Epoch [4/10], batch: [13/49, loss: 0.0614]
Epoch [4/10], batch: [14/49, loss: 0.1037]
Epoch [4/10], batch: [15/49, loss: 0.0667]
Epoch [4/10], batch: [16/49, loss: 0.1489]
Epoch [4/10], batch: [17/49, loss: 0.0379]
Epoch [4/10], batch: [18/49, loss: 0.1034]
Epoch [4/10], batch: 

Epoch [7/10], batch: [42/49, loss: 0.0148]
Epoch [7/10], batch: [43/49, loss: 0.0037]
Epoch [7/10], batch: [44/49, loss: 0.0081]
Epoch [7/10], batch: [45/49, loss: 0.0797]
Epoch [7/10], batch: [46/49, loss: 0.2199]
Epoch [7/10], batch: [47/49, loss: 0.0100]
Epoch [7/10], batch: [48/49, loss: 0.0074]
Epoch [8/10], batch: [0/49, loss: 0.0592]
Epoch [8/10], batch: [1/49, loss: 0.0339]
Epoch [8/10], batch: [2/49, loss: 0.0030]
Epoch [8/10], batch: [3/49, loss: 0.0231]
Epoch [8/10], batch: [4/49, loss: 0.0059]
Epoch [8/10], batch: [5/49, loss: 0.0072]
Epoch [8/10], batch: [6/49, loss: 0.0033]
Epoch [8/10], batch: [7/49, loss: 0.0116]
Epoch [8/10], batch: [8/49, loss: 0.0214]
Epoch [8/10], batch: [9/49, loss: 0.0081]
Epoch [8/10], batch: [10/49, loss: 0.0042]
Epoch [8/10], batch: [11/49, loss: 0.0053]
Epoch [8/10], batch: [12/49, loss: 0.0074]
Epoch [8/10], batch: [13/49, loss: 0.0091]
Epoch [8/10], batch: [14/49, loss: 0.0030]
Epoch [8/10], batch: [15/49, loss: 0.0790]
Epoch [8/10], batch: 

In [33]:
calc_loss_acc(model_v2, loss_func, dataloader_val_V2)

Loss : 1.752
Acccuracy  : 0.597
F1 Score: 0.522


# Task 2
#### Now that we managed to get the models running, we try to improve the models using the train and validation sets. One thing we will do is to remove some layers, as I have a feeling that we have to many convolutional layers for a problem that doesn't need such a deep structure. As mentioned in the task description, we only need to do so with one of the two datasets. In my case this is the V1 dataset (anger and joy).
#### The first model in the race is the same as above. We will train with 15 epochs though and report the validation error at end of every epoch.

### Training and evaluating Model 1

In [34]:
class my_model_1(nn.Module):

    def __init__(self, vocab_size, embedding_size, batch_size):
        
        super().__init__()
        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                     embedding_dim=embedding_size)
        self.conv1 = nn.Conv1d(embedding_size, 128, kernel_size=7, padding="same")
        self.conv2 = nn.Conv1d(128, 64, kernel_size=5, padding="same")
        self.conv3 = nn.Conv1d(64, 16, kernel_size=3, padding="same")

        self.dropout = nn.Dropout(0.4)

        self.linear = nn.Linear(16, 2) # only 2 classes as output

    def forward(self, inputs):
        
        x = self.embedding(inputs)
        
        #30 = max num of tokens
        x = x.reshape(len(x), self.embedding_size, 30) ## Embedding Length needs to be treated as channel dimension
        
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.dropout(x)

        # pool the 16 dimension to 1
        x, _ = x.max(dim=-1)

        y_out = self.linear(x)

        return y_out

In [35]:
EMBED_SIZE = 300
BATCH_SIZE = 32

model_1 = my_model_1(len(word_to_ix), EMBED_SIZE, BATCH_SIZE).to(device)
optimizer = optim.Adam(model_1.parameters(), lr=0.001)
loss_func = torch.nn.CrossEntropyLoss()

In [36]:
num_epochs = 6
num_batches = len(dataloader_train_V1)

for epoch in range(1, num_epochs+1):
    for i, d in enumerate(dataloader_train_V1):
        optimizer.zero_grad()
        x, y = d
        x, y = x.to(device), y.to(device)
        y_pred = model_1(x)
        loss = loss_func(y_pred, y)
        loss_batch = loss.item()
        loss.backward()
        optimizer.step()
        if i % 20 == 0: 
            #print(f'Epoch [{epoch}/{num_epochs}], batch: [{i}/{num_batches}, loss: {loss_batch:.4f}]')
            pass
    
    # at end of epoch calculate the validation loss
    print(f'--------EPOCH: {epoch} --------')
    print('Metrics on Validation Set:')
    calc_loss_acc(model_1, loss_func, dataloader_val_V1)
    print("       -----      ")
    print('Metrics on Training Set:')
    calc_loss_acc(model_1, loss_func, dataloader_train_V1)
    print('################################')


--------EPOCH: 1 --------
Metrics on Validation Set:
Loss : 0.673
Acccuracy  : 0.623
F1 Score: 0.126
       -----      
Metrics on Training Set:
Loss : 0.608
Acccuracy  : 0.670
F1 Score: 0.157
################################
--------EPOCH: 2 --------
Metrics on Validation Set:
Loss : 0.677
Acccuracy  : 0.607
F1 Score: 0.019
       -----      
Metrics on Training Set:
Loss : 0.547
Acccuracy  : 0.675
F1 Score: 0.149
################################
--------EPOCH: 3 --------
Metrics on Validation Set:
Loss : 0.679
Acccuracy  : 0.580
F1 Score: 0.270
       -----      
Metrics on Training Set:
Loss : 0.407
Acccuracy  : 0.835
F1 Score: 0.723
################################
--------EPOCH: 4 --------
Metrics on Validation Set:
Loss : 0.822
Acccuracy  : 0.568
F1 Score: 0.419
       -----      
Metrics on Training Set:
Loss : 0.268
Acccuracy  : 0.919
F1 Score: 0.877
################################
--------EPOCH: 5 --------
Metrics on Validation Set:
Loss : 0.963
Acccuracy  : 0.580
F1 Score: 0

### Training and evalating Model 2
#### Let's make some changes to the model and the parameters and see how it performs

In [37]:
class my_model_2(nn.Module):

    def __init__(self, vocab_size, embedding_size, batch_size):
        
        super().__init__()
        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                     embedding_dim=embedding_size)
        self.conv1 = nn.Conv1d(embedding_size, 256, kernel_size=10, padding="same")
        self.conv2 = nn.Conv1d(256, 128, kernel_size=8, padding="same")
        self.conv3 = nn.Conv1d(128, 64, kernel_size=6, padding="same")
        self.conv4 = nn.Conv1d(64, 32, kernel_size=3, padding="same")


        self.linear2 = nn.Linear(32, 2) # only 2 classes as output

    def forward(self, inputs):
        
        x = self.embedding(inputs)
        
        #30 = max num of tokens
        x = x.reshape(len(x), self.embedding_size, 30) ## Embedding Length needs to be treated as channel dimension
        
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))

        x, _ = x.max(dim=-1)
        #rint(x.shape)
        
        
       # print(x.shape)


        y_out = self.linear2(x)

        return y_out

In [38]:
EMBED_SIZE = 512
BATCH_SIZE = 64

model_2 = my_model_2(len(word_to_ix), EMBED_SIZE, BATCH_SIZE).to(device)
optimizer = optim.Adam(model_2.parameters(), lr=0.001)
loss_func = torch.nn.CrossEntropyLoss()

In [39]:
num_epochs = 5
num_batches = len(dataloader_train_V1)

for epoch in range(1, num_epochs+1):
    for i, d in enumerate(dataloader_train_V1):
        optimizer.zero_grad()
        x, y = d
        x, y = x.to(device), y.to(device)
        y_pred = model_2(x)
        loss = loss_func(y_pred, y)
        loss_batch = loss.item()
        loss.backward()
        optimizer.step()
        if i % 20 == 0: 
            #print(f'Epoch [{epoch}/{num_epochs}], batch: [{i}/{num_batches}, loss: {loss_batch:.4f}]')
            pass
    
    # at end of epoch calculate the validation loss
    print(f'--------EPOCH: {epoch} --------')
    print('Metrics on Validation Set:')
    calc_loss_acc(model_2, loss_func, dataloader_val_V1)
    print("       -----      ")
    print('Metrics on Training Set:')
    calc_loss_acc(model_2, loss_func, dataloader_train_V1)
    print('################################')


  return F.conv1d(input, weight, bias, self.stride,


--------EPOCH: 1 --------
Metrics on Validation Set:
Loss : 0.672
Acccuracy  : 0.623
F1 Score: 0.000
       -----      
Metrics on Training Set:
Loss : 0.635
Acccuracy  : 0.664
F1 Score: 0.000
################################
--------EPOCH: 2 --------
Metrics on Validation Set:
Loss : 0.696
Acccuracy  : 0.623
F1 Score: 0.000
       -----      
Metrics on Training Set:
Loss : 0.516
Acccuracy  : 0.664
F1 Score: 0.000
################################
--------EPOCH: 3 --------
Metrics on Validation Set:
Loss : 0.781
Acccuracy  : 0.553
F1 Score: 0.470
       -----      
Metrics on Training Set:
Loss : 0.247
Acccuracy  : 0.912
F1 Score: 0.880
################################
--------EPOCH: 4 --------
Metrics on Validation Set:
Loss : 1.027
Acccuracy  : 0.572
F1 Score: 0.439
       -----      
Metrics on Training Set:
Loss : 0.100
Acccuracy  : 0.963
F1 Score: 0.947
################################
--------EPOCH: 5 --------
Metrics on Validation Set:
Loss : 1.337
Acccuracy  : 0.595
F1 Score: 0

### Training and evalating Model 3
#### Let's see what additional changes we can make to make the model perform better. It is really difficult as I do not have a intuition what makes the model perform well and what not

In [40]:
class my_model_3(nn.Module):

    def __init__(self, vocab_size, embedding_size, batch_size):
        
        super().__init__()
        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                     embedding_dim=embedding_size)
        self.conv1 = nn.Conv1d(embedding_size, 64, kernel_size=10, padding="same")


        
        self.linear2 = nn.Linear(64, 2) # only 2 classes as output

    def forward(self, inputs):
        
        x = self.embedding(inputs)
        
        #30 = max num of tokens
        x = x.reshape(len(x), self.embedding_size, 30) ## Embedding Length needs to be treated as channel dimension
        
        x = F.relu(self.conv1(x))
        
        x, _ = x.max(dim=-1)

        y_out = self.linear2(x)

        return y_out

In [41]:
EMBED_SIZE = 300
BATCH_SIZE = 32

model_3 = my_model_3(len(word_to_ix), EMBED_SIZE, BATCH_SIZE).to(device)
optimizer = optim.Adam(model_3.parameters(), lr=0.001)
loss_func = torch.nn.CrossEntropyLoss()

In [42]:
num_epochs = 5
num_batches = len(dataloader_train_V1)

for epoch in range(1, num_epochs+1):
    for i, d in enumerate(dataloader_train_V1):
        optimizer.zero_grad()
        x, y = d
        x, y = x.to(device), y.to(device)
        y_pred = model_3(x)
        loss = loss_func(y_pred, y)
        loss_batch = loss.item()
        loss.backward()
        optimizer.step()
        if i % 20 == 0: 
            #print(f'Epoch [{epoch}/{num_epochs}], batch: [{i}/{num_batches}, loss: {loss_batch:.4f}]')
            pass
    
    # at end of epoch calculate the validation loss
    print(f'--------EPOCH: {epoch} --------')
    print('Metrics on Validation Set:')
    calc_loss_acc(model_3, loss_func, dataloader_val_V1)
    print("       -----      ")
    print('Metrics on Training Set:')
    calc_loss_acc(model_3, loss_func, dataloader_train_V1)
    print('################################')


--------EPOCH: 1 --------
Metrics on Validation Set:
Loss : 0.687
Acccuracy  : 0.576
F1 Score: 0.384
       -----      
Metrics on Training Set:
Loss : 0.402
Acccuracy  : 0.828
F1 Score: 0.743
################################
--------EPOCH: 2 --------
Metrics on Validation Set:
Loss : 0.703
Acccuracy  : 0.599
F1 Score: 0.309
       -----      
Metrics on Training Set:
Loss : 0.248
Acccuracy  : 0.941
F1 Score: 0.907
################################
--------EPOCH: 3 --------
Metrics on Validation Set:
Loss : 0.800
Acccuracy  : 0.611
F1 Score: 0.231
       -----      
Metrics on Training Set:
Loss : 0.161
Acccuracy  : 0.954
F1 Score: 0.927
################################
--------EPOCH: 4 --------
Metrics on Validation Set:
Loss : 0.777
Acccuracy  : 0.615
F1 Score: 0.377
       -----      
Metrics on Training Set:
Loss : 0.076
Acccuracy  : 0.991
F1 Score: 0.987
################################
--------EPOCH: 5 --------
Metrics on Validation Set:
Loss : 0.858
Acccuracy  : 0.654
F1 Score: 0

## Testing all 3 models on the validation sets, to determine which model is the best.
### Now, I will use all the trained models and evaluate them on the validation sets.

In [43]:
print("#### EVALUATING MODEL 1 ####")
print('Metrics on Validation Set:')
calc_loss_acc(model_1, loss_func, dataloader_val_V1)
print("----------------------------------")

print("#### EVALUATING MODEL 2 ####")
print('Metrics on Validation Set:')
calc_loss_acc(model_2, loss_func, dataloader_val_V1)
print("----------------------------------")

print("#### EVALUATING MODEL 3 ####")
print('Metrics on Validation Set:')
calc_loss_acc(model_3, loss_func, dataloader_val_V1)
print("----------------------------------")

#### EVALUATING MODEL 1 ####
Metrics on Validation Set:
Loss : 1.195
Acccuracy  : 0.521
F1 Score: 0.428
----------------------------------
#### EVALUATING MODEL 2 ####
Metrics on Validation Set:
Loss : 1.337
Acccuracy  : 0.595
F1 Score: 0.422
----------------------------------
#### EVALUATING MODEL 3 ####
Metrics on Validation Set:
Loss : 0.858
Acccuracy  : 0.654
F1 Score: 0.440
----------------------------------


### Task 3
#### Let's look at the performance of our best model, which seems to be model 3 on our other dataset. For this, we of course need to train the model again on the other dataset and evaluate it. Note that we should evaluate it on the test set at the end

First, we will initialize the model and then train it

In [44]:
EMBED_SIZE = 512
BATCH_SIZE = 64

model_2_v2 = my_model_2(len(word_to_ix), EMBED_SIZE, BATCH_SIZE).to(device)
optimizer = optim.Adam(model_2.parameters(), lr=0.001)
loss_func = torch.nn.CrossEntropyLoss()

In [45]:
num_epochs = 5
num_batches = len(dataloader_train_V2)

for epoch in range(1, num_epochs+1):
    for i, d in enumerate(dataloader_train_V2):
        optimizer.zero_grad()
        x, y = d
        x, y = x.to(device), y.to(device)
        y_pred = model_2_v2(x)
        loss = loss_func(y_pred, y)
        loss_batch = loss.item()
        loss.backward()
        optimizer.step()
        if i % 20 == 0: 
            #print(f'Epoch [{epoch}/{num_epochs}], batch: [{i}/{num_batches}, loss: {loss_batch:.4f}]')
            pass
    
    # at end of epoch calculate the validation loss
    print(f'--------EPOCH: {epoch} --------')
    print('Metrics on Validation Set:')
    calc_loss_acc(model_2_v2, loss_func, dataloader_val_V2)
    print("       -----      ")
    print('Metrics on Training Set:')
    calc_loss_acc(model_2_v2, loss_func, dataloader_train_V2)
    print('################################')


--------EPOCH: 1 --------
Metrics on Validation Set:
Loss : 0.710
Acccuracy  : 0.478
F1 Score: 0.000
       -----      
Metrics on Training Set:
Loss : 0.690
Acccuracy  : 0.547
F1 Score: 0.000
################################
--------EPOCH: 2 --------
Metrics on Validation Set:
Loss : 0.710
Acccuracy  : 0.478
F1 Score: 0.000
       -----      
Metrics on Training Set:
Loss : 0.690
Acccuracy  : 0.547
F1 Score: 0.000
################################
--------EPOCH: 3 --------
Metrics on Validation Set:
Loss : 0.710
Acccuracy  : 0.478
F1 Score: 0.000
       -----      
Metrics on Training Set:
Loss : 0.690
Acccuracy  : 0.547
F1 Score: 0.000
################################
--------EPOCH: 4 --------
Metrics on Validation Set:
Loss : 0.710
Acccuracy  : 0.478
F1 Score: 0.000
       -----      
Metrics on Training Set:
Loss : 0.690
Acccuracy  : 0.547
F1 Score: 0.000
################################
--------EPOCH: 5 --------
Metrics on Validation Set:
Loss : 0.710
Acccuracy  : 0.478
F1 Score: 0

In [46]:
print("#### EVALUATING MODEL 2 on TEST SET of dataset 1 ####")
print('Metrics on Test Set:')
calc_loss_acc(model_2, loss_func, dataloader_test_V1)
print("----------------------------------")




print("#### EVALUATING MODEL 2 on TEST SET of dataset 2 ####")
print('Metrics on Test Set:')
calc_loss_acc(model_2_v2, loss_func, dataloader_test_V2)
print("----------------------------------")



#### EVALUATING MODEL 2 on TEST SET of dataset 1 ####
Metrics on Test Set:
Loss : 1.262
Acccuracy  : 0.655
F1 Score: 0.503
----------------------------------
#### EVALUATING MODEL 2 on TEST SET of dataset 2 ####
Metrics on Test Set:
Loss : 0.699
Acccuracy  : 0.516
F1 Score: 0.000
----------------------------------
