In [26]:
from datasets import load_dataset
import random
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from tokenizers import Tokenizer, models, trainers
from tokenizers.pre_tokenizers import Whitespace
import torch.nn as nn
import torch
import numpy as np
from tqdm import tqdm

In [18]:
emotions = load_dataset("dair-ai/emotion")

In [19]:
labels = ["sadness", "joy", "love", "anger", "fear", "surprise"]

In [20]:
train_data = emotions["train"]
validation_data = emotions["validation"]
test_data = emotions["test"]

In [36]:
train_preprocess = np.array(train_data["text"]).reshape(-1,1)
train_preprocess

array([['i didnt feel humiliated'],
       ['i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'],
       ['im grabbing a minute to post i feel greedy wrong'],
       ...,
       ['i feel strong and good overall'],
       ['i feel like this was such a rude comment and im glad that t'],
       ['i know a lot but i feel so stupid because i can not portray it']],
      dtype='<U300')

In [37]:
oversample = SMOTE()
train_text_oversample, train_data_oversample = oversample.fit_resample(train_preprocess, train_data["label"])
train_text_oversample

ValueError: dtype='numeric' is not compatible with arrays of bytes/strings.Convert your data to numeric values explicitly instead.

In [10]:
# Tokenization
vocab_n = 5000
sequence_len = 64

# Initialize a tokenizer using BPE (Byte Pair Encoding)
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = Whitespace()
tokenizer.enable_padding(length=sequence_len)
tokenizer.enable_truncation(max_length=sequence_len)
tokenizer_trainer = trainers.BpeTrainer(vocab_size=vocab_n)
tokenizer.train_from_iterator(train_data["text"], trainer=tokenizer_trainer)

In [8]:
def preprocess_text(text: str, tokenizer: Tokenizer):
    """ 
    Helper function to tokenize text and return corresponding token IDs as tensors.

    Args:
        text, str: Text instance from training data.
        tokenizer, Tokenizer: The respective tokenizer to be used for tokenization.
    Returns:
        Tensor: One-dimensional PyTorch tensor with token IDs.
    """
    return torch.tensor(tokenizer.encode(text).ids)


def preprocess_label(label: int):
    """ 
    Helper function to return label as tensor.

    Args:
        label, int: Label from instance.
    Returns:
        Tensor: One-dimensional PyTorch tensor containing the label index.
    """
    return torch.tensor(label)


def preprocess(data: dict, tokenizer: Tokenizer):
    """ 
    Transforms input dataset to tokenized vector representations.

    Args:
        data, dict: Dictionary with text instances and labels.
        tokenizer, Tokenizer: The respective tokenizer to be used for tokenization.
    Returns:
        list: List with tensors for the input texts and labels.
    """
    instances = []

    for text, label in zip(data["text"], data["label"]):
        input = preprocess_text(text, tokenizer)
        label = preprocess_label(label)
        
        instances.append((input, label))

    return instances

In [11]:
train_instances = preprocess(train_data, tokenizer)
val_instances = preprocess(validation_data, tokenizer)
test_instances = preprocess(test_data, tokenizer)

In [12]:
val_instances

[(tensor([  66,   55,  464,  599,   42,  962,   78,  193,   92,  171,   18, 3424,
            77,   53,   44, 1263,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0]),
  tensor(0)),
 (tensor([   8,   31,   79,    8,   70,  248,  756,   35,    0, 1490, 4678, 1490,
          2912,   53, 1923,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0]),
  tensor(0)),
 (tensor([   8,   31,   79,    0, 1471, 1844,  104,    0,    0,    0,    0,    0

In [9]:
# Batching

def batching(instances: list, batch_size: int, shuffle: bool):
    """ 
    Batches input instances along the given size and returns list of batches.

    Args:
        instances, list: List of instances, containing a tuple of two tensors 
            for each text as well as corresponding label.
        batch_size, int: Size for batches.
        shuffle, bool: If true, the instances will be shuffled before batching.
    Returns:
        list: List containing tuples that correspond to single batches.
    """
    if shuffle:
        random.shuffle(instances)

    batches = []

    # We iterate through the instances with batch_size steps
    for i in range(0, len(instances), batch_size):

        # Stacking the instances with dim=0 (default value)
        batch_texts = torch.stack(
            [instance[0] for instance in instances[i : i + batch_size]]
        )
        batch_labels = torch.stack(
            [instance[1] for instance in instances[i : i + batch_size]]
        )

        batches.append((batch_texts, batch_labels))
    
    return batches

In [15]:
# CNN Network

class CNN_Classifier(nn.Module):
    """ 
    CNN for sentiment classification with 6 classes, consisting of an embedding 
    layer, two convolutional layers with different filter sizes, different 
    pooling sizes, as well as one linear output layer.
    """
    def __init__(self):
        super().__init__()
        # We can implement embeddings as a simple lookup-table for given word 
        # indices
        self.embedding = nn.Embedding(tokenizer.get_vocab_size(), 300)

        # One-dimensional convolution-layer with 300 input channels, and 100  
        # output channels as well as kernel size of 3; note that the
        # one-dimensional convolutional layer has 3 dimensions
        self.conv_1 = nn.Conv1d(300, 100, 3, padding="same")

        # Pooling with with a one-dimensional sliding window of length 3, 
        # reducing in this fashion the sequence length 
        self.pool_1 = nn.MaxPool1d(3)

        # The input will be the reduced number of maximum picks from the
        # previous operation; the dimension of those picks is the same as the
        # output channel size from self.conv_1. We apply a different filter of 
        # size 5.
        self.conv_2 = nn.Conv1d(100, 50, 5, padding="same")

        # Pooling with window size of 5
        self.pool_2 = nn.MaxPool1d(5)

        # Final fully connected linear layer from the 50 output channels to the
        # 6 sentiment categories 
        self.linear_layer = nn.Linear(50, 6)

    def forward(self, x):
        """ 
        Defining the forward pass of an input batch x.

        Args:
            x, tensor: The input is a batch of tweets from the data.
        Returns:
            y, float: The output are the logits from the final layer.
        """
        # x will correspond here to a batch; therefore, the input dimensions of 
        # the embedding will be by PyTorch convention as follows:
        # [batch_size, seq_len, emb_dim]
        x = self.embedding(x)

        # Unfortunately the embedding tensor does not correspond to the shape 
        # that is needed for nn.Conv1d(); for this reason, we must switch its 
        # order to [batch_size, emb_dim, seq_len] for PyTorch
        x = x.permute(0, 2, 1)

        # We can wrap the ReLu activation function around our convolution layer
        # The output tensor will have the following shape: 
        # [batch_size, 100, seq_len]
        x = nn.functional.relu(self.conv_1(x))

        # Applying max pooling of size 3 means that the output length of the 
        # sequence is shrunk to seq_len//3
        x = self.pool_1(x)

        # Output of the following layer: [batch_size, 50, seq_len//3]
        x = nn.functional.relu(self.conv_2(x))

        # Shrinking the sequence length by 5
        x = self.pool_2(x)
        # print(x.shape)

        # At this point we have a tensor with 3 dimensions; however, the final layer 
        # requires an input of size [batch_size x 50]. To get this value we can 
        # aggregate the values and continue only with their mean
        x = x.mean(dim=-1)

        # In this fasion, the linear layer can be used to make predictions
        y = self.linear_layer(x)

        return y
    
    def fit(self, train_instances, val_instances, epochs, batch_size):
        """ 
        Gradient based fitting method with Adam optimization and automatic 
        evaluation (F1 score) for each epoch.

        Args:
            train_instances, list: List of instance tuples.
            val_instances, list: List of instance tuples.
            epochs, int: Number of training epochs.
            batch_size, int: Number of batch size.
        """
        self.train()
        optimizer = torch.optim.Adam(self.parameters())

        for epoch in range(epochs):
            train_batches = batching(
                train_instances,
                batch_size=batch_size,
                shuffle=True)
            
            for inputs, labels in tqdm(train_batches):
                optimizer.zero_grad()
                outputs = self(inputs)
                loss = nn.functional.cross_entropy(outputs, labels)
                loss.backward()
                optimizer.step()
            
            train_f1 = self.evaluate(train_instances, batch_size=batch_size)
            val_f1 = self.evaluate(val_instances, batch_size=batch_size)

            print(f"Epoch {epoch + 1} train F1 score: {train_f1}, validation F1 score: {val_f1}")

    def predict(self, input):
        """ 
        To make inferences from the model.

        Args:
            input, tensor: Single instance.
        Returns:
            int: Integer for most probable class.
        """
        self.eval()
        outputs = self(input)

        return torch.argmax(outputs, dim=-1)

    def evaluate(self, instances, batch_size):
        """ 
        To evaluate model's performance by various processes/standard.

        Args:
            instances, list: List of instance tuples.
            batch_size, int: Batch size.
        Returns:
            float: Macro F1 score for given instances.
        """
        batches = batching(instances, batch_size=batch_size, shuffle=False)
        y_test = []
        y_pred = []

        for inputs, labels in batches:
            y_test.extend(labels)
            y_pred.extend(self.predict(inputs))

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        cm = confusion_matrix(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        print("CNN Classifier:")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 Score: {f1}")
        print(f"Confusion Matrix:\n{cm}")
        print(f"Classification Report:\n{report}")

In [16]:
classifier = CNN_Classifier()
classifier.fit(train_instances, val_instances, epochs=5, batch_size=16)

100%|██████████| 1000/1000 [00:03<00:00, 283.22it/s]


CNN Classifier:
Accuracy: 0.8441875
Precision: 0.7841844527850768
Recall: 0.6722731568191591
F1 Score: 0.6703932542614649
Confusion Matrix:
[[4379   86   28  126   47    0]
 [  60 5139   93   32   38    0]
 [  31  493  603  161   16    0]
 [ 137  126  125 1691   78    2]
 [  47  116   18   71 1682    3]
 [  14   72   82  129  262   13]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      4666
           1       0.85      0.96      0.90      5362
           2       0.64      0.46      0.54      1304
           3       0.77      0.78      0.77      2159
           4       0.79      0.87      0.83      1937
           5       0.72      0.02      0.04       572

    accuracy                           0.84     16000
   macro avg       0.78      0.67      0.67     16000
weighted avg       0.84      0.84      0.83     16000

CNN Classifier:
Accuracy: 0.7945
Precision: 0.6992531012715045
Recall: 0.6217595877320382
F1 Sc

100%|██████████| 1000/1000 [00:03<00:00, 289.76it/s]


CNN Classifier:
Accuracy: 0.9520625
Precision: 0.9378671105240146
Recall: 0.9148365697550123
F1 Score: 0.9256561471110906
Confusion Matrix:
[[4633    7    2   11   13    0]
 [  52 5201   79   11   12    7]
 [  13  120 1142   27    1    1]
 [ 120   14    4 2010   11    0]
 [  44   11    1   41 1793   47]
 [   5    6    2   19   86  454]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      4666
           1       0.97      0.97      0.97      5362
           2       0.93      0.88      0.90      1304
           3       0.95      0.93      0.94      2159
           4       0.94      0.93      0.93      1937
           5       0.89      0.79      0.84       572

    accuracy                           0.95     16000
   macro avg       0.94      0.91      0.93     16000
weighted avg       0.95      0.95      0.95     16000

CNN Classifier:
Accuracy: 0.8825
Precision: 0.865280914344853
Recall: 0.8256063274390747
F1 Sco

100%|██████████| 1000/1000 [00:03<00:00, 281.99it/s]


CNN Classifier:
Accuracy: 0.9784375
Precision: 0.9606572599694783
Recall: 0.9752671190911865
F1 Score: 0.967426288564884
Confusion Matrix:
[[4575   27    1   42   20    1]
 [   2 5296   52    1    1   10]
 [   0   37 1261    5    0    1]
 [  15    8    0 2115   21    0]
 [   8    7    0   11 1845   66]
 [   0    0    0    2    7  563]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4666
           1       0.99      0.99      0.99      5362
           2       0.96      0.97      0.96      1304
           3       0.97      0.98      0.98      2159
           4       0.97      0.95      0.96      1937
           5       0.88      0.98      0.93       572

    accuracy                           0.98     16000
   macro avg       0.96      0.98      0.97     16000
weighted avg       0.98      0.98      0.98     16000

CNN Classifier:
Accuracy: 0.895
Precision: 0.8652014880696334
Recall: 0.8735947913947775
F1 Scor

100%|██████████| 1000/1000 [00:03<00:00, 273.80it/s]


CNN Classifier:
Accuracy: 0.983875
Precision: 0.9794597475800645
Recall: 0.9782706716313972
F1 Score: 0.9786800064823923
Confusion Matrix:
[[4652    2    1    3    8    0]
 [  10 5324   21    0    4    3]
 [   3   46 1255    0    0    0]
 [  67    3    1 2051   37    0]
 [  19    0    0    0 1894   24]
 [   1    0    0    0    5  566]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      4666
           1       0.99      0.99      0.99      5362
           2       0.98      0.96      0.97      1304
           3       1.00      0.95      0.97      2159
           4       0.97      0.98      0.98      1937
           5       0.95      0.99      0.97       572

    accuracy                           0.98     16000
   macro avg       0.98      0.98      0.98     16000
weighted avg       0.98      0.98      0.98     16000

CNN Classifier:
Accuracy: 0.894
Precision: 0.8754740048891875
Recall: 0.8584905183765908
F1 Scor

100%|██████████| 1000/1000 [00:03<00:00, 273.44it/s]


CNN Classifier:
Accuracy: 0.9879375
Precision: 0.9800485966066724
Recall: 0.9850867882605495
F1 Score: 0.9824329897222192
Confusion Matrix:
[[4658    0    1    6    0    1]
 [   9 5300   28   13    4    8]
 [   0   18 1279    6    0    1]
 [  13    1    0 2141    4    0]
 [  28    0    0   24 1863   22]
 [   0    0    0    0    6  566]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4666
           1       1.00      0.99      0.99      5362
           2       0.98      0.98      0.98      1304
           3       0.98      0.99      0.98      2159
           4       0.99      0.96      0.98      1937
           5       0.95      0.99      0.97       572

    accuracy                           0.99     16000
   macro avg       0.98      0.99      0.98     16000
weighted avg       0.99      0.99      0.99     16000

CNN Classifier:
Accuracy: 0.895
Precision: 0.8618082884772917
Recall: 0.8689309923675944
F1 Sco

In [17]:
f1_test = classifier.evaluate(test_instances, batch_size=16)

CNN Classifier:
Accuracy: 0.8965
Precision: 0.845107185966377
Recall: 0.8559161416862501
F1 Score: 0.8490755186618048
Confusion Matrix:
[[553  10   0  15   2   1]
 [  4 634  40   6   5   6]
 [  5  25 120   8   0   1]
 [ 13   6   0 252   1   3]
 [ 11   1   0  17 182  13]
 [  1   4   0   1   8  52]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       581
           1       0.93      0.91      0.92       695
           2       0.75      0.75      0.75       159
           3       0.84      0.92      0.88       275
           4       0.92      0.81      0.86       224
           5       0.68      0.79      0.73        66

    accuracy                           0.90      2000
   macro avg       0.85      0.86      0.85      2000
weighted avg       0.90      0.90      0.90      2000

