# Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import plotly as px
import warnings
warnings.filterwarnings("ignore")

## Processing Training Data

In [9]:
# Set the file path for the training dataset
file_path = "/kaggle/input/emotions-dataset-for-nlp/train.txt"

# Initialize an empty list to hold the data
data = []

# Open the file in read mode and populate the data list
with open(file_path, 'r') as file:
    # Loop through each line in the file
    for line in file:
        # Strip leading/trailing whitespaces and split by ';'
        values = line.strip().split(';')
        
        # Append the parsed values to the data list
        data.append(values)

# Convert the list of data into a Pandas DataFrame
# Columns are named 'text' and 'emotion'
train_df = pd.DataFrame(data, columns=['text', 'emotion'])

# Display the DataFrame
train_df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
15995,i just had a very brief time in the beanbag an...,sadness
15996,i am now turning and i feel pathetic that i am...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such a rude comment and i...,anger


In [4]:
# Drop duplicate rows from the training DataFrame
train_df = train_df.drop_duplicates()

# Drop rows with NaN (missing) values from the training DataFrame
train_df = train_df.dropna()

# Reset the index after dropping rows
train_df = train_df.reset_index(drop=True)
# Count the occurrences of different emotions in the training data
train_df['emotion'].value_counts()

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: emotion, dtype: int64

## Processing Valid Data

In [10]:
# Define the path for the validation dataset located in Kaggle's directory
file_path = "/kaggle/input/emotions-dataset-for-nlp/val.txt"

# Initialize an empty list to temporarily hold our parsed data
data = []

# Open the validation file for reading
with open(file_path, 'r') as file:
    # Loop through each line in the validation dataset file
    for line in file:
        # Clean up the line and split it using ';' as a delimiter
        values = line.strip().split(';')
        
        # Add the parsed line (text and emotion) to our data list
        data.append(values)

# Convert the list into a DataFrame for easier manipulation later
# I'm setting the column names to 'text' and 'emotion' for clarity
valid_df = pd.DataFrame(data, columns=['text', 'emotion'])

# Display the DataFrame
valid_df

Unnamed: 0,text,emotion
0,im feeling quite sad and sorry for myself but ...,sadness
1,i feel like i am still looking at a blank canv...,sadness
2,i feel like a faithful servant,love
3,i am just feeling cranky and blue,anger
4,i can have for a treat or if i am feeling festive,joy
...,...,...
1995,im having ssa examination tomorrow in the morn...,sadness
1996,i constantly worry about their fight against n...,joy
1997,i feel its important to share this info for th...,joy
1998,i truly feel that if you are passionate enough...,joy


In [12]:
# Drop duplicate rows from the training DataFrame
valid_df = valid_df.drop_duplicates()

# Drop rows with NaN (missing) values from the training DataFrame
valid_df = valid_df.dropna()

# Reset the index after dropping rows
valid_df = valid_df.reset_index(drop=True)
# Count the occurrences of different emotions in the training data
valid_df['emotion'].value_counts()

joy         704
sadness     550
anger       275
fear        212
love        178
surprise     81
Name: emotion, dtype: int64

## Processing Test Data

In [11]:
# Specify the file path for the test dataset from Kaggle's directory
file_path = "/kaggle/input/emotions-dataset-for-nlp/test.txt"

# Create an empty list to collect the parsed lines from the test dataset
data = []

# Open the test dataset file in read mode
with open(file_path, 'r') as file:
    # Loop through each line in the file
    for line in file:
        # Remove any leading/trailing whitespaces and split by ';'
        values = line.strip().split(';')
        
        # Add the cleaned-up line to the data list
        data.append(values)

# Turn the list into a DataFrame for easier data manipulation
# Setting column names to 'text' and 'emotion' for clarity
test_df = pd.DataFrame(data, columns=['text', 'emotion'])

# Output the DataFrame
test_df

Unnamed: 0,text,emotion
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness
...,...,...
1995,i just keep feeling like someone is being unki...,anger
1996,im feeling a little cranky negative after this...,anger
1997,i feel that i am useful to my people and that ...,joy
1998,im feeling more comfortable with derby i feel ...,joy


In [13]:
# Drop duplicate rows from the training DataFrame
test_df = test_df.drop_duplicates()

# Drop rows with NaN (missing) values from the training DataFrame
test_df = test_df.dropna()

# Reset the index after dropping rows
test_df = test_df.reset_index(drop=True)
# Count the occurrences of different emotions in the training data
test_df['emotion'].value_counts()

joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: emotion, dtype: int64

# Encoding the Labels

In [14]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
encoded_labels_train = label_encoder.fit_transform(train_df['emotion'])

# Save the mapping between original labels and encoded labels
label_mapping = {original_label: int_label for original_label, int_label in zip(train_df['emotion'], encoded_labels_train)}
label_mapping

{'sadness': 4, 'anger': 0, 'love': 3, 'surprise': 5, 'fear': 1, 'joy': 2}

In [15]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
encoded_labels_valid = label_encoder.fit_transform(valid_df['emotion'])

# Save the mapping between original labels and encoded labels
label_mapping = {original_label: int_label for original_label, int_label in zip(valid_df['emotion'], encoded_labels_valid)}
label_mapping

{'sadness': 4, 'love': 3, 'anger': 0, 'joy': 2, 'fear': 1, 'surprise': 5}

# Importing the Pre-trained Model (Roberta)

In [4]:
# Import the necessary classes from the transformers library
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Initialize a tokenizer using the 'roberta-base' pre-trained model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Initialize a model for sequence classification using the 'roberta-base' pre-trained weights
# Set the number of labels to 6 (according to our dataset)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=6)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should pr

In [6]:
# Import necessary classes from PyTorch and Hugging Face transformers library
from torch.utils.data import Dataset, DataLoader

# Define a custom Dataset class for the emotion classification task
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Get text and label for the given index
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize and encode the text using the provided tokenizer
        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        # Return a dictionary containing tokenized data and label
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initializing variables using training and validation data
train_texts, train_labels = train_df['text'], encoded_labels_train
val_texts, val_labels = valid_df['text'], encoded_labels_valid

# Create EmotionDataset instances for training and validation data
train_data = EmotionDataset(train_texts, train_labels, tokenizer, max_len=128)
val_data = EmotionDataset(val_texts, val_labels, tokenizer, max_len=128)

# Create DataLoader instances for training and validation data
# DataLoader helps manage batches and shuffling of data
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)

In [18]:
# Import the AdamW optimizer class from the transformers library
from transformers import AdamW

# Initialize the AdamW optimizer with the parameters of the model
# Set the learning rate (lr) to 1e-5
optimizer = AdamW(model.parameters(), lr=1e-5)

## Fine Tuning the Model

In [19]:
import torch
from sklearn.metrics import accuracy_score

# Check if CUDA (GPU) is available, and move the model to the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
for epoch in range(5):
    model.train()  # Set the model to training mode
    for batch in train_loader:
        optimizer.zero_grad()  # Clear gradients
        input_ids = batch['input_ids'].to(device)  # Move input to the device
        attention_mask = batch['attention_mask'].to(device)  # Move attention mask to the device
        labels = batch['labels'].to(device)  # Move labels to the device
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)  # Forward pass
        loss = outputs[0]  # Get the loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update model parameters using gradients

    # Validation loop
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    val_accuracy = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            val_loss += loss.item()

            # Calculate accuracy
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            accuracy = accuracy_score(labels.cpu(), predictions.cpu())
            val_accuracy += accuracy

    val_loss /= len(val_loader)  # Calculate average validation loss
    val_accuracy /= len(val_loader)  # Calculate average validation accuracy
    print(f'Validation Loss: {val_loss}')
    print(f'Validation Accuracy: {val_accuracy}')

Validation Loss: 0.24647469664849933
Validation Accuracy: 0.9057539682539683
Validation Loss: 0.16709393162339453
Validation Accuracy: 0.9290674603174603
Validation Loss: 0.15009885878553467
Validation Accuracy: 0.9325396825396826
Validation Loss: 0.15384577797164048
Validation Accuracy: 0.935515873015873
Validation Loss: 0.13322170908075004
Validation Accuracy: 0.939484126984127


# Testing the Model

In [20]:
from torch.utils.data import DataLoader

# Transform the labels of the test data using the label encoder
test_enc_labels = label_encoder.transform(test_df['emotion'])

# Create a test dataset using the EmotionDataset class
test_dataset = EmotionDataset(test_df['text'], test_enc_labels, tokenizer, max_len=128)

# Create a DataLoader for the test dataset
# DataLoader helps manage batches and shuffling of data during testing
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [21]:
model.eval()  # Set the model to evaluation mode

# Lists to store predicted and true labels
all_predictions = []
all_true_labels = []

# Disable gradient calculation for evaluation
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Perform forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        # Extend the lists with predicted and true labels
        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

# Convert encoded predictions back to original labels using the label encoder
predicted_labels = label_encoder.inverse_transform(all_predictions)
true_labels = label_encoder.inverse_transform(all_true_labels)

In [22]:
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy using true_labels and predicted_labels
accuracy = accuracy_score(true_labels, predicted_labels)
print(f'Accuracy: {accuracy}')

# Generate a classification report using true_labels and predicted_labels
report = classification_report(true_labels, predicted_labels)
print(report)

Accuracy: 0.928
              precision    recall  f1-score   support

       anger       0.92      0.95      0.93       275
        fear       0.86      0.89      0.88       224
         joy       0.95      0.95      0.95       695
        love       0.84      0.88      0.86       159
     sadness       0.97      0.95      0.96       581
    surprise       0.77      0.61      0.68        66

    accuracy                           0.93      2000
   macro avg       0.88      0.87      0.88      2000
weighted avg       0.93      0.93      0.93      2000



In [23]:
# Loop through the first 10 samples in the test data along with their true and predicted labels
for text, true_label, predicted_label in zip(test_df['text'][:10], true_labels[:10], predicted_labels[:10]):
    print(f'Text: {text}')
    print(f'True Label: {true_label}')
    print(f'Predicted Label: {predicted_label}\n')

Text: im feeling rather rotten so im not very ambitious right now
True Label: sadness
Predicted Label: sadness

Text: im updating my blog because i feel shitty
True Label: sadness
Predicted Label: sadness

Text: i never make her separate from me because i don t ever want her to feel like i m ashamed with her
True Label: sadness
Predicted Label: sadness

Text: i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived
True Label: joy
Predicted Label: joy

Text: i was feeling a little vain when i did this one
True Label: sadness
Predicted Label: sadness

Text: i cant walk into a shop anywhere where i do not feel uncomfortable
True Label: fear
Predicted Label: fear

Text: i felt anger when at the end of a telephone call
True Label: anger
Predicted Label: anger

Text: i explain why i clung to a relationship with a boy who was in many ways immature and uncommitted despite the excitement i should have been feeling for getting accepted in

In [27]:
torch.save(model, 'emotion_model.pth')