<a href="https://colab.research.google.com/github/eshikaalam/XLMRoBERTaBiGRU/blob/main/XLMRoBERTaBiGRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers -q
!pip install datasets -q

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
import torch.nn.functional as F
import random
import os


In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('Using GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('Using CPU')

Using GPU: Tesla T4


In [None]:
df = pd.read_csv("/content/drive/MyDrive/dataset/MONOVAB.csv")

# Drop unnecessary columns and rows with missing data
df = df.drop(columns=['Unnamed: 0'], errors='ignore')
df.dropna(inplace=True)

# Rename text column
df.rename(columns={"comment": "text"}, inplace=True)

# View class labels
emotion_columns = ['anger', 'contempt', 'disgust', 'enjoyment', 'fear', 'sadness', 'surprise']
df[emotion_columns] = df[emotion_columns].astype(float)


In [None]:
def preprocess(df):
    # Normalize & apply threshold
    threshold = 0.35
    for col in emotion_columns:
        df[col] = df[col] / df[col].max()
        df[col] = df[col].apply(lambda x: 1 if x >= threshold else 0)
    return df

df = preprocess(df)


In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

X_train, y_train = train['text'].tolist(), train[emotion_columns].values.tolist()
X_test, y_test = test['text'].tolist(), test[emotion_columns].values.tolist()


In [None]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, texts, labels, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.float32)
        }


In [None]:
batch_size = 32

train_dataset = TextDataset(tokenizer, X_train, y_train)
test_dataset = TextDataset(tokenizer, X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [None]:
# Model Definition (XLM-RoBERTa + BiGRU)
class XLMRobertaGRUClassifier(nn.Module):
    def __init__(self, num_classes):
        super(XLMRobertaGRUClassifier, self).__init__()
        self.xlmroberta = AutoModel.from_pretrained("xlm-roberta-base")
        self.gru = nn.GRU(input_size=self.xlmroberta.config.hidden_size,
                          hidden_size=self.xlmroberta.config.hidden_size,
                          num_layers=1,
                          batch_first=True,
                          bidirectional=True)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(self.xlmroberta.config.hidden_size * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():  # Freeze XLM-R for stability
            outputs = self.xlmroberta(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        hidden_states = outputs.last_hidden_state
        gru_output, _ = self.gru(hidden_states)
        output = self.linear(self.dropout(gru_output[:, -1, :]))
        return output


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_classes = len(emotion_columns)
model = XLMRobertaGRUClassifier(num_classes).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCEWithLogitsLoss()


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [None]:
from sklearn.metrics import f1_score, precision_score, accuracy_score, hamming_loss
import numpy as np
import torch
import torch.nn.functional as F

# Evaluation Function
def compute_metrics(y_true, y_pred):
    micro_f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)
    macro_f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
    micro_precision = precision_score(y_true, y_pred, average='micro', zero_division=0)
    macro_precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
    subset_accuracy = accuracy_score(y_true, y_pred)  # exact match accuracy
    hamming_acc = 1 - hamming_loss(y_true, y_pred)    # label-wise accuracy
    return micro_f1, macro_f1, micro_precision, macro_precision, subset_accuracy, hamming_acc

# Training Function with integrated validation
def train_model(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=5):
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device).float()

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device).float()

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                probs = torch.sigmoid(outputs)
                preds = (probs >= 0.5).float()

                all_preds.append(preds.cpu())
                all_labels.append(labels.cpu())

        avg_val_loss = val_loss / len(val_loader)

        # Convert to numpy arrays
        all_preds = torch.cat(all_preds).numpy()
        all_labels = torch.cat(all_labels).numpy()

        # Compute metrics
        micro_f1, macro_f1, micro_precision, macro_precision, subset_acc, hamming_acc = compute_metrics(all_labels, all_preds)

        print(f"\nEpoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
        print(f"Subset Accuracy: {subset_acc:.4f}")
        print(f"Hamming Accuracy: {hamming_acc:.4f}")
        print(f"Micro F1: {micro_f1:.4f} | Macro F1: {macro_f1:.4f}")
        print(f"Micro Precision: {micro_precision:.4f} | Macro Precision: {macro_precision:.4f}")


In [None]:
epochs = 5

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    train_loss = train_epoch(model, train_loader)
    val_loss, preds, truths = eval_epoch(model, test_loader)

    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    binarized_preds = (np.array(preds) >= 0.5).astype(int)

    # Compute metrics
    micro_f1, macro_f1, micro_precision, macro_precision, subset_acc, hamming_acc = compute_metrics(truths, binarized_preds)

    print(f"Subset Accuracy: {subset_acc:.4f}")
    print(f"Hamming Accuracy: {hamming_acc:.4f}")
    print(f"Micro F1: {micro_f1:.4f} | Macro F1: {macro_f1:.4f}")
    print(f"Micro Precision: {micro_precision:.4f} | Macro Precision: {macro_precision:.4f}")



Epoch 1/5
Train Loss: 0.3513 | Val Loss: 0.3561
Subset Accuracy: 0.1144
Hamming Accuracy: 0.8398
Micro F1: 0.2198 | Macro F1: 0.1230
Micro Precision: 0.7115 | Macro Precision: 0.2071

Epoch 2/5
Train Loss: 0.3505 | Val Loss: 0.3595
Subset Accuracy: 0.1037
Hamming Accuracy: 0.8375
Micro F1: 0.1856 | Macro F1: 0.1126
Micro Precision: 0.7143 | Macro Precision: 0.2065

Epoch 3/5
Train Loss: 0.3484 | Val Loss: 0.3516
Subset Accuracy: 0.1374
Hamming Accuracy: 0.8412
Micro F1: 0.2565 | Macro F1: 0.1371
Micro Precision: 0.6853 | Macro Precision: 0.2003

Epoch 4/5
Train Loss: 0.3465 | Val Loss: 0.3507
Subset Accuracy: 0.1550
Hamming Accuracy: 0.8422
Micro F1: 0.2817 | Macro F1: 0.1440
Micro Precision: 0.6712 | Macro Precision: 0.2016

Epoch 5/5
Train Loss: 0.3453 | Val Loss: 0.3538
Subset Accuracy: 0.1487
Hamming Accuracy: 0.8414
Micro F1: 0.2629 | Macro F1: 0.1417
Micro Precision: 0.6795 | Macro Precision: 0.1951


In [9]:
# Save the trained model
torch.save({'model_state_dict': model.state_dict()}, "/content/drive/MyDrive/dataset/model.pt")
print("Model saved as model.pt")


Model saved as model.pt


In [17]:
#for testing and save the model
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from transformers import AutoTokenizer, AutoModel



In [18]:
# Define emotion labels
emotion_columns = ['anger', 'contempt', 'disgust', 'enjoyment', 'fear', 'sadness', 'surprise']
num_classes = len(emotion_columns)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")


In [19]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, texts, labels=None, max_length=128):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}

        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)

        return item


In [20]:
class XLMRobertaGRUClassifier(nn.Module):
    def __init__(self, num_classes):
        super(XLMRobertaGRUClassifier, self).__init__()
        self.xlmroberta = AutoModel.from_pretrained("xlm-roberta-base")
        for param in self.xlmroberta.parameters():
            param.requires_grad = False  # Freeze RoBERTa
        self.gru = nn.GRU(
            input_size=self.xlmroberta.config.hidden_size,
            hidden_size=self.xlmroberta.config.hidden_size,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(self.xlmroberta.config.hidden_size * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.xlmroberta(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        hidden_states = outputs.last_hidden_state
        gru_output, _ = self.gru(hidden_states)
        pooled_output = self.dropout(gru_output[:, -1, :])  # Last timestep
        logits = self.linear(pooled_output)
        return logits


In [21]:
# Load model from Drive
device = torch.device("cpu")
model = XLMRobertaGRUClassifier(num_classes)
checkpoint = torch.load("/content/drive/MyDrive/dataset/model.pt", map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
model.eval()


XLMRobertaGRUClassifier(
  (xlmroberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load full dataset
df = pd.read_csv("/content/drive/MyDrive/dataset/MONOVAB.csv")

# Clean and preprocess (same as training)
df = df.drop(columns=['Unnamed: 0'], errors='ignore')
df.dropna(inplace=True)
df.rename(columns={"comment": "text"}, inplace=True)

# Define emotion columns
emotion_columns = ['anger', 'contempt', 'disgust', 'enjoyment', 'fear', 'sadness', 'surprise']
df[emotion_columns] = df[emotion_columns].astype(float)

# Split into train and test
texts = df['text'].tolist()
labels = df[emotion_columns].values
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create test dataset and loader
test_dataset = TextDataset(tokenizer, X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32)


In [24]:
import torch

# Ensure model is in eval mode and on the correct device
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Predict on test set
all_predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.sigmoid(outputs).cpu().numpy()
        all_predictions.extend(predictions)


  return forward_call(*args, **kwargs)


In [25]:
# Convert predictions to DataFrame
pred_df = pd.DataFrame(all_predictions, columns=emotion_columns)

# Save to CSV
pred_df.to_csv("/content/drive/MyDrive/dataset/emotion_predictions.csv", index=False)
print("Predictions saved to: /content/drive/MyDrive/dataset/emotion_predictions.csv")


Predictions saved to: /content/drive/MyDrive/dataset/emotion_predictions.csv
