In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import random
import os

df_main = pd.read_csv("/content/eng.csv")  # just as example

train-test split (for simplicity we won't do stratify here for multi-label)
train_df, val_df = train_test_split(df_main, test_size=0.1, random_state=42)

In [2]:
train_df = pd.read_csv("/kaggle/input/datasetsemi/eng.csv")
train_df.head()

Unnamed: 0,id,text,anger,fear,joy,sadness,surprise
0,eng_train_track_a_00001,"Colorado, middle of nowhere.",0,1,0,0,1
1,eng_train_track_a_00002,This involved swimming a pretty large lake tha...,0,1,0,0,0
2,eng_train_track_a_00003,It was one of my most shameful experiences.,0,1,0,1,0
3,eng_train_track_a_00004,"After all, I had vegetables coming out my ears...",0,0,0,0,0
4,eng_train_track_a_00005,Then the screaming started.,0,1,0,1,1


In [3]:
val_df = pd.read_csv("/kaggle/input/datasetsemi/engdev.csv")
val_df.head()

Unnamed: 0,id,text,anger,fear,joy,sadness,surprise
0,eng_dev_track_a_00001,Older sister (23 at the time) is a Scumbag Stacy.,1,0,0,0,0
1,eng_dev_track_a_00002,"And I laughed like this: garhahagar, because m...",0,1,0,0,0
2,eng_dev_track_a_00003,It overflowed and brown shitty diarrhea water ...,1,1,0,1,1
3,eng_dev_track_a_00004,Its very dark and foggy.,0,1,0,0,0
4,eng_dev_track_a_00005,"Then she tried to, like, have sex with/strangl...",1,1,0,0,1


In [4]:
test_df = pd.read_csv("/kaggle/input/datasetsemi/engtest.csv")
test_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,text,anger,fear,joy,sadness,surprise
0,eng_test_track_a_00001,/ o \ So today I went in for a new exam with D...,,,,,
1,eng_test_track_a_00002,The image I have in my mind is this: a group o...,,,,,
2,eng_test_track_a_00003,"I slammed my fist against the door and yelled,...",,,,,
3,eng_test_track_a_00004,I could not unbend my knees.,,,,,
4,eng_test_track_a_00005,"I spent the night at the hotel, mostly hanging...",,,,,


test_text_embeddings = extract_text_embeddings(
    df=test_df,
    save_path="test_text_embeddings.pt",  # or another path
    model=text_model,
    tokenizer=text_tokenizer,
    max_length=max_length
)

# We do NOT have labels in the test set, so pass has_labels=False
X_test = prepare_text_embeddings(
    text_embeddings=test_text_embeddings,
    df=test_df,
    has_labels=False
)

# Create a TensorDataset from X_test
test_dataset = TensorDataset(X_test)

# Create a DataLoader from that dataset
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [5]:
#df_main.head()

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 42
torch.manual_seed(seed)

<torch._C.Generator at 0x7fdf5d5594b0>

In [7]:
model_name = "j-hartmann/emotion-english-distilroberta-base"  # or any model of your choice
batch_size = 8
max_length = 256  # or 512, etc.

In [8]:
num_labels = 5

In [9]:
text_tokenizer = AutoTokenizer.from_pretrained(model_name)
text_model = AutoModel.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at j-hartmann/emotion-english-distilroberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
class TextDataset(Dataset):
    """
    Returns two things:
      1) Embedded text (or tokenized text) for each row
      2) Multi-label tensor of shape (6,) for [anger, disgust, fear, joy, sadness, surprise]
    """
    def __init__(self, df, tokenizer, max_length, is_train=True):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_train = is_train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row["text"] if isinstance(row["text"], str) else ""

        # Tokenize text
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        # We want (batch, seq) shape, so we’ll squeeze out the 1st dimension
        # inputs["input_ids"], inputs["attention_mask"], ...
        for k, v in inputs.items():
            inputs[k] = v.squeeze(0)

        # If train or val, get the multi-label vector
        if self.is_train:
            # Convert [anger, disgust, fear, joy, sadness, surprise] to a torch tensor
            labels = torch.tensor([
                row["anger"],
                row["fear"],
                row["joy"],
                row["sadness"],
                row["surprise"]
            ], dtype=torch.float)  # float for BCEWithLogitsLoss
            return inputs, labels
        else:
            return inputs


In [11]:
# 5. Extract embeddings function
# ------------------
def extract_text_embeddings(df, save_path, model, tokenizer, max_length=128):
    """
    Convert text in df into [CLS] embeddings from a transformer.
    This is optional if you want an MLP on top.
    Or you can do end-to-end fine-tuning.
    """

    if os.path.exists(save_path):
        print(f"Embeddings already exist at {save_path}")
        return torch.load(save_path)

    model.eval()
    embeddings = {}

    with torch.no_grad():
        for idx, row in tqdm(df.iterrows(), desc="Extracting embeddings", total=len(df)):
            text = row["text"] if isinstance(row["text"], str) else ""
            inputs = tokenizer(
                text,
                padding="max_length",
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] embedding
            embeddings[idx] = cls_embedding.cpu()

    torch.save(embeddings, save_path)
    return embeddings


In [12]:
# (Optional) Actually do it for train, val
train_text_embeddings = extract_text_embeddings(
    train_df, "train_text_embeddings.pt", text_model, text_tokenizer, max_length
)
val_text_embeddings = extract_text_embeddings(
    val_df, "val_text_embeddings.pt", text_model, text_tokenizer, max_length
)

Extracting embeddings: 100%|██████████| 2768/2768 [00:22<00:00, 121.29it/s]
Extracting embeddings: 100%|██████████| 116/116 [00:00<00:00, 123.33it/s]


In [13]:
test_text_embeddings = extract_text_embeddings(
    df=test_df,
    save_path="test_text_embeddings.pt",  # or another path
    model=text_model,
    tokenizer=text_tokenizer,
    max_length=max_length
)

Extracting embeddings: 100%|██████████| 2767/2767 [00:23<00:00, 118.22it/s]


In [14]:
# 6. Convert embeddings + labels into Tensors
# ------------------
def prepare_text_embeddings(text_embeddings, df, has_labels=True):
    """ Stack all embeddings into a single tensor, and the 6‐dim label if available. """
    combined_embeddings = []
    combined_labels = []

    for idx, row in df.iterrows():
        if idx not in text_embeddings:
            continue
        emb = text_embeddings[idx].squeeze(0)
        combined_embeddings.append(emb)

        if has_labels:
            labs = [
                row["anger"],
                row["fear"],
                row["joy"],
                row["sadness"],
                row["surprise"]
            ]
            combined_labels.append(labs)

    X = torch.stack(combined_embeddings)
    if has_labels:
        y = torch.tensor(combined_labels, dtype=torch.float)
        return X, y
    else:
        return X

X_train, y_train = prepare_text_embeddings(train_text_embeddings, train_df, has_labels=True)
X_val, y_val = prepare_text_embeddings(val_text_embeddings, val_df, has_labels=True)

print("Shapes:", X_train.shape, y_train.shape, "|", X_val.shape, y_val.shape)


Shapes: torch.Size([2768, 768]) torch.Size([2768, 5]) | torch.Size([116, 768]) torch.Size([116, 5])


In [15]:
# 7. Define a Multi-label MLP model
# ------------------
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_p=0.5):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim[0])
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(p=dropout_p)
        self.fc2 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.dropout2 = nn.Dropout(p=dropout_p)
        self.fc3 = nn.Linear(hidden_dim[1], output_dim)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.relu(self.fc2(x))
        x = self.dropout2(x)
        # For multi‐label, we do not apply sigmoid here.
        # We'll use BCEWithLogitsLoss, which combines sigmoid + BCE in a stable way.
        x = self.fc3(x)
        return x


In [16]:
# 8. Prepare Data Loaders
# ------------------
train_dataset = TensorDataset(X_train, y_train)
val_dataset   = TensorDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)


In [17]:
test_text_embeddings = extract_text_embeddings(
    df=test_df,
    save_path="test_text_embeddings.pt",  # or another path
    model=text_model,
    tokenizer=text_tokenizer,
    max_length=max_length
)

Embeddings already exist at test_text_embeddings.pt


  return torch.load(save_path)


In [18]:
# We do NOT have labels in the test set, so pass has_labels=False
X_test = prepare_text_embeddings(
    text_embeddings=test_text_embeddings,
    df=test_df,
    has_labels=False
)

# Create a TensorDataset from X_test
test_dataset = TensorDataset(X_test)

# Create a DataLoader from that dataset
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [19]:
# 9. Training Setup
# ------------------
input_dim   = X_train.shape[1]  # e.g. 768 if your transformer is a base model
hidden_dim  = [1024, 512]       # can adjust
output_dim  = num_labels        # 6 for [anger, disgust, fear, joy, sadness, surprise]
dropout_p   = 0.3
num_epochs  = 50
learning_rate_options = [0.0005, 0.001, 0.002]
learning_rate = random.choice(learning_rate_options)


In [20]:
model = MLPModel(input_dim, hidden_dim, output_dim, dropout_p).to(device)
# For multi-label classification => BCEWithLogitsLoss
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [21]:
# 10. Training Loop
# ------------------
def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for X_batch, y_batch in loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)  # shape (batch, 6)

        optimizer.zero_grad()
        logits = model(X_batch)  # shape (batch, 6)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Convert to predictions
        preds = torch.sigmoid(logits)  # apply sigmoid
        preds = (preds >= 0.5).float() # threshold
        all_preds.append(preds.detach().cpu())
        all_labels.append(y_batch.detach().cpu())

    avg_loss = total_loss / len(loader)
    all_preds  = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    return avg_loss, all_preds, all_labels

In [22]:
def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            total_loss += loss.item()

            preds = torch.sigmoid(logits)
            preds = (preds >= 0.5).float()
            all_preds.append(preds.detach().cpu())
            all_labels.append(y_batch.detach().cpu())

    avg_loss = total_loss / len(loader)
    all_preds  = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    return avg_loss, all_preds, all_labels


In [23]:

from sklearn.metrics import precision_recall_fscore_support

def calculate_metrics(preds, labels):
    """
    Multi-label version of the metrics:
      - preds, labels: shape (batch_size, 5) with 0/1 values.
      - We compute:
         * Subset accuracy: fraction of samples with exactly matching labels
         * Macro Precision, Recall, F1
    """
    # Convert to CPU numpy if needed
    preds_np = preds.cpu().numpy()
    labels_np = labels.cpu().numpy()

    # Subset accuracy: the sample is correct only if *all* 5 labels match
    exact_matches = (preds_np == labels_np).all(axis=1)
    subset_accuracy = exact_matches.mean()

    # Macro-averaged P, R, F1 across the 5 labels
    p, r, f, _ = precision_recall_fscore_support(
        labels_np,
        preds_np,
        average="macro",
        zero_division=0
    )
    return subset_accuracy, p, r, f

In [24]:
def train_and_save_best_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, save_dir):
    best_f1 = -float('inf')
    best_model_path = None

    for epoch in range(num_epochs):
        ######################
        # 1) Training Phase
        ######################
        model.train()
        train_loss = 0.0
        all_train_preds, all_train_labels = [], []

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # shape (batch, 768), (batch, 5)

            optimizer.zero_grad()
            outputs = model(inputs).squeeze()  # shape (batch, 5)
            loss = criterion(outputs, labels)  # BCEWithLogitsLoss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            # Convert logits -> probabilities -> 0/1 predictions
            probs = torch.sigmoid(outputs)
            preds = (probs >= 0.5).float()
            all_train_preds.append(preds)
            all_train_labels.append(labels)

        avg_train_loss = train_loss / len(train_loader)
        all_train_preds = torch.cat(all_train_preds, dim=0)
        all_train_labels = torch.cat(all_train_labels, dim=0)

        # Calculate multi-label metrics for training set
        train_accuracy, train_precision, train_recall, train_f1 = calculate_metrics(all_train_preds, all_train_labels)

        ######################
        # 2) Validation Phase
        ######################
        model.eval()
        val_loss = 0.0
        all_val_preds, all_val_labels = [], []

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs).squeeze()
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                probs = torch.sigmoid(outputs)
                preds = (probs >= 0.5).float()
                all_val_preds.append(preds)
                all_val_labels.append(labels)

        avg_val_loss = val_loss / len(val_loader)
        all_val_preds = torch.cat(all_val_preds, dim=0)
        all_val_labels = torch.cat(all_val_labels, dim=0)

        # Calculate multi-label metrics for validation set
        val_accuracy, val_precision, val_recall, val_f1 = calculate_metrics(all_val_preds, all_val_labels)

        print(f"Epoch {epoch+1}/{num_epochs}: "
              f"Train Loss: {avg_train_loss:.4f}, "
              f"Train Acc: {train_accuracy:.4f}, Prec: {train_precision:.4f}, Rec: {train_recall:.4f}, F1: {train_f1:.4f} | "
              f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}, "
              f"Prec: {val_precision:.4f}, Rec: {val_recall:.4f}, F1: {val_f1:.4f}")

        # Save the model if it has the best F1 score on validation
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model_path = f"{save_dir}/best_model_epoch_{epoch + 1}_f1_{val_f1:.4f}.pth"
            torch.save(model.state_dict(), best_model_path)
            print(f"Best model saved with F1: {val_f1:.4f} at epoch {epoch + 1}")

            # For your Excel logging
            a = round(train_accuracy,4)
            b = round(train_precision,4)
            c = round(train_recall,4)
            d = round(train_f1,4)
            e = round(val_accuracy,4)
            f_ = round(val_precision,4)
            g = round(val_recall,4)
            h = round(val_f1,4)

    # Return best path plus the last known metrics
    return best_model_path, a, b, c, d, e, f_, g, h

In [25]:
#  RUN and SAVE
# ------------------
save_dir = "./models"
os.makedirs(save_dir, exist_ok=True)

best_model_path, a, b, c, d, e, f_, g, h = train_and_save_best_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=num_epochs,
    save_dir=save_dir
)

print(f"Best model saved at: {best_model_path}")

Epoch 1/50: Train Loss: 0.4686, Train Acc: 0.3118, Prec: 0.6541, Rec: 0.4812, F1: 0.5405 | Val Loss: 0.4657, Val Acc: 0.3103, Prec: 0.7151, Rec: 0.5023, F1: 0.5601
Best model saved with F1: 0.5601 at epoch 1
Epoch 2/50: Train Loss: 0.4373, Train Acc: 0.3436, Prec: 0.6890, Rec: 0.5352, F1: 0.5935 | Val Loss: 0.4276, Val Acc: 0.3017, Prec: 0.7698, Rec: 0.4586, F1: 0.5575
Epoch 3/50: Train Loss: 0.4144, Train Acc: 0.3587, Prec: 0.6930, Rec: 0.5516, F1: 0.6079 | Val Loss: 0.4322, Val Acc: 0.3621, Prec: 0.7213, Rec: 0.5631, F1: 0.6129
Best model saved with F1: 0.6129 at epoch 3
Epoch 4/50: Train Loss: 0.3968, Train Acc: 0.3855, Prec: 0.7065, Rec: 0.5802, F1: 0.6303 | Val Loss: 0.4459, Val Acc: 0.3621, Prec: 0.7315, Rec: 0.5285, F1: 0.5984
Epoch 5/50: Train Loss: 0.3808, Train Acc: 0.4090, Prec: 0.7179, Rec: 0.6038, F1: 0.6507 | Val Loss: 0.4539, Val Acc: 0.3362, Prec: 0.7476, Rec: 0.5414, F1: 0.5922
Epoch 6/50: Train Loss: 0.3636, Train Acc: 0.4277, Prec: 0.7287, Rec: 0.6237, F1: 0.6658 | V

# EXCEL LOGGING (SAME IDEA AS BEFORE)
#####################################

import os
from openpyxl import Workbook, load_workbook

# Path to your Excel file
excel_file = "/content/drive/MyDrive/ML winter Bootcamp/SemiEval11/Book1.xlsx"

# Load the workbook to append data
wb = load_workbook(excel_file)
ws = wb.active

# Identify the next empty row
next_row = 1
for row in ws.iter_rows(min_row=1, max_col=4):
    if not row[3].value:
        next_row = row[0].row
        break
    next_row += 1

# Write data starting from column C
ws.cell(row=next_row, column=3,  value=seed)
ws.cell(row=next_row, column=4,  value=max_length)
ws.cell(row=next_row, column=5,  value=batch_size)
ws.cell(row=next_row, column=6,  value=num_epochs)
ws.cell(row=next_row, column=7,  value=str(hidden_dim))
ws.cell(row=next_row, column=8,  value=learning_rate)
ws.cell(row=next_row, column=9,  value=dropout_p)
ws.cell(row=next_row, column=10, value=a)   # Train Acc
ws.cell(row=next_row, column=11, value=b)   # Train Prec
ws.cell(row=next_row, column=12, value=c)   # Train Rec
ws.cell(row=next_row, column=13, value=d)   # Train F1
ws.cell(row=next_row, column=14, value=e)   # Val Acc
ws.cell(row=next_row, column=15, value=f_)  # Val Prec
ws.cell(row=next_row, column=16, value=g)   # Val Rec
ws.cell(row=next_row, column=17, value=h)   # Val F1

wb.save(excel_file)

import pandas as pd
df = pd.read_excel(excel_file)
print(df)


In [26]:
def predict_and_generate_submission(
    test_loader,
    best_model_path,
    submission_file_path,
    text_df,  # The original test_df so we can reference "id" / "text"
    input_dim,
    hidden_dim,
    output_dim,
    dropout_p
):
    """
    Multi-label inference on test set. The test_loader yields numeric embeddings (X_test).
    We do MLP forward pass => sigmoid => threshold => build a CSV with predicted labels.
    """
    # 1. Load best model
    model = MLPModel(input_dim, hidden_dim, output_dim, dropout_p).to(device)
    model.load_state_dict(torch.load(best_model_path, weights_only=True))
    model.eval()

    # For storing predictions in a list of shape (num_samples, 5)
    all_test_preds = []

    # 2. Iterate over test_loader
    with torch.no_grad():
        for (inputs,) in test_loader:
            # "inputs" is shape (batch_size, embedding_dim)
            inputs = inputs.to(device)
            logits = model(inputs)              # shape (batch_size, 5)
            probs = torch.sigmoid(logits)       # shape (batch_size, 5)
            preds = (probs >= 0.5).float().cpu() # shape (batch_size, 5), 0/1
            all_test_preds.extend(preds.numpy())

    # 3. Build the submission DataFrame
    #    Suppose you want the "id" and "text" columns from the test_df
    submission_df = pd.DataFrame({
        "id":   text_df["id"].values,
        "text": text_df["text"].values,
        "anger":    [int(row[0]) for row in all_test_preds],
        "fear":     [int(row[1]) for row in all_test_preds],
        "joy":      [int(row[2]) for row in all_test_preds],
        "sadness":  [int(row[3]) for row in all_test_preds],
        "surprise": [int(row[4]) for row in all_test_preds],
    })

    # 4. Save CSV
    submission_df.to_csv(submission_file_path, index=False, encoding="utf-8")
    print(f"Submission saved to {submission_file_path}")

    return submission_df

# Example usage:
submission_file_path = "submission.csv"
submission_df = predict_and_generate_submission(
    test_loader=test_loader,
    best_model_path=best_model_path,
    submission_file_path=submission_file_path,
    text_df=test_df,   # the original DataFrame for the test set
    input_dim=input_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    dropout_p=dropout_p
)

submission_df.head()


Submission saved to submission.csv


Unnamed: 0,id,text,anger,fear,joy,sadness,surprise
0,eng_test_track_a_00001,/ o \ So today I went in for a new exam with D...,0,0,0,0,0
1,eng_test_track_a_00002,The image I have in my mind is this: a group o...,0,1,0,1,0
2,eng_test_track_a_00003,"I slammed my fist against the door and yelled,...",0,0,1,0,0
3,eng_test_track_a_00004,I could not unbend my knees.,0,0,0,0,0
4,eng_test_track_a_00005,"I spent the night at the hotel, mostly hanging...",0,0,1,0,0
