In [None]:
!pip install transformers datasets scikit-learn numpy pandas tqdm matplotlib

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, XLMRobertaForSequenceClassification, AdamW, get_scheduler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import random
from sklearn.model_selection import train_test_split

In [None]:
# File path to the ESCI Dataset
data_path = "path/to/esci_dataset.csv"  # Replace with your actual dataset path

# Load dataset
df = pd.read_csv(data_path)

# Example columns in ESCI dataset: query, product_title, label (E=3, S=2, C=1, I=0)
print(df.head())

# Split into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Training samples: {len(train_df)}, Validation samples: {len(val_df)}, Test samples: {len(test_df)}")

In [None]:
def smooth_labels(labels, epsilon=0.1):
    """
    Applies label smoothing to reduce the impact of noisy labels.
    Args:
        labels: Array of integer labels (E=3, S=2, C=1, I=0).
        epsilon: Smoothing factor (default: 0.1).
    Returns:
        smoothed_labels: Array of smoothed label probabilities.
    """
    num_classes = 4  # E=3, S=2, C=1, I=0
    smoothed_labels = (1 - epsilon) * labels + epsilon / num_classes
    return smoothed_labels

# Apply label smoothing to the dataset
train_df["smoothed_label"] = smooth_labels(train_df["label"])
val_df["smoothed_label"] = smooth_labels(val_df["label"])
test_df["smoothed_label"] = smooth_labels(test_df["label"])

In [None]:
class AugmentedESCDataset(Dataset):
    def __init__(self, queries, products, labels, pseudo_labels, tokenizer, max_len=128):
        self.queries = queries
        self.products = products
        self.labels = labels
        self.pseudo_labels = pseudo_labels  # Include pseudo labels derived from the model's outputs
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, item):
        query = str(self.queries[item])
        product = str(self.products[item])
        label = self.labels[item]
        pseudo_label = self.pseudo_labels[item]

        encoding = self.tokenizer.encode_plus(
            query,
            product,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.float),
            "pseudo_label": torch.tensor(pseudo_label, dtype=torch.float)
        }

In [None]:
def apply_adversarial_weight_perturbation(model, embeddings, epsilon=0.01, perturb_steps=1):
    """
    Applies adversarial weight perturbation to the embedding layer.
    Args:
        model: Pretrained transformer model (XLM-RoBERTa).
        embeddings: Embedding weights.
        epsilon: Magnitude of perturbation (default: 0.01).
        perturb_steps: Number of steps for perturbation (default: 1).
    Returns:
        perturbed_embeddings: Perturbed embedding weights.
    """
    perturbed_embeddings = embeddings.clone()
    for _ in range(perturb_steps):
        perturbation = epsilon * torch.randn_like(embeddings)
        perturbed_embeddings += perturbation
        perturbed_embeddings = nn.functional.normalize(perturbed_embeddings, dim=-1)  # Normalize perturbation

    return perturbed_embeddings

In [None]:
# Training loop with self-distillation and adversarial weight perturbation
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    epoch_loss = 0
    model.train()

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        # Load data to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].unsqueeze(1).to(device)
        pseudo_labels = batch["pseudo_label"].unsqueeze(1).to(device)

        # Forward pass with regular embeddings
        outputs_regular = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions_regular = outputs_regular.logits

        # Apply adversarial perturbation to weights
        embeddings = model.base_model.embeddings.word_embeddings.weight
        perturbed_embeddings = apply_adversarial_weight_perturbation(model, embeddings)
        model.base_model.embeddings.word_embeddings.weight = nn.Parameter(perturbed_embeddings)

        # Forward pass with adversarial embeddings
        outputs_perturbed = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions_perturbed = outputs_perturbed.logits

        # Combine loss: Regular loss + Pseudo-label loss + Perturbed loss
        loss_regular = loss_fn(predictions_regular, labels)
        loss_pseudo = loss_fn(predictions_regular, pseudo_labels)
        loss_perturbed = loss_fn(predictions_perturbed, labels)

        # Total loss
        loss = (loss_regular + loss_pseudo + loss_perturbed) / 3
        loss.backward()

        # Update model weights
        optimizer.step()
        lr_scheduler.step()

        epoch_loss += loss.item()

    print(f"Epoch Loss: {epoch_loss / len(train_loader)}")

In [None]:
# Load the XLM-RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Create DataLoader objects for training and validation
train_dataset = ESCDataset(train_df["query"].values, train_df["product_title"].values, train_df["label"].values, tokenizer)
val_dataset = ESCDataset(val_df["query"].values, val_df["product_title"].values, val_df["label"].values, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [None]:
# Load XLM-RoBERTa for sequence classification
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=1)  # Regression task
model = model.to(device)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)

num_training_steps = len(train_loader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Define loss function
loss_fn = torch.nn.MSELoss()  # For regression tasks

# Training Loop
epochs = 3
model.train()
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    epoch_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        # Move batch data to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].unsqueeze(1).to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = outputs.logits

        # Compute loss
        loss = loss_fn(predictions, labels)
        loss.backward()

        # Update model weights
        optimizer.step()
        lr_scheduler.step()

        epoch_loss += loss.item()

    print(f"Epoch Loss: {epoch_loss / len(train_loader)}")

In [None]:
def evaluate_model(model, data_loader, device):
    model.eval()
    relevance_scores = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            true_labels.extend(labels.tolist())
            relevance_scores.extend(torch.sigmoid(outputs.logits).tolist())

    return relevance_scores, true_labels

# Evaluate on validation and test sets
val_scores, val_labels = evaluate_model(model, val_loader, device)
test_scores, test_labels = evaluate_model(model, test_loader, device)

In [None]:
from sklearn.metrics import mean_squared_error

def compute_metrics(true_labels, relevance_scores):
    mse = mean_squared_error(true_labels, relevance_scores)
    print(f"Mean Squared Error: {mse}")

# Compute metrics for validation set
compute_metrics(val_labels, val_scores)

In [None]:
# Example data for NDCG computation
k = 10
query_ids = test_df["query_id"].unique()

# Compute NDCG@k for test queries
ndcg_results = []
for query_id in query_ids:
    query_products = test_df[test_df["query_id"] == query_id]
    relevance_scores = query_products["model_score"].values
    # Sort products by relevance scores
    relevance_ranking = np.argsort(relevance_scores)[::-1]  # Highest scores first
    relevance_sorted_scores = [relevance_scores[i] for i in relevance_ranking]

    ndcg_results.append(ndcg_at_k(relevance_sorted_scores, k))

average_ndcg = sum(ndcg_results) / len(ndcg_results)
print(f"Average NDCG@{k}: {average_ndcg}")