In [1]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

Looking in indexes: https://download.pytorch.org/whl/cu126
INFO: pip is looking at multiple versions of torchvision to determine which version is compatible with other requirements. This could take a while.
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cu126/torchvision-0.21.0%2Bcu126-cp312-cp312-win_amd64.whl.metadata (6.3 kB)
Collecting torch
  Using cached https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp312-cp312-win_amd64.whl.metadata (28 kB)
INFO: pip is looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu126/torchaudio-2.6.0%2Bcu126-cp312-cp312-win_amd64.whl.metadata (6.8 kB)
Using cached https://download.pytorch.org/whl/cu126/torchvision-0.21.0%2Bcu126-cp312-cp312-win_amd64.whl (6.1 MB)
Using cached https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp312-cp312-win_amd64.whl (2496.1



In [2]:
%pip install seaborn matplotlib huggingface_hub transformers peft tqdm scikit-learn

Note: you may need to restart the kernel to use updated packages.




In [3]:
import os
import torch
import pickle
import logging
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef, classification_report, precision_recall_curve

from huggingface_hub import hf_hub_download

import torch
import torch.nn as nn


# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class DualEmbeddingDataset(Dataset):
    """
    Dataset for dual embedding model that ensures all tensors are on CPU.
    """
    def __init__(self, features):
        """
        Initialize the dataset with preprocessed features.

        Args:
            features: Dictionary of feature tensors including input_ids,
                      attention_mask, sbert_embeddings, and labels
        """
        self.input_ids = features["input_ids"]
        self.attention_mask = features["attention_mask"]
        self.sbert_embeddings = features["sbert_embeddings"]
        self.labels = features["labels"] if "labels" in features else None

        # Ensure all tensors are on CPU
        if self.input_ids.is_cuda:
            self.input_ids = self.input_ids.cpu()
        if self.attention_mask.is_cuda:
            self.attention_mask = self.attention_mask.cpu()
        if self.sbert_embeddings.is_cuda:
            self.sbert_embeddings = self.sbert_embeddings.cpu()
        if self.labels is not None and self.labels.is_cuda:
            self.labels = self.labels.cpu()

        # Validate tensor shapes
        assert len(self.input_ids) == len(self.attention_mask) == len(self.sbert_embeddings), \
            "All feature tensors must have the same first dimension"

    def __len__(self):
        return len(self.labels) if self.labels is not None else len(self.input_ids)

    def __getitem__(self, idx):
        """
        Get a single example from the dataset.

        Args:
            idx: Index to retrieve

        Returns:
            Dictionary of tensors for the given index
        """
        item = {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "sbert_embeddings": self.sbert_embeddings[idx]
        }
        if self.labels is not None:
            item["labels"] = self.labels[idx]
        return item
    

In [5]:
def prepare_dual_embedding_features(df, modernbert_tokenizer, sbert_model, max_length=512, sbert_batch_size=64):
    """
    Prepare features for the dual embedding model, ensuring all tensors remain on CPU.

    Args:
        df: DataFrame with 'Claim', 'Evidence', and label columns
        modernbert_tokenizer: ModernBERT tokenizer
        sbert_model: Sentence-BERT model
        max_length: Maximum sequence length for tokenization
        sbert_batch_size: Batch size for SBERT encoding

    Returns:
        Dictionary of feature tensors with input_ids, attention_mask, sbert_embeddings, and labels
    """
    # Keep track of original SBERT device
    original_device = next(sbert_model.parameters()).device
    logger.info(f"Original SBERT device: {original_device}")

    # Prepare inputs
    texts_claim = df["Claim"].tolist()
    texts_evidence = df["Evidence"].tolist()

    # ModernBERT tokenization - keep on CPU
    logger.info("Tokenizing inputs for ModernBERT...")
    modernbert_features = modernbert_tokenizer(
        texts_claim,
        texts_evidence,
        padding=True,
        truncation="only_second",
        max_length=max_length,
        return_tensors="pt"
    )

    # Compute SBERT embeddings on GPU, then move back to CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Computing SBERT embeddings on: {device}")

    # Temporarily move SBERT to computation device
    sbert_model = sbert_model.to(device)

    # Compute claim embeddings
    logger.info("Computing SBERT embeddings for claims (batched)...")
    claim_embeddings = sbert_model.encode(
        texts_claim,
        convert_to_tensor=True,
        batch_size=sbert_batch_size,
        show_progress_bar=True,
        device=device
    )

    # Move claim embeddings to CPU immediately
    claim_embeddings = claim_embeddings.cpu()

    # Compute evidence embeddings
    logger.info("Computing SBERT embeddings for evidence (batched)...")
    evidence_embeddings = sbert_model.encode(
        texts_evidence,
        convert_to_tensor=True,
        batch_size=sbert_batch_size,
        show_progress_bar=True,
        device=device
    )

    # Move evidence embeddings to CPU immediately
    evidence_embeddings = evidence_embeddings.cpu()

    # Return SBERT to original device
    sbert_model = sbert_model.to(original_device)

    # Combine claim and evidence embeddings on CPU
    logger.info("Combining embeddings...")
    combined_embeddings = []
    for claim_emb, evid_emb in tqdm(zip(claim_embeddings, evidence_embeddings),
                                  total=len(claim_embeddings),
                                  desc="Combining embeddings"):
        # Use average of the claim and evidence embeddings
        combined_emb = (claim_emb + evid_emb) / 2
        combined_embeddings.append(combined_emb)

    sbert_embeddings = torch.stack(combined_embeddings)

    # # Prepare labels
    # if "label" in df.columns:
    #     label_col = "label"
    # elif "labels" in df.columns:
    #     label_col = "labels"
    # else:
    #     raise ValueError("DataFrame must contain 'label' or 'labels' column")

    # Keep labels on CPU
    # labels = torch.tensor(df[label_col].values, dtype=torch.float)

    # Final verification that all tensors are on CPU
    logger.info("Verifying all tensors are on CPU...")
    for key, tensor in modernbert_features.items():
        if tensor.is_cuda:
            logger.warning(f"{key} is on CUDA, moving to CPU")
            modernbert_features[key] = tensor.cpu()

    if sbert_embeddings.is_cuda:
        logger.warning("sbert_embeddings is on CUDA, moving to CPU")
        sbert_embeddings = sbert_embeddings.cpu()

    # if labels.is_cuda:
    #     logger.warning("labels is on CUDA, moving to CPU")
    #     labels = labels.cpu()

    return {
        "input_ids": modernbert_features["input_ids"],
        "attention_mask": modernbert_features["attention_mask"],
        "sbert_embeddings": sbert_embeddings,
        # "labels": labels
    }

In [6]:
# Define the DualEmbeddingModel class again for loading
class DualEmbeddingModel(nn.Module):
    def __init__(self, modernbert_model, sbert_dim=384, hidden_size=768, dropout_rate=0.1):
        super(DualEmbeddingModel, self).__init__()
        self.modernbert = modernbert_model
        
        # Get embedding dimensions
        self.modernbert_dim = modernbert_model.config.hidden_size  # 768 for ModernBERT-base
        self.sbert_dim = sbert_dim
        
        # Classifier with variable hidden size
        self.classifier = nn.Sequential(
            nn.Linear(self.modernbert_dim + self.sbert_dim, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, 1)
        )
    
    @property
    def device(self):
        return next(self.parameters()).device
    
    def forward(self, input_ids, attention_mask, sbert_embeddings, labels=None):
        # Ensure inputs are on the same device as the model parameters
        device = self.device
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        sbert_embeddings = sbert_embeddings.to(device)
        
        # Get ModernBERT embedding for [CLS] token
        modernbert_outputs = self.modernbert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        modernbert_embedding = modernbert_outputs.last_hidden_state[:, 0, :]  # [CLS] token
        
        # Concatenate embeddings
        combined_embedding = torch.cat([modernbert_embedding, sbert_embeddings], dim=1)
        
        # Classify
        logits = self.classifier(combined_embedding).squeeze(-1)
        
        return logits

In [7]:
def load_dual_embedding_model_from_hub(repo_id, device=None):
    """
    Load a DualEmbeddingModel from Hugging Face Hub.
    
    Args:
        repo_id: Hugging Face repository ID (e.g., "username/model-name")
        device: Device to load the model to
    """
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Load tokenizer from Hub
    tokenizer = AutoTokenizer.from_pretrained(repo_id)
    
    # Load SBERT model
    sbert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    
    # Set up quantization config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_storage=torch.bfloat16
    )
    
    # Load base ModernBERT model
    base_model = AutoModel.from_pretrained(
        "answerdotai/ModernBERT-base",
        quantization_config=quant_config,
        device_map=device
    )
    
    # Load the PEFT adapters
    peft_model = PeftModel.from_pretrained(base_model, repo_id, inference_mode=True)
    
    # Create DualEmbeddingModel
    model = DualEmbeddingModel(peft_model)
    
    # Load classifier weights using huggingface_hub
    from huggingface_hub import hf_hub_download
    
    # Download classifier weights file
    classifier_path = hf_hub_download(repo_id=repo_id, filename="classifier_weights.pt")
    classifier_weights = torch.load(classifier_path, map_location=device)
    model.classifier.load_state_dict(classifier_weights)
    
    # Load optimal threshold
    threshold_path = hf_hub_download(repo_id=repo_id, filename="optimal_threshold.txt")
    with open(threshold_path, "r") as f:
        threshold = float(f.read().strip())
    
    model.eval()
    return model, tokenizer, sbert_model, threshold



In [8]:
def predict_model(model_dir, test_df, batch_size=64, device=None):
    """
    Use the saved DualEmbeddingModel to make predictions on a batch of claim-evidence pairs.
    
    Args:
        model_dir (str): Directory containing the saved model and tokenizer.
        test_df (pd.DataFrame): Test dataframe with 'Claim' and 'Evidence' columns.
        batch_size (int): Batch size for prediction.
        device (str, optional): Device to load the model to.
    
    Returns:
        pd.DataFrame: Original dataframe with 'prediction' and 'probability' columns added.
    """
    # Load the model
    model, tokenizer, sbert_model, threshold = load_dual_embedding_model_from_hub(model_dir, device=device)

    # Explicitly move the entire model to the specified device
    model = model.to(device)

    # Make sure model is in evaluation mode
    model.eval()
    
    # Prepare the test dataset
    test_features = prepare_dual_embedding_features(test_df, tokenizer, sbert_model)
    test_dataset = DualEmbeddingDataset(test_features)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Perform predictions
    all_logits = []
    
    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Inference", disable=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            sbert_embeddings = batch["sbert_embeddings"].to(device)
            
            logits = model(input_ids, attention_mask, sbert_embeddings)
            all_logits.append(logits.cpu())
    
    # Concatenate all batches
    all_logits = torch.cat(all_logits, dim=0).numpy()
    
    # Convert logits to probabilities with sigmoid
    probabilities = torch.sigmoid(torch.tensor(all_logits)).numpy()
    
    # Make predictions using the optimal threshold
    predictions = (probabilities > threshold).astype(int)
    
    # Add predictions and probabilities to the dataframe
    result_df = test_df.copy()
    result_df["prediction"] = predictions
    result_df["probability"] = probabilities
    
    return result_df

In [None]:
DATASET_DIR = "test_data/ED"

test_df = pd.read_csv(os.path.join(DATASET_DIR, "test.csv"))

In [10]:
# Load the model directory
model_dir = "ddosdub/DualEncoderModernBERT"

predict_df = predict_model(model_dir, test_df, batch_size=64, device="cuda" if torch.cuda.is_available() else "cpu")

predict_df.head(10)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda:0
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:__main__:Original SBERT device: cuda:0
INFO:__main__:Tokenizing inputs for ModernBERT...
INFO:__main__:Computing SBERT embeddings on: cuda
INFO:__main__:Computing SBERT embeddings for claims (batched)...
Batches: 100%|██████████| 74/74 [00:01<00:00, 67.90it/s] 
INFO:__main__:Computing SBERT embeddings for evidence (batched)...
Batches: 100%|██████████| 74/74 [00:02<00:00, 27.06it/s]
INFO:__main__:Combining embeddings...
Combining embeddings: 100%|██████████| 4688/4688 [00:00<00:00, 75153.35it/s]
INFO:__main__:Verifying all tensors are on CPU...
Inference: 100%|██████████| 74/74 [01:40<00:00,  1.36s/it]


Unnamed: 0,Claim,Evidence,prediction,probability
0,We should further exploit geothermal energy,Taxpayer funding of research and development o...,1,0.876566
1,We should prohibit corporal punishment,"Regarding discipline, Sukhmani writes that cor...",0,0.221318
2,We should ban male infant circumcision,"Benatar and Benatar (2003) argue that ""it is f...",0,0.106695
3,We should ban trans fats usage in food,"Each KIND bar is gluten free, dairy free, non ...",0,0.191823
4,We should ban boxing,About Feng Keshan and Meihuaquan: In the 1800s...,0,0.001519
5,We should adopt libertarianism,The North American Confederacy is much more ad...,0,0.510214
6,We should legalize organ trade,"In November and December 2010, Israelis and a ...",0,0.005191
7,We should introduce universal health care,"In the UK, the National Health Service (NHS) p...",0,0.120508
8,We should ban private education,"In 2008, due to the devaluing of the A-Levels ...",0,0.016501
9,We should introduce universal health care,Uruguay is the only country in Latin America t...,0,0.020215


In [12]:
predict_df["prediction"].value_counts()

prediction
0    2895
1    1793
Name: count, dtype: int64

In [11]:
# get prediction column from the predict_df
predictions = predict_df["prediction"]

# convert to csv
predictions.to_csv("Group_7_C.csv", index=False)

# def create_file_for_submission(predictions: pd.DataFrame, ext: str = "predict"):
#     predictions.to_csv(f"predictions.csv.{ext}", index=False)

#     # zip the file
#     import zipfile
#     with zipfile.ZipFile("predictions.zip", "w") as zf:
#         zf.write(f"predictions.csv.{ext}")

# create_file_for_submission(pd.read_csv("predictions.csv"))