# Inference Notebook

This notebook is used to run inference on the trained dual embedding model. It loads the model and tokenizer from a Hugging Face repository and runs inference on the input CSV file to predict whether evidence supports a claim.

In [1]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

Looking in indexes: https://download.pytorch.org/whl/cu126
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install seaborn matplotlib tqdm scikit-learn unidecode

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8
Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install -U huggingface_hub transformers bitsandbytes peft sentence-transformers pandas

Collecting huggingface_hub
  Downloading huggingface_hub-0.30.1-py3-none-any.whl.metadata (13 kB)
Collecting transformers
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting peft
  Downloading peft-0.15.1-py3-none-any.whl.metadata (13 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.0.2-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.30.1-py3-none-any.whl (481 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.2/481.2 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.50.3-py3-none-any.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m102.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl (76.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import os
import torch
import pickle
import logging
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig
from peft import PeftModel
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef, classification_report, precision_recall_curve
import string
import unidecode
import re

from huggingface_hub import hf_hub_download

import torch
import torch.nn as nn


# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Dataset and Model Classes

These classes define the data handling and model architecture for our dual embedding approach.

In [5]:
class DualEmbeddingDataset(Dataset):
    """
    Dataset for dual embedding model that ensures all tensors are on CPU.
    """
    def __init__(self, features):
        """
        Initialize the dataset with preprocessed features.

        Args:
            features: Dictionary of feature tensors including input_ids,
                      attention_mask, sbert_embeddings, and labels
        """
        self.input_ids = features["input_ids"]
        self.attention_mask = features["attention_mask"]
        self.sbert_embeddings = features["sbert_embeddings"]
        self.labels = features["labels"] if "labels" in features else None

        # Ensure all tensors are on CPU
        if self.input_ids.is_cuda:
            self.input_ids = self.input_ids.cpu()
        if self.attention_mask.is_cuda:
            self.attention_mask = self.attention_mask.cpu()
        if self.sbert_embeddings.is_cuda:
            self.sbert_embeddings = self.sbert_embeddings.cpu()
        if self.labels is not None and self.labels.is_cuda:
            self.labels = self.labels.cpu()

        # Validate tensor shapes
        assert len(self.input_ids) == len(self.attention_mask) == len(self.sbert_embeddings), \
            "All feature tensors must have the same first dimension"

    def __len__(self):
        return len(self.labels) if self.labels is not None else len(self.input_ids)

    def __getitem__(self, idx):
        """
        Get a single example from the dataset.

        Args:
            idx: Index to retrieve

        Returns:
            Dictionary of tensors for the given index
        """
        item = {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "sbert_embeddings": self.sbert_embeddings[idx]
        }
        if self.labels is not None:
            item["labels"] = self.labels[idx]
        return item
    

## Model Architecture

The model class is an `nn.Module` that combines embeddings from ModernBERT (for contextual understanding) and SBERT (for semantic similarity) to make predictions about evidence-claim relationships.

In [6]:
# Define the DualEmbeddingModel class again for loading
class DualEmbeddingModel(nn.Module):
    def __init__(self, modernbert_model, sbert_dim=384, hidden_size=768, dropout_rate=0.1):
        super(DualEmbeddingModel, self).__init__()
        self.modernbert = modernbert_model
        
        # Get embedding dimensions
        self.modernbert_dim = modernbert_model.config.hidden_size  # 768 for ModernBERT-base
        self.sbert_dim = sbert_dim
        
        # Classifier with variable hidden size
        self.classifier = nn.Sequential(
            nn.Linear(self.modernbert_dim + self.sbert_dim, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, 1)
        )
    
    # Get the device of the model parameters
    # This is useful for ensuring inputs are on the same device
    @property
    def device(self):
        return next(self.parameters()).device
    
    def forward(self, input_ids, attention_mask, sbert_embeddings, labels=None):
        # Ensure inputs are on the same device as the model parameters
        device = self.device
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        sbert_embeddings = sbert_embeddings.to(device)
        
        # Get ModernBERT embedding for [CLS] token
        modernbert_outputs = self.modernbert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        modernbert_embedding = modernbert_outputs.last_hidden_state[:, 0, :]  # [CLS] token
        
        # Concatenate embeddings
        combined_embedding = torch.cat([modernbert_embedding, sbert_embeddings], dim=1)
        
        # Classify
        logits = self.classifier(combined_embedding).squeeze(-1)
        
        return logits

## Data Preprocessing and Feature Extraction

These functions clean text inputs and prepare the features needed for model inference.

In [7]:
def clean_text(text):
    """
    Clean text by removing reference tags and normalizing whitespace.

    Args:
        text (str): The input text to clean.

    Returns:
        str: The cleaned text.
    """
    # Remove reference tags
    cleaned_text = re.sub(r"\[REF\]|\[REF|REF\]", "", text).strip()

    # Normalize text
    cleaned_text = unidecode.unidecode(cleaned_text)

    punctuations = re.escape(string.punctuation)  # escape special characters like [ ] ( ) etc.

    # Remove spaces between letter and punctuation
    cleaned_text = re.sub(r"([a-zA-Z])\s+([{}])".format(punctuations), r"\1\2", cleaned_text)
    # Remove spaces between punctuation and another punctuation
    cleaned_text = re.sub(r"([{}])\s+([{}])".format(punctuations, punctuations), r"\1\2", cleaned_text)

    # Remove extra whitespaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

    return cleaned_text

In [8]:
def prepare_dual_embedding_features(df, modernbert_tokenizer, sbert_model, max_length=8192, sbert_batch_size=64):
    """
    Prepare features for the dual embedding model, ensuring all tensors remain on CPU.

    Args:
        df: DataFrame with 'Claim', 'Evidence', and label columns
        modernbert_tokenizer: ModernBERT tokenizer
        sbert_model: Sentence-BERT model
        max_length: Maximum sequence length for tokenization
        sbert_batch_size: Batch size for SBERT encoding

    Returns:
        Dictionary of feature tensors with input_ids, attention_mask, sbert_embeddings, and labels
    """
    # Keep track of original SBERT device
    original_device = next(sbert_model.parameters()).device
    logger.info(f"Original SBERT device: {original_device}")

    # Prepare inputs
    texts_claim = df["Claim"].tolist()
    texts_evidence = df["Evidence"].tolist()

    # ModernBERT tokenization - keep on CPU
    logger.info("Tokenizing inputs for ModernBERT...")
    modernbert_features = modernbert_tokenizer(
        texts_claim,
        texts_evidence,
        padding=True,
        truncation="only_second",
        max_length=max_length,
        return_tensors="pt"
    )

    # Compute SBERT embeddings on GPU, then move back to CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Computing SBERT embeddings on: {device}")

    # Temporarily move SBERT to computation device
    sbert_model = sbert_model.to(device)

    # Compute claim embeddings
    logger.info("Computing SBERT embeddings for claims (batched)...")
    claim_embeddings = sbert_model.encode(
        texts_claim,
        convert_to_tensor=True,
        batch_size=sbert_batch_size,
        show_progress_bar=True,
        device=device
    )

    # Move claim embeddings to CPU immediately
    claim_embeddings = claim_embeddings.cpu()

    # Compute evidence embeddings
    logger.info("Computing SBERT embeddings for evidence (batched)...")
    evidence_embeddings = sbert_model.encode(
        texts_evidence,
        convert_to_tensor=True,
        batch_size=sbert_batch_size,
        show_progress_bar=True,
        device=device
    )

    # Move evidence embeddings to CPU immediately
    evidence_embeddings = evidence_embeddings.cpu()

    # Return SBERT to original device
    sbert_model = sbert_model.to(original_device)

    # Combine claim and evidence embeddings on CPU
    logger.info("Combining embeddings...")
    combined_embeddings = []
    for claim_emb, evid_emb in tqdm(zip(claim_embeddings, evidence_embeddings),
                                  total=len(claim_embeddings),
                                  desc="Combining embeddings"):
        # Use average of the claim and evidence embeddings
        combined_emb = (claim_emb + evid_emb) / 2
        combined_embeddings.append(combined_emb)

    sbert_embeddings = torch.stack(combined_embeddings)

    # # Prepare labels
    # if "label" in df.columns:
    #     label_col = "label"
    # elif "labels" in df.columns:
    #     label_col = "labels"
    # else:
    #     raise ValueError("DataFrame must contain 'label' or 'labels' column")

    # Keep labels on CPU
    # labels = torch.tensor(df[label_col].values, dtype=torch.float)

    # Final verification that all tensors are on CPU
    logger.info("Verifying all tensors are on CPU...")
    for key, tensor in modernbert_features.items():
        if tensor.is_cuda:
            logger.warning(f"{key} is on CUDA, moving to CPU")
            modernbert_features[key] = tensor.cpu()

    if sbert_embeddings.is_cuda:
        logger.warning("sbert_embeddings is on CUDA, moving to CPU")
        sbert_embeddings = sbert_embeddings.cpu()

    # if labels.is_cuda:
    #     logger.warning("labels is on CUDA, moving to CPU")
    #     labels = labels.cpu()

    return {
        "input_ids": modernbert_features["input_ids"],
        "attention_mask": modernbert_features["attention_mask"],
        "sbert_embeddings": sbert_embeddings,
        # "labels": labels
    }

## Model Loading and Inference

These functions handle loading the pretrained model from Hugging Face Hub and running inference on new data.

In [9]:
def load_dual_embedding_model_from_hub(repo_id, device=None):
    """
    Load a DualEmbeddingModel from Hugging Face Hub.
    
    Args:
        repo_id: Hugging Face repository ID (e.g., "username/model-name")
        device: Device to load the model to
    """
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Load tokenizer from Hub
    tokenizer = AutoTokenizer.from_pretrained(repo_id)
    
    # Load SBERT model
    sbert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    
    # Set up quantization config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_storage=torch.bfloat16
    )
    
    # Load base ModernBERT model
    base_model = AutoModel.from_pretrained(
        "answerdotai/ModernBERT-base",
        quantization_config=quant_config,
        device_map=device
    )
    
    # Load the PEFT adapters
    peft_model = PeftModel.from_pretrained(base_model, repo_id, inference_mode=True)
    
    # Create DualEmbeddingModel
    model = DualEmbeddingModel(peft_model)
    
    # Load classifier weights using huggingface_hub
    from huggingface_hub import hf_hub_download
    
    # Download classifier weights file
    classifier_path = hf_hub_download(repo_id=repo_id, filename="classifier_weights.pt")
    classifier_weights = torch.load(classifier_path, map_location=device, weights_only=True)
    model.classifier.load_state_dict(classifier_weights)
    
    # Load optimal threshold
    threshold_path = hf_hub_download(repo_id=repo_id, filename="optimal_threshold.txt")
    with open(threshold_path, "r") as f:
        threshold = float(f.read().strip())
    
    model.eval()
    return model, tokenizer, sbert_model, threshold



## Running Inference on Test Data

Here we load the test data and run the model to generate predictions.

In [10]:
def predict_model(model_dir, test_df, batch_size=64, device=None):
    """
    Use the saved DualEmbeddingModel to make predictions on a batch of claim-evidence pairs.
    
    Args:
        model_dir (str): Directory containing the saved model and tokenizer.
        test_df (pd.DataFrame): Test dataframe with 'Claim' and 'Evidence' columns.
        batch_size (int): Batch size for prediction.
        device (str, optional): Device to load the model to.
    
    Returns:
        pd.DataFrame: Original dataframe with 'prediction' and 'probability' columns added.
    """
    # Load the model
    model, tokenizer, sbert_model, threshold = load_dual_embedding_model_from_hub(model_dir, device=device)

    # Explicitly move the entire model to the specified device
    model = model.to(device)

    # Make sure model is in evaluation mode
    model.eval()

    # Preprocess
    test_df["Claim"] = test_df["Claim"].apply(clean_text)
    test_df["Evidence"] = test_df["Evidence"].apply(clean_text)

    # Prepare the test dataset
    test_features = prepare_dual_embedding_features(test_df, tokenizer, sbert_model)
    test_dataset = DualEmbeddingDataset(test_features)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Perform predictions
    all_logits = []
    
    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Inference", disable=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            sbert_embeddings = batch["sbert_embeddings"].to(device)
            
            logits = model(input_ids, attention_mask, sbert_embeddings)
            all_logits.append(logits.cpu())
    
    # Concatenate all batches
    all_logits = torch.cat(all_logits, dim=0).numpy()
    
    # Convert logits to probabilities with sigmoid
    probabilities = torch.sigmoid(torch.tensor(all_logits)).numpy()
    
    # Make predictions using the optimal threshold
    predictions = (probabilities > threshold).astype(int)
    
    # Add predictions and probabilities to the dataframe
    result_df = test_df.copy()
    result_df["prediction"] = predictions
    result_df["probability"] = probabilities
    
    return result_df

In [11]:
DATASET_DIR = "/kaggle/input/test-data"

test_df = pd.read_csv(os.path.join(DATASET_DIR, "test.csv"))

In [12]:
# Load the model directory
model_dir = "ddosdub/DualEncoderModernBERT"

predict_df = predict_model(model_dir, test_df, batch_size=64, device="cuda" if torch.cuda.is_available() else "cpu")

predict_df.head(10)

tokenizer_config.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/824 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/47.3M [00:00<?, ?B/s]

classifier_weights.pt:   0%|          | 0.00/3.55M [00:00<?, ?B/s]

optimal_threshold.txt:   0%|          | 0.00/9.00 [00:00<?, ?B/s]

Batches:   0%|          | 0/74 [00:00<?, ?it/s]

Batches:   0%|          | 0/74 [00:00<?, ?it/s]

Combining embeddings: 100%|██████████| 4688/4688 [00:00<00:00, 98443.45it/s]
Inference: 100%|██████████| 74/74 [01:23<00:00,  1.13s/it]


Unnamed: 0,Claim,Evidence,prediction,probability
0,We should further exploit geothermal energy,Taxpayer funding of research and development o...,1,0.61269
1,We should prohibit corporal punishment,"Regarding discipline, Sukhmani writes that cor...",0,0.008638
2,We should ban male infant circumcision,"Benatar and Benatar(2003) argue that""it is far...",1,0.639438
3,We should ban trans fats usage in food,"Each KIND bar is gluten free, dairy free, non ...",0,0.020886
4,We should ban boxing,About Feng Keshan and Meihuaquan: In the 1800s...,0,0.000925
5,We should adopt libertarianism,The North American Confederacy is much more ad...,1,0.86872
6,We should legalize organ trade,"In November and December 2010, Israelis and a ...",0,0.002305
7,We should introduce universal health care,"In the UK, the National Health Service(NHS) pr...",0,0.285292
8,We should ban private education,"In 2008, due to the devaluing of the A-Levels ...",0,0.067804
9,We should introduce universal health care,Uruguay is the only country in Latin America t...,0,0.48145


In [13]:
predict_df["prediction"].value_counts()

prediction
0    2422
1    2266
Name: count, dtype: int64

In [14]:
len(predict_df)

4688

In [15]:
# get prediction column from the predict_df
predictions = predict_df["prediction"]

# convert to csv
predictions.to_csv("Group_7_C.csv", index=False)