# Training Notebook
### This notebook is used to train the model on the dataset.

Install the required libraries + ensure system is set up for GPU usage.


In [None]:
%pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu124

In [2]:
%pip install -U nltk tqdm sentence-transformers transformers bitsandbytes scikit-learn peft optuna unidecode triton flash-attn

Collecting sentence-transformers
  Downloading sentence_transformers-4.0.2-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting peft
  Downloading peft-0.15.1-py3-none-any.whl.metadata (13 kB)
Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Collecting flash-attn
  Downloading flash_attn-2.7.4.post1.tar.gz (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers

## Data Preparation

### Imports
Importing the required libraries and modules for the project.

In [3]:
# imports for data processing
import pandas as pd
import nltk
from nltk.corpus import wordnet
import random
import re
import os
import pickle
import logging
from tqdm.auto import tqdm
from datetime import datetime

# imports for model implementation and training
import torch
import torch.nn as nn
from transformers import (
    AutoModel, AutoTokenizer,
    BitsAndBytesConfig, TrainingArguments,
    Trainer, EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from peft.optimizers import create_loraplus_optimizer
import bitsandbytes as bnb
from sentence_transformers import SentenceTransformer
from torch.utils.data import Dataset
import gc
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef, precision_recall_curve
import optuna


from functools import partial
import unidecode
import string

nltk.download('punkt_tab', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

True

In [4]:
torch.cuda.is_available()

True

In [5]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[
        logging.FileHandler(f"training_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

In [6]:
def download_nltk_resources():
    """Download required NLTK resources if not already present."""
    try:
        nltk.data.find('corpora/wordnet')
        nltk.data.find('taggers/averaged_perceptron_tagger')
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('wordnet', quiet=True)
        nltk.download('punkt', quiet=True)
        nltk.download('averaged_perceptron_tagger', quiet=True)

In [7]:
def get_synonyms(word, pos=None):
    """
    Get synonyms for a word with optional part-of-speech filter.

    Args:
        word (str): The word to find synonyms for.
        pos (str, optional): Part of speech (N, V, J, R). Defaults to None.

    Returns:
        list: A list of synonyms.
    """
    synonyms = []

    wordnet_pos = {
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'J': wordnet.ADJ,
        'R': wordnet.ADV
    }

    if pos and pos[0] in wordnet_pos:
        synsets = wordnet.synsets(word, pos=wordnet_pos[pos[0]])
    else:
        synsets = wordnet.synsets(word)

    for synset in synsets:
        for lemma in synset.lemmas():
            synonym = lemma.name().replace('_', ' ')
            if synonym != word and synonym not in synonyms:
                synonyms.append(synonym)

    return synonyms

def get_wordnet_pos(tag):
    """
    Map POS tag from NLTK tagger to WordNet format.

    Args:
        tag (str): The POS tag from NLTK.

    Returns:
        str: The corresponding WordNet POS tag.
    """
    tag = tag[0].upper()
    tag_dict = {"J": "J", "N": "N", "V": "V", "R": "R"}
    return tag_dict.get(tag, None)

In [8]:
def augment_text_with_synonyms(text, replace_prob=0.3, max_replacements=5):
    """
    Augment text by replacing words with their synonyms.

    Args:
        text (str): The input text to augment.
        replace_prob (float): Probability of replacing a word with a synonym.
        max_replacements (int): Maximum number of replacements to make.

    Returns:
        str: The augmented text.
    """
    words = nltk.word_tokenize(text)
    tagged_words = nltk.pos_tag(words)

    num_replacements = 0

    for i, (word, tag) in enumerate(tagged_words):
        # Skip short words, stopwords, punctuation
        if len(word) <= 3 or not word.isalpha():
            continue

        if random.random() > replace_prob:
            continue

        pos = get_wordnet_pos(tag)
        if not pos:
            continue

        synonyms = get_synonyms(word, pos)
        if not synonyms:
            continue

        words[i] = random.choice(synonyms)
        num_replacements += 1

        if num_replacements >= max_replacements:
            break

    augmented_text = ' '.join(words)
    augmented_text = re.sub(r'\s+([,.!?:;])', r'\1', augmented_text)

    return augmented_text

In [9]:
def create_balanced_augmented_dataset(train_df, cache_dir="./cached_data", force_rebuild=False):
    """
    Create a balanced dataset by augmenting minority class examples.


    Args:
        train_df (pd.DataFrame): The training dataset with 'Claim', 'Evidence', and 'label' columns.
        cache_dir (str): Directory to cache the augmented dataset.
        force_rebuild (bool): Force rebuild of the dataset even if cached.

    Returns:
        pd.DataFrame: The balanced and augmented dataset.

    Raises:
        ValueError: If the dataset does not contain the required columns.
    """
    # Download NLTK resources
    download_nltk_resources()

    # Set up cache directory
    os.makedirs(cache_dir, exist_ok=True)
    cache_file = os.path.join(cache_dir, "augmented_training_data.pkl")

    # Use cached data if available
    if os.path.exists(cache_file) and not force_rebuild:
        logger.info("Loading cached augmented dataset...")
        with open(cache_file, 'rb') as f:
            return pickle.load(f)

    logger.info("Creating augmented dataset...")

    # Calculate class distribution
    class_counts = train_df["label"].value_counts()
    positive_samples = train_df[train_df["label"] == 1].copy()
    negative_samples = train_df[train_df["label"] == 0].copy()

    # Calculate augmentation factor to balance classes
    class_ratio = len(negative_samples) / len(positive_samples)
    n_augmentations = max(1, int(class_ratio - 1))

    logger.info(f"Creating {n_augmentations} augmentations for each positive sample")

    # Mark original samples
    positive_samples['is_augmented'] = False
    negative_samples['is_augmented'] = False

    # Create augmentations
    augmented_rows = []

    for _, row in tqdm(positive_samples.iterrows(), total=len(positive_samples), desc="Augmenting"):
        for i in range(n_augmentations):
            new_row = row.to_dict()

            # Augment claim and evidence
            new_row["Claim"] = augment_text_with_synonyms(
                new_row["Claim"],
                replace_prob=0.3,
                max_replacements=5
            )

            new_row["Evidence"] = augment_text_with_synonyms(
                new_row["Evidence"],
                replace_prob=0.3,
                max_replacements=5
            )

            new_row['is_augmented'] = True
            augmented_rows.append(new_row)

    # Create augmented dataframe
    augmented_df = pd.DataFrame(augmented_rows)

    # Combine with original data
    balanced_df = pd.concat([negative_samples, positive_samples, augmented_df])

    # Shuffle the dataset
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Cache the result
    logger.info(f"Saving augmented dataset to {cache_file}")
    with open(cache_file, 'wb') as f:
        pickle.dump(balanced_df, f)

    # Log statistics
    logger.info(f"Original class distribution: {class_counts.to_dict()}")
    logger.info(f"New class distribution: {balanced_df['label'].value_counts().to_dict()}")
    logger.info(f"Original dataset: {len(train_df)} rows")
    logger.info(f"Augmented dataset: {len(balanced_df)} rows")

    return balanced_df

## Model Architecture

In [10]:
class DualEmbeddingDataset(Dataset):
    """
    Dataset for dual embedding model that ensures all tensors are on CPU.
    """
    def __init__(self, features):
        """
        Initialize the dataset with preprocessed features.

        Args:
            features: Dictionary of feature tensors including input_ids,
                      attention_mask, sbert_embeddings, and labels
        """
        self.input_ids = features["input_ids"]
        self.attention_mask = features["attention_mask"]
        self.sbert_embeddings = features["sbert_embeddings"]
        self.labels = features["labels"]

        # Ensure all tensors are on CPU
        if self.input_ids.is_cuda:
            self.input_ids = self.input_ids.cpu()
        if self.attention_mask.is_cuda:
            self.attention_mask = self.attention_mask.cpu()
        if self.sbert_embeddings.is_cuda:
            self.sbert_embeddings = self.sbert_embeddings.cpu()
        if self.labels.is_cuda:
            self.labels = self.labels.cpu()

        # Validate tensor shapes
        assert len(self.input_ids) == len(self.attention_mask) == len(self.sbert_embeddings) == len(self.labels), \
            "All feature tensors must have the same first dimension"

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Get a single example from the dataset.

        Args:
            idx: Index to retrieve

        Returns:
            Dictionary of tensors for the given index
        """
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "sbert_embeddings": self.sbert_embeddings[idx],
            "labels": self.labels[idx]
        }

In [11]:
class DualEmbeddingModel(nn.Module):
    """
    Dual embedding model combining ModernBERT and SBERT embeddings.

    Args:
        modernbert_model (nn.Module): Pre-trained ModernBERT model.
        sbert_dim (int): Dimension of SBERT embeddings.
        hidden_size (int): Hidden size for the classifier.
        dropout_rate (float): Dropout rate for the classifier.
    """

    def __init__(self, modernbert_model, sbert_dim=384, hidden_size=512, dropout_rate=0.1):
        super(DualEmbeddingModel, self).__init__()
        self.modernbert = modernbert_model

        # Get embedding dimensions
        self.modernbert_dim = modernbert_model.config.hidden_size  # 768 for ModernBERT-base
        self.sbert_dim = sbert_dim

        # Classifier with variable hidden size
        self.classifier = nn.Sequential(
            nn.Linear(self.modernbert_dim + self.sbert_dim, hidden_size),
            nn.LayerNorm(hidden_size),  # Normalization helps training stability
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, 1)
        )

    @property
    def device(self):
        """
        Property that returns the device where the model parameters are located.
        """
        return next(self.parameters()).device

    def forward(self, input_ids, attention_mask, sbert_embeddings, labels=None):
        # Ensure inputs are on the same device as the model parameters
        device = self.device
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        sbert_embeddings = sbert_embeddings.to(device)

        # Get ModernBERT embedding for [CLS] token
        modernbert_outputs = self.modernbert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        modernbert_embedding = modernbert_outputs.last_hidden_state[:, 0, :]  # [CLS] token

        # Concatenate embeddings
        combined_embedding = torch.cat([modernbert_embedding, sbert_embeddings], dim=1)

        # Classify
        logits = self.classifier(combined_embedding).squeeze(-1)

        # Return logits only (loss will be calculated in the trainer)
        return logits

In [12]:
# Helper functions
def find_optimal_threshold(y_true, y_pred):
    """
    Find the optimal threshold for classification based on F1 score.

    Args:
        y_true (np.ndarray): True labels.
        y_pred (np.ndarray): Predicted probabilities.

    Returns:
        float: Optimal threshold.
    """
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_pred)

    f1_scores = np.divide(
        2 * (precisions * recalls),
        (precisions + recalls),
        out=np.zeros_like(precisions),
        where=(precisions + recalls) > 0
    )

    best_idx = np.argmax(f1_scores[:-1])
    best_threshold = thresholds[best_idx]
    best_f1 = f1_scores[best_idx]

    logger.info(f"Best threshold: {best_threshold:.4f} with F1: {best_f1:.4f}")

    return best_threshold

def compute_metrics(eval_pred, threshold=0.5):
    """
    Compute evaluation metrics for model predictions.

    Args:
        eval_pred (tuple): Tuple containing logits and labels.
        threshold (float): Threshold for binary classification.

    Returns:
        dict: Dictionary containing accuracy, precision, recall, F1 score, and Matthews correlation coefficient.
    """
    logits, labels = eval_pred

    if logits.ndim == 2 and logits.shape[1] == 1:
        logits = logits.squeeze(1)

    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    predictions = (probs > threshold).astype(int)

    accuracy = accuracy_score(labels, predictions)

    macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
        labels, predictions, average="macro"
    )

    w_macro_p, w_macro_r, w_macro_f1, _ = precision_recall_fscore_support(
        labels, predictions, average="weighted"
    )

    mcc = matthews_corrcoef(labels, predictions)

    return {
        'accuracy': accuracy,
        'macro_p': macro_p,
        'macro_r': macro_r,
        'macro_f1': macro_f1,
        'w_macro_p': w_macro_p,
        'w_macro_r': w_macro_r,
        'w_macro_f1': w_macro_f1,
        'mcc': mcc,
        'threshold': threshold
    }

In [13]:
# Custom trainer with pruning support
class PruningTrainer(Trainer):
    """
    Custom trainer with Optuna pruning support.

    Args:
        trial (optuna.Trial): Optuna trial object for pruning.
        pos_weight (torch.Tensor): Positive class weight for loss calculation.
        threshold (float): Threshold for binary classification.
        **kwargs: Additional arguments for Trainer.

    Attributes:
        trial (optuna.Trial): Optuna trial object.
        pos_weight (torch.Tensor): Positive class weight.
        threshold (float): Threshold for binary classification.
        compute_loss (callable): Loss computation function.
        evaluate (callable): Evaluation function.
    """

    def __init__(self, trial=None, pos_weight=None, threshold=0.5, **kwargs):
        super().__init__(**kwargs)
        self.trial = trial
        self.pos_weight = pos_weight
        self.threshold = threshold

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Move tensors to the model's device
        device = model.device
        labels = inputs.pop("labels").float().to(device)

        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        sbert_embeddings = inputs["sbert_embeddings"].to(device)

        # Forward pass
        logits = model(input_ids, attention_mask, sbert_embeddings)

        # Loss calculation with positive class weighting
        loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=self.pos_weight.to(device))
        loss = loss_fn(logits.view(-1), labels.view(-1))

        if return_outputs:
            return loss, {"loss": loss, "logits": logits}
        return loss

    def evaluate(self, **kwargs):
        metrics = super().evaluate(**kwargs)

        # Get predictions on evaluation set
        eval_pred = self.predict(self.eval_dataset)
        logits = eval_pred.predictions
        labels = eval_pred.label_ids

        if logits.ndim == 2 and logits.shape[1] == 1:
            logits = logits.squeeze(1)

        probs = torch.sigmoid(torch.tensor(logits)).numpy().flatten()

        # Find optimal threshold
        optimal_threshold = find_optimal_threshold(labels, probs)
        self.threshold = optimal_threshold

        # Calculate metrics with optimal threshold
        threshold_metrics = compute_metrics((logits, labels), threshold=optimal_threshold)

        for key, value in threshold_metrics.items():
            metrics[f"eval_optimal_{key}"] = value

        # Report to Optuna for pruning if trial is provided
        if self.trial:
            current_step = self.state.global_step
            w_macro_f1 = metrics.get("eval_optimal_w_macro_f1", 0.0)
            self.trial.report(w_macro_f1, current_step)

            # Check if the trial should be pruned
            if self.trial.should_prune():
                message = f"Trial {self.trial.number} pruned at step {current_step} with value {w_macro_f1:.4f}"
                logger.info(message)
                raise optuna.exceptions.TrialPruned(message)

        return metrics

In [14]:
def prepare_dual_embedding_features(df, modernbert_tokenizer, sbert_model, max_length=512, sbert_batch_size=64):
    """
    Prepare features for the dual embedding model, ensuring all tensors remain on CPU.

    Args:
        df: DataFrame with 'Claim', 'Evidence', and label columns
        modernbert_tokenizer: ModernBERT tokenizer
        sbert_model: Sentence-BERT model
        max_length: Maximum sequence length for tokenization
        sbert_batch_size: Batch size for SBERT encoding

    Returns:
        Dictionary of feature tensors with input_ids, attention_mask, sbert_embeddings, and labels
    """
    # Keep track of original SBERT device
    original_device = next(sbert_model.parameters()).device
    logger.info(f"Original SBERT device: {original_device}")

    # Prepare inputs
    texts_claim = df["Claim"].tolist()
    texts_evidence = df["Evidence"].tolist()

    # ModernBERT tokenization - keep on CPU
    logger.info("Tokenizing inputs for ModernBERT...")
    modernbert_features = modernbert_tokenizer(
        texts_claim,
        texts_evidence,
        padding=True,
        truncation="only_second",
        max_length=max_length,
        return_tensors="pt"
    )

    # Compute SBERT embeddings on GPU, then move back to CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Computing SBERT embeddings on: {device}")

    # Temporarily move SBERT to computation device
    sbert_model = sbert_model.to(device)

    # Compute claim embeddings
    logger.info("Computing SBERT embeddings for claims (batched)...")
    claim_embeddings = sbert_model.encode(
        texts_claim,
        convert_to_tensor=True,
        batch_size=sbert_batch_size,
        show_progress_bar=True,
        device=device
    )

    # Move claim embeddings to CPU immediately
    claim_embeddings = claim_embeddings.cpu()

    # Compute evidence embeddings
    logger.info("Computing SBERT embeddings for evidence (batched)...")
    evidence_embeddings = sbert_model.encode(
        texts_evidence,
        convert_to_tensor=True,
        batch_size=sbert_batch_size,
        show_progress_bar=True,
        device=device
    )

    # Move evidence embeddings to CPU immediately
    evidence_embeddings = evidence_embeddings.cpu()

    # Return SBERT to original device
    sbert_model = sbert_model.to(original_device)

    # Combine claim and evidence embeddings on CPU
    logger.info("Combining embeddings...")
    combined_embeddings = []
    for claim_emb, evid_emb in tqdm(zip(claim_embeddings, evidence_embeddings),
                                  total=len(claim_embeddings),
                                  desc="Combining embeddings"):
        # Use average of the claim and evidence embeddings
        combined_emb = (claim_emb + evid_emb) / 2
        combined_embeddings.append(combined_emb)

    sbert_embeddings = torch.stack(combined_embeddings)

    # Prepare labels
    if "label" in df.columns:
        label_col = "label"
    elif "labels" in df.columns:
        label_col = "labels"
    else:
        raise ValueError("DataFrame must contain 'label' or 'labels' column")

    # Keep labels on CPU
    labels = torch.tensor(df[label_col].values, dtype=torch.float)

    # Final verification that all tensors are on CPU
    logger.info("Verifying all tensors are on CPU...")
    for key, tensor in modernbert_features.items():
        if tensor.is_cuda:
            logger.warning(f"{key} is on CUDA, moving to CPU")
            modernbert_features[key] = tensor.cpu()

    if sbert_embeddings.is_cuda:
        logger.warning("sbert_embeddings is on CUDA, moving to CPU")
        sbert_embeddings = sbert_embeddings.cpu()

    if labels.is_cuda:
        logger.warning("labels is on CUDA, moving to CPU")
        labels = labels.cpu()

    return {
        "input_ids": modernbert_features["input_ids"],
        "attention_mask": modernbert_features["attention_mask"],
        "sbert_embeddings": sbert_embeddings,
        "labels": labels
    }

In [None]:
# Main training function
def train_dual_embedding_model(train_df, val_df, params, trial=None, output_dir="./results",
                              use_saved_data=True, data_dir="./cached_data", save_model=True):
    """
    Train the dual embedding model with ModernBERT and SBERT.

    Args:
        train_df (pd.DataFrame): Training dataset with 'Claim', 'Evidence', and 'label' columns.
        val_df (pd.DataFrame): Validation dataset with 'Claim', 'Evidence', and 'label' columns.
        params (dict): Hyperparameters for training.
        trial (optuna.Trial, optional): Optuna trial object for pruning.
        output_dir (str): Directory to save model and results.
        use_saved_data (bool): Use cached data if available.
        data_dir (str): Directory to cache data.
        save_model (bool): Save the trained model.

    Returns:
        dict: Evaluation results including accuracy, F1 scores, and Matthews correlation coefficient.

    Raises:
        ValueError: If the dataset does not contain the required columns.
    """
    os.makedirs(data_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)

    train_cache_file = os.path.join(data_dir, "train_dual_features.pkl")
    val_cache_file = os.path.join(data_dir, "val_dual_features.pkl")

    # Extract hyperparameters
    learning_rate = params.get('learning_rate', 1e-4)
    batch_size = params.get('batch_size', 32)
    num_epochs = params.get('num_epochs', 5)
    weight_decay = params.get('weight_decay', 0.05)
    warmup_ratio = params.get('warmup_ratio', 0.1)
    gradient_accumulation_steps = params.get('gradient_accumulation_steps', 2)
    lora_r = params.get('lora_r', 32)
    lora_alpha = params.get('lora_alpha', 24)
    lora_dropout = params.get('lora_dropout', 0.1)
    classifier_dropout = params.get('classifier_dropout', 0.1)
    classifier_hidden_size = params.get('classifier_hidden_size', 512)

    # Load models
    logger.info("Loading models and tokenizers...")
    modernbert_name = "answerdotai/ModernBERT-base"
    sbert_name = "sentence-transformers/all-MiniLM-L6-v2"

    modernbert_tokenizer = AutoTokenizer.from_pretrained(modernbert_name, strip_accents=True)
    sbert_model = SentenceTransformer(sbert_name)

    # Prepare datasets (with caching)
    if use_saved_data and os.path.exists(train_cache_file) and os.path.exists(val_cache_file):
        logger.info("Loading cached features...")
        with open(train_cache_file, 'rb') as f:
            train_features = pickle.load(f)
        with open(val_cache_file, 'rb') as f:
            val_features = pickle.load(f)
    else:
        logger.info("Generating features...")
        train_features = prepare_dual_embedding_features(
            train_df, modernbert_tokenizer, sbert_model
        )
        val_features = prepare_dual_embedding_features(
            val_df, modernbert_tokenizer, sbert_model
        )

        # Cache the features
        logger.info("Saving features to cache...")
        with open(train_cache_file, 'wb') as f:
            pickle.dump(train_features, f)
        with open(val_cache_file, 'wb') as f:
            pickle.dump(val_features, f)

    # Create datasets
    train_dataset = DualEmbeddingDataset(train_features)
    val_dataset = DualEmbeddingDataset(val_features)

    # Load ModernBERT with QLoRA
    logger.info("Setting up ModernBERT with QLoRA...")
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_storage=torch.bfloat16
    )

    modernbert = AutoModel.from_pretrained(
        modernbert_name,
        quantization_config=quant_config,
        device_map="auto"
    )

    modernbert.gradient_checkpointing_enable()
    logger.info("Gradient checkpointing enabled")

    modernbert = prepare_model_for_kbit_training(modernbert)

    # Configure LoRA
    logger.info("Configuring LoRA adapters...")
    peft_config = LoraConfig(
        task_type=TaskType.FEATURE_EXTRACTION,
        inference_mode=False,
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules=['Wqkv', 'Wi', 'Wo', 'dense']
    )

    modernbert = get_peft_model(modernbert, peft_config)

    # Log trainable parameters
    trainable_params = sum(p.numel() for p in modernbert.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in modernbert.parameters())
    logger.info(f"Trainable parameters: {trainable_params:,} ({trainable_params / total_params:.2%} of total)")

    # Create the combined model
    logger.info("Creating dual embedding model...")
    model = DualEmbeddingModel(
        modernbert,
        hidden_size=classifier_hidden_size,
        dropout_rate=classifier_dropout
    )

    # Class weights for imbalanced data
    if "label" in train_df.columns:
        label_col = "label"
    else:
        label_col = "labels"

    class_counts = train_df[label_col].value_counts()
    pos_weight = torch.tensor([class_counts[0] / class_counts[1]], dtype=torch.float16)
    logger.info(f"Positive class weight: {pos_weight.item():.4f}")

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size * 2,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_ratio=warmup_ratio,
        weight_decay=weight_decay,
        learning_rate=learning_rate,
        eval_strategy="steps",
        eval_steps=300,
        logging_steps=100,
        save_strategy="steps",
        save_steps=300,
        load_best_model_at_end=True,
        metric_for_best_model="w_macro_f1",  # Using weighted macro F1 score
        greater_is_better=True,
        fp16=True,
        bf16=False,
        save_total_limit=1,
        report_to="none",
        dataloader_pin_memory=False,
    )

    # Create optimizer with LoRA+
    logger.info("Setting up LoRA+ optimizer...")
    optimizer = create_loraplus_optimizer(
        model=model,
        optimizer_cls=bnb.optim.AdamW8bit,
        lr=learning_rate,
        loraplus_lr_ratio=16.0
    )

    # Create trainer
    logger.info("Creating trainer with pruning support...")
    trainer = PruningTrainer(
        trial=trial,  # Pass trial for pruning
        pos_weight=pos_weight,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        optimizers=(optimizer, None),
    )

    # Train model
    logger.info("Starting training...")
    trainer.train()

    # Evaluate
    logger.info("Evaluating model...")
    eval_results = trainer.evaluate()

    # Save model and tokenizer
    if save_model:
        logger.info("Saving model...")
        model_dir = os.path.join(output_dir, "best_model")
        os.makedirs(model_dir, exist_ok=True)
        
        # Save the tokenizer
        modernbert_tokenizer.save_pretrained(model_dir)
        
        # Save PEFT adapters separately
        model.modernbert.save_pretrained(model_dir)
        
        # Save the classifier weights separately
        torch.save(model.classifier.state_dict(), os.path.join(model_dir, "classifier_weights.pt"))
        
        # Save optimal threshold
        with open(os.path.join(model_dir, "optimal_threshold.txt"), "w") as f:
            f.write(str(trainer.threshold))

    # Clean up
    del model, modernbert, sbert_model, optimizer
    torch.cuda.empty_cache()
    gc.collect()

    return eval_results

# Hyperparameters Optimization

In [16]:
def set_seed(seed=42):
    """
    Set random seeds for reproducibility.

    Args:
        seed (int): The seed value to set.
    """
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True

In [17]:
def clean_text(text):
    """
    Clean text by removing reference tags and normalizing whitespace.

    Args:
        text (str): The input text to clean.

    Returns:
        str: The cleaned text.
    """
    # Remove reference tags
    cleaned_text = re.sub(r"\[REF\]|\[REF|REF\]", "", text).strip()

    # Normalize text
    cleaned_text = unidecode.unidecode(cleaned_text)

    punctuations = re.escape(string.punctuation)  # escape special characters like [ ] ( ) etc.

    # Remove spaces between letter and punctuation
    cleaned_text = re.sub(r"([a-zA-Z])\s+([{}])".format(punctuations), r"\1\2", cleaned_text)
    # Remove spaces between punctuation and another punctuation
    cleaned_text = re.sub(r"([{}])\s+([{}])".format(punctuations, punctuations), r"\1\2", cleaned_text)

    # Remove extra whitespaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

    return cleaned_text

In [25]:
def objective(trial, train_df, val_df):
    """
    Objective function for Optuna hyperparameter optimization.

    Args:
        trial (optuna.Trial): The Optuna trial object.
        train_df (pd.DataFrame): The training dataset.
        val_df (pd.DataFrame): The validation dataset.

    Returns:
        float: The evaluation metric (weighted macro F1 score).

    Raises:
        optuna.exceptions.TrialPruned: If the trial is pruned.
        Exception: If any other error occurs during training.
    """
    # Sample hyperparameters
    params = {
        'learning_rate': trial.suggest_float("learning_rate", 5e-6, 3e-4, log=True),
        'batch_size': trial.suggest_categorical("batch_size", [64, 128, 256]),
        'num_epochs': trial.suggest_categorical("num_epochs", [3, 4, 5]),
        'weight_decay': trial.suggest_float("weight_decay", 0.0, 0.1),
        'warmup_ratio': trial.suggest_float("warmup_ratio", 0.0, 0.2),
        'gradient_accumulation_steps': trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4]),
        'lora_r': trial.suggest_int("lora_r", 16, 64, step=8),
        'lora_alpha': trial.suggest_int("lora_alpha", 16, 64, step=8),
        'lora_dropout': trial.suggest_float("lora_dropout", 0.0, 0.2),
        'classifier_dropout': trial.suggest_float("classifier_dropout", 0.1, 0.5),
        'classifier_hidden_size': trial.suggest_categorical("classifier_hidden_size", [384, 512, 768])
    }

    # Create trial directory
    trial_dir = f"./results/trial_{trial.number}"
    os.makedirs(trial_dir, exist_ok=True)

    try:
        # Train model with these parameters, passing trial for pruning
        results = train_dual_embedding_model(
            train_df,
            val_df,
            params,
            trial=trial,  # Pass trial for pruning
            output_dir=trial_dir,
            use_saved_data=True,
            data_dir=f"./cached_data",
            save_model=False  # Only save the final model
        )

        # Explicitly return the weighted macro F1 score
        metric_value = results.get("eval_optimal_w_macro_f1", 0.0)
        logger.info(f"Trial {trial.number} - weighted macro F1: {metric_value:.4f}")

        # Clean up CUDA memory after each trial
        torch.cuda.empty_cache()
        gc.collect()

        return metric_value

    except optuna.exceptions.TrialPruned as e:
        # Re-raise pruning exception to properly handle it in Optuna
        raise e
    except Exception as e:
        logger.error(f"Trial {trial.number} failed with error: {str(e)}")
        # Return a very low score so this trial is considered a failure
        return 0.0

In [26]:
def optimize_hyperparameters(train_df, val_df, n_trials=20):
    """
    Run hyperparameter optimization using Optuna with pruning.

    Args:
        train_df (pd.DataFrame): The training dataset.
        val_df (pd.DataFrame): The validation dataset.
        n_trials (int): Number of trials for hyperparameter optimization.

    Returns:
        dict: Best hyperparameters found during optimization.

    Raises:
        ValueError: If the dataset does not contain the required columns.

    Raises:
        RuntimeError: If the model training fails.
    """
    logger.info(f"Starting hyperparameter optimization with {n_trials} trials")

    # Create output directory
    os.makedirs("./results", exist_ok=True)

    # Get device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")

    # Create study object with pruning
    study = optuna.create_study(
        direction="maximize",  # Maximize weighted macro F1
        study_name="dual_embedding_hyperparameter_optimization",
        load_if_exists=True,
        pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2)
    )

    # Create objective function with fixed parameters
    objective_func = partial(
        objective,
        train_df=train_df,
        val_df=val_df
    )

    # Run optimization
    study.optimize(objective_func, n_trials=n_trials)

    # Log best parameters
    logger.info("Hyperparameter optimization complete!")
    logger.info(f"Best trial: {study.best_trial.number}")
    logger.info(f"Best weighted macro F1: {study.best_trial.value:.4f}")
    logger.info("Best hyperparameters:")
    for key, value in study.best_params.items():
        logger.info(f"  {key}: {value}")

    # Save best parameters
    best_params_file = "./results/best_params.txt"
    with open(best_params_file, "w") as f:
        for key, value in study.best_params.items():
            f.write(f"{key}: {value}\n")

    return study.best_params

In [27]:
def main(dataset_path: str):
    """Main function for the evidence detection pipeline."""
    logger.info("Starting evidence detection pipeline")

    # Set random seed
    set_seed(42)

    # 1. Load and preprocess data
    logger.info("Loading datasets...")
    train_df = pd.read_csv(f"{dataset_path}/train.csv")
    test_df = pd.read_csv(f"{dataset_path}/dev.csv")

    logger.info("Preprocessing data...")
    train_df["Evidence"] = train_df["Evidence"].apply(clean_text)
    train_df["Claim"] = train_df["Claim"].apply(clean_text)
    test_df["Evidence"] = test_df["Evidence"].apply(clean_text)
    test_df["Claim"] = test_df["Claim"].apply(clean_text)

    # Convert labels to the right format
    train_df["label"] = train_df["label"].astype(np.float16)
    test_df["label"] = test_df["label"].astype(np.float16)

    # 2. Data augmentation (training data only)
    logger.info("Augmenting training data...")

    train_df_augmented = create_balanced_augmented_dataset(
        train_df,
        cache_dir="./cached_data",
        force_rebuild=False
    )

    # 3. Run hyperparameter optimization
    logger.info("Starting hyperparameter optimization...")
    best_params = optimize_hyperparameters(
        train_df_augmented,
        test_df,
        n_trials=10
    )

    # 4. Train final model with best parameters
    logger.info("Training final model with best hyperparameters...")

    final_results = train_dual_embedding_model(
        train_df_augmented,
        test_df,
        best_params,
        output_dir="./final_model",
        use_saved_data=True,
        data_dir="./cached_data/final_model",
        save_model=True
    )

    # 5. Log final results
    logger.info("Training complete! Final evaluation results:")
    for key, value in final_results.items():
        if key.startswith("eval_optimal_"):
            logger.info(f"  {key}: {value:.4f}")

    logger.info("Evidence detection pipeline completed successfully!")

In [None]:
DATASET_PATH = "training_data/ED"

main(DATASET_PATH)

[I 2025-04-05 01:39:17,624] A new study created in memory with name: dual_embedding_hyperparameter_optimization
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
W0405 01:39:25.730000 285 torch/_dynamo/convert_frame.py:906] [34/8] torch._dynamo hit config.cache_size_limit (8)
W0405 01:39:25.730000 285 torch/_dynamo/convert_frame.py:906] [34/8]    function: 'torch_dynamo_resume_in_forward_at_494' (/usr/local/lib/python3.11/dist-packages/peft/tuners/lora/bnb.py:494)
W0405 01:39:25.730000 285 torch/_dynamo/convert_frame.py:906] [34/8]    last reason: 34/4: GLOBAL_STATE changed: grad_mode 
W0405 01:39:25.730000 285 torch/_dynamo/convert_frame.py:906] [34/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W0405 01:39:25.730000 285 torch/_dynamo/convert_frame.py:906] [34/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy,Macro P,Macro R,Macro F1,W Macro P,W Macro R,W Macro F1,Mcc,Threshold
300,0.1763,0.35286,0.874958,0.839891,0.857655,0.847912,0.879341,0.874958,0.876549,0.69732,0.5


  return fn(*args, **kwargs)
There were unexpected keys in the checkpoint model loaded: ['modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.absmax', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.nested_absmax', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.nested_quant_map', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.quant_map', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.quant_state.bitsandbytes__nf4', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.absmax', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.nested_absmax', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.nested_quant_map', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.quant_map', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.quant_state.bitsandbytes__nf4', 'modernbert.base_model.model.layers.0.mlp.Wi.base_layer.weight.absmax', 'modernbert.b

[I 2025-04-05 01:46:27,182] Trial 0 finished with value: 0.8795850266988647 and parameters: {'learning_rate': 6.621764186962187e-05, 'batch_size': 128, 'num_epochs': 5, 'weight_decay': 0.06904263816649328, 'warmup_ratio': 0.05045906450115829, 'gradient_accumulation_steps': 2, 'lora_r': 64, 'lora_alpha': 56, 'lora_dropout': 0.13374058436691866, 'classifier_dropout': 0.26738450165760114, 'classifier_hidden_size': 512}. Best is trial 0 with value: 0.8795850266988647.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy,Macro P,Macro R,Macro F1,W Macro P,W Macro R,W Macro F1,Mcc,Threshold
300,0.3588,0.364883,0.859433,0.82347,0.827912,0.825636,0.860468,0.859433,0.859912,0.651366,0.5
600,0.2515,0.35707,0.870233,0.837886,0.838013,0.83795,0.870257,0.870233,0.870245,0.675899,0.5
900,0.1792,0.513563,0.865677,0.834304,0.826582,0.83029,0.864316,0.865677,0.864892,0.66084,0.5
1200,0.1246,0.430747,0.871752,0.834943,0.861838,0.846287,0.879661,0.871752,0.874222,0.696261,0.5
1500,0.0326,0.67601,0.872764,0.836509,0.859714,0.846584,0.879145,0.872764,0.874882,0.695837,0.5


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
There were unexpected keys in the checkpoint model loaded: ['modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.absmax', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.nested_absmax', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.nested_quant_map', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.quant_map', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.quant_state.bitsandbytes__nf4', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.absmax', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.nested_absmax', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.nested_quant_map', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.quant_map', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.q

[I 2025-04-05 01:58:54,181] Trial 1 finished with value: 0.8778958089512661 and parameters: {'learning_rate': 0.000244072640622032, 'batch_size': 64, 'num_epochs': 4, 'weight_decay': 0.02841726112769487, 'warmup_ratio': 0.07826637472454721, 'gradient_accumulation_steps': 1, 'lora_r': 32, 'lora_alpha': 56, 'lora_dropout': 0.08735198376670755, 'classifier_dropout': 0.10128407262244715, 'classifier_hidden_size': 384}. Best is trial 0 with value: 0.8795850266988647.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy,Macro P,Macro R,Macro F1,W Macro P,W Macro R,W Macro F1,Mcc,Threshold
300,0.2629,0.371448,0.852514,0.81408,0.858702,0.829186,0.872729,0.852514,0.857372,0.671301,0.5


  return fn(*args, **kwargs)
There were unexpected keys in the checkpoint model loaded: ['modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.absmax', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.nested_absmax', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.nested_quant_map', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.quant_map', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.quant_state.bitsandbytes__nf4', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.absmax', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.nested_absmax', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.nested_quant_map', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.quant_map', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.quant_state.bitsandbytes__nf4', 'modernbert.base_model.model.layers.0.mlp.Wi.base_layer.weight.absmax', 'modernbert.b

[I 2025-04-05 02:06:00,358] Trial 2 finished with value: 0.8645022567854153 and parameters: {'learning_rate': 5.953293710073359e-05, 'batch_size': 128, 'num_epochs': 5, 'weight_decay': 0.029290957589980363, 'warmup_ratio': 0.0832623670752482, 'gradient_accumulation_steps': 2, 'lora_r': 40, 'lora_alpha': 24, 'lora_dropout': 0.16599483657158368, 'classifier_dropout': 0.27487376907900907, 'classifier_hidden_size': 512}. Best is trial 0 with value: 0.8795850266988647.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy,Macro P,Macro R,Macro F1,W Macro P,W Macro R,W Macro F1,Mcc,Threshold
300,0.1459,0.35418,0.877658,0.842213,0.865921,0.852509,0.883933,0.877658,0.879703,0.707737,0.5


  return fn(*args, **kwargs)
There were unexpected keys in the checkpoint model loaded: ['modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.absmax', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.nested_absmax', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.nested_quant_map', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.quant_map', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.quant_state.bitsandbytes__nf4', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.absmax', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.nested_absmax', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.nested_quant_map', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.quant_map', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.quant_state.bitsandbytes__nf4', 'modernbert.base_model.model.layers.0.mlp.Wi.base_layer.weight.absmax', 'modernbert.b

[I 2025-04-05 02:17:12,064] Trial 3 finished with value: 0.8824155917115593 and parameters: {'learning_rate': 0.0002643238333834569, 'batch_size': 64, 'num_epochs': 4, 'weight_decay': 0.048207625326781293, 'warmup_ratio': 0.19552784843595056, 'gradient_accumulation_steps': 4, 'lora_r': 56, 'lora_alpha': 40, 'lora_dropout': 0.07644825534662132, 'classifier_dropout': 0.2659719581055393, 'classifier_hidden_size': 768}. Best is trial 3 with value: 0.8824155917115593.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


[I 2025-04-05 02:20:46,212] Trial 4 finished with value: 0.8649020715295556 and parameters: {'learning_rate': 6.53173615511963e-05, 'batch_size': 256, 'num_epochs': 5, 'weight_decay': 0.07788719789649168, 'warmup_ratio': 0.03264532688269624, 'gradient_accumulation_steps': 2, 'lora_r': 24, 'lora_alpha': 64, 'lora_dropout': 0.1913698458545149, 'classifier_dropout': 0.2266272223624668, 'classifier_hidden_size': 384}. Best is trial 3 with value: 0.8824155917115593.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy,Macro P,Macro R,Macro F1,W Macro P,W Macro R,W Macro F1,Mcc,Threshold
300,0.2776,0.340067,0.870908,0.841282,0.832457,0.836676,0.86948,0.870908,0.870062,0.673681,0.5


[I 2025-04-05 02:24:53,446] Trial 5 pruned. Trial 5 pruned at step 300 with value 0.8711
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy,Macro P,Macro R,Macro F1,W Macro P,W Macro R,W Macro F1,Mcc,Threshold
300,0.1177,0.413166,0.88002,0.850106,0.850238,0.850172,0.880043,0.88002,0.880032,0.700344,0.5


  return fn(*args, **kwargs)
There were unexpected keys in the checkpoint model loaded: ['modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.absmax', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.nested_absmax', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.nested_quant_map', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.quant_map', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.quant_state.bitsandbytes__nf4', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.absmax', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.nested_absmax', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.nested_quant_map', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.quant_map', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.quant_state.bitsandbytes__nf4', 'modernbert.base_model.model.layers.0.mlp.Wi.base_layer.weight.absmax', 'modernbert.b

[I 2025-04-05 02:33:26,572] Trial 6 finished with value: 0.8789242661339393 and parameters: {'learning_rate': 0.0002763460158022115, 'batch_size': 64, 'num_epochs': 3, 'weight_decay': 0.011025578341646426, 'warmup_ratio': 0.02317613699618484, 'gradient_accumulation_steps': 4, 'lora_r': 24, 'lora_alpha': 40, 'lora_dropout': 0.12170294270794065, 'classifier_dropout': 0.10861295667874003, 'classifier_hidden_size': 768}. Best is trial 3 with value: 0.8824155917115593.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy,Macro P,Macro R,Macro F1,W Macro P,W Macro R,W Macro F1,Mcc,Threshold
300,0.3011,0.372196,0.858589,0.824133,0.820929,0.822503,0.857925,0.858589,0.858238,0.645054,0.5


[I 2025-04-05 02:41:11,323] Trial 7 pruned. Trial 7 pruned at step 300 with value 0.8602
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy,Macro P,Macro R,Macro F1,W Macro P,W Macro R,W Macro F1,Mcc,Threshold
300,0.3366,0.363229,0.860952,0.831001,0.814281,0.821933,0.85831,0.860952,0.859151,0.645065,0.5


[I 2025-04-05 02:43:25,829] Trial 8 pruned. Trial 8 pruned at step 300 with value 0.8586
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy,Macro P,Macro R,Macro F1,W Macro P,W Macro R,W Macro F1,Mcc,Threshold
300,0.5016,0.527698,0.767128,0.721118,0.753183,0.730759,0.792312,0.767128,0.774943,0.473215,0.5


[I 2025-04-05 02:45:34,317] Trial 9 pruned. Trial 9 pruned at step 300 with value 0.7758


Batches:   0%|          | 0/428 [00:00<?, ?it/s]

Batches:   0%|          | 0/428 [00:00<?, ?it/s]

Combining embeddings:   0%|          | 0/27362 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Combining embeddings:   0%|          | 0/5926 [00:00<?, ?it/s]

  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy,Macro P,Macro R,Macro F1,W Macro P,W Macro R,W Macro F1,Mcc,Threshold
300,0.1565,0.419928,0.854033,0.81467,0.850906,0.82833,0.868358,0.854033,0.85799,0.664589,0.5


  return fn(*args, **kwargs)
There were unexpected keys in the checkpoint model loaded: ['modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.absmax', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.nested_absmax', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.nested_quant_map', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.quant_map', 'modernbert.base_model.model.layers.0.attn.Wqkv.base_layer.weight.quant_state.bitsandbytes__nf4', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.absmax', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.nested_absmax', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.nested_quant_map', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.quant_map', 'modernbert.base_model.model.layers.0.attn.Wo.base_layer.weight.quant_state.bitsandbytes__nf4', 'modernbert.base_model.model.layers.0.mlp.Wi.base_layer.weight.absmax', 'modernbert.b

In [None]:
import zipfile

with zipfile.ZipFile("cached_data.zip", "w") as zipf:
    # Add the directory to the zip file
    for root, dirs, files in os.walk("cached_data"):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, "cached_data"))

with zipfile.ZipFile("final_model.zip", "w") as zipf:
    # Add the directory to the zip file
    for root, dirs, files in os.walk("final_model"):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, "final_model"))
