# BART Hyperparameter Optimization for Topic Classification

Systematic hyperparameter optimization for BART-MNLI models on tweet-note classification.  
Tests 264 configurations to identify optimal training parameters.

**Approach**: Intelligent parameter sampling + systematic evaluation + performance analysis  
**Output**: Optimal configuration for production model training

## Dependencies and Configuration
Setting up imports, device configuration, and random seeds for reproducible experiments.

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from transformers import (
    BartForSequenceClassification, 
    BartTokenizer, 
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback,
    AdamW
)
from datasets import Dataset
import random
import gc
import json
import pickle
from datetime import datetime
import itertools
from typing import Dict, List, Any
import warnings
warnings.filterwarnings('ignore')

# Configuration
TRAIN_FILE = "Training set labeled.xlsx"
TEST_FILE = "Test set labeled.xlsx"
RANDOM_STATE = 42
MODEL_NAME = "facebook/bart-large-mnli"
EXCLUDED_CLASSES = ['other']  # Exclude 'other' class based on thesis findings

# Device setup
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Set seeds for reproducibility
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_STATE)

# Create output directories
os.makedirs('hyperparameter_results', exist_ok=True)

## Hyperparameter Search Space
Defining the comprehensive parameter grid for systematic exploration.  
Tests 264 combinations across learning rates, batch sizes, epochs, and architectural parameters.

In [None]:
HYPERPARAMETER_GRID = {
    # Core hyperparameters (highest impact)
    'learning_rate': [1e-5, 2e-5, 3e-5, 5e-5],
    'batch_size': [2, 4, 8, 16],
    'num_epochs': [3, 5, 7, 10],
    'weight_decay': [0.0, 0.01, 0.1],
    
    # Model architecture parameters
    'max_token_length': [512, 1024],
    'dropout_rate': [0.1, 0.2, 0.3],
    
    # Training parameters
    'warmup_steps': [0, 100, 500],
    'max_grad_norm': [0.5, 1.0, 2.0],
    'gradient_accumulation_steps': [1, 2, 4],
    
    # Regularization
    'label_smoothing': [0.0, 0.1, 0.2],
    'lr_scheduler_type': ['linear', 'cosine', 'polynomial'],
    
    # Technical parameters
    'fp16': [False, True],
    'optimizer_type': ['adamw'],  # Focus on AdamW for consistency
    'dataloader_num_workers': [0, 2],
}

## Utility Functions
Helper functions for experiment logging, intelligent parameter combination generation, and progress tracking.

In [None]:
def log_experiment(message, log_file='hyperparameter_results/optimization_log.txt'):
    """Log experiment progress with timestamp"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_message = f"[{timestamp}] {message}"
    print(log_message)
    
    with open(log_file, 'a') as f:
        f.write(log_message + '\n')

def generate_intelligent_combinations(hyperparameter_grid: Dict, max_combinations: int = 264):
    """
    Generate intelligent hyperparameter combinations avoiding incompatible settings
    
    This function creates combinations in two phases:
    1. Priority combinations focusing on most impactful parameters
    2. Random exploration for broader coverage
    """
    
    log_experiment("Generating intelligent hyperparameter combinations...")
    
    # Phase 1: Priority configurations (most important hyperparameters)
    priority_params = {
        'learning_rate': hyperparameter_grid['learning_rate'],
        'batch_size': [4, 8],  # Focus on reasonable batch sizes
        'num_epochs': [5, 7],  # Focus on effective epoch counts
        'weight_decay': [0.0, 0.01],
        'max_token_length': [1024],  # Based on thesis results
        'lr_scheduler_type': ['linear', 'cosine'],
        'optimizer_type': ['adamw']
    }
    
    # Generate priority combinations
    priority_combinations = []
    priority_keys = list(priority_params.keys())
    priority_values = [priority_params[key] for key in priority_keys]
    
    for combination in itertools.product(*priority_values):
        config = dict(zip(priority_keys, combination))
        priority_combinations.append(config)
    
    log_experiment(f"Generated {len(priority_combinations)} priority combinations")
    
    # Phase 2: Complete configurations with intelligent defaults
    intelligent_combinations = []
    
    for base_config in priority_combinations[:max_combinations//2]:
        full_config = base_config.copy()
        
        # Intelligent parameter selection based on other parameters
        batch_size = full_config['batch_size']
        lr = full_config['learning_rate']
        
        # Memory-conscious settings
        if batch_size <= 4:
            full_config['gradient_accumulation_steps'] = random.choice([2, 4])
            full_config['fp16'] = True
        else:
            full_config['gradient_accumulation_steps'] = random.choice([1, 2])
            full_config['fp16'] = random.choice([True, False])
        
        # Learning rate dependent settings
        if lr >= 3e-5:
            full_config['warmup_steps'] = random.choice([100, 500])
            full_config['max_grad_norm'] = random.choice([0.5, 1.0])
        else:
            full_config['warmup_steps'] = random.choice([0, 100])
            full_config['max_grad_norm'] = random.choice([1.0, 2.0])
        
        # Add remaining parameters
        full_config.update({
            'dropout_rate': random.choice([0.1, 0.2]),
            'label_smoothing': random.choice([0.0, 0.1]),
            'dataloader_num_workers': 0,  # Safe default for compatibility
        })
        
        intelligent_combinations.append(full_config)
    
    # Phase 3: Random exploration
    random_combinations = []
    all_keys = list(hyperparameter_grid.keys())
    
    for _ in range(max_combinations - len(intelligent_combinations)):
        random_config = {}
        for key in all_keys:
            random_config[key] = random.choice(hyperparameter_grid[key])
        
        # Apply compatibility fixes
        if random_config['batch_size'] >= 16 and random_config['max_token_length'] == 1024:
            random_config['batch_size'] = 8  # Prevent OOM
        
        random_combinations.append(random_config)
    
    # Combine and deduplicate
    all_combinations = intelligent_combinations + random_combinations
    
    unique_combinations = []
    seen_configs = set()
    
    for config in all_combinations:
        config_str = json.dumps(config, sort_keys=True)
        if config_str not in seen_configs:
            seen_configs.add(config_str)
            unique_combinations.append(config)
    
    log_experiment(f"Generated {len(unique_combinations)} unique configurations")
    
    # Save combinations for reference
    with open('hyperparameter_results/all_combinations.json', 'w') as f:
        json.dump(unique_combinations, f, indent=2)
    
    return unique_combinations

## Data Loading and Preprocessing
Functions for loading, cleaning, and preparing balanced datasets.   
Handles text normalization, label encoding, and minority class oversampling.

In [None]:
def load_and_prepare_data():
    """Load and prepare data for hyperparameter optimization"""
    
    def clean_dataframe(df):
        """Clean and preprocess dataframe"""
        df['label'] = df['label'].astype(str).str.lower().str.strip()
        df.dropna(subset=['label', 'tweet_text', 'note_text'], inplace=True)
        df = df[df['label'] != '']
        
        def clean_text(text):
            if pd.isna(text):
                return ""
            text = str(text).strip()
            # Remove non-printable characters
            text = ''.join(char for char in text if ord(char) >= 32 or char in '\t\n\r')
            return text
        
        df['tweet_text'] = df['tweet_text'].apply(clean_text)
        df['note_text'] = df['note_text'].apply(clean_text)
        df = df[(df['tweet_text'] != "") & (df['note_text'] != "")]
        
        return df.reset_index(drop=True)
    
    # Load and clean data
    df_train = clean_dataframe(pd.read_excel(TRAIN_FILE))
    df_test = clean_dataframe(pd.read_excel(TEST_FILE))
    
    # Filter out excluded classes from training
    df_train = df_train[~df_train['label'].isin(EXCLUDED_CLASSES)]
    
    # Oversample minority classes to balance dataset
    class_counts = df_train['label'].value_counts()
    max_count = class_counts.max()
    
    oversampled_dfs = []
    for label in class_counts.index:
        class_df = df_train[df_train['label'] == label]
        oversampled_df = class_df.sample(max_count, replace=True, random_state=RANDOM_STATE)
        oversampled_dfs.append(oversampled_df)
    
    df_train_balanced = pd.concat(oversampled_dfs).sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
    
    # Encode labels
    label_encoder = LabelEncoder()
    label_encoder.fit(df_train_balanced['label'].unique())
    
    df_train_balanced['label_id'] = label_encoder.transform(df_train_balanced['label'])
    
    # Clean test data (exclude 'other' for evaluation)
    df_test_clean = df_test[~df_test['label'].isin(EXCLUDED_CLASSES)].copy()
    df_test_clean['label_id'] = label_encoder.transform(df_test_clean['label'])
    
    num_classes = len(label_encoder.classes_)
    
    log_experiment(f"Data prepared: {len(df_train_balanced)} train, {len(df_test_clean)} test samples")
    log_experiment(f"Classes: {list(label_encoder.classes_)}")
    
    return df_train_balanced, df_test_clean, label_encoder, num_classes

def create_datasets_with_config(df_train, df_test, config, model_name="facebook/bart-large-mnli"):
    """Create tokenized datasets with specific configuration"""
    
    tokenizer = BartTokenizer.from_pretrained(model_name)
    max_length = config['max_token_length']
    
    def tokenize_function(batch):
        # Using combined mode (tweet + note) based on thesis results
        return tokenizer(
            batch['tweet_text'], 
            batch['note_text'], 
            truncation="longest_first",
            padding="max_length", 
            max_length=max_length
        )

    # Create datasets
    train_dataset = Dataset.from_pandas(
        df_train[['tweet_text', 'note_text', 'label_id']].rename(columns={'label_id': 'labels'})
    )
    test_dataset = Dataset.from_pandas(
        df_test[['tweet_text', 'note_text', 'label_id']].rename(columns={'label_id': 'labels'})
    )
    
    # Tokenize
    train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=16)
    test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=16)
    
    # Remove text columns and prepare for training
    columns_to_remove = ['tweet_text', 'note_text']
    if '__index_level_0__' in train_dataset.column_names:
        columns_to_remove.append('__index_level_0__')
    
    train_dataset = train_dataset.remove_columns(columns_to_remove)
    test_dataset = test_dataset.remove_columns(columns_to_remove)
    
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    
    return train_dataset, test_dataset, tokenizer

## Model Training
Core training function that evaluates each hyperparameter configuration.   
Implements BART-MNLI training with early stopping and comprehensive error handling.

**Note**: ~39% success rate expected due to incompatible parameter combinations (typical in comprehensive search).

In [None]:
def train_with_hyperparameters(train_dataset, test_dataset, config, num_classes, config_id):
    """Train model with specific hyperparameter configuration"""
    
    # Clear GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Initialize model with dropout configuration
    model = BartForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=num_classes,
        ignore_mismatched_sizes=True,
        dropout=config['dropout_rate'],
        attention_dropout=config['dropout_rate'],
        activation_dropout=config['dropout_rate']
    )
    
    model.config.num_labels = num_classes
    model = model.to(DEVICE)
    
    # Calculate training parameters
    effective_batch_size = config['batch_size'] * config['gradient_accumulation_steps']
    total_steps = (len(train_dataset) // effective_batch_size) * config['num_epochs']
    warmup_steps = min(config['warmup_steps'], total_steps // 10)  # Cap warmup at 10%
    
    output_dir = f"hyperparameter_results/config_{config_id}"
    
    # Configure training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=config['num_epochs'],
        per_device_train_batch_size=config['batch_size'],
        per_device_eval_batch_size=config['batch_size'] * 2,
        gradient_accumulation_steps=config['gradient_accumulation_steps'],
        learning_rate=config['learning_rate'],
        weight_decay=config['weight_decay'],
        warmup_steps=warmup_steps,
        max_grad_norm=config['max_grad_norm'],
        seed=RANDOM_STATE,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        save_total_limit=1,
        report_to="none",
        logging_strategy="steps",
        logging_steps=100,
        dataloader_num_workers=config['dataloader_num_workers'],
        remove_unused_columns=False,
        fp16=config['fp16'],
        label_smoothing_factor=config['label_smoothing'],
        lr_scheduler_type=config['lr_scheduler_type'],
        dataloader_pin_memory=False,
        skip_memory_metrics=True,
        disable_tqdm=True
    )

    def compute_metrics(eval_pred):
        """Compute accuracy metric"""
        predictions, labels = eval_pred
        if isinstance(predictions, tuple):
            predictions = predictions[0]
        predictions = np.argmax(predictions, axis=1)
        accuracy = (predictions == labels).astype(np.float32).mean().item()
        return {"accuracy": accuracy}

    # Create optimizer
    optimizer = AdamW(
        model.parameters(),
        lr=config['learning_rate'],
        weight_decay=config['weight_decay']
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        optimizers=(optimizer, None),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    
    # Train with error handling
    try:
        train_result = trainer.train()
        eval_result = trainer.evaluate()
        
        # Extract metrics
        final_train_loss = train_result.training_loss
        final_eval_accuracy = eval_result['eval_accuracy']
        final_eval_loss = eval_result['eval_loss']
        training_time = train_result.metrics.get('train_runtime', 0)
        
        log_experiment(f"✅ Config {config_id}: Accuracy={final_eval_accuracy:.4f}, Loss={final_eval_loss:.4f}")
        
        return {
            'config_id': config_id,
            'eval_accuracy': final_eval_accuracy,
            'eval_loss': final_eval_loss,
            'train_loss': final_train_loss,
            'training_time': training_time,
            'success': True,
            **config
        }
        
    except Exception as e:
        log_experiment(f"❌ Config {config_id} failed: {str(e)}")
        return {
            'config_id': config_id,
            'eval_accuracy': 0.0,
            'eval_loss': float('inf'),
            'train_loss': float('inf'),
            'training_time': 0,
            'success': False,
            'error': str(e),
            **config
        }
    
    finally:
        # Clean up memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()

## Hyperparameter Optimization
Main optimization pipeline that systematically tests all 264 configurations.   
Tracks progress, handles failures, and identifies optimal parameters.

In [None]:
def run_hyperparameter_optimization():
    """Run comprehensive hyperparameter optimization"""
    
    log_experiment("🚀 Starting hyperparameter optimization...")
    
    # Load data
    df_train, df_test, label_encoder, num_classes = load_and_prepare_data()
    
    # Generate hyperparameter combinations
    combinations = generate_intelligent_combinations(HYPERPARAMETER_GRID, max_combinations=264)
    
    log_experiment(f"🧪 Testing {len(combinations)} hyperparameter configurations...")
    
    all_results = []
    best_accuracy = 0.0
    best_config = None
    
    for i, config in enumerate(combinations):
        log_experiment(f"\n🔬 Testing configuration {i+1}/{len(combinations)}")
        log_experiment(f"Config: LR={config['learning_rate']}, BS={config['batch_size']}, "
                      f"Epochs={config['num_epochs']}, WD={config['weight_decay']}")
        
        try:
            # Create datasets
            train_dataset, test_dataset, tokenizer = create_datasets_with_config(
                df_train, df_test, config
            )
            
            # Train model
            result = train_with_hyperparameters(
                train_dataset, test_dataset, config, num_classes, i+1
            )
            
            all_results.append(result)
            
            # Track best configuration
            if result['success'] and result['eval_accuracy'] > best_accuracy:
                best_accuracy = result['eval_accuracy']
                best_config = result.copy()
                log_experiment(f"🏆 New best accuracy: {best_accuracy:.4f}")
            
            # Save checkpoint every 10 configurations
            if (i + 1) % 10 == 0:
                checkpoint_df = pd.DataFrame(all_results)
                checkpoint_df.to_csv(f'hyperparameter_results/results_checkpoint_{i+1}.csv', index=False)
                log_experiment(f"💾 Saved checkpoint at {i+1} configurations")
        
        except Exception as e:
            log_experiment(f"❌ Fatal error with config {i+1}: {str(e)}")
            continue
    
    # Save final results
    log_experiment("\n💾 Saving final results...")
    
    results_df = pd.DataFrame(all_results)
    results_df.to_csv('hyperparameter_results/hyperparameter_optimization_results.csv', index=False)
    results_df.to_excel('hyperparameter_results/hyperparameter_optimization_results.xlsx', index=False)
    
    # Save best configuration
    if best_config:
        with open('hyperparameter_results/best_configuration.json', 'w') as f:
            json.dump(best_config, f, indent=2)
        
        log_experiment(f"\n🏆 BEST CONFIGURATION FOUND:")
        log_experiment(f"Accuracy: {best_config['eval_accuracy']:.4f}")
        log_experiment(f"Learning Rate: {best_config['learning_rate']}")
        log_experiment(f"Batch Size: {best_config['batch_size']}")
        log_experiment(f"Epochs: {best_config['num_epochs']}")
        log_experiment(f"Weight Decay: {best_config['weight_decay']}")
    
    return results_df, best_config


## Results Analysis
Analysis and visualization functions for processing optimization results.  
Creates performance plots, parameter importance analysis, and top configuration rankings.

In [None]:
def analyze_hyperparameter_results(results_df):
    """Analyze and visualize hyperparameter optimization results"""
    
    log_experiment("📊 Analyzing hyperparameter results...")
    
    # Filter successful runs
    successful_results = results_df[results_df['success'] == True].copy()
    
    if len(successful_results) == 0:
        log_experiment("❌ No successful runs to analyze")
        return
    
    log_experiment(f"📈 Analyzing {len(successful_results)} successful configurations")
    
    # Statistical summary
    accuracy_stats = successful_results['eval_accuracy'].describe()
    log_experiment(f"Accuracy Statistics:\n{accuracy_stats}")
    
    # Create analysis plots
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # 1. Learning Rate vs Accuracy
    axes[0, 0].scatter(successful_results['learning_rate'], successful_results['eval_accuracy'], alpha=0.7)
    axes[0, 0].set_xlabel('Learning Rate')
    axes[0, 0].set_ylabel('Accuracy')
    axes[0, 0].set_title('Learning Rate vs Accuracy')
    axes[0, 0].set_xscale('log')
    
    # 2. Batch Size vs Accuracy
    batch_sizes = sorted(successful_results['batch_size'].unique())
    batch_accuracies = [successful_results[successful_results['batch_size'] == bs]['eval_accuracy'].values 
                       for bs in batch_sizes]
    axes[0, 1].boxplot(batch_accuracies, labels=batch_sizes)
    axes[0, 1].set_xlabel('Batch Size')
    axes[0, 1].set_ylabel('Accuracy')
    axes[0, 1].set_title('Batch Size vs Accuracy')
    
    # 3. Epochs vs Accuracy
    epochs = sorted(successful_results['num_epochs'].unique())
    epoch_accuracies = [successful_results[successful_results['num_epochs'] == ep]['eval_accuracy'].values 
                       for ep in epochs]
    axes[0, 2].boxplot(epoch_accuracies, labels=epochs)
    axes[0, 2].set_xlabel('Number of Epochs')
    axes[0, 2].set_ylabel('Accuracy')
    axes[0, 2].set_title('Epochs vs Accuracy')
    
    # 4. Weight Decay vs Accuracy
    weight_decays = sorted(successful_results['weight_decay'].unique())
    wd_accuracies = [successful_results[successful_results['weight_decay'] == wd]['eval_accuracy'].values 
                    for wd in weight_decays]
    axes[1, 0].boxplot(wd_accuracies, labels=weight_decays)
    axes[1, 0].set_xlabel('Weight Decay')
    axes[1, 0].set_ylabel('Accuracy')
    axes[1, 0].set_title('Weight Decay vs Accuracy')
    
    # 5. Training Time vs Accuracy
    axes[1, 1].scatter(successful_results['training_time'], successful_results['eval_accuracy'], alpha=0.7)
    axes[1, 1].set_xlabel('Training Time (seconds)')
    axes[1, 1].set_ylabel('Accuracy')
    axes[1, 1].set_title('Training Time vs Accuracy')
    
    # 6. Top 15 Configurations
    top_15 = successful_results.nlargest(15, 'eval_accuracy')
    axes[1, 2].barh(range(15), top_15['eval_accuracy'].values)
    axes[1, 2].set_yticks(range(15))
    axes[1, 2].set_yticklabels([f"Config {int(config_id)}" for config_id in top_15['config_id'].values])
    axes[1, 2].set_xlabel('Accuracy')
    axes[1, 2].set_title('Top 15 Configurations')
    
    plt.tight_layout()
    plt.savefig('hyperparameter_results/hyperparameter_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Parameter importance analysis
    log_experiment("\n📊 PARAMETER IMPORTANCE ANALYSIS:")
    log_experiment("="*50)
    
    for param in ['learning_rate', 'batch_size', 'num_epochs', 'weight_decay', 'dropout_rate']:
        if param in successful_results.columns:
            grouped = successful_results.groupby(param)['eval_accuracy'].agg(['mean', 'std', 'count'])
            best_value = grouped['mean'].idxmax()
            best_mean = grouped['mean'].max()
            worst_mean = grouped['mean'].min()
            impact = best_mean - worst_mean
            
            log_experiment(f"{param:15} | Impact: {impact:.4f} | Best: {best_value} ({best_mean:.4f})")
    
    # Save top configurations
    top_configs = successful_results.nlargest(20, 'eval_accuracy')
    top_configs.to_csv('hyperparameter_results/top_20_configurations.csv', index=False)
    
    return successful_results

def save_optimal_configuration(best_config, label_encoder):
    """Save optimal configuration without training final model"""
    
    log_experiment("💾 Saving optimal configuration for future use...")
    
    # Save configuration (match existing filename)
    with open('hyperparameter_results/best_configuration.json', 'w') as f:
        json.dump(best_config, f, indent=2)
    
    log_experiment(f"✅ Optimal configuration saved:")
    log_experiment(f"   Learning Rate: {best_config['learning_rate']}")
    log_experiment(f"   Batch Size: {best_config['batch_size']}")
    log_experiment(f"   Epochs: {best_config['num_epochs']}")
    log_experiment(f"   Weight Decay: {best_config['weight_decay']}")
    log_experiment(f"   Validation Accuracy: {best_config['eval_accuracy']:.4f}")
    
    return best_config

## Main Execution
Complete optimization pipeline execution: data preparation → configuration generation  
→ systematic evaluation → analysis → optimal configuration selection.

In [None]:
def main():
    """Main execution function"""
    
    log_experiment("🚀 Starting comprehensive hyperparameter optimization for BART topic classification...")
    
    # Load data once
    df_train, df_test, label_encoder, num_classes = load_and_prepare_data()
    
    # Run hyperparameter optimization
    results_df, best_config = run_hyperparameter_optimization()
    
    # Analyze results if we have any
    if len(results_df) > 0:
        successful_results = analyze_hyperparameter_results(results_df)
        
        # Save optimal configuration for future use
        if best_config:
            save_optimal_configuration(best_config, label_encoder)
    
    # Print final summary
    log_experiment("🎉 Hyperparameter optimization completed!")
    
    print("\n" + "="*80)
    print("HYPERPARAMETER OPTIMIZATION SUMMARY")
    print("="*80)
    
    if best_config:
        print(f"🏆 BEST CONFIGURATION:")
        print(f"   Accuracy: {best_config['eval_accuracy']:.4f}")
        print(f"   Learning Rate: {best_config['learning_rate']}")
        print(f"   Batch Size: {best_config['batch_size']}")
        print(f"   Epochs: {best_config['num_epochs']}")
        print(f"   Weight Decay: {best_config['weight_decay']}")
        print(f"   Dropout Rate: {best_config['dropout_rate']}")
    
    successful_runs = len(results_df[results_df['success'] == True])
    print(f"\n📊 STATISTICS:")
    print(f"   Total configurations tested: {len(results_df)}")
    print(f"   Successful runs: {successful_runs}")
    print(f"   Success rate: {successful_runs/len(results_df)*100:.1f}%")
    
    print(f"\n📁 RESULTS SAVED TO:")
    print(f"   hyperparameter_results/hyperparameter_optimization_results.xlsx")
    print(f"   hyperparameter_results/best_configuration.json")
    print("="*80)

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


CUDA available: True
GPU: NVIDIA A100-PCIE-40GB
GPU Memory: 39 GB
[2025-05-27 23:47:25] 🚀 Starting comprehensive hyperparameter optimization...
[2025-05-27 23:47:25] 🧠 Generating intelligent hyperparameter combinations...
[2025-05-27 23:47:25] Generated 64 priority combinations
[2025-05-27 23:47:25] Generated 264 unique hyperparameter combinations
[2025-05-27 23:47:25] Target: 264 configurations


[2025-05-27 23:47:25] Data prepared: 924 train, 125 test samples
[2025-05-27 23:47:25] Classes: ['economics', 'health', 'lifestyle', 'politics', 'science', 'sports']
[2025-05-27 23:47:25] 🚀 Starting hyperparameter optimization...
[2025-05-27 23:47:25] Data prepared: 924 train, 125 test samples
[2025-05-27 23:47:25] Classes: ['economics', 'health', 'lifestyle', 'politics', 'science', 'sports']
[2025-05-27 23:47:25] 🧠 Generating intelligent hyperparameter combinations...
[2025-05-27 23:47:25] Generated 64 priority combinations
[2025-05-27 23:47:25] Generated 264 unique hyperparameter combinations
[2025-05-27 23:47:25] 🧪 Testing 264 hyperparameter configurations...
[2025-05-27 23:47:25] 
🔬 Testing configuration 1/264
[2025-05-27 23:47:25] Config: LR=1e-05, BS=4, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 650.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 665.27 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 744.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 781.95 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 845.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 887.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 960.33 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 954.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 883.19 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 601.08 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 602.84 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8314554691314697, 'eval_accuracy': 0.14399999380111694, 'eval_runtime': 2.1238, 'eval_samples_per_second': 58.856, 'eval_steps_per_second': 7.534, 'epoch': 0.987012987012987}


{'loss': 1.7577, 'grad_norm': 17.740947723388672, 'learning_rate': 7.354085603112842e-06, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.5304714441299438, 'eval_accuracy': 0.4320000112056732, 'eval_runtime': 2.0837, 'eval_samples_per_second': 59.988, 'eval_steps_per_second': 7.679, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.060055136680603, 'eval_accuracy': 0.7120000123977661, 'eval_runtime': 2.0702, 'eval_samples_per_second': 60.381, 'eval_steps_per_second': 7.729, 'epoch': 2.995670995670996}


{'loss': 1.0424, 'grad_norm': 7.2326812744140625, 'learning_rate': 3.5408560311284052e-06, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8328447341918945, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 2.0795, 'eval_samples_per_second': 60.112, 'eval_steps_per_second': 7.694, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8474644422531128, 'eval_accuracy': 0.8080000281333923, 'eval_runtime': 2.079, 'eval_samples_per_second': 60.126, 'eval_steps_per_second': 7.696, 'epoch': 4.935064935064935}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 471.0445, 'train_samples_per_second': 9.808, 'train_steps_per_second': 0.605, 'train_loss': 1.1824532625967996, 'epoch': 4.935064935064935}


{'eval_loss': 0.8328447341918945, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 2.0785, 'eval_samples_per_second': 60.141, 'eval_steps_per_second': 7.698, 'epoch': 4.935064935064935}
[2025-05-27 23:55:32] ✅ Config 1: Accuracy=0.8240, Loss=0.8328


[2025-05-27 23:55:32] 🏆 New best accuracy: 0.8240
[2025-05-27 23:55:32] 
🔬 Testing configuration 2/264
[2025-05-27 23:55:32] Config: LR=1e-05, BS=4, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 668.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 680.38 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 758.04 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 796.53 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 857.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 896.33 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 970.53 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 975.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 898.92 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 643.07 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 637.09 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8182, 'grad_norm': 77.6769790649414, 'learning_rate': 9.881297650503641e-06, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8448078632354736, 'eval_accuracy': 0.08799999952316284, 'eval_runtime': 2.085, 'eval_samples_per_second': 59.953, 'eval_steps_per_second': 7.674, 'epoch': 0.9956709956709957}


{'loss': 1.5415, 'grad_norm': 16.5361385345459, 'learning_rate': 8.393446994107876e-06, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9943680167198181, 'eval_accuracy': 0.7360000014305115, 'eval_runtime': 2.0782, 'eval_samples_per_second': 60.148, 'eval_steps_per_second': 7.699, 'epoch': 2.0}


{'loss': 0.7572, 'grad_norm': 51.521026611328125, 'learning_rate': 5.695198184263259e-06, 'epoch': 2.5974025974025974}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7663177847862244, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.0795, 'eval_samples_per_second': 60.111, 'eval_steps_per_second': 7.694, 'epoch': 2.995670995670996}


{'loss': 0.6384, 'grad_norm': 4.368455410003662, 'learning_rate': 2.7760998187429067e-06, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7342912554740906, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.3019, 'eval_samples_per_second': 54.303, 'eval_steps_per_second': 6.951, 'epoch': 4.0}


{'loss': 0.5458, 'grad_norm': 14.160987854003906, 'learning_rate': 6.202208184617065e-07, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7276022434234619, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.2852, 'eval_samples_per_second': 54.7, 'eval_steps_per_second': 7.002, 'epoch': 4.978354978354979}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 516.4818, 'train_samples_per_second': 8.945, 'train_steps_per_second': 1.113, 'train_loss': 0.9936763464886209, 'epoch': 4.978354978354979}


{'eval_loss': 0.7342912554740906, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.3615, 'eval_samples_per_second': 52.932, 'eval_steps_per_second': 6.775, 'epoch': 4.978354978354979}
[2025-05-28 00:04:19] ✅ Config 2: Accuracy=0.8560, Loss=0.7343


[2025-05-28 00:04:20] 🏆 New best accuracy: 0.8560
[2025-05-28 00:04:20] 
🔬 Testing configuration 3/264
[2025-05-28 00:04:20] Config: LR=1e-05, BS=4, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 561.91 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 542.65 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 598.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 616.50 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 661.89 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 718.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 732.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 782.79 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 804.31 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 816.31 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 731.12 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 535.91 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 535.83 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 529.68 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8547500371932983, 'eval_accuracy': 0.14399999380111694, 'eval_runtime': 2.2662, 'eval_samples_per_second': 55.159, 'eval_steps_per_second': 7.06, 'epoch': 0.987012987012987}


{'loss': 1.7941, 'grad_norm': 63.28396987915039, 'learning_rate': 6.666666666666667e-06, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7882658243179321, 'eval_accuracy': 0.24799999594688416, 'eval_runtime': 2.4788, 'eval_samples_per_second': 50.427, 'eval_steps_per_second': 6.455, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.833949327468872, 'eval_accuracy': 0.17599999904632568, 'eval_runtime': 2.2313, 'eval_samples_per_second': 56.02, 'eval_steps_per_second': 7.171, 'epoch': 2.995670995670996}


{'loss': 1.679, 'grad_norm': 27.62133026123047, 'learning_rate': 3.192982456140351e-06, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.4764528274536133, 'eval_accuracy': 0.42399999499320984, 'eval_runtime': 2.4703, 'eval_samples_per_second': 50.602, 'eval_steps_per_second': 6.477, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.360704779624939, 'eval_accuracy': 0.5040000081062317, 'eval_runtime': 2.2327, 'eval_samples_per_second': 55.985, 'eval_steps_per_second': 7.166, 'epoch': 4.935064935064935}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 508.6584, 'train_samples_per_second': 9.083, 'train_steps_per_second': 0.56, 'train_loss': 1.5551301922714502, 'epoch': 4.935064935064935}


{'eval_loss': 1.360704779624939, 'eval_accuracy': 0.5040000081062317, 'eval_runtime': 2.3138, 'eval_samples_per_second': 54.023, 'eval_steps_per_second': 6.915, 'epoch': 4.935064935064935}
[2025-05-28 00:13:00] ✅ Config 3: Accuracy=0.5040, Loss=1.3607


[2025-05-28 00:13:01] 
🔬 Testing configuration 4/264
[2025-05-28 00:13:01] Config: LR=1e-05, BS=4, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 558.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 537.18 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 592.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 610.77 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 654.61 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 712.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 727.74 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 777.17 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 798.07 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 807.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 724.64 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 526.07 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 532.31 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 525.33 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.1491, 'grad_norm': 10.771510124206543, 'learning_rate': 9.28622728502766e-06, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5907846689224243, 'eval_accuracy': 0.7599999904632568, 'eval_runtime': 2.2331, 'eval_samples_per_second': 55.977, 'eval_steps_per_second': 7.165, 'epoch': 0.9956709956709957}


{'loss': 0.2201, 'grad_norm': 1.5891590118408203, 'learning_rate': 7.34869773556623e-06, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.42187440395355225, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.2329, 'eval_samples_per_second': 55.981, 'eval_steps_per_second': 7.166, 'epoch': 2.0}


{'loss': 0.0986, 'grad_norm': 0.10462594032287598, 'learning_rate': 4.713316243417499e-06, 'epoch': 2.5974025974025974}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5498477816581726, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.2282, 'eval_samples_per_second': 56.098, 'eval_steps_per_second': 7.181, 'epoch': 2.995670995670996}


{'loss': 0.0266, 'grad_norm': 0.08466280996799469, 'learning_rate': 2.161405935249029e-06, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.44864723086357117, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.4647, 'eval_samples_per_second': 50.716, 'eval_steps_per_second': 6.492, 'epoch': 4.0}


{'loss': 0.0093, 'grad_norm': 0.15611359477043152, 'learning_rate': 4.472088650652956e-07, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.4597924053668976, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.2169, 'eval_samples_per_second': 56.386, 'eval_steps_per_second': 7.217, 'epoch': 4.978354978354979}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 540.3267, 'train_samples_per_second': 8.55, 'train_steps_per_second': 1.064, 'train_loss': 0.26225914167321246, 'epoch': 4.978354978354979}


{'eval_loss': 0.44864723086357117, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.2844, 'eval_samples_per_second': 54.718, 'eval_steps_per_second': 7.004, 'epoch': 4.978354978354979}
[2025-05-28 00:22:13] ✅ Config 4: Accuracy=0.8800, Loss=0.4486


[2025-05-28 00:22:14] 🏆 New best accuracy: 0.8800
[2025-05-28 00:22:14] 
🔬 Testing configuration 5/264


[2025-05-28 00:22:14] Config: LR=1e-05, BS=4, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 570.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 548.75 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 606.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 625.52 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 670.92 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 730.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 744.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 792.86 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 814.37 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 825.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 740.43 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 548.43 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 548.07 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 541.17 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.3614, 'grad_norm': 13.265100479125977, 'learning_rate': 8.782608695652174e-06, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6392760276794434, 'eval_accuracy': 0.800000011920929, 'eval_runtime': 2.2327, 'eval_samples_per_second': 55.987, 'eval_steps_per_second': 7.166, 'epoch': 0.9956709956709957}


{'loss': 0.3083, 'grad_norm': 27.891925811767578, 'learning_rate': 7.552795031055902e-06, 'epoch': 1.7316017316017316}


{'eval_loss': 0.42425963282585144, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.2203, 'eval_samples_per_second': 56.298, 'eval_steps_per_second': 7.206, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'loss': 0.1292, 'grad_norm': 0.3109087347984314, 'learning_rate': 6.310559006211181e-06, 'epoch': 2.5974025974025974}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.4352712035179138, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 2.2196, 'eval_samples_per_second': 56.317, 'eval_steps_per_second': 7.209, 'epoch': 2.995670995670996}


{'loss': 0.0366, 'grad_norm': 0.09337692707777023, 'learning_rate': 5.06832298136646e-06, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.4374842643737793, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 2.2071, 'eval_samples_per_second': 56.636, 'eval_steps_per_second': 7.249, 'epoch': 4.0}


{'loss': 0.0097, 'grad_norm': 0.09243191033601761, 'learning_rate': 3.8260869565217395e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.49689385294914246, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.4697, 'eval_samples_per_second': 50.614, 'eval_steps_per_second': 6.479, 'epoch': 4.995670995670996}


{'loss': 0.0036, 'grad_norm': 0.02563823014497757, 'learning_rate': 2.583850931677019e-06, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.4398375451564789, 'eval_accuracy': 0.8960000276565552, 'eval_runtime': 2.2163, 'eval_samples_per_second': 56.4, 'eval_steps_per_second': 7.219, 'epoch': 6.0}


{'loss': 0.0015, 'grad_norm': 0.05944264680147171, 'learning_rate': 1.341614906832298e-06, 'epoch': 6.0606060606060606}


{'loss': 0.0015, 'grad_norm': 0.028588885441422462, 'learning_rate': 9.937888198757765e-08, 'epoch': 6.926406926406926}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.4389466345310211, 'eval_accuracy': 0.8960000276565552, 'eval_runtime': 2.2249, 'eval_samples_per_second': 56.182, 'eval_steps_per_second': 7.191, 'epoch': 6.96969696969697}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 750.0118, 'train_samples_per_second': 8.624, 'train_steps_per_second': 1.073, 'train_loss': 0.23003781171318907, 'epoch': 6.96969696969697}


{'eval_loss': 0.4398375451564789, 'eval_accuracy': 0.8960000276565552, 'eval_runtime': 2.278, 'eval_samples_per_second': 54.874, 'eval_steps_per_second': 7.024, 'epoch': 6.96969696969697}
[2025-05-28 00:34:56] ✅ Config 5: Accuracy=0.8960, Loss=0.4398


[2025-05-28 00:34:56] 🏆 New best accuracy: 0.8960
[2025-05-28 00:34:56] 
🔬 Testing configuration 6/264
[2025-05-28 00:34:56] Config: LR=1e-05, BS=4, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 560.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 537.87 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 594.83 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 615.62 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 660.82 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 717.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 731.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 783.45 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 806.23 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 817.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 731.22 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 538.68 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 540.11 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 533.70 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8235219717025757, 'eval_accuracy': 0.19200000166893005, 'eval_runtime': 2.3054, 'eval_samples_per_second': 54.221, 'eval_steps_per_second': 6.94, 'epoch': 0.987012987012987}


{'loss': 1.7685, 'grad_norm': 15.91362190246582, 'learning_rate': 9.414737964294636e-06, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7357968091964722, 'eval_accuracy': 0.3199999928474426, 'eval_runtime': 2.2664, 'eval_samples_per_second': 55.153, 'eval_steps_per_second': 7.06, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.0351643562316895, 'eval_accuracy': 0.6880000233650208, 'eval_runtime': 2.2655, 'eval_samples_per_second': 55.175, 'eval_steps_per_second': 7.062, 'epoch': 2.995670995670996}


{'loss': 1.0943, 'grad_norm': 6.375221252441406, 'learning_rate': 6.0821980696905145e-06, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8160340785980225, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 2.2767, 'eval_samples_per_second': 54.904, 'eval_steps_per_second': 7.028, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.879168689250946, 'eval_accuracy': 0.7360000014305115, 'eval_runtime': 2.2705, 'eval_samples_per_second': 55.054, 'eval_steps_per_second': 7.047, 'epoch': 4.987012987012987}


{'loss': 0.6134, 'grad_norm': 4.320562362670898, 'learning_rate': 1.956192854956397e-06, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8459477424621582, 'eval_accuracy': 0.7519999742507935, 'eval_runtime': 2.5317, 'eval_samples_per_second': 49.373, 'eval_steps_per_second': 6.32, 'epoch': 5.991341991341991}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8298094868659973, 'eval_accuracy': 0.7760000228881836, 'eval_runtime': 2.2685, 'eval_samples_per_second': 55.102, 'eval_steps_per_second': 7.053, 'epoch': 6.909090909090909}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 709.2269, 'train_samples_per_second': 9.12, 'train_steps_per_second': 0.563, 'train_loss': 1.009231720353129, 'epoch': 6.909090909090909}


{'eval_loss': 0.8160340785980225, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 2.3247, 'eval_samples_per_second': 53.771, 'eval_steps_per_second': 6.883, 'epoch': 6.909090909090909}
[2025-05-28 00:46:58] ✅ Config 6: Accuracy=0.8240, Loss=0.8160


[2025-05-28 00:46:59] 
🔬 Testing configuration 7/264
[2025-05-28 00:46:59] Config: LR=1e-05, BS=4, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 572.67 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 549.98 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 608.47 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 626.37 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 672.52 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 730.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 743.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 794.40 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 815.37 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 826.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 741.50 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 541.06 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 545.12 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 537.55 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.803375005722046, 'eval_accuracy': 0.20000000298023224, 'eval_runtime': 2.2119, 'eval_samples_per_second': 56.513, 'eval_steps_per_second': 7.234, 'epoch': 0.987012987012987}


{'loss': 1.7728, 'grad_norm': 39.25027847290039, 'learning_rate': 7.644110275689223e-06, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7050119638442993, 'eval_accuracy': 0.23199999332427979, 'eval_runtime': 2.2099, 'eval_samples_per_second': 56.563, 'eval_steps_per_second': 7.24, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.6683943271636963, 'eval_accuracy': 0.29600000381469727, 'eval_runtime': 2.2096, 'eval_samples_per_second': 56.572, 'eval_steps_per_second': 7.241, 'epoch': 2.995670995670996}


{'loss': 1.6596, 'grad_norm': 27.86602210998535, 'learning_rate': 5.1378446115288225e-06, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.5173181295394897, 'eval_accuracy': 0.4480000138282776, 'eval_runtime': 2.2183, 'eval_samples_per_second': 56.349, 'eval_steps_per_second': 7.213, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9418807625770569, 'eval_accuracy': 0.7120000123977661, 'eval_runtime': 2.2066, 'eval_samples_per_second': 56.647, 'eval_steps_per_second': 7.251, 'epoch': 4.987012987012987}


{'loss': 0.9001, 'grad_norm': 13.42526626586914, 'learning_rate': 2.6566416040100253e-06, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7665919065475464, 'eval_accuracy': 0.7519999742507935, 'eval_runtime': 2.2147, 'eval_samples_per_second': 56.44, 'eval_steps_per_second': 7.224, 'epoch': 5.991341991341991}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7415932416915894, 'eval_accuracy': 0.7599999904632568, 'eval_runtime': 2.2116, 'eval_samples_per_second': 56.521, 'eval_steps_per_second': 7.235, 'epoch': 6.909090909090909}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 696.4385, 'train_samples_per_second': 9.287, 'train_steps_per_second': 0.573, 'train_loss': 1.182487478232324, 'epoch': 6.909090909090909}


{'eval_loss': 0.7415932416915894, 'eval_accuracy': 0.7599999904632568, 'eval_runtime': 2.2686, 'eval_samples_per_second': 55.1, 'eval_steps_per_second': 7.053, 'epoch': 6.909090909090909}
[2025-05-28 00:58:47] ✅ Config 7: Accuracy=0.7600, Loss=0.7416


[2025-05-28 00:58:47] 
🔬 Testing configuration 8/264
[2025-05-28 00:58:47] Config: LR=1e-05, BS=4, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 570.54 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 551.53 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 611.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 631.47 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 680.28 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 739.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 750.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 798.57 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 817.48 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 826.54 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 745.10 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 542.10 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 546.81 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 539.89 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.4898, 'grad_norm': 15.156777381896973, 'learning_rate': 9.638756592879923e-06, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8740707039833069, 'eval_accuracy': 0.671999990940094, 'eval_runtime': 2.2216, 'eval_samples_per_second': 56.267, 'eval_steps_per_second': 7.202, 'epoch': 0.9956709956709957}


{'loss': 0.3489, 'grad_norm': 28.151718139648438, 'learning_rate': 8.60722509119478e-06, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5221884846687317, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 2.45, 'eval_samples_per_second': 51.02, 'eval_steps_per_second': 6.531, 'epoch': 2.0}


{'loss': 0.1363, 'grad_norm': 1.1532015800476074, 'learning_rate': 7.018817478070793e-06, 'epoch': 2.5974025974025974}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7082356810569763, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 2.228, 'eval_samples_per_second': 56.103, 'eval_steps_per_second': 7.181, 'epoch': 2.995670995670996}


{'loss': 0.0247, 'grad_norm': 0.03820135071873665, 'learning_rate': 5.126820883887419e-06, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6873800158500671, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.4719, 'eval_samples_per_second': 50.569, 'eval_steps_per_second': 6.473, 'epoch': 4.0}


{'loss': 0.0047, 'grad_norm': 0.022660767659544945, 'learning_rate': 3.2157530148935946e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.822327196598053, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.2084, 'eval_samples_per_second': 56.603, 'eval_steps_per_second': 7.245, 'epoch': 4.995670995670996}


{'loss': 0.0012, 'grad_norm': 0.011906172148883343, 'learning_rate': 1.5729995081690963e-06, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7741087079048157, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 2.2118, 'eval_samples_per_second': 56.515, 'eval_steps_per_second': 7.234, 'epoch': 6.0}


{'loss': 0.0013, 'grad_norm': 0.02471020072698593, 'learning_rate': 4.455969921637698e-07, 'epoch': 6.0606060606060606}


{'loss': 0.0006, 'grad_norm': 0.01959841325879097, 'learning_rate': 3.0838153393697443e-09, 'epoch': 6.926406926406926}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7740093469619751, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 2.2175, 'eval_samples_per_second': 56.371, 'eval_steps_per_second': 7.215, 'epoch': 6.96969696969697}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 752.6105, 'train_samples_per_second': 8.594, 'train_steps_per_second': 1.07, 'train_loss': 0.24939020192771225, 'epoch': 6.96969696969697}


{'eval_loss': 0.6873800158500671, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.2125, 'eval_samples_per_second': 56.498, 'eval_steps_per_second': 7.232, 'epoch': 6.96969696969697}
[2025-05-28 01:11:31] ✅ Config 8: Accuracy=0.8560, Loss=0.6874


[2025-05-28 01:11:32] 
🔬 Testing configuration 9/264
[2025-05-28 01:11:32] Config: LR=1e-05, BS=8, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 576.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 556.65 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 611.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 628.97 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 673.65 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 733.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 744.49 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 793.79 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 813.26 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 824.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 742.26 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 527.74 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 532.48 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 525.57 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.7967, 'grad_norm': 15.089225769042969, 'learning_rate': 8.362068965517242e-06, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.6680938005447388, 'eval_accuracy': 0.35199999809265137, 'eval_runtime': 1.7817, 'eval_samples_per_second': 70.157, 'eval_steps_per_second': 4.49, 'epoch': 1.0}


{'loss': 0.9144, 'grad_norm': 16.215105056762695, 'learning_rate': 6.63793103448276e-06, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.624183714389801, 'eval_accuracy': 0.7919999957084656, 'eval_runtime': 1.7775, 'eval_samples_per_second': 70.324, 'eval_steps_per_second': 4.501, 'epoch': 2.0}


{'loss': 0.3622, 'grad_norm': 2.1861324310302734, 'learning_rate': 4.931034482758621e-06, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5804300308227539, 'eval_accuracy': 0.7919999957084656, 'eval_runtime': 1.7766, 'eval_samples_per_second': 70.361, 'eval_steps_per_second': 4.503, 'epoch': 3.0}


{'loss': 0.3068, 'grad_norm': 13.645232200622559, 'learning_rate': 3.206896551724138e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.42031267285346985, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.0308, 'eval_samples_per_second': 61.553, 'eval_steps_per_second': 3.939, 'epoch': 4.0}


{'loss': 0.1388, 'grad_norm': 53.43833923339844, 'learning_rate': 1.4827586206896551e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.4419074058532715, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.7712, 'eval_samples_per_second': 70.572, 'eval_steps_per_second': 4.517, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 436.8476, 'train_samples_per_second': 10.576, 'train_steps_per_second': 1.328, 'train_loss': 0.6257170266118542, 'epoch': 5.0}


{'eval_loss': 0.42031267285346985, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.8496, 'eval_samples_per_second': 67.581, 'eval_steps_per_second': 4.325, 'epoch': 5.0}
[2025-05-28 01:19:00] ✅ Config 9: Accuracy=0.8480, Loss=0.4203


[2025-05-28 01:19:00] 
🔬 Testing configuration 10/264
[2025-05-28 01:19:00] Config: LR=1e-05, BS=8, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 576.77 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 555.33 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 607.21 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 621.37 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 667.43 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 726.19 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 741.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 792.08 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 809.98 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 821.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 738.92 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 539.27 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 541.37 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 534.15 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.6297059059143066, 'eval_accuracy': 0.31200000643730164, 'eval_runtime': 1.8438, 'eval_samples_per_second': 67.796, 'eval_steps_per_second': 4.339, 'epoch': 1.0}


{'loss': 1.3719, 'grad_norm': 10.089082717895508, 'learning_rate': 8.339708152022586e-06, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8973668813705444, 'eval_accuracy': 0.7440000176429749, 'eval_runtime': 2.0584, 'eval_samples_per_second': 60.726, 'eval_steps_per_second': 3.886, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7589791417121887, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.8254, 'eval_samples_per_second': 68.48, 'eval_steps_per_second': 4.383, 'epoch': 3.0}


{'loss': 0.5602, 'grad_norm': 5.679825782775879, 'learning_rate': 2.7460879906210485e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7382192611694336, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.8231, 'eval_samples_per_second': 68.565, 'eval_steps_per_second': 4.388, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7168979048728943, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 1.8262, 'eval_samples_per_second': 68.449, 'eval_steps_per_second': 4.381, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 434.4781, 'train_samples_per_second': 10.633, 'train_steps_per_second': 0.667, 'train_loss': 0.8068800432928678, 'epoch': 5.0}


{'eval_loss': 0.7168979048728943, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 1.8815, 'eval_samples_per_second': 66.438, 'eval_steps_per_second': 4.252, 'epoch': 5.0}
[2025-05-28 01:26:26] ✅ Config 10: Accuracy=0.8720, Loss=0.7169


[2025-05-28 01:26:26] 💾 Saved checkpoint at 10 configurations
[2025-05-28 01:26:26] 
🔬 Testing configuration 11/264
[2025-05-28 01:26:26] Config: LR=1e-05, BS=8, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 575.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 547.70 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 601.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 617.92 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 664.88 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 718.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 730.29 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 782.32 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 810.02 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 826.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 736.09 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 530.39 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 537.59 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 530.30 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7841169834136963, 'eval_accuracy': 0.17599999904632568, 'eval_runtime': 1.7801, 'eval_samples_per_second': 70.22, 'eval_steps_per_second': 4.494, 'epoch': 1.0}


{'loss': 1.7533, 'grad_norm': 91.0900650024414, 'learning_rate': 6.758620689655173e-06, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.6856873035430908, 'eval_accuracy': 0.2800000011920929, 'eval_runtime': 2.0197, 'eval_samples_per_second': 61.889, 'eval_steps_per_second': 3.961, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9751405715942383, 'eval_accuracy': 0.6480000019073486, 'eval_runtime': 1.7835, 'eval_samples_per_second': 70.088, 'eval_steps_per_second': 4.486, 'epoch': 3.0}


{'loss': 1.0406, 'grad_norm': 16.002017974853516, 'learning_rate': 3.3448275862068967e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6832671165466309, 'eval_accuracy': 0.7760000228881836, 'eval_runtime': 1.7784, 'eval_samples_per_second': 70.288, 'eval_steps_per_second': 4.498, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6800140142440796, 'eval_accuracy': 0.7519999742507935, 'eval_runtime': 1.7847, 'eval_samples_per_second': 70.039, 'eval_steps_per_second': 4.482, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 436.1733, 'train_samples_per_second': 10.592, 'train_steps_per_second': 0.665, 'train_loss': 1.0929892178239493, 'epoch': 5.0}


{'eval_loss': 0.6832671165466309, 'eval_accuracy': 0.7760000228881836, 'eval_runtime': 1.8295, 'eval_samples_per_second': 68.324, 'eval_steps_per_second': 4.373, 'epoch': 5.0}
[2025-05-28 01:33:54] ✅ Config 11: Accuracy=0.7760, Loss=0.6833


[2025-05-28 01:33:55] 
🔬 Testing configuration 12/264
[2025-05-28 01:33:55] Config: LR=1e-05, BS=8, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 570.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 548.38 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 606.12 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 626.18 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 674.20 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 733.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 746.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 796.70 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 816.75 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 827.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 742.34 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 525.18 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 528.28 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 521.15 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8112, 'grad_norm': 21.524330139160156, 'learning_rate': 9.365784329704114e-06, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.5702303647994995, 'eval_accuracy': 0.3919999897480011, 'eval_runtime': 2.0152, 'eval_samples_per_second': 62.029, 'eval_steps_per_second': 3.97, 'epoch': 1.0}


{'loss': 0.9676, 'grad_norm': 13.63489055633545, 'learning_rate': 7.4843475331969614e-06, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6570587158203125, 'eval_accuracy': 0.7680000066757202, 'eval_runtime': 2.0113, 'eval_samples_per_second': 62.15, 'eval_steps_per_second': 3.978, 'epoch': 2.0}


{'loss': 0.3732, 'grad_norm': 5.078655242919922, 'learning_rate': 4.891677694124013e-06, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.48418477177619934, 'eval_accuracy': 0.8080000281333923, 'eval_runtime': 1.7643, 'eval_samples_per_second': 70.851, 'eval_steps_per_second': 4.534, 'epoch': 3.0}


{'loss': 0.2742, 'grad_norm': 14.305235862731934, 'learning_rate': 2.3529560949975184e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.4073072373867035, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.6856, 'eval_samples_per_second': 74.156, 'eval_steps_per_second': 4.746, 'epoch': 4.0}


{'loss': 0.1414, 'grad_norm': 38.605892181396484, 'learning_rate': 5.449673790581611e-07, 'epoch': 4.310344827586207}


{'eval_loss': 0.419299840927124, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 1.7122, 'eval_samples_per_second': 73.005, 'eval_steps_per_second': 4.672, 'epoch': 5.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 432.9167, 'train_samples_per_second': 10.672, 'train_steps_per_second': 1.34, 'train_loss': 0.6347389566487279, 'epoch': 5.0}


{'eval_loss': 0.4073072373867035, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.732, 'eval_samples_per_second': 72.171, 'eval_steps_per_second': 4.619, 'epoch': 5.0}
[2025-05-28 01:41:19] ✅ Config 12: Accuracy=0.8640, Loss=0.4073


[2025-05-28 01:41:19] 
🔬 Testing configuration 13/264
[2025-05-28 01:41:19] Config: LR=1e-05, BS=8, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 656.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:01, 729.77 examples/s]


Map:  31%|███       | 288/924 [00:00<00:00, 777.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 820.30 examples/s]


Map:  57%|█████▋    | 528/924 [00:00<00:00, 915.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 942.68 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 987.75 examples/s]


Map:  97%|█████████▋| 896/924 [00:00<00:00, 1012.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 917.62 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 647.80 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 647.28 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.6758, 'grad_norm': 19.5002384185791, 'learning_rate': 9.740437158469946e-06, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.2756211757659912, 'eval_accuracy': 0.47999998927116394, 'eval_runtime': 1.6754, 'eval_samples_per_second': 74.611, 'eval_steps_per_second': 4.775, 'epoch': 1.0}


{'loss': 0.5854, 'grad_norm': 40.560298919677734, 'learning_rate': 8.387978142076504e-06, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5656331777572632, 'eval_accuracy': 0.8159999847412109, 'eval_runtime': 1.8925, 'eval_samples_per_second': 66.049, 'eval_steps_per_second': 4.227, 'epoch': 2.0}


{'loss': 0.1726, 'grad_norm': nan, 'learning_rate': 7.0355191256830605e-06, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6196777820587158, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 1.6728, 'eval_samples_per_second': 74.727, 'eval_steps_per_second': 4.783, 'epoch': 3.0}


{'loss': 0.061, 'grad_norm': 16.295589447021484, 'learning_rate': 5.669398907103826e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7296677231788635, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 1.6704, 'eval_samples_per_second': 74.833, 'eval_steps_per_second': 4.789, 'epoch': 4.0}


{'loss': 0.0103, 'grad_norm': 0.027810407802462578, 'learning_rate': 4.30327868852459e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7873321771621704, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.8965, 'eval_samples_per_second': 65.911, 'eval_steps_per_second': 4.218, 'epoch': 5.0}


{'loss': 0.0048, 'grad_norm': 0.02127613127231598, 'learning_rate': 2.9371584699453553e-06, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8150122165679932, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 1.664, 'eval_samples_per_second': 75.121, 'eval_steps_per_second': 4.808, 'epoch': 6.0}


{'loss': 0.001, 'grad_norm': 0.026979723945260048, 'learning_rate': 1.5710382513661205e-06, 'epoch': 6.0344827586206895}


{'loss': 0.0012, 'grad_norm': 0.014118172228336334, 'learning_rate': 2.0491803278688524e-07, 'epoch': 6.896551724137931}


{'eval_loss': 0.8122596144676208, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 1.8873, 'eval_samples_per_second': 66.233, 'eval_steps_per_second': 4.239, 'epoch': 7.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 568.5352, 'train_samples_per_second': 11.377, 'train_steps_per_second': 1.428, 'train_loss': 0.30940161509613684, 'epoch': 7.0}


{'eval_loss': 0.7873321771621704, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.7112, 'eval_samples_per_second': 73.048, 'eval_steps_per_second': 4.675, 'epoch': 7.0}
[2025-05-28 01:50:58] ✅ Config 13: Accuracy=0.8400, Loss=0.7873


[2025-05-28 01:50:58] 
🔬 Testing configuration 14/264
[2025-05-28 01:50:58] Config: LR=1e-05, BS=8, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 653.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:01, 727.60 examples/s]


Map:  31%|███       | 288/924 [00:00<00:00, 767.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 808.41 examples/s]


Map:  57%|█████▋    | 528/924 [00:00<00:00, 904.22 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 931.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 982.17 examples/s]


Map:  97%|█████████▋| 896/924 [00:00<00:00, 1009.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 911.72 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 658.40 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 654.25 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.2741, 'grad_norm': 9.191941261291504, 'learning_rate': 9.644883599083959e-06, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7688692212104797, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 1.7126, 'eval_samples_per_second': 72.989, 'eval_steps_per_second': 4.671, 'epoch': 1.0}


{'loss': 0.5852, 'grad_norm': 20.582365036010742, 'learning_rate': 8.616646928238207e-06, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6984568238258362, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 1.7056, 'eval_samples_per_second': 73.289, 'eval_steps_per_second': 4.69, 'epoch': 2.0}


{'loss': 0.4939, 'grad_norm': 37.07728576660156, 'learning_rate': 7.046634727088898e-06, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7121644616127014, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.7111, 'eval_samples_per_second': 73.054, 'eval_steps_per_second': 4.675, 'epoch': 3.0}


{'loss': 0.4529, 'grad_norm': 2.8463099002838135, 'learning_rate': 5.1740678606896255e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6862800717353821, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 1.932, 'eval_samples_per_second': 64.7, 'eval_steps_per_second': 4.141, 'epoch': 4.0}


{'loss': 0.4283, 'grad_norm': 0.33538585901260376, 'learning_rate': 3.275768486860149e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6905235052108765, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 1.7107, 'eval_samples_per_second': 73.069, 'eval_steps_per_second': 4.676, 'epoch': 5.0}


{'loss': 0.4264, 'grad_norm': 0.18090951442718506, 'learning_rate': 1.6323628082107052e-06, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.685494601726532, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 1.9361, 'eval_samples_per_second': 64.564, 'eval_steps_per_second': 4.132, 'epoch': 6.0}


{'loss': 0.4261, 'grad_norm': 0.24448181688785553, 'learning_rate': 4.86796010998794e-07, 'epoch': 6.0344827586206895}


{'loss': 0.4241, 'grad_norm': 0.2746351957321167, 'learning_rate': 8.417600665923676e-09, 'epoch': 6.896551724137931}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6882301568984985, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 1.7118, 'eval_samples_per_second': 73.023, 'eval_steps_per_second': 4.673, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 601.8356, 'train_samples_per_second': 10.747, 'train_steps_per_second': 1.349, 'train_loss': 0.5617956093379429, 'epoch': 7.0}


{'eval_loss': 0.6862800717353821, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 1.7648, 'eval_samples_per_second': 70.829, 'eval_steps_per_second': 4.533, 'epoch': 7.0}
[2025-05-28 02:01:10] ✅ Config 14: Accuracy=0.8880, Loss=0.6863


[2025-05-28 02:01:10] 
🔬 Testing configuration 15/264
[2025-05-28 02:01:10] Config: LR=1e-05, BS=8, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 675.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 685.08 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 762.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 803.47 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 867.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 907.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 989.96 examples/s]


Map:  92%|█████████▏| 848/924 [00:00<00:00, 1003.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 912.73 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 643.84 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 643.25 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.1866, 'grad_norm': 10.954188346862793, 'learning_rate': 8.805418719211823e-06, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6035654544830322, 'eval_accuracy': 0.7839999794960022, 'eval_runtime': 1.6685, 'eval_samples_per_second': 74.916, 'eval_steps_per_second': 4.795, 'epoch': 1.0}


{'loss': 0.2445, 'grad_norm': 35.382606506347656, 'learning_rate': 7.586206896551724e-06, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5884000658988953, 'eval_accuracy': 0.7839999794960022, 'eval_runtime': 1.667, 'eval_samples_per_second': 74.983, 'eval_steps_per_second': 4.799, 'epoch': 2.0}


{'loss': 0.1256, 'grad_norm': 16.446346282958984, 'learning_rate': 6.3546798029556655e-06, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5310825109481812, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.8958, 'eval_samples_per_second': 65.934, 'eval_steps_per_second': 4.22, 'epoch': 3.0}


{'loss': 0.0543, 'grad_norm': 10.242847442626953, 'learning_rate': 5.123152709359607e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.4660699665546417, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 1.6663, 'eval_samples_per_second': 75.015, 'eval_steps_per_second': 4.801, 'epoch': 4.0}


{'loss': 0.0071, 'grad_norm': 0.07982144504785538, 'learning_rate': 3.891625615763547e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5122695565223694, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 1.6684, 'eval_samples_per_second': 74.92, 'eval_steps_per_second': 4.795, 'epoch': 5.0}


{'loss': 0.004, 'grad_norm': 0.03393414616584778, 'learning_rate': 2.660098522167488e-06, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5701315402984619, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.9024, 'eval_samples_per_second': 65.705, 'eval_steps_per_second': 4.205, 'epoch': 6.0}


{'loss': 0.0067, 'grad_norm': 0.06654118746519089, 'learning_rate': 1.4285714285714286e-06, 'epoch': 6.0344827586206895}


{'loss': 0.0015, 'grad_norm': 0.015622176229953766, 'learning_rate': 1.9704433497536947e-07, 'epoch': 6.896551724137931}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5678538680076599, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.6712, 'eval_samples_per_second': 74.794, 'eval_steps_per_second': 4.787, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 605.112, 'train_samples_per_second': 10.689, 'train_steps_per_second': 1.342, 'train_loss': 0.20077309463852144, 'epoch': 7.0}


{'eval_loss': 0.4660699665546417, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 1.7346, 'eval_samples_per_second': 72.061, 'eval_steps_per_second': 4.612, 'epoch': 7.0}
[2025-05-28 02:11:26] ✅ Config 15: Accuracy=0.8720, Loss=0.4661


[2025-05-28 02:11:26] 
🔬 Testing configuration 16/264
[2025-05-28 02:11:26] Config: LR=1e-05, BS=8, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 649.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:01, 722.19 examples/s]


Map:  31%|███       | 288/924 [00:00<00:00, 764.89 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  42%|████▏     | 384/924 [00:00<00:00, 793.99 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 869.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 889.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 980.92 examples/s]


Map:  92%|█████████▏| 848/924 [00:00<00:00, 989.26 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 898.68 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 658.23 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 653.19 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8463200330734253, 'eval_accuracy': 0.19200000166893005, 'eval_runtime': 1.6676, 'eval_samples_per_second': 74.959, 'eval_steps_per_second': 4.797, 'epoch': 1.0}


{'loss': 1.6825, 'grad_norm': 115.46879577636719, 'learning_rate': 9.416522805888072e-06, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.2408537864685059, 'eval_accuracy': 0.5920000076293945, 'eval_runtime': 2.0161, 'eval_samples_per_second': 62.0, 'eval_steps_per_second': 3.968, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6626816391944885, 'eval_accuracy': 0.7599999904632568, 'eval_runtime': 1.7835, 'eval_samples_per_second': 70.088, 'eval_steps_per_second': 4.486, 'epoch': 3.0}


{'loss': 0.5454, 'grad_norm': 10.765331268310547, 'learning_rate': 6.166186332682203e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.3552091121673584, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 1.7893, 'eval_samples_per_second': 69.858, 'eval_steps_per_second': 4.471, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.4031667709350586, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.7796, 'eval_samples_per_second': 70.242, 'eval_steps_per_second': 4.496, 'epoch': 5.0}


{'loss': 0.193, 'grad_norm': 28.557376861572266, 'learning_rate': 2.092324704199938e-06, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.39300039410591125, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.029, 'eval_samples_per_second': 61.607, 'eval_steps_per_second': 3.943, 'epoch': 6.0}


{'loss': 0.12, 'grad_norm': 3.3580074310302734, 'learning_rate': 2.2149915932981327e-08, 'epoch': 6.896551724137931}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.40784457325935364, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.7818, 'eval_samples_per_second': 70.152, 'eval_steps_per_second': 4.49, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 562.5048, 'train_samples_per_second': 11.499, 'train_steps_per_second': 0.722, 'train_loss': 0.6268361319462067, 'epoch': 7.0}


{'eval_loss': 0.3552091121673584, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 1.8338, 'eval_samples_per_second': 68.166, 'eval_steps_per_second': 4.363, 'epoch': 7.0}
[2025-05-28 02:20:59] ✅ Config 16: Accuracy=0.8800, Loss=0.3552


[2025-05-28 02:20:59] 
🔬 Testing configuration 17/264
[2025-05-28 02:20:59] Config: LR=2e-05, BS=4, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 578.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 554.62 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 606.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 622.88 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 667.65 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 725.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 740.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 789.74 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 811.76 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 824.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 738.86 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 546.02 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 547.17 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 540.69 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8531683087348938, 'eval_accuracy': 0.7760000228881836, 'eval_runtime': 2.3193, 'eval_samples_per_second': 53.896, 'eval_steps_per_second': 6.899, 'epoch': 0.987012987012987}


{'loss': 1.044, 'grad_norm': 6.094135761260986, 'learning_rate': 1.4552529182879378e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8740082383155823, 'eval_accuracy': 0.7680000066757202, 'eval_runtime': 2.5263, 'eval_samples_per_second': 49.479, 'eval_steps_per_second': 6.333, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.781688928604126, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.5339, 'eval_samples_per_second': 49.331, 'eval_steps_per_second': 6.314, 'epoch': 2.995670995670996}


{'loss': 0.4819, 'grad_norm': 0.49597811698913574, 'learning_rate': 6.770428015564204e-06, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7150380611419678, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.2745, 'eval_samples_per_second': 54.958, 'eval_steps_per_second': 7.035, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7257539629936218, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.5195, 'eval_samples_per_second': 49.613, 'eval_steps_per_second': 6.35, 'epoch': 4.935064935064935}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 501.9589, 'train_samples_per_second': 9.204, 'train_steps_per_second': 0.568, 'train_loss': 0.6635827583179139, 'epoch': 4.935064935064935}


{'eval_loss': 0.781688928604126, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.3171, 'eval_samples_per_second': 53.948, 'eval_steps_per_second': 6.905, 'epoch': 4.935064935064935}
[2025-05-28 02:29:33] ✅ Config 17: Accuracy=0.8560, Loss=0.7817


[2025-05-28 02:29:34] 
🔬 Testing configuration 18/264
[2025-05-28 02:29:34] Config: LR=2e-05, BS=4, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 561.14 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 549.65 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 604.68 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 624.39 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 670.23 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 731.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 745.00 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 796.69 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 818.43 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 829.07 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 742.34 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 538.65 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 543.60 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 535.82 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.2516, 'grad_norm': 12.451430320739746, 'learning_rate': 1.9692433491602732e-05, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5578044652938843, 'eval_accuracy': 0.8159999847412109, 'eval_runtime': 2.2563, 'eval_samples_per_second': 55.4, 'eval_steps_per_second': 7.091, 'epoch': 0.9956709956709957}


{'loss': 0.2494, 'grad_norm': 1.7666897773742676, 'learning_rate': 1.656109885288794e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5049977898597717, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 2.4617, 'eval_samples_per_second': 50.778, 'eval_steps_per_second': 6.5, 'epoch': 2.0}


{'loss': 0.0779, 'grad_norm': 0.04140656068921089, 'learning_rate': 1.1089506078442709e-05, 'epoch': 2.5974025974025974}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5011855363845825, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.458, 'eval_samples_per_second': 50.854, 'eval_steps_per_second': 6.509, 'epoch': 2.995670995670996}


{'loss': 0.0166, 'grad_norm': 0.06628559529781342, 'learning_rate': 5.229300737539801e-06, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5298710465431213, 'eval_accuracy': 0.9120000004768372, 'eval_runtime': 2.4771, 'eval_samples_per_second': 50.463, 'eval_steps_per_second': 6.459, 'epoch': 4.0}


{'loss': 0.0011, 'grad_norm': 0.08821761608123779, 'learning_rate': 1.0707414185043163e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5266036987304688, 'eval_accuracy': 0.9120000004768372, 'eval_runtime': 2.4459, 'eval_samples_per_second': 51.107, 'eval_steps_per_second': 6.542, 'epoch': 4.978354978354979}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 538.6138, 'train_samples_per_second': 8.578, 'train_steps_per_second': 1.068, 'train_loss': 0.27780218098474585, 'epoch': 4.978354978354979}


{'eval_loss': 0.5298710465431213, 'eval_accuracy': 0.9120000004768372, 'eval_runtime': 2.2669, 'eval_samples_per_second': 55.141, 'eval_steps_per_second': 7.058, 'epoch': 4.978354978354979}
[2025-05-28 02:38:44] ✅ Config 18: Accuracy=0.9120, Loss=0.5299


[2025-05-28 02:38:44] 🏆 New best accuracy: 0.9120
[2025-05-28 02:38:44] 
🔬 Testing configuration 19/264
[2025-05-28 02:38:44] Config: LR=2e-05, BS=4, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 566.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 544.78 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 603.54 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 622.42 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 669.62 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 728.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 742.83 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 790.57 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 811.77 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 822.46 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 738.35 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 546.79 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 547.78 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 540.47 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.0183411836624146, 'eval_accuracy': 0.656000018119812, 'eval_runtime': 2.2606, 'eval_samples_per_second': 55.294, 'eval_steps_per_second': 7.078, 'epoch': 0.987012987012987}


{'loss': 1.0248, 'grad_norm': 15.132699966430664, 'learning_rate': 1.4552529182879378e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.3970477283000946, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.46, 'eval_samples_per_second': 50.813, 'eval_steps_per_second': 6.504, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.4900348484516144, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.3796, 'eval_samples_per_second': 52.529, 'eval_steps_per_second': 6.724, 'epoch': 2.995670995670996}


{'loss': 0.0968, 'grad_norm': 6.27913236618042, 'learning_rate': 6.848249027237355e-06, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.45412561297416687, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.4615, 'eval_samples_per_second': 50.782, 'eval_steps_per_second': 6.5, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.42848312854766846, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 2.4625, 'eval_samples_per_second': 50.762, 'eval_steps_per_second': 6.498, 'epoch': 4.935064935064935}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 505.4131, 'train_samples_per_second': 9.141, 'train_steps_per_second': 0.564, 'train_loss': 0.3978572694878829, 'epoch': 4.935064935064935}


{'eval_loss': 0.42848312854766846, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 2.5108, 'eval_samples_per_second': 49.785, 'eval_steps_per_second': 6.372, 'epoch': 4.935064935064935}
[2025-05-28 02:47:22] ✅ Config 19: Accuracy=0.8880, Loss=0.4285


[2025-05-28 02:47:22] 
🔬 Testing configuration 20/264
[2025-05-28 02:47:22] Config: LR=2e-05, BS=4, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 563.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 550.85 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 604.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 627.35 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 674.56 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 733.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 745.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 796.38 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 817.07 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 828.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 743.17 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 541.64 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 545.33 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 538.71 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.7713, 'grad_norm': 18.564064025878906, 'learning_rate': 1.873661485346977e-05, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.0892592668533325, 'eval_accuracy': 0.6399999856948853, 'eval_runtime': 2.2292, 'eval_samples_per_second': 56.074, 'eval_steps_per_second': 7.178, 'epoch': 0.9956709956709957}


{'loss': 0.5743, 'grad_norm': 5.440195083618164, 'learning_rate': 1.4936779212365317e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.4663859009742737, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.5066, 'eval_samples_per_second': 49.869, 'eval_steps_per_second': 6.383, 'epoch': 2.0}


{'loss': 0.2427, 'grad_norm': 0.39885425567626953, 'learning_rate': 9.699545056543546e-06, 'epoch': 2.5974025974025974}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.623407244682312, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.4756, 'eval_samples_per_second': 50.493, 'eval_steps_per_second': 6.463, 'epoch': 2.995670995670996}


{'loss': 0.127, 'grad_norm': 1.9487202167510986, 'learning_rate': 4.549791719872458e-06, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6554528474807739, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.469, 'eval_samples_per_second': 50.628, 'eval_steps_per_second': 6.48, 'epoch': 4.0}


{'loss': 0.0606, 'grad_norm': 0.03330887109041214, 'learning_rate': 9.869275167346237e-07, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6211623549461365, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.4691, 'eval_samples_per_second': 50.625, 'eval_steps_per_second': 6.48, 'epoch': 4.978354978354979}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 525.9571, 'train_samples_per_second': 8.784, 'train_steps_per_second': 1.093, 'train_loss': 0.48993962826936144, 'epoch': 4.978354978354979}


{'eval_loss': 0.4663859009742737, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.2863, 'eval_samples_per_second': 54.673, 'eval_steps_per_second': 6.998, 'epoch': 4.978354978354979}
[2025-05-28 02:56:22] ✅ Config 20: Accuracy=0.8640, Loss=0.4664


[2025-05-28 02:56:22] 💾 Saved checkpoint at 20 configurations
[2025-05-28 02:56:22] 
🔬 Testing configuration 21/264
[2025-05-28 02:56:22] Config: LR=2e-05, BS=4, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 568.50 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 553.07 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 604.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 625.36 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 674.45 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 732.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 744.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 790.90 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 812.83 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 823.89 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 740.34 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 548.09 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 548.15 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 541.84 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.2723379135131836, 'eval_accuracy': 0.527999997138977, 'eval_runtime': 2.2166, 'eval_samples_per_second': 56.393, 'eval_steps_per_second': 7.218, 'epoch': 0.987012987012987}


{'loss': 1.0931, 'grad_norm': 8.147783279418945, 'learning_rate': 1.6722222222222225e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.37090322375297546, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.4712, 'eval_samples_per_second': 50.583, 'eval_steps_per_second': 6.475, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.39861705899238586, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.4806, 'eval_samples_per_second': 50.391, 'eval_steps_per_second': 6.45, 'epoch': 2.995670995670996}


{'loss': 0.1058, 'grad_norm': 5.736083507537842, 'learning_rate': 1.1222222222222224e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.36396464705467224, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.4739, 'eval_samples_per_second': 50.527, 'eval_steps_per_second': 6.467, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.46285951137542725, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.4851, 'eval_samples_per_second': 50.299, 'eval_steps_per_second': 6.438, 'epoch': 4.987012987012987}


{'loss': 0.0126, 'grad_norm': 0.02305622026324272, 'learning_rate': 5.722222222222222e-06, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.43859127163887024, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.4813, 'eval_samples_per_second': 50.376, 'eval_steps_per_second': 6.448, 'epoch': 5.991341991341991}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.43274492025375366, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.4723, 'eval_samples_per_second': 50.56, 'eval_steps_per_second': 6.472, 'epoch': 6.909090909090909}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 701.4838, 'train_samples_per_second': 9.22, 'train_steps_per_second': 0.569, 'train_loss': 0.3039520405288926, 'epoch': 6.909090909090909}


{'eval_loss': 0.36396464705467224, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.2767, 'eval_samples_per_second': 54.905, 'eval_steps_per_second': 7.028, 'epoch': 6.909090909090909}
[2025-05-28 03:08:17] ✅ Config 21: Accuracy=0.9040, Loss=0.3640


[2025-05-28 03:08:17] 
🔬 Testing configuration 22/264
[2025-05-28 03:08:17] Config: LR=2e-05, BS=4, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 554.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 539.38 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 599.04 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 622.08 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 671.18 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 731.50 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 746.04 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 796.24 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 817.65 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 828.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 740.19 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 544.35 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 546.21 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 538.97 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8324042558670044, 'eval_accuracy': 0.2240000069141388, 'eval_runtime': 2.2929, 'eval_samples_per_second': 54.517, 'eval_steps_per_second': 6.978, 'epoch': 0.987012987012987}


{'loss': 1.668, 'grad_norm': 10.328585624694824, 'learning_rate': 1.8788171126619653e-05, 'epoch': 1.7316017316017316}


{'eval_loss': 1.0910381078720093, 'eval_accuracy': 0.6639999747276306, 'eval_runtime': 2.5204, 'eval_samples_per_second': 49.595, 'eval_steps_per_second': 6.348, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7168577313423157, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.5359, 'eval_samples_per_second': 49.292, 'eval_steps_per_second': 6.309, 'epoch': 2.995670995670996}


{'loss': 0.6813, 'grad_norm': 0.8647657036781311, 'learning_rate': 1.1993679344171973e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7034302949905396, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.5666, 'eval_samples_per_second': 48.703, 'eval_steps_per_second': 6.234, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.763731062412262, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.5258, 'eval_samples_per_second': 49.488, 'eval_steps_per_second': 6.335, 'epoch': 4.987012987012987}


{'loss': 0.4979, 'grad_norm': 1.6456273794174194, 'learning_rate': 3.774853633623806e-06, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7470278143882751, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.5313, 'eval_samples_per_second': 49.382, 'eval_steps_per_second': 6.321, 'epoch': 5.991341991341991}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7297943234443665, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.5406, 'eval_samples_per_second': 49.201, 'eval_steps_per_second': 6.298, 'epoch': 6.909090909090909}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 691.7408, 'train_samples_per_second': 9.35, 'train_steps_per_second': 0.577, 'train_loss': 0.8278950186899132, 'epoch': 6.909090909090909}


{'eval_loss': 0.7034302949905396, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.264, 'eval_samples_per_second': 55.212, 'eval_steps_per_second': 7.067, 'epoch': 6.909090909090909}
[2025-05-28 03:20:01] ✅ Config 22: Accuracy=0.9040, Loss=0.7034


[2025-05-28 03:20:02] 
🔬 Testing configuration 23/264
[2025-05-28 03:20:02] Config: LR=2e-05, BS=4, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 563.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 549.43 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 608.67 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 627.95 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 675.43 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 736.24 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 752.94 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 803.72 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 824.63 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 834.37 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 747.31 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 544.82 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 546.27 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 540.05 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8547754287719727, 'eval_accuracy': 0.2800000011920929, 'eval_runtime': 2.279, 'eval_samples_per_second': 54.85, 'eval_steps_per_second': 7.021, 'epoch': 0.987012987012987}


{'loss': 1.7105, 'grad_norm': 15.6574068069458, 'learning_rate': 1.6833333333333334e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9873563647270203, 'eval_accuracy': 0.7360000014305115, 'eval_runtime': 2.5306, 'eval_samples_per_second': 49.395, 'eval_steps_per_second': 6.323, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7645434737205505, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 2.544, 'eval_samples_per_second': 49.135, 'eval_steps_per_second': 6.289, 'epoch': 2.995670995670996}


{'loss': 0.7623, 'grad_norm': 11.430150985717773, 'learning_rate': 1.138888888888889e-05, 'epoch': 3.463203463203463}


{'eval_loss': 0.715513288974762, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.5299, 'eval_samples_per_second': 49.409, 'eval_steps_per_second': 6.324, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7197935581207275, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.5434, 'eval_samples_per_second': 49.147, 'eval_steps_per_second': 6.291, 'epoch': 4.987012987012987}


{'loss': 0.546, 'grad_norm': 1.6248133182525635, 'learning_rate': 5.833333333333334e-06, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7110660076141357, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.5287, 'eval_samples_per_second': 49.432, 'eval_steps_per_second': 6.327, 'epoch': 5.991341991341991}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6981639266014099, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.5422, 'eval_samples_per_second': 49.17, 'eval_steps_per_second': 6.294, 'epoch': 6.909090909090909}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 706.5651, 'train_samples_per_second': 9.154, 'train_steps_per_second': 0.565, 'train_loss': 0.8753072121985873, 'epoch': 6.909090909090909}


{'eval_loss': 0.715513288974762, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.2657, 'eval_samples_per_second': 55.17, 'eval_steps_per_second': 7.062, 'epoch': 6.909090909090909}
[2025-05-28 03:32:01] ✅ Config 23: Accuracy=0.8800, Loss=0.7155


[2025-05-28 03:32:01] 
🔬 Testing configuration 24/264
[2025-05-28 03:32:01] Config: LR=2e-05, BS=4, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 570.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 551.16 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 609.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 628.66 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 676.59 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 731.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 746.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 798.51 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 819.96 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 831.17 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 744.91 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 535.64 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 540.86 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 533.26 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8132, 'grad_norm': 10.589536666870117, 'learning_rate': 1.9978883431348845e-05, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.80047607421875, 'eval_accuracy': 0.30399999022483826, 'eval_runtime': 2.2785, 'eval_samples_per_second': 54.86, 'eval_steps_per_second': 7.022, 'epoch': 0.9956709956709957}


{'loss': 1.2573, 'grad_norm': 9.459699630737305, 'learning_rate': 1.8804488195001394e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8300450444221497, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 2.5315, 'eval_samples_per_second': 49.378, 'eval_steps_per_second': 6.32, 'epoch': 2.0}


{'loss': 0.6491, 'grad_norm': 9.626267433166504, 'learning_rate': 1.599986984950065e-05, 'epoch': 2.5974025974025974}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7767783403396606, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.5283, 'eval_samples_per_second': 49.44, 'eval_steps_per_second': 6.328, 'epoch': 2.995670995670996}


{'loss': 0.5445, 'grad_norm': 12.370176315307617, 'learning_rate': 1.2086180598261956e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7656075954437256, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.521, 'eval_samples_per_second': 49.583, 'eval_steps_per_second': 6.347, 'epoch': 4.0}


{'loss': 0.4671, 'grad_norm': 0.33927518129348755, 'learning_rate': 7.786862614453356e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7799916863441467, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.5526, 'eval_samples_per_second': 48.97, 'eval_steps_per_second': 6.268, 'epoch': 4.995670995670996}


{'loss': 0.4492, 'grad_norm': 3.124143362045288, 'learning_rate': 3.896641218784081e-06, 'epoch': 5.194805194805195}


{'eval_loss': 0.7439541816711426, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.5323, 'eval_samples_per_second': 49.361, 'eval_steps_per_second': 6.318, 'epoch': 6.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'loss': 0.4394, 'grad_norm': 1.251820683479309, 'learning_rate': 1.1346205710177304e-06, 'epoch': 6.0606060606060606}


{'loss': 0.4387, 'grad_norm': 0.302724689245224, 'learning_rate': 1.135786961421248e-08, 'epoch': 6.926406926406926}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.737642228603363, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.5309, 'eval_samples_per_second': 49.39, 'eval_steps_per_second': 6.322, 'epoch': 6.96969696969697}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 751.5933, 'train_samples_per_second': 8.606, 'train_steps_per_second': 1.071, 'train_loss': 0.7552432400839669, 'epoch': 6.96969696969697}


{'eval_loss': 0.7439541816711426, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.3188, 'eval_samples_per_second': 53.906, 'eval_steps_per_second': 6.9, 'epoch': 6.96969696969697}
[2025-05-28 03:44:46] ✅ Config 24: Accuracy=0.8640, Loss=0.7440


[2025-05-28 03:44:47] 
🔬 Testing configuration 25/264
[2025-05-28 03:44:47] Config: LR=2e-05, BS=8, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 574.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 552.99 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 613.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 633.74 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 680.77 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 740.74 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 752.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 802.31 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 824.05 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 837.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 749.56 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 543.99 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 545.00 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 537.75 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8322283029556274, 'eval_accuracy': 0.8159999847412109, 'eval_runtime': 1.8364, 'eval_samples_per_second': 68.068, 'eval_steps_per_second': 4.356, 'epoch': 1.0}


{'loss': 0.9963, 'grad_norm': 8.573625564575195, 'learning_rate': 1.324137931034483e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7690874338150024, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.0038, 'eval_samples_per_second': 62.382, 'eval_steps_per_second': 3.992, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7372393012046814, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.0353, 'eval_samples_per_second': 61.416, 'eval_steps_per_second': 3.931, 'epoch': 3.0}


{'loss': 0.493, 'grad_norm': 1.5507564544677734, 'learning_rate': 6.413793103448276e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7140746712684631, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.001, 'eval_samples_per_second': 62.467, 'eval_steps_per_second': 3.998, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6995121836662292, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 1.957, 'eval_samples_per_second': 63.872, 'eval_steps_per_second': 4.088, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 426.465, 'train_samples_per_second': 10.833, 'train_steps_per_second': 0.68, 'train_loss': 0.6462728303054284, 'epoch': 5.0}


{'eval_loss': 0.7372393012046814, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 1.7936, 'eval_samples_per_second': 69.693, 'eval_steps_per_second': 4.46, 'epoch': 5.0}
[2025-05-28 03:52:06] ✅ Config 25: Accuracy=0.8800, Loss=0.7372


[2025-05-28 03:52:06] 
🔬 Testing configuration 26/264
[2025-05-28 03:52:06] Config: LR=2e-05, BS=8, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 669.21 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 664.96 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 750.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 793.77 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 862.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 906.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 989.94 examples/s]


Map:  92%|█████████▏| 848/924 [00:00<00:00, 1003.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 909.86 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 651.44 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 650.50 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.5683, 'grad_norm': 64.1813735961914, 'learning_rate': 1.873156865940823e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6034848093986511, 'eval_accuracy': 0.7919999957084656, 'eval_runtime': 1.6685, 'eval_samples_per_second': 74.917, 'eval_steps_per_second': 4.795, 'epoch': 1.0}


{'loss': 0.4694, 'grad_norm': 20.206953048706055, 'learning_rate': 1.4968695066393923e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.4937771260738373, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.9063, 'eval_samples_per_second': 65.574, 'eval_steps_per_second': 4.197, 'epoch': 2.0}


{'loss': 0.2003, 'grad_norm': 2.975252866744995, 'learning_rate': 9.783355388248026e-06, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6323623061180115, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 1.8968, 'eval_samples_per_second': 65.901, 'eval_steps_per_second': 4.218, 'epoch': 3.0}


{'loss': 0.1523, 'grad_norm': 23.142133712768555, 'learning_rate': 4.6600379530906395e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5908001661300659, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 1.9071, 'eval_samples_per_second': 65.545, 'eval_steps_per_second': 4.195, 'epoch': 4.0}


{'loss': 0.0407, 'grad_norm': 0.1263047754764557, 'learning_rate': 1.0654750110382627e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5678092241287231, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 1.9143, 'eval_samples_per_second': 65.299, 'eval_steps_per_second': 4.179, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 403.9569, 'train_samples_per_second': 11.437, 'train_steps_per_second': 1.436, 'train_loss': 0.4295798942960542, 'epoch': 5.0}


{'eval_loss': 0.5678092241287231, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 1.887, 'eval_samples_per_second': 66.243, 'eval_steps_per_second': 4.24, 'epoch': 5.0}
[2025-05-28 03:59:01] ✅ Config 26: Accuracy=0.8720, Loss=0.5678


[2025-05-28 03:59:01] 
🔬 Testing configuration 27/264
[2025-05-28 03:59:01] Config: LR=2e-05, BS=8, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 650.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:01, 729.82 examples/s]


Map:  31%|███       | 288/924 [00:00<00:00, 777.07 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 820.18 examples/s]


Map:  57%|█████▋    | 528/924 [00:00<00:00, 915.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 940.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 989.75 examples/s]


Map:  97%|█████████▋| 896/924 [00:00<00:00, 1009.53 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 916.72 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 649.79 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 649.36 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.3365, 'grad_norm': 7.254728317260742, 'learning_rate': 1.6620689655172414e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.845637321472168, 'eval_accuracy': 0.7839999794960022, 'eval_runtime': 1.7158, 'eval_samples_per_second': 72.852, 'eval_steps_per_second': 4.663, 'epoch': 1.0}


{'loss': 0.59, 'grad_norm': 6.2082414627075195, 'learning_rate': 1.3172413793103449e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7673019766807556, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 1.9591, 'eval_samples_per_second': 63.805, 'eval_steps_per_second': 4.083, 'epoch': 2.0}


{'loss': 0.4713, 'grad_norm': 2.822028398513794, 'learning_rate': 9.724137931034484e-06, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7138860821723938, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.9447, 'eval_samples_per_second': 64.277, 'eval_steps_per_second': 4.114, 'epoch': 3.0}


{'loss': 0.4432, 'grad_norm': 2.180914878845215, 'learning_rate': 6.310344827586207e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7244721055030823, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 1.9496, 'eval_samples_per_second': 64.117, 'eval_steps_per_second': 4.104, 'epoch': 4.0}


{'loss': 0.4244, 'grad_norm': 0.2131538838148117, 'learning_rate': 2.8620689655172416e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.710768461227417, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 1.947, 'eval_samples_per_second': 64.2, 'eval_steps_per_second': 4.109, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 435.124, 'train_samples_per_second': 10.618, 'train_steps_per_second': 1.333, 'train_loss': 0.6213612720884126, 'epoch': 5.0}


{'eval_loss': 0.710768461227417, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.0095, 'eval_samples_per_second': 62.204, 'eval_steps_per_second': 3.981, 'epoch': 5.0}
[2025-05-28 04:06:29] ✅ Config 27: Accuracy=0.8800, Loss=0.7108


[2025-05-28 04:06:29] 
🔬 Testing configuration 28/264
[2025-05-28 04:06:29] Config: LR=2e-05, BS=8, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 653.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:01, 723.73 examples/s]


Map:  31%|███       | 288/924 [00:00<00:00, 773.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  42%|████▏     | 384/924 [00:00<00:00, 809.86 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 879.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  68%|██████▊   | 624/924 [00:00<00:00, 929.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  81%|████████▏ | 752/924 [00:00<00:00, 994.39 examples/s]


Map:  94%|█████████▎| 864/924 [00:00<00:00, 1001.07 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 911.53 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 653.10 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 647.55 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9260183572769165, 'eval_accuracy': 0.7440000176429749, 'eval_runtime': 1.7208, 'eval_samples_per_second': 72.642, 'eval_steps_per_second': 4.649, 'epoch': 1.0}


{'loss': 1.0633, 'grad_norm': 7.574601650238037, 'learning_rate': 1.4779519235862365e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8082476258277893, 'eval_accuracy': 0.7919999957084656, 'eval_runtime': 1.9535, 'eval_samples_per_second': 63.987, 'eval_steps_per_second': 4.095, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6902778148651123, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.9489, 'eval_samples_per_second': 64.14, 'eval_steps_per_second': 4.105, 'epoch': 3.0}


{'loss': 0.4808, 'grad_norm': 1.6907426118850708, 'learning_rate': 4.568760825195671e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6711874604225159, 'eval_accuracy': 0.8960000276565552, 'eval_runtime': 1.9556, 'eval_samples_per_second': 63.918, 'eval_steps_per_second': 4.091, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6705294251441956, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 1.9474, 'eval_samples_per_second': 64.187, 'eval_steps_per_second': 4.108, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 409.4544, 'train_samples_per_second': 11.283, 'train_steps_per_second': 0.708, 'train_loss': 0.6660842500883958, 'epoch': 5.0}


{'eval_loss': 0.6711874604225159, 'eval_accuracy': 0.8960000276565552, 'eval_runtime': 1.7875, 'eval_samples_per_second': 69.929, 'eval_steps_per_second': 4.475, 'epoch': 5.0}
[2025-05-28 04:13:30] ✅ Config 28: Accuracy=0.8960, Loss=0.6712


[2025-05-28 04:13:30] 
🔬 Testing configuration 29/264
[2025-05-28 04:13:30] Config: LR=2e-05, BS=8, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 670.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 681.96 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 761.00 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 806.65 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 872.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 915.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 994.47 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 996.47 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 915.22 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 655.14 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 653.81 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8084, 'grad_norm': 25.269489288330078, 'learning_rate': 1.9617486338797815e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.6556251049041748, 'eval_accuracy': 0.328000009059906, 'eval_runtime': 2.1709, 'eval_samples_per_second': 57.579, 'eval_steps_per_second': 3.685, 'epoch': 1.0}


{'loss': 0.9274, 'grad_norm': 22.788057327270508, 'learning_rate': 1.6885245901639347e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.548067569732666, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.908, 'eval_samples_per_second': 65.515, 'eval_steps_per_second': 4.193, 'epoch': 2.0}


{'loss': 0.3443, 'grad_norm': 1.338545322418213, 'learning_rate': 1.4207650273224044e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5291213989257812, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.8935, 'eval_samples_per_second': 66.017, 'eval_steps_per_second': 4.225, 'epoch': 3.0}


{'loss': 0.2541, 'grad_norm': 18.131675720214844, 'learning_rate': 1.1475409836065575e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.562415361404419, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.909, 'eval_samples_per_second': 65.478, 'eval_steps_per_second': 4.191, 'epoch': 4.0}


{'loss': 0.1025, 'grad_norm': 9.375459671020508, 'learning_rate': 8.743169398907103e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6563127040863037, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 1.8983, 'eval_samples_per_second': 65.849, 'eval_steps_per_second': 4.214, 'epoch': 5.0}


{'loss': 0.0834, 'grad_norm': 0.6058176755905151, 'learning_rate': 6.010928961748635e-06, 'epoch': 5.172413793103448}


{'eval_loss': 0.6699298620223999, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.9067, 'eval_samples_per_second': 65.559, 'eval_steps_per_second': 4.196, 'epoch': 6.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'loss': 0.0566, 'grad_norm': 51.107723236083984, 'learning_rate': 3.2786885245901638e-06, 'epoch': 6.0344827586206895}


{'loss': 0.0234, 'grad_norm': 0.045464545488357544, 'learning_rate': 5.46448087431694e-07, 'epoch': 6.896551724137931}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6317179799079895, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.8955, 'eval_samples_per_second': 65.946, 'eval_steps_per_second': 4.221, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 569.4516, 'train_samples_per_second': 11.358, 'train_steps_per_second': 1.426, 'train_loss': 0.44341498833571746, 'epoch': 7.0}


{'eval_loss': 0.562415361404419, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.7301, 'eval_samples_per_second': 72.25, 'eval_steps_per_second': 4.624, 'epoch': 7.0}
[2025-05-28 04:23:11] ✅ Config 29: Accuracy=0.8640, Loss=0.5624


[2025-05-28 04:23:11] 
🔬 Testing configuration 30/264
[2025-05-28 04:23:11] Config: LR=2e-05, BS=8, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 651.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:01, 728.75 examples/s]


Map:  31%|███       | 288/924 [00:00<00:00, 773.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 817.21 examples/s]


Map:  57%|█████▋    | 528/924 [00:00<00:00, 906.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 936.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 984.86 examples/s]


Map:  97%|█████████▋| 896/924 [00:00<00:00, 1011.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 915.06 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 647.78 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 648.15 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.3631, 'grad_norm': 11.400151252746582, 'learning_rate': 1.9966771240757915e-05, 'epoch': 0.8620689655172413}


{'eval_loss': 0.8949151635169983, 'eval_accuracy': 0.7680000066757202, 'eval_runtime': 1.7113, 'eval_samples_per_second': 73.045, 'eval_steps_per_second': 4.675, 'epoch': 1.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'loss': 0.6279, 'grad_norm': 13.325326919555664, 'learning_rate': 1.8765546742763972e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6983413696289062, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 1.9486, 'eval_samples_per_second': 64.147, 'eval_steps_per_second': 4.105, 'epoch': 2.0}


{'loss': 0.4929, 'grad_norm': 0.44403398036956787, 'learning_rate': 1.5967760186090896e-05, 'epoch': 2.586206896551724}


{'eval_loss': 0.7613195180892944, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.09, 'eval_samples_per_second': 59.809, 'eval_steps_per_second': 3.828, 'epoch': 3.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'loss': 0.4317, 'grad_norm': 0.5013845562934875, 'learning_rate': 1.2087512155747418e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8062395453453064, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.1369, 'eval_samples_per_second': 58.497, 'eval_steps_per_second': 3.744, 'epoch': 4.0}


{'loss': 0.4265, 'grad_norm': 0.14194080233573914, 'learning_rate': 7.828620980580217e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7547003626823425, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.1144, 'eval_samples_per_second': 59.12, 'eval_steps_per_second': 3.784, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 459.9553, 'train_samples_per_second': 14.062, 'train_steps_per_second': 1.765, 'train_loss': 0.6345377691860856, 'epoch': 5.0}


{'eval_loss': 0.6983413696289062, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 1.8306, 'eval_samples_per_second': 68.284, 'eval_steps_per_second': 4.37, 'epoch': 5.0}
[2025-05-28 04:31:02] ✅ Config 30: Accuracy=0.8800, Loss=0.6983


[2025-05-28 04:31:03] 💾 Saved checkpoint at 30 configurations
[2025-05-28 04:31:03] 
🔬 Testing configuration 31/264
[2025-05-28 04:31:03] Config: LR=2e-05, BS=8, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 556.35 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 540.46 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 596.88 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 615.18 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 660.67 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 718.35 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 730.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 779.59 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 800.28 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 810.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 727.87 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 532.81 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 534.36 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 527.43 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.3201, 'grad_norm': 6.4170379638671875, 'learning_rate': 1.7561576354679803e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8557131886482239, 'eval_accuracy': 0.7680000066757202, 'eval_runtime': 1.8488, 'eval_samples_per_second': 67.612, 'eval_steps_per_second': 4.327, 'epoch': 1.0}


{'loss': 0.5852, 'grad_norm': 4.790968894958496, 'learning_rate': 1.5123152709359607e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.734782874584198, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.0971, 'eval_samples_per_second': 59.606, 'eval_steps_per_second': 3.815, 'epoch': 2.0}


{'loss': 0.4752, 'grad_norm': 9.072504997253418, 'learning_rate': 1.268472906403941e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6822165250778198, 'eval_accuracy': 0.8960000276565552, 'eval_runtime': 2.0925, 'eval_samples_per_second': 59.738, 'eval_steps_per_second': 3.823, 'epoch': 3.0}


{'loss': 0.44, 'grad_norm': 0.3461359441280365, 'learning_rate': 1.0221674876847292e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6802042722702026, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.0974, 'eval_samples_per_second': 59.596, 'eval_steps_per_second': 3.814, 'epoch': 4.0}


{'loss': 0.4269, 'grad_norm': 0.1985812783241272, 'learning_rate': 7.758620689655173e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.662970781326294, 'eval_accuracy': 0.9120000004768372, 'eval_runtime': 2.0986, 'eval_samples_per_second': 59.564, 'eval_steps_per_second': 3.812, 'epoch': 5.0}


{'loss': 0.4229, 'grad_norm': 0.17171791195869446, 'learning_rate': 5.295566502463054e-06, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6618773341178894, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.1001, 'eval_samples_per_second': 59.522, 'eval_steps_per_second': 3.809, 'epoch': 6.0}


{'loss': 0.4226, 'grad_norm': 0.17950865626335144, 'learning_rate': 2.832512315270936e-06, 'epoch': 6.0344827586206895}


{'loss': 0.4224, 'grad_norm': 0.11514165997505188, 'learning_rate': 3.694581280788178e-07, 'epoch': 6.896551724137931}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6643818616867065, 'eval_accuracy': 0.8960000276565552, 'eval_runtime': 2.0867, 'eval_samples_per_second': 59.905, 'eval_steps_per_second': 3.834, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 646.5197, 'train_samples_per_second': 10.004, 'train_steps_per_second': 1.256, 'train_loss': 0.5623122829521818, 'epoch': 7.0}


{'eval_loss': 0.662970781326294, 'eval_accuracy': 0.9120000004768372, 'eval_runtime': 1.8761, 'eval_samples_per_second': 66.629, 'eval_steps_per_second': 4.264, 'epoch': 7.0}
[2025-05-28 04:42:10] ✅ Config 31: Accuracy=0.9120, Loss=0.6630


[2025-05-28 04:42:11] 
🔬 Testing configuration 32/264
[2025-05-28 04:42:11] Config: LR=2e-05, BS=8, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 564.94 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 544.31 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 599.53 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 620.85 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 666.07 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 723.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 737.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 784.75 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 806.68 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 815.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 733.14 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 535.60 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 534.80 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 527.95 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.4505, 'grad_norm': 9.387453079223633, 'learning_rate': 1.9318129768176033e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8083540201187134, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 1.8284, 'eval_samples_per_second': 68.367, 'eval_steps_per_second': 4.375, 'epoch': 1.0}


{'loss': 0.7185, 'grad_norm': 12.927547454833984, 'learning_rate': 1.728650730913312e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8333216905593872, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.0959, 'eval_samples_per_second': 59.641, 'eval_steps_per_second': 3.817, 'epoch': 2.0}


{'loss': 0.5731, 'grad_norm': 4.4928436279296875, 'learning_rate': 1.4163745979093997e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7994915843009949, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.1017, 'eval_samples_per_second': 59.475, 'eval_steps_per_second': 3.806, 'epoch': 3.0}


{'loss': 0.534, 'grad_norm': 10.037590980529785, 'learning_rate': 1.0464108002827881e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7505635619163513, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.1047, 'eval_samples_per_second': 59.391, 'eval_steps_per_second': 3.801, 'epoch': 4.0}


{'loss': 0.45, 'grad_norm': 0.236580029129982, 'learning_rate': 6.660715755793154e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7576956152915955, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.1148, 'eval_samples_per_second': 59.106, 'eval_steps_per_second': 3.783, 'epoch': 5.0}


{'loss': 0.4592, 'grad_norm': 0.2680852711200714, 'learning_rate': 3.350970964900998e-06, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7475230097770691, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.0983, 'eval_samples_per_second': 59.571, 'eval_steps_per_second': 3.813, 'epoch': 6.0}


{'loss': 0.4411, 'grad_norm': 0.5211888551712036, 'learning_rate': 1.0241542594777576e-06, 'epoch': 6.0344827586206895}


{'loss': 0.443, 'grad_norm': 0.3267986476421356, 'learning_rate': 2.4239696100050168e-08, 'epoch': 6.896551724137931}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7525643110275269, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.0969, 'eval_samples_per_second': 59.612, 'eval_steps_per_second': 3.815, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 640.7217, 'train_samples_per_second': 10.095, 'train_steps_per_second': 1.267, 'train_loss': 0.6306038677986032, 'epoch': 7.0}


{'eval_loss': 0.7576956152915955, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 1.9023, 'eval_samples_per_second': 65.711, 'eval_steps_per_second': 4.206, 'epoch': 7.0}
[2025-05-28 04:53:04] ✅ Config 32: Accuracy=0.8720, Loss=0.7577


[2025-05-28 04:53:04] 
🔬 Testing configuration 33/264
[2025-05-28 04:53:04] Config: LR=3e-05, BS=4, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 558.84 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 543.60 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 600.35 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 618.26 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 664.82 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 723.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 736.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 787.09 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 807.19 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 816.36 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 733.43 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 532.82 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 537.14 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 529.77 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5336432456970215, 'eval_accuracy': 0.7599999904632568, 'eval_runtime': 2.2414, 'eval_samples_per_second': 55.768, 'eval_steps_per_second': 7.138, 'epoch': 0.987012987012987}


{'loss': 0.7284, 'grad_norm': 0.2926487624645233, 'learning_rate': 2.171206225680934e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.36056622862815857, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.5049, 'eval_samples_per_second': 49.903, 'eval_steps_per_second': 6.388, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.44053053855895996, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 2.4976, 'eval_samples_per_second': 50.048, 'eval_steps_per_second': 6.406, 'epoch': 2.995670995670996}


{'loss': 0.062, 'grad_norm': 0.09439169615507126, 'learning_rate': 1.0155642023346305e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.545962929725647, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.495, 'eval_samples_per_second': 50.1, 'eval_steps_per_second': 6.413, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5025396347045898, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 2.4921, 'eval_samples_per_second': 50.159, 'eval_steps_per_second': 6.42, 'epoch': 4.935064935064935}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 524.009, 'train_samples_per_second': 8.817, 'train_steps_per_second': 0.544, 'train_loss': 0.27845900560680187, 'epoch': 4.935064935064935}


{'eval_loss': 0.36056622862815857, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.2399, 'eval_samples_per_second': 55.807, 'eval_steps_per_second': 7.143, 'epoch': 4.935064935064935}
[2025-05-28 05:02:02] ✅ Config 33: Accuracy=0.9040, Loss=0.3606


[2025-05-28 05:02:02] 
🔬 Testing configuration 34/264
[2025-05-28 05:02:02] Config: LR=3e-05, BS=4, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 556.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 540.49 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 596.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 615.65 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 661.46 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 718.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 734.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 784.20 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 805.39 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 815.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 731.04 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 537.05 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 538.28 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 532.06 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.7408, 'grad_norm': 154.85887145996094, 'learning_rate': 2.9623918682727355e-05, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.0874687433242798, 'eval_accuracy': 0.6320000290870667, 'eval_runtime': 2.3084, 'eval_samples_per_second': 54.151, 'eval_steps_per_second': 6.931, 'epoch': 0.9956709956709957}


{'loss': 0.8903, 'grad_norm': 19.6109619140625, 'learning_rate': 2.5180340982323628e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8581228852272034, 'eval_accuracy': 0.8159999847412109, 'eval_runtime': 2.571, 'eval_samples_per_second': 48.619, 'eval_steps_per_second': 6.223, 'epoch': 2.0}


{'loss': 0.6188, 'grad_norm': 4.358255863189697, 'learning_rate': 1.7085594552789776e-05, 'epoch': 2.5974025974025974}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8220235705375671, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.5526, 'eval_samples_per_second': 48.971, 'eval_steps_per_second': 6.268, 'epoch': 2.995670995670996}


{'loss': 0.5136, 'grad_norm': 0.45063188672065735, 'learning_rate': 8.246943827734899e-06, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.742453396320343, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.5604, 'eval_samples_per_second': 48.82, 'eval_steps_per_second': 6.249, 'epoch': 4.0}


{'loss': 0.4509, 'grad_norm': 0.5995374917984009, 'learning_rate': 1.8170199032573138e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7286107540130615, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.5563, 'eval_samples_per_second': 48.898, 'eval_steps_per_second': 6.259, 'epoch': 4.978354978354979}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 566.2879, 'train_samples_per_second': 8.158, 'train_steps_per_second': 1.015, 'train_loss': 0.7926371101711107, 'epoch': 4.978354978354979}


{'eval_loss': 0.742453396320343, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.3618, 'eval_samples_per_second': 52.925, 'eval_steps_per_second': 6.774, 'epoch': 4.978354978354979}
[2025-05-28 05:11:42] ✅ Config 34: Accuracy=0.8640, Loss=0.7425


[2025-05-28 05:11:42] 
🔬 Testing configuration 35/264
[2025-05-28 05:11:42] Config: LR=3e-05, BS=4, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 553.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 539.28 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 598.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 617.56 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 662.94 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 721.37 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 734.64 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 786.46 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 807.03 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 807.35 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 729.44 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 531.74 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 537.35 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 530.18 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.87856125831604, 'eval_accuracy': 0.14399999380111694, 'eval_runtime': 2.3252, 'eval_samples_per_second': 53.759, 'eval_steps_per_second': 6.881, 'epoch': 0.987012987012987}


{'loss': 1.8263, 'grad_norm': 5.013401031494141, 'learning_rate': 2.2178988326848252e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.815914273262024, 'eval_accuracy': 0.2879999876022339, 'eval_runtime': 2.5609, 'eval_samples_per_second': 48.81, 'eval_steps_per_second': 6.248, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9093501567840576, 'eval_accuracy': 0.7839999794960022, 'eval_runtime': 2.5507, 'eval_samples_per_second': 49.007, 'eval_steps_per_second': 6.273, 'epoch': 2.995670995670996}


{'loss': 1.2645, 'grad_norm': 4.906634330749512, 'learning_rate': 1.0505836575875485e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8591588735580444, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.562, 'eval_samples_per_second': 48.789, 'eval_steps_per_second': 6.245, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7867433428764343, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.5558, 'eval_samples_per_second': 48.908, 'eval_steps_per_second': 6.26, 'epoch': 4.935064935064935}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 572.1296, 'train_samples_per_second': 8.075, 'train_steps_per_second': 0.498, 'train_loss': 1.2668320772940653, 'epoch': 4.935064935064935}


{'eval_loss': 0.8591588735580444, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.3648, 'eval_samples_per_second': 52.858, 'eval_steps_per_second': 6.766, 'epoch': 4.935064935064935}
[2025-05-28 05:21:27] ✅ Config 35: Accuracy=0.8320, Loss=0.8592


[2025-05-28 05:21:27] 
🔬 Testing configuration 36/264
[2025-05-28 05:21:27] Config: LR=3e-05, BS=4, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 555.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 539.64 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 594.12 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 614.64 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 659.88 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 716.97 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 731.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 780.61 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 799.79 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 811.74 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 728.27 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 528.99 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 532.27 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 524.01 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.5256, 'grad_norm': 32.56514358520508, 'learning_rate': 2.9582357195506187e-05, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5851114988327026, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.2391, 'eval_samples_per_second': 55.827, 'eval_steps_per_second': 7.146, 'epoch': 0.9956709956709957}


{'loss': 0.4568, 'grad_norm': 77.23438262939453, 'learning_rate': 2.5045969937572945e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5745824575424194, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.5068, 'eval_samples_per_second': 49.864, 'eval_steps_per_second': 6.383, 'epoch': 2.0}


{'loss': 0.264, 'grad_norm': 0.6589715480804443, 'learning_rate': 1.6905267296203182e-05, 'epoch': 2.5974025974025974}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7359622716903687, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.4907, 'eval_samples_per_second': 50.187, 'eval_steps_per_second': 6.424, 'epoch': 2.995670995670996}


{'loss': 0.1577, 'grad_norm': 5.016371250152588, 'learning_rate': 8.084980732893569e-06, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.786484956741333, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.488, 'eval_samples_per_second': 50.242, 'eval_steps_per_second': 6.431, 'epoch': 4.0}


{'loss': 0.0331, 'grad_norm': 0.013462643139064312, 'learning_rate': 1.731191087467685e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7165157198905945, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.4983, 'eval_samples_per_second': 50.034, 'eval_steps_per_second': 6.404, 'epoch': 4.978354978354979}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 613.9873, 'train_samples_per_second': 7.525, 'train_steps_per_second': 0.937, 'train_loss': 0.43011265505915103, 'epoch': 4.978354978354979}


{'eval_loss': 0.7359622716903687, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.308, 'eval_samples_per_second': 54.16, 'eval_steps_per_second': 6.933, 'epoch': 4.978354978354979}
[2025-05-28 05:31:55] ✅ Config 36: Accuracy=0.8720, Loss=0.7360


[2025-05-28 05:31:55] 
🔬 Testing configuration 37/264
[2025-05-28 05:31:55] Config: LR=3e-05, BS=4, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 558.19 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 541.18 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 595.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 613.37 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 658.46 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 714.77 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 728.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 774.33 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 791.54 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 803.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 723.77 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 529.85 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 530.52 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 524.47 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.4616, 'grad_norm': 10.51040267944336, 'learning_rate': 2.9255172413793105e-05, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.938766360282898, 'eval_accuracy': 0.7360000014305115, 'eval_runtime': 2.303, 'eval_samples_per_second': 54.276, 'eval_steps_per_second': 6.947, 'epoch': 0.9956709956709957}


{'loss': 0.6619, 'grad_norm': 22.508838653564453, 'learning_rate': 2.5117241379310345e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8874325156211853, 'eval_accuracy': 0.7919999957084656, 'eval_runtime': 2.5471, 'eval_samples_per_second': 49.075, 'eval_steps_per_second': 6.282, 'epoch': 2.0}


{'loss': 0.4988, 'grad_norm': 0.2006961554288864, 'learning_rate': 2.097931034482759e-05, 'epoch': 2.5974025974025974}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7608394026756287, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.5812, 'eval_samples_per_second': 48.428, 'eval_steps_per_second': 6.199, 'epoch': 2.995670995670996}


{'loss': 0.4496, 'grad_norm': 0.16155043244361877, 'learning_rate': 1.684137931034483e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8018116354942322, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.5594, 'eval_samples_per_second': 48.839, 'eval_steps_per_second': 6.251, 'epoch': 4.0}


{'loss': 0.4333, 'grad_norm': 0.20286010205745697, 'learning_rate': 1.2744827586206896e-05, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7851248979568481, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.5523, 'eval_samples_per_second': 48.975, 'eval_steps_per_second': 6.269, 'epoch': 4.995670995670996}


{'loss': 0.43, 'grad_norm': 0.13100214302539825, 'learning_rate': 8.606896551724137e-06, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7947480082511902, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.5789, 'eval_samples_per_second': 48.47, 'eval_steps_per_second': 6.204, 'epoch': 6.0}


{'loss': 0.4225, 'grad_norm': 0.15433219075202942, 'learning_rate': 4.468965517241379e-06, 'epoch': 6.0606060606060606}


{'loss': 0.422, 'grad_norm': 0.11608200520277023, 'learning_rate': 3.310344827586207e-07, 'epoch': 6.926406926406926}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7951569557189941, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.5551, 'eval_samples_per_second': 48.922, 'eval_steps_per_second': 6.262, 'epoch': 6.96969696969697}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 781.2794, 'train_samples_per_second': 8.279, 'train_steps_per_second': 1.03, 'train_loss': 0.5963799962345858, 'epoch': 6.96969696969697}


{'eval_loss': 0.7851248979568481, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.351, 'eval_samples_per_second': 53.169, 'eval_steps_per_second': 6.806, 'epoch': 6.96969696969697}
[2025-05-28 05:45:10] ✅ Config 37: Accuracy=0.8720, Loss=0.7851


[2025-05-28 05:45:10] 
🔬 Testing configuration 38/264
[2025-05-28 05:45:10] Config: LR=3e-05, BS=4, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 552.89 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 537.36 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 595.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 615.90 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 661.54 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 718.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 733.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 782.30 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 802.17 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 812.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 729.11 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 531.23 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 532.53 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 526.58 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.9003713130950928, 'eval_accuracy': 0.14399999380111694, 'eval_runtime': 2.2235, 'eval_samples_per_second': 56.217, 'eval_steps_per_second': 7.196, 'epoch': 0.987012987012987}


{'loss': 1.5156, 'grad_norm': 12.938348770141602, 'learning_rate': 2.8305162497673325e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7899384498596191, 'eval_accuracy': 0.7599999904632568, 'eval_runtime': 2.4949, 'eval_samples_per_second': 50.103, 'eval_steps_per_second': 6.413, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.57225501537323, 'eval_accuracy': 0.8080000281333923, 'eval_runtime': 2.4869, 'eval_samples_per_second': 50.264, 'eval_steps_per_second': 6.434, 'epoch': 2.995670995670996}


{'loss': 0.306, 'grad_norm': 0.39056551456451416, 'learning_rate': 1.8246594209071543e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6386882662773132, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.515, 'eval_samples_per_second': 49.701, 'eval_steps_per_second': 6.362, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.667971670627594, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.4838, 'eval_samples_per_second': 50.325, 'eval_steps_per_second': 6.442, 'epoch': 4.987012987012987}


{'loss': 0.1017, 'grad_norm': 0.3698495328426361, 'learning_rate': 5.86857856486919e-06, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6813083291053772, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.4956, 'eval_samples_per_second': 50.088, 'eval_steps_per_second': 6.411, 'epoch': 5.991341991341991}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6619899868965149, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.5522, 'eval_samples_per_second': 48.977, 'eval_steps_per_second': 6.269, 'epoch': 6.909090909090909}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 752.2336, 'train_samples_per_second': 8.598, 'train_steps_per_second': 0.53, 'train_loss': 0.4916783377042689, 'epoch': 6.909090909090909}


{'eval_loss': 0.667971670627594, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.2947, 'eval_samples_per_second': 54.473, 'eval_steps_per_second': 6.973, 'epoch': 6.909090909090909}
[2025-05-28 05:57:55] ✅ Config 38: Accuracy=0.8720, Loss=0.6680


[2025-05-28 05:57:56] 
🔬 Testing configuration 39/264
[2025-05-28 05:57:56] Config: LR=3e-05, BS=4, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 560.48 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 542.95 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 597.83 examples/s]


Map:  31%|███       | 288/924 [00:00<00:01, 616.54 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 646.24 examples/s]


Map:  50%|█████     | 464/924 [00:00<00:00, 695.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 726.83 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 761.58 examples/s]


Map:  81%|████████▏ | 752/924 [00:01<00:00, 804.20 examples/s]


Map:  92%|█████████▏| 848/924 [00:01<00:00, 800.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 725.63 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 536.21 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 537.87 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 531.73 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.1184744834899902, 'eval_accuracy': 0.6480000019073486, 'eval_runtime': 2.3091, 'eval_samples_per_second': 54.133, 'eval_steps_per_second': 6.929, 'epoch': 0.987012987012987}


{'loss': 1.158, 'grad_norm': 6.323307037353516, 'learning_rate': 2.5083333333333334e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7274782061576843, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.5571, 'eval_samples_per_second': 48.884, 'eval_steps_per_second': 6.257, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7518962621688843, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.3557, 'eval_samples_per_second': 53.063, 'eval_steps_per_second': 6.792, 'epoch': 2.995670995670996}


{'loss': 0.478, 'grad_norm': 0.44322437047958374, 'learning_rate': 1.675e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7265714406967163, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.3389, 'eval_samples_per_second': 53.443, 'eval_steps_per_second': 6.841, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7149012088775635, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.3346, 'eval_samples_per_second': 53.543, 'eval_steps_per_second': 6.853, 'epoch': 4.987012987012987}


{'loss': 0.4265, 'grad_norm': 0.20299415290355682, 'learning_rate': 8.416666666666667e-06, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7165083289146423, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.3091, 'eval_samples_per_second': 54.134, 'eval_steps_per_second': 6.929, 'epoch': 5.991341991341991}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7358128428459167, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.3253, 'eval_samples_per_second': 53.756, 'eval_steps_per_second': 6.881, 'epoch': 6.909090909090909}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 774.0592, 'train_samples_per_second': 8.356, 'train_steps_per_second': 0.515, 'train_loss': 0.6219290252915002, 'epoch': 6.909090909090909}


{'eval_loss': 0.7265714406967163, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.1394, 'eval_samples_per_second': 58.427, 'eval_steps_per_second': 7.479, 'epoch': 6.909090909090909}
[2025-05-28 06:11:03] ✅ Config 39: Accuracy=0.8640, Loss=0.7266


[2025-05-28 06:11:04] 
🔬 Testing configuration 40/264
[2025-05-28 06:11:04] Config: LR=3e-05, BS=4, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 640.81 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 655.41 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 732.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 772.04 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 831.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 866.33 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 944.59 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 952.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 873.61 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 622.77 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 619.71 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.7263, 'grad_norm': 16.795671463012695, 'learning_rate': 2.9963962804397973e-05, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8348139524459839, 'eval_accuracy': 0.6480000019073486, 'eval_runtime': 2.0276, 'eval_samples_per_second': 61.648, 'eval_steps_per_second': 7.891, 'epoch': 0.9956709956709957}


{'loss': 0.6072, 'grad_norm': 6.146136283874512, 'learning_rate': 2.8175789946585697e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.839909553527832, 'eval_accuracy': 0.7680000066757202, 'eval_runtime': 2.2631, 'eval_samples_per_second': 55.235, 'eval_steps_per_second': 7.07, 'epoch': 2.0}


{'loss': 0.3745, 'grad_norm': 0.10931675136089325, 'learning_rate': 2.405171952688028e-05, 'epoch': 2.5974025974025974}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7659634947776794, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.2644, 'eval_samples_per_second': 55.202, 'eval_steps_per_second': 7.066, 'epoch': 2.995670995670996}


{'loss': 0.2261, 'grad_norm': 0.9885361790657043, 'learning_rate': 1.8192809639297413e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.978492796421051, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.2529, 'eval_samples_per_second': 55.485, 'eval_steps_per_second': 7.102, 'epoch': 4.0}


{'loss': 0.0923, 'grad_norm': 0.018515052273869514, 'learning_rate': 1.1743711569749372e-05, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9507120847702026, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.2525, 'eval_samples_per_second': 55.493, 'eval_steps_per_second': 7.103, 'epoch': 4.995670995670996}


{'loss': 0.0676, 'grad_norm': 0.005246713291853666, 'learning_rate': 5.8965356833938e-06, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9213389754295349, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.2512, 'eval_samples_per_second': 55.525, 'eval_steps_per_second': 7.107, 'epoch': 6.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 658.0177, 'train_samples_per_second': 9.83, 'train_steps_per_second': 1.223, 'train_loss': 0.45124787720078857, 'epoch': 6.0}


{'eval_loss': 0.7659634947776794, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.095, 'eval_samples_per_second': 59.666, 'eval_steps_per_second': 7.637, 'epoch': 6.0}
[2025-05-28 06:22:14] ✅ Config 40: Accuracy=0.8800, Loss=0.7660


[2025-05-28 06:22:14] 💾 Saved checkpoint at 40 configurations
[2025-05-28 06:22:14] 
🔬 Testing configuration 41/264
[2025-05-28 06:22:14] Config: LR=3e-05, BS=8, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 644.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 656.13 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 737.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 777.96 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 841.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 880.14 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 958.94 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 961.48 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 882.49 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 630.78 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 627.86 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.3903, 'grad_norm': 7.742833137512207, 'learning_rate': 2.7648183556405356e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.811866044998169, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.7087, 'eval_samples_per_second': 73.155, 'eval_steps_per_second': 4.682, 'epoch': 1.0}


{'loss': 0.6163, 'grad_norm': 19.139205932617188, 'learning_rate': 2.1912045889101336e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7952613234519958, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 1.9435, 'eval_samples_per_second': 64.318, 'eval_steps_per_second': 4.116, 'epoch': 2.0}


{'loss': 0.4756, 'grad_norm': 0.7989801168441772, 'learning_rate': 1.6175908221797323e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7572978734970093, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 1.9464, 'eval_samples_per_second': 64.222, 'eval_steps_per_second': 4.11, 'epoch': 3.0}


{'loss': 0.4437, 'grad_norm': 0.15350425243377686, 'learning_rate': 1.0439770554493308e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8031310439109802, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 1.9382, 'eval_samples_per_second': 64.492, 'eval_steps_per_second': 4.127, 'epoch': 4.0}


{'loss': 0.4247, 'grad_norm': 0.13228937983512878, 'learning_rate': 4.703632887189293e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7726678252220154, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 1.9352, 'eval_samples_per_second': 64.593, 'eval_steps_per_second': 4.134, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 468.2224, 'train_samples_per_second': 9.867, 'train_steps_per_second': 1.239, 'train_loss': 0.6360855891786773, 'epoch': 5.0}


{'eval_loss': 0.7572978734970093, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 1.7539, 'eval_samples_per_second': 71.272, 'eval_steps_per_second': 4.561, 'epoch': 5.0}
[2025-05-28 06:30:16] ✅ Config 41: Accuracy=0.8720, Loss=0.7573


[2025-05-28 06:30:16] 
🔬 Testing configuration 42/264
[2025-05-28 06:30:16] Config: LR=3e-05, BS=8, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 639.35 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 640.50 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 718.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 755.01 examples/s]


Map:  50%|█████     | 464/924 [00:00<00:00, 814.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 855.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 928.45 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 942.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 862.36 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 622.29 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 616.84 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.6888, 'grad_norm': 20.496429443359375, 'learning_rate': 2.9610920516394893e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.1639245748519897, 'eval_accuracy': 0.5839999914169312, 'eval_runtime': 1.6631, 'eval_samples_per_second': 75.161, 'eval_steps_per_second': 4.81, 'epoch': 1.0}


{'loss': 0.5136, 'grad_norm': 29.245059967041016, 'learning_rate': 2.5201018338366207e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7632468342781067, 'eval_accuracy': 0.8080000281333923, 'eval_runtime': 1.8935, 'eval_samples_per_second': 66.017, 'eval_steps_per_second': 4.225, 'epoch': 2.0}


{'loss': 0.2257, 'grad_norm': 0.11997392028570175, 'learning_rate': 1.719956448510025e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8978660702705383, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 1.9122, 'eval_samples_per_second': 65.368, 'eval_steps_per_second': 4.184, 'epoch': 3.0}


{'loss': 0.1983, 'grad_norm': 34.34331512451172, 'learning_rate': 8.428032324708266e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8689004778862, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.9033, 'eval_samples_per_second': 65.675, 'eval_steps_per_second': 4.203, 'epoch': 4.0}


{'loss': 0.0568, 'grad_norm': 25.994401931762695, 'learning_rate': 1.9573782591201005e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8611354231834412, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.8945, 'eval_samples_per_second': 65.981, 'eval_steps_per_second': 4.223, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 458.2951, 'train_samples_per_second': 10.081, 'train_steps_per_second': 1.266, 'train_loss': 0.4717747211456299, 'epoch': 5.0}


{'eval_loss': 0.8611354231834412, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.9551, 'eval_samples_per_second': 63.935, 'eval_steps_per_second': 4.092, 'epoch': 5.0}
[2025-05-28 06:38:06] ✅ Config 42: Accuracy=0.8640, Loss=0.8611


[2025-05-28 06:38:06] 
🔬 Testing configuration 43/264
[2025-05-28 06:38:06] Config: LR=3e-05, BS=8, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 641.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 651.22 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 726.76 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 763.47 examples/s]


Map:  50%|█████     | 464/924 [00:00<00:00, 820.49 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 862.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 934.00 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 948.75 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 869.55 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 623.09 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 619.45 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.4261, 'grad_norm': 7.9390997886657715, 'learning_rate': 2.7648183556405356e-05, 'epoch': 0.8620689655172413}


{'eval_loss': 0.9634638428688049, 'eval_accuracy': 0.6800000071525574, 'eval_runtime': 1.8212, 'eval_samples_per_second': 68.636, 'eval_steps_per_second': 4.393, 'epoch': 1.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'loss': 0.6125, 'grad_norm': 1.2842744588851929, 'learning_rate': 2.196940726577438e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.851003885269165, 'eval_accuracy': 0.8159999847412109, 'eval_runtime': 2.0875, 'eval_samples_per_second': 59.879, 'eval_steps_per_second': 3.832, 'epoch': 2.0}


{'loss': 0.4786, 'grad_norm': 1.075541377067566, 'learning_rate': 1.6233269598470364e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8508366346359253, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.0842, 'eval_samples_per_second': 59.976, 'eval_steps_per_second': 3.838, 'epoch': 3.0}


{'loss': 0.4409, 'grad_norm': 0.3534978926181793, 'learning_rate': 1.0497131931166348e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.757805585861206, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.0963, 'eval_samples_per_second': 59.629, 'eval_steps_per_second': 3.816, 'epoch': 4.0}


{'loss': 0.4233, 'grad_norm': 0.17606285214424133, 'learning_rate': 4.760994263862333e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7780548334121704, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.0869, 'eval_samples_per_second': 59.896, 'eval_steps_per_second': 3.833, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 488.4883, 'train_samples_per_second': 9.458, 'train_steps_per_second': 1.187, 'train_loss': 0.6414925542371026, 'epoch': 5.0}


{'eval_loss': 0.7780548334121704, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.1368, 'eval_samples_per_second': 58.497, 'eval_steps_per_second': 3.744, 'epoch': 5.0}
[2025-05-28 06:46:26] ✅ Config 43: Accuracy=0.8720, Loss=0.7781


[2025-05-28 06:46:26] 
🔬 Testing configuration 44/264
[2025-05-28 06:46:26] Config: LR=3e-05, BS=8, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 550.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 534.49 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 591.36 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 610.04 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 655.20 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 716.94 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 735.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 787.61 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 810.29 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 821.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 731.40 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 540.32 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 539.55 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 533.58 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.3419, 'grad_norm': 8.884405136108398, 'learning_rate': 2.9547384868399506e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6963686943054199, 'eval_accuracy': 0.7440000176429749, 'eval_runtime': 1.7937, 'eval_samples_per_second': 69.689, 'eval_steps_per_second': 4.46, 'epoch': 1.0}


{'loss': 0.3201, 'grad_norm': 32.95115661621094, 'learning_rate': 2.4933863414195144e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6362930536270142, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.0473, 'eval_samples_per_second': 61.057, 'eval_steps_per_second': 3.908, 'epoch': 2.0}


{'loss': 0.0767, 'grad_norm': 0.009693569503724575, 'learning_rate': 1.684244768908203e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.0768710374832153, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.0442, 'eval_samples_per_second': 61.15, 'eval_steps_per_second': 3.914, 'epoch': 3.0}


{'loss': 0.0161, 'grad_norm': 3.5375776290893555, 'learning_rate': 8.186128691298253e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7139023542404175, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.0413, 'eval_samples_per_second': 61.236, 'eval_steps_per_second': 3.919, 'epoch': 4.0}


{'loss': 0.0036, 'grad_norm': 0.009801487438380718, 'learning_rate': 1.8259924571487236e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7193288207054138, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.045, 'eval_samples_per_second': 61.125, 'eval_steps_per_second': 3.912, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 459.5685, 'train_samples_per_second': 10.053, 'train_steps_per_second': 1.262, 'train_loss': 0.30319255713758797, 'epoch': 5.0}


{'eval_loss': 0.7139023542404175, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 1.8257, 'eval_samples_per_second': 68.466, 'eval_steps_per_second': 4.382, 'epoch': 5.0}
[2025-05-28 06:54:20] ✅ Config 44: Accuracy=0.8800, Loss=0.7139


[2025-05-28 06:54:20] 
🔬 Testing configuration 45/264
[2025-05-28 06:54:20] Config: LR=3e-05, BS=8, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 558.31 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 539.74 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 597.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 616.38 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 661.70 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 718.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 731.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 780.25 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 800.71 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 811.49 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 728.58 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 527.84 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 531.64 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 524.09 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9837353229522705, 'eval_accuracy': 0.7200000286102295, 'eval_runtime': 1.8367, 'eval_samples_per_second': 68.057, 'eval_steps_per_second': 4.356, 'epoch': 1.0}


{'loss': 1.1488, 'grad_norm': 2.247361183166504, 'learning_rate': 2.5095367847411446e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7843910455703735, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.0964, 'eval_samples_per_second': 59.626, 'eval_steps_per_second': 3.816, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7879986763000488, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.0922, 'eval_samples_per_second': 59.746, 'eval_steps_per_second': 3.824, 'epoch': 3.0}


{'loss': 0.4917, 'grad_norm': 0.24034875631332397, 'learning_rate': 1.700272479564033e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7253468632698059, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.0922, 'eval_samples_per_second': 59.746, 'eval_steps_per_second': 3.824, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7496052980422974, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.0894, 'eval_samples_per_second': 59.827, 'eval_steps_per_second': 3.829, 'epoch': 5.0}


{'loss': 0.4275, 'grad_norm': 0.10273463279008865, 'learning_rate': 8.8283378746594e-06, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7394992709159851, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.0797, 'eval_samples_per_second': 60.105, 'eval_steps_per_second': 3.847, 'epoch': 6.0}


{'loss': 0.4223, 'grad_norm': 0.09018304198980331, 'learning_rate': 6.539509536784741e-07, 'epoch': 6.896551724137931}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.735745370388031, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.0865, 'eval_samples_per_second': 59.91, 'eval_steps_per_second': 3.834, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 574.0399, 'train_samples_per_second': 11.268, 'train_steps_per_second': 0.707, 'train_loss': 0.6196222769215777, 'epoch': 7.0}


{'eval_loss': 0.7253468632698059, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.9165, 'eval_samples_per_second': 65.222, 'eval_steps_per_second': 4.174, 'epoch': 7.0}
[2025-05-28 07:04:07] ✅ Config 45: Accuracy=0.8640, Loss=0.7253


[2025-05-28 07:04:07] 
🔬 Testing configuration 46/264
[2025-05-28 07:04:07] Config: LR=3e-05, BS=8, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 554.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 533.98 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 590.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 611.76 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 658.26 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 716.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 729.94 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 778.99 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 798.85 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 807.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 725.07 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 530.54 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 533.24 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 526.80 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.794, 'grad_norm': 492.94891357421875, 'learning_rate': 2.997293149589571e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.6983087062835693, 'eval_accuracy': 0.30399999022483826, 'eval_runtime': 1.784, 'eval_samples_per_second': 70.069, 'eval_steps_per_second': 4.484, 'epoch': 1.0}


{'loss': 0.8509, 'grad_norm': 17.771133422851562, 'learning_rate': 2.8240181974883212e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6649796366691589, 'eval_accuracy': 0.8080000281333923, 'eval_runtime': 2.0408, 'eval_samples_per_second': 61.251, 'eval_steps_per_second': 3.92, 'epoch': 2.0}


{'loss': 0.3488, 'grad_norm': 5.70311164855957, 'learning_rate': 2.410586361643651e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.651974618434906, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.0344, 'eval_samples_per_second': 61.443, 'eval_steps_per_second': 3.932, 'epoch': 3.0}


{'loss': 0.2115, 'grad_norm': 25.691133499145508, 'learning_rate': 1.831987926573649e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8324995040893555, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.0273, 'eval_samples_per_second': 61.66, 'eval_steps_per_second': 3.946, 'epoch': 4.0}


{'loss': 0.0825, 'grad_norm': 0.005988455843180418, 'learning_rate': 1.1931719008106991e-05, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8689802885055542, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.0292, 'eval_samples_per_second': 61.6, 'eval_steps_per_second': 3.942, 'epoch': 5.0}


{'loss': 0.063, 'grad_norm': 16.489648818969727, 'learning_rate': 6.100098534341522e-06, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.740732729434967, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.0322, 'eval_samples_per_second': 61.51, 'eval_steps_per_second': 3.937, 'epoch': 6.0}


{'loss': 0.0441, 'grad_norm': 0.007541182916611433, 'learning_rate': 1.8827856325344056e-06, 'epoch': 6.0344827586206895}


{'loss': 0.0194, 'grad_norm': 0.003576015355065465, 'learning_rate': 4.4737103272099456e-08, 'epoch': 6.896551724137931}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7598245739936829, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.0534, 'eval_samples_per_second': 60.875, 'eval_steps_per_second': 3.896, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 551.4514, 'train_samples_per_second': 11.729, 'train_steps_per_second': 1.472, 'train_loss': 0.4220308118265838, 'epoch': 7.0}


{'eval_loss': 0.8689802885055542, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.7784, 'eval_samples_per_second': 70.287, 'eval_steps_per_second': 4.498, 'epoch': 7.0}
[2025-05-28 07:13:31] ✅ Config 46: Accuracy=0.8640, Loss=0.8690


[2025-05-28 07:13:32] 
🔬 Testing configuration 47/264
[2025-05-28 07:13:32] Config: LR=3e-05, BS=8, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 557.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 538.07 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 596.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 614.05 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 659.64 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 717.36 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 730.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 778.70 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 797.45 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 807.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 726.30 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 538.79 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 534.82 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 528.66 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.6299, 'grad_norm': 9.442163467407227, 'learning_rate': 2.9385245901639346e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8061227202415466, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.8454, 'eval_samples_per_second': 67.735, 'eval_steps_per_second': 4.335, 'epoch': 1.0}


{'loss': 0.7463, 'grad_norm': 2.7484540939331055, 'learning_rate': 2.5327868852459018e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8028919696807861, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.1127, 'eval_samples_per_second': 59.167, 'eval_steps_per_second': 3.787, 'epoch': 2.0}


{'loss': 0.5713, 'grad_norm': 6.190673828125, 'learning_rate': 2.122950819672131e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7318884134292603, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 2.1237, 'eval_samples_per_second': 58.859, 'eval_steps_per_second': 3.767, 'epoch': 3.0}


{'loss': 0.5129, 'grad_norm': 12.734496116638184, 'learning_rate': 1.7131147540983607e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8286756873130798, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.0979, 'eval_samples_per_second': 59.582, 'eval_steps_per_second': 3.813, 'epoch': 4.0}


{'loss': 0.4533, 'grad_norm': 0.34054702520370483, 'learning_rate': 1.3032786885245902e-05, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7715206742286682, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.1108, 'eval_samples_per_second': 59.219, 'eval_steps_per_second': 3.79, 'epoch': 5.0}


{'loss': 0.463, 'grad_norm': 0.28339287638664246, 'learning_rate': 8.934426229508197e-06, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8422009944915771, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.0962, 'eval_samples_per_second': 59.631, 'eval_steps_per_second': 3.816, 'epoch': 6.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 482.532, 'train_samples_per_second': 13.404, 'train_steps_per_second': 1.683, 'train_loss': 0.6906053334817119, 'epoch': 6.0}


{'eval_loss': 0.7318884134292603, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 1.8809, 'eval_samples_per_second': 66.456, 'eval_steps_per_second': 4.253, 'epoch': 6.0}
[2025-05-28 07:21:47] ✅ Config 47: Accuracy=0.8880, Loss=0.7319


[2025-05-28 07:21:47] 
🔬 Testing configuration 48/264
[2025-05-28 07:21:47] Config: LR=3e-05, BS=8, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 533.68 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 531.59 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 592.48 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 613.63 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 660.24 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 719.16 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 734.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 784.29 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 804.10 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 813.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 728.41 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 532.44 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 534.72 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 527.66 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6348138451576233, 'eval_accuracy': 0.7440000176429749, 'eval_runtime': 1.7991, 'eval_samples_per_second': 69.48, 'eval_steps_per_second': 4.447, 'epoch': 1.0}


{'loss': 0.9152, 'grad_norm': 22.23342514038086, 'learning_rate': 2.818888845254941e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.4085772931575775, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.0585, 'eval_samples_per_second': 60.724, 'eval_steps_per_second': 3.886, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.3766869604587555, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.0646, 'eval_samples_per_second': 60.543, 'eval_steps_per_second': 3.875, 'epoch': 3.0}


{'loss': 0.0982, 'grad_norm': 0.31902649998664856, 'learning_rate': 1.824833528759981e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6723892092704773, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.0581, 'eval_samples_per_second': 60.737, 'eval_steps_per_second': 3.887, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7249289751052856, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.0592, 'eval_samples_per_second': 60.702, 'eval_steps_per_second': 3.885, 'epoch': 5.0}


{'loss': 0.0123, 'grad_norm': 0.006892307661473751, 'learning_rate': 6.069345568963473e-06, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7372268438339233, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.0619, 'eval_samples_per_second': 60.622, 'eval_steps_per_second': 3.88, 'epoch': 6.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 455.2836, 'train_samples_per_second': 14.207, 'train_steps_per_second': 0.892, 'train_loss': 0.29479857840329066, 'epoch': 6.0}


{'eval_loss': 0.3766869604587555, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 1.8535, 'eval_samples_per_second': 67.44, 'eval_steps_per_second': 4.316, 'epoch': 6.0}
[2025-05-28 07:29:35] ✅ Config 48: Accuracy=0.8800, Loss=0.3767


[2025-05-28 07:29:35] 
🔬 Testing configuration 49/264
[2025-05-28 07:29:35] Config: LR=5e-05, BS=4, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   5%|▌         | 48/924 [00:00<00:02, 321.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:02, 360.81 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 455.64 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:01, 557.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 606.68 examples/s]


Map:  48%|████▊     | 448/924 [00:00<00:00, 656.31 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 700.67 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:01<00:00, 738.99 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  80%|███████▉  | 736/924 [00:01<00:00, 785.76 examples/s]


Map:  90%|█████████ | 832/924 [00:01<00:00, 787.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 801.42 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 679.53 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 534.01 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 536.16 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 529.99 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.2807, 'grad_norm': 15.025253295898438, 'learning_rate': 4.6042471042471044e-05, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.013310432434082, 'eval_accuracy': 0.7599999904632568, 'eval_runtime': 2.2985, 'eval_samples_per_second': 54.382, 'eval_steps_per_second': 6.961, 'epoch': 0.9956709956709957}


{'loss': 0.6204, 'grad_norm': 0.6952672600746155, 'learning_rate': 3.6389961389961394e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8086780905723572, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.551, 'eval_samples_per_second': 49.0, 'eval_steps_per_second': 6.272, 'epoch': 2.0}


{'loss': 0.4932, 'grad_norm': 0.2449999302625656, 'learning_rate': 2.673745173745174e-05, 'epoch': 2.5974025974025974}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7184796333312988, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.55, 'eval_samples_per_second': 49.02, 'eval_steps_per_second': 6.275, 'epoch': 2.995670995670996}


{'loss': 0.4475, 'grad_norm': 0.18388870358467102, 'learning_rate': 1.7084942084942085e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6867115497589111, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 2.55, 'eval_samples_per_second': 49.02, 'eval_steps_per_second': 6.275, 'epoch': 4.0}


{'loss': 0.4301, 'grad_norm': 0.11935856193304062, 'learning_rate': 7.432432432432433e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6848827600479126, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.5552, 'eval_samples_per_second': 48.919, 'eval_steps_per_second': 6.262, 'epoch': 4.978354978354979}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 502.2842, 'train_samples_per_second': 9.198, 'train_steps_per_second': 1.145, 'train_loss': 0.6257215217922045, 'epoch': 4.978354978354979}


{'eval_loss': 0.6848827600479126, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.6146, 'eval_samples_per_second': 47.809, 'eval_steps_per_second': 6.12, 'epoch': 4.978354978354979}
[2025-05-28 07:38:12] ✅ Config 49: Accuracy=0.9040, Loss=0.6849


[2025-05-28 07:38:12] 
🔬 Testing configuration 50/264
[2025-05-28 07:38:12] Config: LR=5e-05, BS=4, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 556.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 541.18 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 599.18 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 618.56 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 663.54 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 721.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 737.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 787.23 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 806.12 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 817.21 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 732.85 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 534.95 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 535.27 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 529.24 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8168204426765442, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.3111, 'eval_samples_per_second': 54.087, 'eval_steps_per_second': 6.923, 'epoch': 0.987012987012987}


{'loss': 1.0256, 'grad_norm': 0.7636844515800476, 'learning_rate': 4.116054916278848e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7972927093505859, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.5678, 'eval_samples_per_second': 48.68, 'eval_steps_per_second': 6.231, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7502566576004028, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.543, 'eval_samples_per_second': 49.154, 'eval_steps_per_second': 6.292, 'epoch': 2.995670995670996}


{'loss': 0.4771, 'grad_norm': 0.2007584273815155, 'learning_rate': 1.258832333897717e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7352133393287659, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.5678, 'eval_samples_per_second': 48.68, 'eval_steps_per_second': 6.231, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7317910194396973, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.5629, 'eval_samples_per_second': 48.772, 'eval_steps_per_second': 6.243, 'epoch': 4.935064935064935}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 483.7628, 'train_samples_per_second': 9.55, 'train_steps_per_second': 0.589, 'train_loss': 0.6540493747644257, 'epoch': 4.935064935064935}


{'eval_loss': 0.7317910194396973, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.6697, 'eval_samples_per_second': 46.821, 'eval_steps_per_second': 5.993, 'epoch': 4.935064935064935}
[2025-05-28 07:46:30] ✅ Config 50: Accuracy=0.8800, Loss=0.7318


[2025-05-28 07:46:30] 💾 Saved checkpoint at 50 configurations
[2025-05-28 07:46:30] 
🔬 Testing configuration 51/264
[2025-05-28 07:46:30] Config: LR=5e-05, BS=4, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 550.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 539.54 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 598.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 620.03 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 667.36 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 726.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 738.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 788.36 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 808.72 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 819.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 735.00 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 537.35 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 538.50 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 532.54 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7513856291770935, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.3109, 'eval_samples_per_second': 54.091, 'eval_steps_per_second': 6.924, 'epoch': 0.987012987012987}


{'loss': 0.93, 'grad_norm': 1.6922590732574463, 'learning_rate': 3.618677042801556e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7689641714096069, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.5511, 'eval_samples_per_second': 48.999, 'eval_steps_per_second': 6.272, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7732729315757751, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.549, 'eval_samples_per_second': 49.039, 'eval_steps_per_second': 6.277, 'epoch': 2.995670995670996}


{'loss': 0.4629, 'grad_norm': 0.18160133063793182, 'learning_rate': 1.6731517509727626e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7046571373939514, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.5559, 'eval_samples_per_second': 48.907, 'eval_steps_per_second': 6.26, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6901560425758362, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 2.5592, 'eval_samples_per_second': 48.844, 'eval_steps_per_second': 6.252, 'epoch': 4.935064935064935}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 479.366, 'train_samples_per_second': 9.638, 'train_steps_per_second': 0.595, 'train_loss': 0.6153353774756716, 'epoch': 4.935064935064935}


{'eval_loss': 0.7046571373939514, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.3689, 'eval_samples_per_second': 52.766, 'eval_steps_per_second': 6.754, 'epoch': 4.935064935064935}
[2025-05-28 07:54:44] ✅ Config 51: Accuracy=0.9040, Loss=0.7047


[2025-05-28 07:54:44] 
🔬 Testing configuration 52/264
[2025-05-28 07:54:44] Config: LR=5e-05, BS=4, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 539.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 528.00 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 588.91 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 610.45 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 657.68 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 716.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 730.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 779.22 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 799.68 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 811.27 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 725.07 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 530.60 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 532.96 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 525.79 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.6919, 'grad_norm': 24.63825225830078, 'learning_rate': 4.937319780454559e-05, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.001340627670288, 'eval_accuracy': 0.5920000076293945, 'eval_runtime': 2.236, 'eval_samples_per_second': 55.904, 'eval_steps_per_second': 7.156, 'epoch': 0.9956709956709957}


{'loss': 0.5527, 'grad_norm': 18.15418243408203, 'learning_rate': 4.1855569093222245e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7066012024879456, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.516, 'eval_samples_per_second': 49.683, 'eval_steps_per_second': 6.359, 'epoch': 2.0}


{'loss': 0.2587, 'grad_norm': 0.10831231623888016, 'learning_rate': 2.8325779372403194e-05, 'epoch': 2.5974025974025974}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7800858616828918, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.4965, 'eval_samples_per_second': 50.069, 'eval_steps_per_second': 6.409, 'epoch': 2.995670995670996}


{'loss': 0.1637, 'grad_norm': 0.015996236354112625, 'learning_rate': 1.3609727653643779e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7058290243148804, 'eval_accuracy': 0.8960000276565552, 'eval_runtime': 2.4888, 'eval_samples_per_second': 50.225, 'eval_steps_per_second': 6.429, 'epoch': 4.0}


{'loss': 0.0458, 'grad_norm': 0.021954422816634178, 'learning_rate': 2.9564370857870947e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7205241918563843, 'eval_accuracy': 0.8960000276565552, 'eval_runtime': 2.5041, 'eval_samples_per_second': 49.919, 'eval_steps_per_second': 6.39, 'epoch': 4.978354978354979}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 502.0709, 'train_samples_per_second': 9.202, 'train_steps_per_second': 1.145, 'train_loss': 0.47892136698183807, 'epoch': 4.978354978354979}


{'eval_loss': 0.7058290243148804, 'eval_accuracy': 0.8960000276565552, 'eval_runtime': 2.305, 'eval_samples_per_second': 54.229, 'eval_steps_per_second': 6.941, 'epoch': 4.978354978354979}
[2025-05-28 08:03:20] ✅ Config 52: Accuracy=0.8960, Loss=0.7058


[2025-05-28 08:03:21] 
🔬 Testing configuration 53/264
[2025-05-28 08:03:21] Config: LR=5e-05, BS=4, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 529.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 523.77 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 585.37 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:01, 605.90 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 652.22 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 710.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 724.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 775.52 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 798.30 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 809.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 721.75 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 531.27 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 533.20 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 526.93 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 2.0400137901306152, 'eval_accuracy': 0.1599999964237213, 'eval_runtime': 2.3231, 'eval_samples_per_second': 53.807, 'eval_steps_per_second': 6.887, 'epoch': 0.987012987012987}


{'loss': 1.7903, 'grad_norm': 18.076486587524414, 'learning_rate': 4.222222222222222e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.0634119510650635, 'eval_accuracy': 0.6079999804496765, 'eval_runtime': 2.5617, 'eval_samples_per_second': 48.796, 'eval_steps_per_second': 6.246, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7780194282531738, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.5537, 'eval_samples_per_second': 48.949, 'eval_steps_per_second': 6.265, 'epoch': 2.995670995670996}


{'loss': 0.7386, 'grad_norm': 3.163311719894409, 'learning_rate': 2.8333333333333335e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8193657398223877, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.5528, 'eval_samples_per_second': 48.966, 'eval_steps_per_second': 6.268, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.786720335483551, 'eval_accuracy': 0.8960000276565552, 'eval_runtime': 2.5579, 'eval_samples_per_second': 48.868, 'eval_steps_per_second': 6.255, 'epoch': 4.987012987012987}


{'loss': 0.4988, 'grad_norm': 1.0711543560028076, 'learning_rate': 1.4444444444444444e-05, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7753318548202515, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 2.5583, 'eval_samples_per_second': 48.86, 'eval_steps_per_second': 6.254, 'epoch': 5.991341991341991}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7533779740333557, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 2.4826, 'eval_samples_per_second': 50.351, 'eval_steps_per_second': 6.445, 'epoch': 6.909090909090909}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 673.582, 'train_samples_per_second': 9.602, 'train_steps_per_second': 0.592, 'train_loss': 0.8717261292880639, 'epoch': 6.909090909090909}


{'eval_loss': 0.786720335483551, 'eval_accuracy': 0.8960000276565552, 'eval_runtime': 2.408, 'eval_samples_per_second': 51.911, 'eval_steps_per_second': 6.645, 'epoch': 6.909090909090909}
[2025-05-28 08:14:49] ✅ Config 53: Accuracy=0.8960, Loss=0.7867


[2025-05-28 08:14:49] 
🔬 Testing configuration 54/264
[2025-05-28 08:14:49] Config: LR=5e-05, BS=4, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 553.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 539.77 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 598.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 620.72 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 667.15 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 727.19 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 742.87 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 794.04 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 813.11 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 827.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 738.72 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 537.75 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 539.99 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 533.68 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8120782375335693, 'eval_accuracy': 0.11999999731779099, 'eval_runtime': 2.1405, 'eval_samples_per_second': 58.399, 'eval_steps_per_second': 7.475, 'epoch': 0.987012987012987}


{'loss': 1.8293, 'grad_norm': 7.1784772872924805, 'learning_rate': 4.707368982147318e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.9459452629089355, 'eval_accuracy': 0.06400000303983688, 'eval_runtime': 2.2492, 'eval_samples_per_second': 55.576, 'eval_steps_per_second': 7.114, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.5637060403823853, 'eval_accuracy': 0.29600000381469727, 'eval_runtime': 2.256, 'eval_samples_per_second': 55.407, 'eval_steps_per_second': 7.092, 'epoch': 2.995670995670996}


{'loss': 1.694, 'grad_norm': 6.301456451416016, 'learning_rate': 3.0197792270443982e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.4800437688827515, 'eval_accuracy': 0.3840000033378601, 'eval_runtime': 2.2479, 'eval_samples_per_second': 55.608, 'eval_steps_per_second': 7.118, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.2668355703353882, 'eval_accuracy': 0.5839999914169312, 'eval_runtime': 2.2541, 'eval_samples_per_second': 55.454, 'eval_steps_per_second': 7.098, 'epoch': 4.987012987012987}


{'loss': 1.1016, 'grad_norm': 5.533609867095947, 'learning_rate': 9.608463116858542e-06, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.172456979751587, 'eval_accuracy': 0.6880000233650208, 'eval_runtime': 2.2578, 'eval_samples_per_second': 55.364, 'eval_steps_per_second': 7.087, 'epoch': 5.991341991341991}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.1428041458129883, 'eval_accuracy': 0.671999990940094, 'eval_runtime': 2.2549, 'eval_samples_per_second': 55.434, 'eval_steps_per_second': 7.096, 'epoch': 6.909090909090909}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 638.8139, 'train_samples_per_second': 10.125, 'train_steps_per_second': 0.625, 'train_loss': 1.2795267702642839, 'epoch': 6.909090909090909}


{'eval_loss': 1.172456979751587, 'eval_accuracy': 0.6880000233650208, 'eval_runtime': 2.093, 'eval_samples_per_second': 59.723, 'eval_steps_per_second': 7.645, 'epoch': 6.909090909090909}
[2025-05-28 08:25:41] ✅ Config 54: Accuracy=0.6880, Loss=1.1725


[2025-05-28 08:25:41] 
🔬 Testing configuration 55/264
[2025-05-28 08:25:41] Config: LR=5e-05, BS=4, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 639.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 655.51 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 735.17 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 768.27 examples/s]


Map:  50%|█████     | 464/924 [00:00<00:00, 827.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 869.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 941.59 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 953.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 874.89 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 625.23 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 623.15 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.1816513538360596, 'eval_accuracy': 0.6800000071525574, 'eval_runtime': 2.0816, 'eval_samples_per_second': 60.05, 'eval_steps_per_second': 7.686, 'epoch': 0.987012987012987}


{'loss': 1.3777, 'grad_norm': 8.28002643585205, 'learning_rate': 4.222222222222222e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8149691224098206, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.0747, 'eval_samples_per_second': 60.25, 'eval_steps_per_second': 7.712, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7957333922386169, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.0786, 'eval_samples_per_second': 60.137, 'eval_steps_per_second': 7.698, 'epoch': 2.995670995670996}


{'loss': 0.5821, 'grad_norm': 0.8616101145744324, 'learning_rate': 2.8333333333333335e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7105785608291626, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 2.079, 'eval_samples_per_second': 60.125, 'eval_steps_per_second': 7.696, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.683097779750824, 'eval_accuracy': 0.8960000276565552, 'eval_runtime': 2.0812, 'eval_samples_per_second': 60.061, 'eval_steps_per_second': 7.688, 'epoch': 4.987012987012987}


{'loss': 0.4648, 'grad_norm': 0.15624889731407166, 'learning_rate': 1.4444444444444444e-05, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6953566670417786, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.0801, 'eval_samples_per_second': 60.092, 'eval_steps_per_second': 7.692, 'epoch': 5.991341991341991}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6914576292037964, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.0811, 'eval_samples_per_second': 60.064, 'eval_steps_per_second': 7.688, 'epoch': 6.909090909090909}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 633.3702, 'train_samples_per_second': 10.212, 'train_steps_per_second': 0.63, 'train_loss': 0.7172460245309318, 'epoch': 6.909090909090909}


{'eval_loss': 0.6953566670417786, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 2.1472, 'eval_samples_per_second': 58.214, 'eval_steps_per_second': 7.451, 'epoch': 6.909090909090909}
[2025-05-28 08:36:27] ✅ Config 55: Accuracy=0.9040, Loss=0.6954


[2025-05-28 08:36:28] 
🔬 Testing configuration 56/264
[2025-05-28 08:36:28] Config: LR=5e-05, BS=4, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 500.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 570.64 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 677.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 723.76 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 785.00 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 852.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 901.06 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 948.06 examples/s]


Map:  97%|█████████▋| 896/924 [00:01<00:00, 975.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 849.34 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 638.82 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 636.35 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5809505581855774, 'eval_accuracy': 0.800000011920929, 'eval_runtime': 2.0319, 'eval_samples_per_second': 61.518, 'eval_steps_per_second': 7.874, 'epoch': 0.987012987012987}


{'loss': 0.8607, 'grad_norm': 2.801311731338501, 'learning_rate': 4.6758892398497494e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6424252986907959, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.0298, 'eval_samples_per_second': 61.583, 'eval_steps_per_second': 7.883, 'epoch': 1.9913419913419914}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9915028810501099, 'eval_accuracy': 0.8159999847412109, 'eval_runtime': 2.0354, 'eval_samples_per_second': 61.414, 'eval_steps_per_second': 7.861, 'epoch': 2.995670995670996}


{'loss': 0.1044, 'grad_norm': 0.01317436434328556, 'learning_rate': 2.9770224884413623e-05, 'epoch': 3.463203463203463}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8302239179611206, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.2569, 'eval_samples_per_second': 55.386, 'eval_steps_per_second': 7.089, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6921902298927307, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.0314, 'eval_samples_per_second': 61.535, 'eval_steps_per_second': 7.877, 'epoch': 4.987012987012987}


{'loss': 0.0142, 'grad_norm': 0.0030851084738969803, 'learning_rate': 9.266990223754069e-06, 'epoch': 5.194805194805195}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7250526547431946, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 2.0291, 'eval_samples_per_second': 61.603, 'eval_steps_per_second': 7.885, 'epoch': 5.991341991341991}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7249715328216553, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.0282, 'eval_samples_per_second': 61.632, 'eval_steps_per_second': 7.889, 'epoch': 6.909090909090909}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 635.4475, 'train_samples_per_second': 10.179, 'train_steps_per_second': 0.628, 'train_loss': 0.24547888349769706, 'epoch': 6.909090909090909}


{'eval_loss': 0.6921902298927307, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.115, 'eval_samples_per_second': 59.102, 'eval_steps_per_second': 7.565, 'epoch': 6.909090909090909}
[2025-05-28 08:47:16] ✅ Config 56: Accuracy=0.8720, Loss=0.6922


[2025-05-28 08:47:16] 
🔬 Testing configuration 57/264
[2025-05-28 08:47:16] Config: LR=5e-05, BS=8, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 662.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 660.18 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 740.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 779.01 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 841.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 879.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 956.93 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 961.98 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 883.74 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 631.36 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 628.41 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.727081835269928, 'eval_accuracy': 0.7519999742507935, 'eval_runtime': 1.6646, 'eval_samples_per_second': 75.095, 'eval_steps_per_second': 4.806, 'epoch': 1.0}


{'loss': 0.8647, 'grad_norm': 10.368999481201172, 'learning_rate': 3.66412213740458e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5053772330284119, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.6627, 'eval_samples_per_second': 75.177, 'eval_steps_per_second': 4.811, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8211243152618408, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 1.6781, 'eval_samples_per_second': 74.49, 'eval_steps_per_second': 4.767, 'epoch': 3.0}


{'loss': 0.0791, 'grad_norm': 0.023278338834643364, 'learning_rate': 1.7748091603053434e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6363891959190369, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.8957, 'eval_samples_per_second': 65.94, 'eval_steps_per_second': 4.22, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7028023600578308, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 1.6656, 'eval_samples_per_second': 75.049, 'eval_steps_per_second': 4.803, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 366.2717, 'train_samples_per_second': 12.614, 'train_steps_per_second': 0.792, 'train_loss': 0.32828294778692313, 'epoch': 5.0}


{'eval_loss': 0.7028023600578308, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 1.7389, 'eval_samples_per_second': 71.886, 'eval_steps_per_second': 4.601, 'epoch': 5.0}
[2025-05-28 08:53:34] ✅ Config 57: Accuracy=0.8720, Loss=0.7028


[2025-05-28 08:53:34] 
🔬 Testing configuration 58/264
[2025-05-28 08:53:34] Config: LR=5e-05, BS=8, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 652.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 658.17 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 733.47 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 766.12 examples/s]


Map:  50%|█████     | 464/924 [00:00<00:00, 825.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 868.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 940.72 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 951.89 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 874.31 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 627.97 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 624.67 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8595, 'grad_norm': 25.892492294311523, 'learning_rate': 4.9385076120574026e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8874667882919312, 'eval_accuracy': 0.1599999964237213, 'eval_runtime': 1.7096, 'eval_samples_per_second': 73.118, 'eval_steps_per_second': 4.68, 'epoch': 1.0}


{'loss': 1.8765, 'grad_norm': 7.232186317443848, 'learning_rate': 4.200169723061035e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8882277011871338, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 1.7042, 'eval_samples_per_second': 73.349, 'eval_steps_per_second': 4.694, 'epoch': 2.0}


{'loss': 1.8138, 'grad_norm': 5.82151460647583, 'learning_rate': 2.8665940808500418e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8055402040481567, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 1.7024, 'eval_samples_per_second': 73.426, 'eval_steps_per_second': 4.699, 'epoch': 3.0}


{'loss': 1.8039, 'grad_norm': 4.740558624267578, 'learning_rate': 1.4046720541180445e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8216224908828735, 'eval_accuracy': 0.06400000303983688, 'eval_runtime': 1.9361, 'eval_samples_per_second': 64.562, 'eval_steps_per_second': 4.132, 'epoch': 4.0}


{'loss': 1.7993, 'grad_norm': 6.023561954498291, 'learning_rate': 3.262297098533501e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8142919540405273, 'eval_accuracy': 0.06400000303983688, 'eval_runtime': 1.702, 'eval_samples_per_second': 73.445, 'eval_steps_per_second': 4.7, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 383.1368, 'train_samples_per_second': 12.058, 'train_steps_per_second': 1.514, 'train_loss': 1.8259468078613281, 'epoch': 5.0}


{'eval_loss': 1.8882277011871338, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 1.7534, 'eval_samples_per_second': 71.292, 'eval_steps_per_second': 4.563, 'epoch': 5.0}
[2025-05-28 09:00:09] ✅ Config 58: Accuracy=0.2720, Loss=1.8882


[2025-05-28 09:00:09] 
🔬 Testing configuration 59/264
[2025-05-28 09:00:09] Config: LR=5e-05, BS=8, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 652.49 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 661.64 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 738.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 774.24 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 832.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 871.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 949.23 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 951.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 876.41 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 622.44 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 619.12 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.4614, 'grad_norm': 8.774864196777344, 'learning_rate': 4.608030592734226e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8078129887580872, 'eval_accuracy': 0.671999990940094, 'eval_runtime': 1.6695, 'eval_samples_per_second': 74.871, 'eval_steps_per_second': 4.792, 'epoch': 1.0}


{'loss': 0.4578, 'grad_norm': 21.972728729248047, 'learning_rate': 3.67112810707457e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6584768891334534, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.6589, 'eval_samples_per_second': 75.35, 'eval_steps_per_second': 4.822, 'epoch': 2.0}


{'loss': 0.2155, 'grad_norm': 0.08895846456289291, 'learning_rate': 2.7151051625239004e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.027620553970337, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 1.6683, 'eval_samples_per_second': 74.929, 'eval_steps_per_second': 4.795, 'epoch': 3.0}


{'loss': 0.1243, 'grad_norm': 42.969154357910156, 'learning_rate': 1.7590822179732312e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7580826282501221, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.8877, 'eval_samples_per_second': 66.217, 'eval_steps_per_second': 4.238, 'epoch': 4.0}


{'loss': 0.0411, 'grad_norm': 0.008745801635086536, 'learning_rate': 8.126195028680688e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8047255873680115, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.6869, 'eval_samples_per_second': 74.101, 'eval_steps_per_second': 4.742, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 393.6386, 'train_samples_per_second': 11.737, 'train_steps_per_second': 1.473, 'train_loss': 0.4016359049698402, 'epoch': 5.0}


{'eval_loss': 0.8047255873680115, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.7567, 'eval_samples_per_second': 71.154, 'eval_steps_per_second': 4.554, 'epoch': 5.0}
[2025-05-28 09:06:52] ✅ Config 59: Accuracy=0.8480, Loss=0.8047


[2025-05-28 09:06:52] 
🔬 Testing configuration 60/264
[2025-05-28 09:06:52] Config: LR=5e-05, BS=8, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 634.21 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 632.57 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 711.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 738.35 examples/s]


Map:  50%|█████     | 464/924 [00:00<00:00, 790.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 830.90 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 901.38 examples/s]


Map:  88%|████████▊ | 816/924 [00:00<00:00, 911.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 932.12 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 840.80 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 601.88 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 594.40 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.0786380767822266, 'eval_accuracy': 0.6880000233650208, 'eval_runtime': 1.7435, 'eval_samples_per_second': 71.694, 'eval_steps_per_second': 4.588, 'epoch': 1.0}


{'loss': 1.0656, 'grad_norm': 6.641078472137451, 'learning_rate': 4.147425251900119e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8229538202285767, 'eval_accuracy': 0.8080000281333923, 'eval_runtime': 1.9099, 'eval_samples_per_second': 65.449, 'eval_steps_per_second': 4.189, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9073670506477356, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 1.747, 'eval_samples_per_second': 71.553, 'eval_steps_per_second': 4.579, 'epoch': 3.0}


{'loss': 0.4717, 'grad_norm': 0.14263851940631866, 'learning_rate': 1.3463671406432058e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7963966131210327, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.7523, 'eval_samples_per_second': 71.336, 'eval_steps_per_second': 4.566, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7855377793312073, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 1.7535, 'eval_samples_per_second': 71.286, 'eval_steps_per_second': 4.562, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 368.9786, 'train_samples_per_second': 12.521, 'train_steps_per_second': 0.786, 'train_loss': 0.6617126070219895, 'epoch': 5.0}


{'eval_loss': 0.7855377793312073, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 1.8008, 'eval_samples_per_second': 69.413, 'eval_steps_per_second': 4.442, 'epoch': 5.0}
[2025-05-28 09:13:12] ✅ Config 60: Accuracy=0.8560, Loss=0.7855


[2025-05-28 09:13:12] 💾 Saved checkpoint at 60 configurations
[2025-05-28 09:13:12] 
🔬 Testing configuration 61/264
[2025-05-28 09:13:12] Config: LR=5e-05, BS=8, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 625.84 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 633.37 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 706.64 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 735.87 examples/s]


Map:  50%|█████     | 464/924 [00:00<00:00, 791.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 829.18 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 901.41 examples/s]


Map:  88%|████████▊ | 816/924 [00:00<00:00, 910.89 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 928.07 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 838.23 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 593.45 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 591.14 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.5972200632095337, 'eval_accuracy': 0.29600000381469727, 'eval_runtime': 1.7558, 'eval_samples_per_second': 71.191, 'eval_steps_per_second': 4.556, 'epoch': 1.0}


{'loss': 1.455, 'grad_norm': 118.01993560791016, 'learning_rate': 4.250681198910082e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9782117605209351, 'eval_accuracy': 0.7599999904632568, 'eval_runtime': 2.0125, 'eval_samples_per_second': 62.111, 'eval_steps_per_second': 3.975, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8025847673416138, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.744, 'eval_samples_per_second': 71.674, 'eval_steps_per_second': 4.587, 'epoch': 3.0}


{'loss': 0.6559, 'grad_norm': 29.85760498046875, 'learning_rate': 2.888283378746594e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8177194595336914, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.741, 'eval_samples_per_second': 71.797, 'eval_steps_per_second': 4.595, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8052894473075867, 'eval_accuracy': 0.8159999847412109, 'eval_runtime': 1.7529, 'eval_samples_per_second': 71.311, 'eval_steps_per_second': 4.564, 'epoch': 5.0}


{'loss': 0.4851, 'grad_norm': 0.24820204079151154, 'learning_rate': 1.5258855585831064e-05, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7990570068359375, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.0058, 'eval_samples_per_second': 62.318, 'eval_steps_per_second': 3.988, 'epoch': 6.0}


{'loss': 0.4409, 'grad_norm': 1.7383105754852295, 'learning_rate': 1.6348773841961852e-06, 'epoch': 6.896551724137931}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7818887233734131, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 1.7459, 'eval_samples_per_second': 71.595, 'eval_steps_per_second': 4.582, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 528.7913, 'train_samples_per_second': 12.232, 'train_steps_per_second': 0.768, 'train_loss': 0.7542809964400794, 'epoch': 7.0}


{'eval_loss': 0.8177194595336914, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.7816, 'eval_samples_per_second': 70.162, 'eval_steps_per_second': 4.49, 'epoch': 7.0}
[2025-05-28 09:22:12] ✅ Config 61: Accuracy=0.8480, Loss=0.8177


[2025-05-28 09:22:13] 
🔬 Testing configuration 62/264
[2025-05-28 09:22:13] Config: LR=5e-05, BS=8, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 630.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 637.05 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 709.53 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 743.02 examples/s]


Map:  50%|█████     | 464/924 [00:00<00:00, 798.67 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 837.89 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 907.23 examples/s]


Map:  88%|████████▊ | 816/924 [00:00<00:00, 911.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 930.47 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 842.89 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 604.72 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 601.10 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.2110859155654907, 'eval_accuracy': 0.6639999747276306, 'eval_runtime': 1.7112, 'eval_samples_per_second': 73.048, 'eval_steps_per_second': 4.675, 'epoch': 1.0}


{'loss': 1.2911, 'grad_norm': 57.14178466796875, 'learning_rate': 4.737627598008486e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7205619215965271, 'eval_accuracy': 0.7760000228881836, 'eval_runtime': 1.7009, 'eval_samples_per_second': 73.492, 'eval_steps_per_second': 4.704, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.721549391746521, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 1.6988, 'eval_samples_per_second': 73.581, 'eval_steps_per_second': 4.709, 'epoch': 3.0}


{'loss': 0.3099, 'grad_norm': 12.513203620910645, 'learning_rate': 3.1246262130101085e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9443456530570984, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 1.6983, 'eval_samples_per_second': 73.602, 'eval_steps_per_second': 4.711, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9264799952507019, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 1.7233, 'eval_samples_per_second': 72.533, 'eval_steps_per_second': 4.642, 'epoch': 5.0}


{'loss': 0.0726, 'grad_norm': 0.08377205580472946, 'learning_rate': 1.081193230346865e-05, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.03777277469635, 'eval_accuracy': 0.800000011920929, 'eval_runtime': 1.9666, 'eval_samples_per_second': 63.562, 'eval_steps_per_second': 4.068, 'epoch': 6.0}


{'loss': 0.0666, 'grad_norm': 2.470369815826416, 'learning_rate': 1.7931385625296337e-07, 'epoch': 6.896551724137931}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.084086537361145, 'eval_accuracy': 0.800000011920929, 'eval_runtime': 1.7027, 'eval_samples_per_second': 73.412, 'eval_steps_per_second': 4.698, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 523.639, 'train_samples_per_second': 12.352, 'train_steps_per_second': 0.775, 'train_loss': 0.4286405696657491, 'epoch': 7.0}


{'eval_loss': 0.9443456530570984, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 1.7531, 'eval_samples_per_second': 71.304, 'eval_steps_per_second': 4.563, 'epoch': 7.0}
[2025-05-28 09:31:07] ✅ Config 62: Accuracy=0.8320, Loss=0.9443


[2025-05-28 09:31:08] 
🔬 Testing configuration 63/264
[2025-05-28 09:31:08] Config: LR=5e-05, BS=8, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 632.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 633.77 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 708.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 743.21 examples/s]


Map:  50%|█████     | 464/924 [00:00<00:00, 800.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 832.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 895.09 examples/s]


Map:  87%|████████▋ | 800/924 [00:00<00:00, 923.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 937.34 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 844.57 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 603.58 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 600.83 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8313043713569641, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.7395, 'eval_samples_per_second': 71.861, 'eval_steps_per_second': 4.599, 'epoch': 1.0}


{'loss': 1.2328, 'grad_norm': 4.738430976867676, 'learning_rate': 4.237057220708447e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8840992450714111, 'eval_accuracy': 0.8159999847412109, 'eval_runtime': 1.7402, 'eval_samples_per_second': 71.829, 'eval_steps_per_second': 4.597, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8001850843429565, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 2.0133, 'eval_samples_per_second': 62.088, 'eval_steps_per_second': 3.974, 'epoch': 3.0}


{'loss': 0.5747, 'grad_norm': 204.89059448242188, 'learning_rate': 2.8746594005449594e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7565230131149292, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 1.753, 'eval_samples_per_second': 71.306, 'eval_steps_per_second': 4.564, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7243742942810059, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 1.7379, 'eval_samples_per_second': 71.928, 'eval_steps_per_second': 4.603, 'epoch': 5.0}


{'loss': 0.4598, 'grad_norm': 0.5476320385932922, 'learning_rate': 1.5122615803814717e-05, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7228861451148987, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 1.736, 'eval_samples_per_second': 72.005, 'eval_steps_per_second': 4.608, 'epoch': 6.0}


{'loss': 0.4329, 'grad_norm': 0.09131735563278198, 'learning_rate': 1.4986376021798365e-06, 'epoch': 6.896551724137931}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7288817763328552, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 1.7434, 'eval_samples_per_second': 71.698, 'eval_steps_per_second': 4.589, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 528.9264, 'train_samples_per_second': 12.229, 'train_steps_per_second': 0.768, 'train_loss': 0.6713219258585587, 'epoch': 7.0}


{'eval_loss': 0.7228861451148987, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 1.8071, 'eval_samples_per_second': 69.17, 'eval_steps_per_second': 4.427, 'epoch': 7.0}
[2025-05-28 09:40:08] ✅ Config 63: Accuracy=0.9040, Loss=0.7229


[2025-05-28 09:40:08] 
🔬 Testing configuration 64/264
[2025-05-28 09:40:08] Config: LR=5e-05, BS=8, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 632.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 640.50 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 713.99 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 743.11 examples/s]


Map:  50%|█████     | 464/924 [00:00<00:00, 799.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 833.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 904.41 examples/s]


Map:  88%|████████▊ | 816/924 [00:00<00:00, 914.19 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 934.91 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 844.60 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 606.06 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 602.26 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.6319835186004639, 'eval_accuracy': 0.4000000059604645, 'eval_runtime': 1.7474, 'eval_samples_per_second': 71.536, 'eval_steps_per_second': 4.578, 'epoch': 1.0}


{'loss': 1.4293, 'grad_norm': 11.838634490966797, 'learning_rate': 4.708261402944035e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8806513547897339, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.0072, 'eval_samples_per_second': 62.275, 'eval_steps_per_second': 3.986, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7924494743347168, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.7391, 'eval_samples_per_second': 71.878, 'eval_steps_per_second': 4.6, 'epoch': 3.0}


{'loss': 0.5945, 'grad_norm': 4.136491775512695, 'learning_rate': 3.062261790770331e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8180862665176392, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 1.7803, 'eval_samples_per_second': 70.211, 'eval_steps_per_second': 4.494, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8014416098594666, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.0175, 'eval_samples_per_second': 61.958, 'eval_steps_per_second': 3.965, 'epoch': 5.0}


{'loss': 0.4668, 'grad_norm': 1.37209951877594, 'learning_rate': 1.0288060714619357e-05, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7507480382919312, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 1.7437, 'eval_samples_per_second': 71.686, 'eval_steps_per_second': 4.588, 'epoch': 6.0}


{'loss': 0.4467, 'grad_norm': 0.2786197364330292, 'learning_rate': 1.1074957966490663e-07, 'epoch': 6.896551724137931}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7529003024101257, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 1.7431, 'eval_samples_per_second': 71.71, 'eval_steps_per_second': 4.589, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 531.7446, 'train_samples_per_second': 12.164, 'train_steps_per_second': 0.764, 'train_loss': 0.7297708929465909, 'epoch': 7.0}


{'eval_loss': 0.7507480382919312, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 1.791, 'eval_samples_per_second': 69.793, 'eval_steps_per_second': 4.467, 'epoch': 7.0}
[2025-05-28 09:49:15] ✅ Config 64: Accuracy=0.8880, Loss=0.7507


[2025-05-28 09:49:16] 
🔬 Testing configuration 65/264
[2025-05-28 09:49:16] Config: LR=2e-05, BS=4, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 612.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 603.85 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:00, 694.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 721.45 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 753.19 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 806.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 858.88 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 904.22 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 926.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 828.80 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 598.53 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 596.66 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 09:49:24] ❌ Fatal error with config 65: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 09:49:24] 
🔬 Testing configuration 66/264
[2025-05-28 09:49:24] Config: LR=5e-05, BS=4, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 681.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 686.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 735.87 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 788.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 859.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  66%|██████▌   | 608/924 [00:00<00:00, 927.76 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  81%|████████▏ | 752/924 [00:00<00:00, 1018.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  94%|█████████▎| 864/924 [00:00<00:00, 1018.07 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 916.33 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 646.99 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 645.26 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 09:49:32] ❌ Fatal error with config 66: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 09:49:32] 
🔬 Testing configuration 67/264
[2025-05-28 09:49:32] Config: LR=1e-05, BS=2, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 675.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 681.87 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 769.31 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 799.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 867.90 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 908.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 991.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 989.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 913.18 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 648.44 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 644.81 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.9636294841766357, 'eval_accuracy': 0.07999999821186066, 'eval_runtime': 3.0294, 'eval_samples_per_second': 41.262, 'eval_steps_per_second': 10.563, 'epoch': 0.9956709956709957}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.9336578845977783, 'eval_accuracy': 0.08799999952316284, 'eval_runtime': 3.1534, 'eval_samples_per_second': 39.64, 'eval_steps_per_second': 10.148, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8144162893295288, 'eval_accuracy': 0.19200000166893005, 'eval_runtime': 3.0393, 'eval_samples_per_second': 41.128, 'eval_steps_per_second': 10.529, 'epoch': 2.995670995670996}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8327054977416992, 'eval_accuracy': 0.19200000166893005, 'eval_runtime': 3.0008, 'eval_samples_per_second': 41.656, 'eval_steps_per_second': 10.664, 'epoch': 4.0}


{'loss': 1.7793, 'grad_norm': 28.73289680480957, 'learning_rate': 3.307228401232945e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8244906663894653, 'eval_accuracy': 0.20000000298023224, 'eval_runtime': 3.0703, 'eval_samples_per_second': 40.713, 'eval_steps_per_second': 10.423, 'epoch': 4.995670995670996}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7942339181900024, 'eval_accuracy': 0.20800000429153442, 'eval_runtime': 2.9703, 'eval_samples_per_second': 42.084, 'eval_steps_per_second': 10.774, 'epoch': 6.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.795328974723816, 'eval_accuracy': 0.2160000056028366, 'eval_runtime': 2.9707, 'eval_samples_per_second': 42.078, 'eval_steps_per_second': 10.772, 'epoch': 6.96969696969697}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 875.0086, 'train_samples_per_second': 7.392, 'train_steps_per_second': 0.92, 'train_loss': 1.6446151069972825, 'epoch': 6.96969696969697}


{'eval_loss': 1.795328974723816, 'eval_accuracy': 0.2160000056028366, 'eval_runtime': 3.0468, 'eval_samples_per_second': 41.026, 'eval_steps_per_second': 10.503, 'epoch': 6.96969696969697}
[2025-05-28 10:04:18] ✅ Config 67: Accuracy=0.2160, Loss=1.7953


[2025-05-28 10:04:19] 
🔬 Testing configuration 68/264
[2025-05-28 10:04:19] Config: LR=3e-05, BS=8, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 593.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 576.81 examples/s]


Map:  24%|██▍       | 224/924 [00:00<00:01, 654.26 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 672.63 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 725.45 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 792.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 813.68 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 889.56 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 889.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 908.77 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 807.52 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 564.95 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 557.36 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8339, 'grad_norm': 45.973960876464844, 'learning_rate': 2.2696183206106872e-05, 'epoch': 3.4482758620689653}


{'eval_loss': 1.869866967201233, 'eval_accuracy': 0.09600000083446503, 'eval_runtime': 2.4817, 'eval_samples_per_second': 50.368, 'eval_steps_per_second': 3.224, 'epoch': 3.4482758620689653}


{'loss': 1.811, 'grad_norm': 2748.795166015625, 'learning_rate': 1.1283969465648856e-05, 'epoch': 6.896551724137931}


{'eval_loss': 1.8925470113754272, 'eval_accuracy': 0.07199999690055847, 'eval_runtime': 2.4891, 'eval_samples_per_second': 50.22, 'eval_steps_per_second': 3.214, 'epoch': 6.896551724137931}


{'train_runtime': 458.8246, 'train_samples_per_second': 20.138, 'train_steps_per_second': 0.632, 'train_loss': 1.8153465534078663, 'epoch': 10.0}


{'eval_loss': 1.8777575492858887, 'eval_accuracy': 0.07999999821186066, 'eval_runtime': 2.4504, 'eval_samples_per_second': 51.012, 'eval_steps_per_second': 3.265, 'epoch': 10.0}
[2025-05-28 10:12:09] ✅ Config 68: Accuracy=0.0800, Loss=1.8778


[2025-05-28 10:12:09] 
🔬 Testing configuration 69/264
[2025-05-28 10:12:09] Config: LR=5e-05, BS=8, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 641.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 652.14 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 736.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 771.37 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 835.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 877.64 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 971.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 976.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 888.93 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 627.18 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 624.36 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.6721, 'grad_norm': 18.907724380493164, 'learning_rate': 4.9941080792150166e-05, 'epoch': 0.8620689655172413}


{'eval_loss': 0.7992759943008423, 'eval_accuracy': 0.7680000066757202, 'eval_runtime': 1.7054, 'eval_samples_per_second': 73.295, 'eval_steps_per_second': 4.691, 'epoch': 0.8620689655172413}


{'loss': 0.5597, 'grad_norm': 24.229755401611328, 'learning_rate': 4.696530612642871e-05, 'epoch': 1.7241379310344827}


{'eval_loss': 0.6220511794090271, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.7349, 'eval_samples_per_second': 72.051, 'eval_steps_per_second': 4.611, 'epoch': 1.7241379310344827}


{'loss': 0.3412, 'grad_norm': 0.044879283756017685, 'learning_rate': 4.000535701370921e-05, 'epoch': 2.586206896551724}


{'eval_loss': 0.940473735332489, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 1.9798, 'eval_samples_per_second': 63.138, 'eval_steps_per_second': 4.041, 'epoch': 2.586206896551724}


{'loss': 0.223, 'grad_norm': 0.1790693998336792, 'learning_rate': 3.0323662998460393e-05, 'epoch': 3.4482758620689653}


{'eval_loss': 0.9838299751281738, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.7414, 'eval_samples_per_second': 71.783, 'eval_steps_per_second': 4.594, 'epoch': 3.4482758620689653}


{'loss': 0.0534, 'grad_norm': 17.011823654174805, 'learning_rate': 1.9676337001539612e-05, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.1676546335220337, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 1.7699, 'eval_samples_per_second': 70.626, 'eval_steps_per_second': 4.52, 'epoch': 4.310344827586207}


{'loss': 0.0956, 'grad_norm': 0.016226286068558693, 'learning_rate': 1.0080599534772766e-05, 'epoch': 5.172413793103448}


{'eval_loss': 1.0995360612869263, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.8193, 'eval_samples_per_second': 68.708, 'eval_steps_per_second': 4.397, 'epoch': 5.172413793103448}


{'loss': 0.0192, 'grad_norm': 0.00578147592023015, 'learning_rate': 3.0861331430900807e-06, 'epoch': 6.0344827586206895}


{'eval_loss': 1.0856099128723145, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.7377, 'eval_samples_per_second': 71.932, 'eval_steps_per_second': 4.604, 'epoch': 6.0344827586206895}


{'loss': 0.0362, 'grad_norm': 0.0022364843171089888, 'learning_rate': 6.651089260368004e-08, 'epoch': 6.896551724137931}


{'eval_loss': 1.0909340381622314, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.8112, 'eval_samples_per_second': 69.014, 'eval_steps_per_second': 4.417, 'epoch': 6.896551724137931}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 311.5898, 'train_samples_per_second': 20.758, 'train_steps_per_second': 2.606, 'train_loss': 0.36950888323226, 'epoch': 7.0}


{'eval_loss': 1.1676546335220337, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 1.7783, 'eval_samples_per_second': 70.29, 'eval_steps_per_second': 4.499, 'epoch': 7.0}
[2025-05-28 10:17:31] ✅ Config 69: Accuracy=0.8320, Loss=1.1677


[2025-05-28 10:17:31] 
🔬 Testing configuration 70/264
[2025-05-28 10:17:31] Config: LR=2e-05, BS=2, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 606.29 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 585.53 examples/s]


Map:  24%|██▍       | 224/924 [00:00<00:01, 662.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 680.36 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 732.64 examples/s]


Map:  55%|█████▌    | 512/924 [00:00<00:00, 811.64 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  66%|██████▌   | 608/924 [00:00<00:00, 831.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 889.54 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 883.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 894.47 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 808.63 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 587.51 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 585.01 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:17:40] ❌ Fatal error with config 70: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 10:17:40] 
🔬 Testing configuration 71/264
[2025-05-28 10:17:40] Config: LR=1e-05, BS=4, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 610.31 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 619.53 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 688.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 717.00 examples/s]


Map:  48%|████▊     | 448/924 [00:00<00:00, 761.59 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 810.17 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 845.58 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 877.63 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 895.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 810.88 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 581.07 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 577.44 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:17:48] ❌ Fatal error with config 71: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 10:17:48] 
🔬 Testing configuration 72/264
[2025-05-28 10:17:48] Config: LR=5e-05, BS=2, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 653.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 660.94 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 739.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 777.67 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 844.88 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 886.08 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 981.33 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 984.37 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 896.62 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 636.08 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 632.67 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:17:56] ❌ Fatal error with config 72: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 10:17:56] 
🔬 Testing configuration 73/264
[2025-05-28 10:17:56] Config: LR=5e-05, BS=2, Epochs=10, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 612.84 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 617.63 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 682.76 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 707.84 examples/s]


Map:  48%|████▊     | 448/924 [00:00<00:00, 749.15 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 799.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 847.52 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 885.30 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 897.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 808.89 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 579.46 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 576.34 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:18:05] ❌ Fatal error with config 73: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 10:18:05] 
🔬 Testing configuration 74/264
[2025-05-28 10:18:05] Config: LR=5e-05, BS=8, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 649.89 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 658.07 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 733.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 775.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 843.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 887.64 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 983.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 985.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 894.44 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 632.09 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 629.18 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:18:13] ❌ Fatal error with config 74: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:18:13] 
🔬 Testing configuration 75/264
[2025-05-28 10:18:13] Config: LR=5e-05, BS=8, Epochs=3, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 641.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 646.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 722.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 757.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 822.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 862.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 955.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 960.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 875.01 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 621.50 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 615.35 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8279, 'grad_norm': 254.84201049804688, 'learning_rate': 4.205414012738854e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7298283576965332, 'eval_accuracy': 0.23199999332427979, 'eval_runtime': 1.0036, 'eval_samples_per_second': 124.556, 'eval_steps_per_second': 7.972, 'epoch': 1.0}


{'loss': 1.8241, 'grad_norm': 168.505615234375, 'learning_rate': 2.6162420382165603e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7920705080032349, 'eval_accuracy': 0.17599999904632568, 'eval_runtime': 1.1131, 'eval_samples_per_second': 112.296, 'eval_steps_per_second': 7.187, 'epoch': 2.0}


{'loss': 1.8392, 'grad_norm': 832.5113525390625, 'learning_rate': 1.090636942675159e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8198124170303345, 'eval_accuracy': 0.15199999511241913, 'eval_runtime': 1.119, 'eval_samples_per_second': 111.712, 'eval_steps_per_second': 7.15, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 176.8439, 'train_samples_per_second': 15.675, 'train_steps_per_second': 0.984, 'train_loss': 1.8301191658809268, 'epoch': 3.0}


{'eval_loss': 1.7298283576965332, 'eval_accuracy': 0.23199999332427979, 'eval_runtime': 1.135, 'eval_samples_per_second': 110.136, 'eval_steps_per_second': 7.049, 'epoch': 3.0}
[2025-05-28 10:21:20] ✅ Config 75: Accuracy=0.2320, Loss=1.7298


[2025-05-28 10:21:20] 
🔬 Testing configuration 76/264
[2025-05-28 10:21:20] Config: LR=1e-05, BS=2, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 650.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 652.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 736.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 772.18 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 828.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 870.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 967.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 965.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 883.09 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 626.43 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 619.36 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:21:29] ❌ Fatal error with config 76: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 10:21:29] 
🔬 Testing configuration 77/264
[2025-05-28 10:21:29] Config: LR=5e-05, BS=8, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 605.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 591.59 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 670.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 694.85 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 747.96 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 799.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 841.11 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 883.34 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 906.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 810.76 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 573.96 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 565.56 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:21:37] ❌ Fatal error with config 77: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 10:21:37] 
🔬 Testing configuration 78/264
[2025-05-28 10:21:37] Config: LR=3e-05, BS=8, Epochs=10, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 592.54 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 588.67 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 674.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 704.03 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 759.23 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 819.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 854.52 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 892.05 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 905.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 816.10 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 591.33 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 586.82 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:21:46] ❌ Fatal error with config 78: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:21:46] 
🔬 Testing configuration 79/264
[2025-05-28 10:21:46] Config: LR=3e-05, BS=16, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 659.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 657.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 737.22 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 776.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 835.74 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 880.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 977.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 978.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 890.75 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 619.85 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 538.85 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:21:54] ❌ Fatal error with config 79: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 10:21:54] 
🔬 Testing configuration 80/264
[2025-05-28 10:21:54] Config: LR=3e-05, BS=4, Epochs=3, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 612.50 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 622.47 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 690.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 721.72 examples/s]


Map:  48%|████▊     | 448/924 [00:00<00:00, 766.91 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 816.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  73%|███████▎  | 672/924 [00:00<00:00, 859.86 examples/s]


Map:  85%|████████▍ | 784/924 [00:00<00:00, 900.56 examples/s]


Map:  97%|█████████▋| 896/924 [00:01<00:00, 917.36 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 822.31 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 586.70 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 581.92 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:22:02] ❌ Fatal error with config 80: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 10:22:02] 
🔬 Testing configuration 81/264
[2025-05-28 10:22:02] Config: LR=1e-05, BS=4, Epochs=3, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 655.16 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 661.59 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 742.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 780.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 847.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 888.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 983.98 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 988.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 898.47 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 627.00 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 621.37 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:22:10] ❌ Fatal error with config 81: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:22:10] 
🔬 Testing configuration 82/264
[2025-05-28 10:22:10] Config: LR=1e-05, BS=4, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 576.12 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 567.81 examples/s]


Map:  24%|██▍       | 224/924 [00:00<00:01, 646.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 672.62 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 727.09 examples/s]


Map:  55%|█████▌    | 512/924 [00:00<00:00, 802.33 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  66%|██████▌   | 608/924 [00:00<00:00, 828.37 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  80%|███████▉  | 736/924 [00:00<00:00, 902.65 examples/s]


Map:  90%|█████████ | 832/924 [00:01<00:00, 896.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 809.91 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 562.72 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 579.33 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 571.21 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:22:19] ❌ Fatal error with config 82: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 10:22:19] 
🔬 Testing configuration 83/264
[2025-05-28 10:22:19] Config: LR=5e-05, BS=8, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 607.88 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 583.65 examples/s]


Map:  24%|██▍       | 224/924 [00:00<00:01, 656.37 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 676.81 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 732.76 examples/s]


Map:  55%|█████▌    | 512/924 [00:00<00:00, 810.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  68%|██████▊   | 624/924 [00:00<00:00, 843.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  80%|███████▉  | 736/924 [00:00<00:00, 900.60 examples/s]


Map:  90%|█████████ | 832/924 [00:01<00:00, 899.18 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 816.79 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 588.56 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 582.99 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:22:27] ❌ Fatal error with config 83: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:22:27] 
🔬 Testing configuration 84/264
[2025-05-28 10:22:27] Config: LR=2e-05, BS=2, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 650.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 650.76 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 717.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 740.29 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 812.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 862.11 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 953.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 971.54 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 877.96 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 628.44 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 627.25 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:22:35] ❌ Fatal error with config 84: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:22:35] 
🔬 Testing configuration 85/264
[2025-05-28 10:22:35] Config: LR=2e-05, BS=8, Epochs=3, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 609.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 602.81 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 681.76 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 707.46 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 758.06 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 816.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 856.22 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 893.74 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 907.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 819.68 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 573.98 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 571.84 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:22:44] ❌ Fatal error with config 85: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:22:44] 
🔬 Testing configuration 86/264
[2025-05-28 10:22:44] Config: LR=3e-05, BS=2, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 641.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 643.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 725.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 763.91 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 832.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 872.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 967.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 971.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 882.02 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 624.82 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 620.66 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:22:52] ❌ Fatal error with config 86: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:22:52] 
🔬 Testing configuration 87/264
[2025-05-28 10:22:52] Config: LR=2e-05, BS=4, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 654.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 656.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 741.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 779.67 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 844.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 886.14 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 980.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 981.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 894.96 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 634.14 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 628.47 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:23:00] ❌ Fatal error with config 87: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 10:23:00] 
🔬 Testing configuration 88/264
[2025-05-28 10:23:00] Config: LR=2e-05, BS=4, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 605.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 590.93 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 669.59 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 696.34 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 744.21 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 800.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 838.61 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 878.12 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 896.97 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 806.77 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 579.85 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 574.51 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:23:08] ❌ Fatal error with config 88: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:23:08] 
🔬 Testing configuration 89/264
[2025-05-28 10:23:08] Config: LR=2e-05, BS=8, Epochs=10, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 600.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 593.31 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 674.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 699.41 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 754.32 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 810.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 850.36 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 890.22 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 908.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 815.19 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 585.85 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 581.76 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:23:17] ❌ Fatal error with config 89: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 10:23:17] 
🔬 Testing configuration 90/264
[2025-05-28 10:23:17] Config: LR=3e-05, BS=2, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 641.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 643.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 723.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 755.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 818.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 863.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 949.14 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 971.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 879.97 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 616.98 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 612.97 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:23:25] ❌ Fatal error with config 90: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:23:25] 
🔬 Testing configuration 91/264
[2025-05-28 10:23:25] Config: LR=3e-05, BS=4, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 603.07 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 592.28 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 670.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 698.44 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 749.80 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 807.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 847.87 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 889.84 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 910.00 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 814.67 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 585.05 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 580.36 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:23:34] ❌ Fatal error with config 91: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:23:34] 
🔬 Testing configuration 92/264
[2025-05-28 10:23:34] Config: LR=5e-05, BS=2, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 652.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 658.91 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 739.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 779.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 841.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 873.27 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 967.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 963.22 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 884.94 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 627.81 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 621.07 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:23:42] ❌ Fatal error with config 92: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 10:23:42] 
🔬 Testing configuration 93/264
[2025-05-28 10:23:42] Config: LR=3e-05, BS=8, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 589.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 586.00 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 664.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 696.82 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 754.26 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 808.54 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 846.50 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 887.83 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 905.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 811.79 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 566.22 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 573.19 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 566.69 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:23:50] ❌ Fatal error with config 93: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 10:23:50] 
🔬 Testing configuration 94/264
[2025-05-28 10:23:50] Config: LR=2e-05, BS=8, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 604.87 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 593.13 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 676.24 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 699.82 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 756.20 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 814.27 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 857.85 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 895.66 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 913.67 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 819.23 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 578.82 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 575.46 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:23:59] ❌ Fatal error with config 94: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 10:23:59] 
🔬 Testing configuration 95/264
[2025-05-28 10:23:59] Config: LR=2e-05, BS=8, Epochs=3, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 646.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 650.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 732.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 771.46 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 839.37 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 881.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 974.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 977.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 889.49 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 628.92 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 624.38 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:24:07] ❌ Fatal error with config 95: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:24:07] 
🔬 Testing configuration 96/264
[2025-05-28 10:24:07] Config: LR=1e-05, BS=8, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 662.22 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 667.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 744.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 779.37 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 843.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 885.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 977.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 977.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 895.37 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 623.95 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 625.47 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:24:16] ❌ Fatal error with config 96: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:24:16] 
🔬 Testing configuration 97/264
[2025-05-28 10:24:16] Config: LR=2e-05, BS=4, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 647.81 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 652.12 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 730.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 765.19 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 821.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 859.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 952.98 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 956.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 874.71 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 621.77 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 619.18 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 0.8947, 'grad_norm': 30.1794376373291, 'learning_rate': 1.4474708171206227e-05, 'epoch': 1.7316017316017316}


{'eval_loss': 0.7529435157775879, 'eval_accuracy': 0.7440000176429749, 'eval_runtime': 1.9384, 'eval_samples_per_second': 64.488, 'eval_steps_per_second': 8.254, 'epoch': 1.7316017316017316}


{'loss': 0.0707, 'grad_norm': 0.04981311038136482, 'learning_rate': 6.692607003891051e-06, 'epoch': 3.463203463203463}


{'eval_loss': 0.5243954062461853, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.9418, 'eval_samples_per_second': 64.372, 'eval_steps_per_second': 8.24, 'epoch': 3.463203463203463}


{'train_runtime': 242.7996, 'train_samples_per_second': 19.028, 'train_steps_per_second': 1.174, 'train_loss': 0.34030109863532215, 'epoch': 4.935064935064935}


{'eval_loss': 0.46992313861846924, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.9701, 'eval_samples_per_second': 63.449, 'eval_steps_per_second': 8.121, 'epoch': 4.935064935064935}
[2025-05-28 10:28:29] ✅ Config 97: Accuracy=0.8640, Loss=0.4699


[2025-05-28 10:28:29] 
🔬 Testing configuration 98/264
[2025-05-28 10:28:29] Config: LR=1e-05, BS=16, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 600.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 605.91 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 680.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 704.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  48%|████▊     | 448/924 [00:00<00:00, 746.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 792.19 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  73%|███████▎  | 672/924 [00:00<00:00, 836.74 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 880.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:01<00:00, 896.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 803.29 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 588.75 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 580.16 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:28:38] ❌ Fatal error with config 98: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 10:28:38] 
🔬 Testing configuration 99/264
[2025-05-28 10:28:38] Config: LR=5e-05, BS=16, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 646.75 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 652.77 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 736.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 775.97 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 840.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 880.12 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 965.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 969.48 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 886.10 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 616.30 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 614.52 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.9152299165725708, 'eval_accuracy': 0.23199999332427979, 'eval_runtime': 1.2773, 'eval_samples_per_second': 97.862, 'eval_steps_per_second': 3.132, 'epoch': 1.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.9452508687973022, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 1.3744, 'eval_samples_per_second': 90.95, 'eval_steps_per_second': 2.91, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7745269536972046, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 1.2736, 'eval_samples_per_second': 98.149, 'eval_steps_per_second': 3.141, 'epoch': 3.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8044010400772095, 'eval_accuracy': 0.29600000381469727, 'eval_runtime': 1.3575, 'eval_samples_per_second': 92.084, 'eval_steps_per_second': 2.947, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.817431926727295, 'eval_accuracy': 0.06400000303983688, 'eval_runtime': 1.2696, 'eval_samples_per_second': 98.457, 'eval_steps_per_second': 3.151, 'epoch': 5.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7980468273162842, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 1.3345, 'eval_samples_per_second': 93.67, 'eval_steps_per_second': 2.997, 'epoch': 6.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8031091690063477, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 1.2732, 'eval_samples_per_second': 98.174, 'eval_steps_per_second': 3.142, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 396.5339, 'train_samples_per_second': 16.311, 'train_steps_per_second': 1.024, 'train_loss': 1.816009521484375, 'epoch': 7.0}


{'eval_loss': 1.8044010400772095, 'eval_accuracy': 0.29600000381469727, 'eval_runtime': 1.2862, 'eval_samples_per_second': 97.186, 'eval_steps_per_second': 3.11, 'epoch': 7.0}
[2025-05-28 10:35:25] ✅ Config 99: Accuracy=0.2960, Loss=1.8044


[2025-05-28 10:35:25] 
🔬 Testing configuration 100/264
[2025-05-28 10:35:25] Config: LR=3e-05, BS=16, Epochs=3, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 651.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 653.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 737.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 769.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 832.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 871.21 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 963.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 964.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 877.70 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 625.65 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 620.27 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:35:34] ❌ Fatal error with config 100: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 10:35:34] 
🔬 Testing configuration 101/264
[2025-05-28 10:35:34] Config: LR=2e-05, BS=8, Epochs=10, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 604.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 589.62 examples/s]


Map:  24%|██▍       | 224/924 [00:00<00:01, 657.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 677.10 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 729.83 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 792.50 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 810.90 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 890.43 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 890.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 906.89 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 809.27 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 582.70 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 576.25 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 0.1945, 'grad_norm': 0.03813252970576286, 'learning_rate': 2.982068965517241e-06, 'epoch': 8.620689655172415}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.1069207191467285, 'eval_accuracy': 0.800000011920929, 'eval_runtime': 1.7254, 'eval_samples_per_second': 72.445, 'eval_steps_per_second': 4.636, 'epoch': 8.620689655172415}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 499.2766, 'train_samples_per_second': 18.507, 'train_steps_per_second': 1.162, 'train_loss': 0.16766855260678407, 'epoch': 10.0}


{'eval_loss': 1.1069207191467285, 'eval_accuracy': 0.800000011920929, 'eval_runtime': 2.0267, 'eval_samples_per_second': 61.678, 'eval_steps_per_second': 3.947, 'epoch': 10.0}
[2025-05-28 10:44:04] ✅ Config 101: Accuracy=0.8000, Loss=1.1069


[2025-05-28 10:44:04] 
🔬 Testing configuration 102/264
[2025-05-28 10:44:04] Config: LR=1e-05, BS=4, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 605.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 594.11 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 669.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 695.18 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 748.90 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 806.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 843.77 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 882.60 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 900.87 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 810.12 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 565.18 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 570.05 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 563.90 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:44:13] ❌ Fatal error with config 102: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 10:44:13] 
🔬 Testing configuration 103/264
[2025-05-28 10:44:13] Config: LR=2e-05, BS=8, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 643.90 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 658.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 740.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 774.83 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 834.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 874.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 971.17 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 979.59 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 888.28 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 633.20 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 625.98 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.2576, 'grad_norm': 6.804218292236328, 'learning_rate': 1.3275172413793104e-05, 'epoch': 1.7241379310344827}


{'eval_loss': 0.8483648896217346, 'eval_accuracy': 0.8159999847412109, 'eval_runtime': 1.8239, 'eval_samples_per_second': 68.536, 'eval_steps_per_second': 4.386, 'epoch': 1.7241379310344827}


{'loss': 0.5339, 'grad_norm': 2.4490039348602295, 'learning_rate': 6.413103448275863e-06, 'epoch': 3.4482758620689653}


{'eval_loss': 0.7642329931259155, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 1.8369, 'eval_samples_per_second': 68.051, 'eval_steps_per_second': 4.355, 'epoch': 3.4482758620689653}


{'train_runtime': 157.5833, 'train_samples_per_second': 29.318, 'train_steps_per_second': 0.92, 'train_loss': 0.7555032664331897, 'epoch': 5.0}


{'eval_loss': 0.7070320248603821, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 1.8259, 'eval_samples_per_second': 68.461, 'eval_steps_per_second': 4.382, 'epoch': 5.0}
[2025-05-28 10:47:01] ✅ Config 103: Accuracy=0.8560, Loss=0.7070


[2025-05-28 10:47:01] 
🔬 Testing configuration 104/264
[2025-05-28 10:47:01] Config: LR=2e-05, BS=4, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 648.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 650.68 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 736.27 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 773.36 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 842.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 876.37 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 969.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 977.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 884.47 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 611.43 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 615.40 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:47:10] ❌ Fatal error with config 104: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:47:10] 
🔬 Testing configuration 105/264
[2025-05-28 10:47:10] Config: LR=2e-05, BS=2, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 613.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 587.97 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 650.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 686.21 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 745.48 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 806.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 843.50 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 879.38 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 898.33 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 805.66 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 576.84 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 575.16 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:47:18] ❌ Fatal error with config 105: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 10:47:18] 
🔬 Testing configuration 106/264
[2025-05-28 10:47:18] Config: LR=3e-05, BS=8, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 603.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 594.16 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 668.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 694.42 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 749.43 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 803.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 842.81 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 879.20 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 899.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 808.90 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 584.46 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 582.49 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:47:26] ❌ Fatal error with config 106: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:47:26] 
🔬 Testing configuration 107/264
[2025-05-28 10:47:26] Config: LR=1e-05, BS=4, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 612.11 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 598.35 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 675.14 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 704.12 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 756.72 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 815.84 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 847.80 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 873.96 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 897.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 812.05 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 580.21 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 574.48 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:47:35] ❌ Fatal error with config 107: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:47:35] 
🔬 Testing configuration 108/264
[2025-05-28 10:47:35] Config: LR=1e-05, BS=8, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 651.68 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 657.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 740.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 772.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 838.35 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 879.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 973.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 970.97 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 887.02 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 633.05 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 628.98 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:47:43] ❌ Fatal error with config 108: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:47:43] 
🔬 Testing configuration 109/264
[2025-05-28 10:47:43] Config: LR=5e-05, BS=2, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 603.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 592.67 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 674.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 700.74 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 755.02 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 810.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 847.37 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 881.51 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 900.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 812.08 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 564.24 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 576.80 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 569.45 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:47:52] ❌ Fatal error with config 109: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 10:47:52] 
🔬 Testing configuration 110/264
[2025-05-28 10:47:52] Config: LR=1e-05, BS=8, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 606.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 598.23 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 671.00 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 702.25 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 757.53 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 815.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 857.82 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 890.91 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 906.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 816.85 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 591.59 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 587.42 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:48:00] ❌ Fatal error with config 110: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:48:00] 
🔬 Testing configuration 111/264
[2025-05-28 10:48:00] Config: LR=1e-05, BS=16, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 657.84 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 662.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 744.75 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 783.48 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 851.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 893.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 987.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 983.36 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 899.05 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 627.04 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 627.21 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:48:09] ❌ Fatal error with config 111: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 10:48:09] 
🔬 Testing configuration 112/264
[2025-05-28 10:48:09] Config: LR=1e-05, BS=2, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 651.19 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 656.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 741.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 779.50 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 829.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 873.46 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 969.59 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 971.48 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 887.93 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 624.50 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 623.51 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:48:17] ❌ Fatal error with config 112: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 10:48:17] 
🔬 Testing configuration 113/264
[2025-05-28 10:48:17] Config: LR=3e-05, BS=16, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 651.27 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 660.90 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 742.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 781.54 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 845.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 881.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 977.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 980.08 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 894.06 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 631.27 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 629.97 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:48:25] ❌ Fatal error with config 113: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:48:25] 
🔬 Testing configuration 114/264
[2025-05-28 10:48:25] Config: LR=1e-05, BS=8, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 645.07 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 645.91 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 729.74 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 760.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 821.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 873.26 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 963.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 979.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 886.63 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 637.33 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 634.74 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:48:34] ❌ Fatal error with config 114: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:48:34] 
🔬 Testing configuration 115/264
[2025-05-28 10:48:34] Config: LR=3e-05, BS=4, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 649.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 658.18 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 740.77 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 774.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 840.21 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 883.81 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 977.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 979.99 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 892.74 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 625.85 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 624.61 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8376, 'grad_norm': 8.728951454162598, 'learning_rate': 2.475779220779221e-05, 'epoch': 0.8658008658008658}


{'eval_loss': 1.7628381252288818, 'eval_accuracy': 0.24799999594688416, 'eval_runtime': 2.2703, 'eval_samples_per_second': 55.058, 'eval_steps_per_second': 7.047, 'epoch': 0.8658008658008658}


{'loss': 1.5838, 'grad_norm': 8.604828834533691, 'learning_rate': 1.5050000000000002e-05, 'epoch': 1.7316017316017316}


{'eval_loss': 1.1551718711853027, 'eval_accuracy': 0.7279999852180481, 'eval_runtime': 2.3375, 'eval_samples_per_second': 53.476, 'eval_steps_per_second': 6.845, 'epoch': 1.7316017316017316}


{'loss': 0.7836, 'grad_norm': 5.557013034820557, 'learning_rate': 5.342207792207792e-06, 'epoch': 2.5974025974025974}


{'eval_loss': 0.7741317749023438, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.2837, 'eval_samples_per_second': 54.736, 'eval_steps_per_second': 7.006, 'epoch': 2.5974025974025974}


{'train_runtime': 153.3324, 'train_samples_per_second': 18.078, 'train_steps_per_second': 1.115, 'train_loss': 1.3103427775422034, 'epoch': 2.961038961038961}


{'eval_loss': 0.7687239050865173, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.274, 'eval_samples_per_second': 54.968, 'eval_steps_per_second': 7.036, 'epoch': 2.961038961038961}
[2025-05-28 10:51:18] ✅ Config 115: Accuracy=0.8320, Loss=0.7687


[2025-05-28 10:51:18] 
🔬 Testing configuration 116/264
[2025-05-28 10:51:18] Config: LR=3e-05, BS=8, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 606.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 596.95 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 672.98 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 700.95 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 742.03 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 797.22 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 836.05 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 876.03 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 893.77 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 805.32 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 568.94 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 567.89 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:51:26] ❌ Fatal error with config 116: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:51:26] 
🔬 Testing configuration 117/264
[2025-05-28 10:51:26] Config: LR=5e-05, BS=2, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 638.31 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 646.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 726.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 756.37 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 820.91 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 867.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 952.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  88%|████████▊ | 816/924 [00:00<00:00, 957.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 984.51 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 877.21 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 632.34 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 629.65 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:51:35] ❌ Fatal error with config 117: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 10:51:35] 
🔬 Testing configuration 118/264
[2025-05-28 10:51:35] Config: LR=5e-05, BS=16, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 648.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 651.53 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 734.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 763.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 824.53 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 872.00 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 959.53 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 967.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 880.80 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 630.99 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 623.14 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 10:51:43] ❌ Fatal error with config 118: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 10:51:43] 
🔬 Testing configuration 119/264
[2025-05-28 10:51:43] Config: LR=1e-05, BS=2, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 637.88 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 644.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 720.67 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 752.31 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 818.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 870.50 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 959.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 977.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 882.71 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 632.80 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 629.76 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8058, 'grad_norm': 388.2376708984375, 'learning_rate': 6.295238095238095e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7344844341278076, 'eval_accuracy': 0.2240000069141388, 'eval_runtime': 3.034, 'eval_samples_per_second': 41.2, 'eval_steps_per_second': 10.547, 'epoch': 4.329004329004329}


{'loss': 1.2275, 'grad_norm': 76.19937896728516, 'learning_rate': 1.5523809523809525e-06, 'epoch': 8.658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6972571015357971, 'eval_accuracy': 0.7599999904632568, 'eval_runtime': 2.9841, 'eval_samples_per_second': 41.888, 'eval_steps_per_second': 10.723, 'epoch': 8.658008658008658}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 1002.2747, 'train_samples_per_second': 9.219, 'train_steps_per_second': 1.147, 'train_loss': 1.4058577031674593, 'epoch': 9.956709956709958}


{'eval_loss': 0.6972571015357971, 'eval_accuracy': 0.7599999904632568, 'eval_runtime': 3.0483, 'eval_samples_per_second': 41.006, 'eval_steps_per_second': 10.498, 'epoch': 9.956709956709958}
[2025-05-28 11:08:37] ✅ Config 119: Accuracy=0.7600, Loss=0.6973


[2025-05-28 11:08:37] 
🔬 Testing configuration 120/264
[2025-05-28 11:08:37] Config: LR=3e-05, BS=8, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 610.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 598.68 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 680.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 705.23 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 758.44 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 815.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 857.51 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 896.43 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 913.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 821.10 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 592.23 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 587.19 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:08:46] ❌ Fatal error with config 120: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 11:08:46] 
🔬 Testing configuration 121/264
[2025-05-28 11:08:46] Config: LR=1e-05, BS=4, Epochs=10, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 604.26 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 596.06 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 676.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 702.10 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 755.72 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 811.90 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 852.12 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 889.86 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 904.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 815.27 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 577.25 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 573.16 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:08:54] ❌ Fatal error with config 121: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 11:08:54] 
🔬 Testing configuration 122/264
[2025-05-28 11:08:54] Config: LR=5e-05, BS=4, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 646.17 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 647.83 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 715.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 759.97 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 829.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 873.33 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 968.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 972.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 882.18 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 634.08 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 630.27 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.911, 'grad_norm': 32.42012405395508, 'learning_rate': 4.9632331891719824e-05, 'epoch': 0.4329004329004329}


{'loss': 1.8629, 'grad_norm': 7.768592834472656, 'learning_rate': 4.833659574779124e-05, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.945509910583496, 'eval_accuracy': 0.06400000303983688, 'eval_runtime': 2.8086, 'eval_samples_per_second': 44.505, 'eval_steps_per_second': 5.697, 'epoch': 0.9956709956709957}


{'loss': 1.8484, 'grad_norm': 5.554926872253418, 'learning_rate': 4.6155117981928854e-05, 'epoch': 1.2987012987012987}


{'loss': 1.8317, 'grad_norm': 9.186683654785156, 'learning_rate': 4.3170696688397546e-05, 'epoch': 1.7316017316017316}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.9022423028945923, 'eval_accuracy': 0.06400000303983688, 'eval_runtime': 2.301, 'eval_samples_per_second': 54.323, 'eval_steps_per_second': 6.953, 'epoch': 2.0}


{'loss': 1.7999, 'grad_norm': 4.00697135925293, 'learning_rate': 3.949660572194612e-05, 'epoch': 2.1645021645021645}


{'loss': 1.8164, 'grad_norm': 3.7769596576690674, 'learning_rate': 3.527229538316371e-05, 'epoch': 2.5974025974025974}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8305621147155762, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 2.3172, 'eval_samples_per_second': 53.945, 'eval_steps_per_second': 6.905, 'epoch': 2.995670995670996}


{'loss': 1.8086, 'grad_norm': 3.608640193939209, 'learning_rate': 3.06580995755666e-05, 'epoch': 3.0303030303030303}


{'loss': 1.8037, 'grad_norm': 4.1544084548950195, 'learning_rate': 2.5829150324524353e-05, 'epoch': 3.463203463203463}


{'loss': 1.8091, 'grad_norm': 4.494022846221924, 'learning_rate': 2.096873063275424e-05, 'epoch': 3.896103896103896}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.9126170873641968, 'eval_accuracy': 0.06400000303983688, 'eval_runtime': 2.2374, 'eval_samples_per_second': 55.867, 'eval_steps_per_second': 7.151, 'epoch': 4.0}


{'loss': 1.801, 'grad_norm': 3.548490285873413, 'learning_rate': 1.6261317965075645e-05, 'epoch': 4.329004329004329}


{'loss': 1.8065, 'grad_norm': 3.5547780990600586, 'learning_rate': 1.1885582397297549e-05, 'epoch': 4.761904761904762}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8963793516159058, 'eval_accuracy': 0.06400000303983688, 'eval_runtime': 2.3257, 'eval_samples_per_second': 53.748, 'eval_steps_per_second': 6.88, 'epoch': 4.995670995670996}


{'loss': 1.8118, 'grad_norm': 3.6008265018463135, 'learning_rate': 8.007605184837166e-06, 'epoch': 5.194805194805195}


{'loss': 1.798, 'grad_norm': 3.9813270568847656, 'learning_rate': 4.7745751406263165e-06, 'epoch': 5.627705627705628}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8378446102142334, 'eval_accuracy': 0.06400000303983688, 'eval_runtime': 2.2384, 'eval_samples_per_second': 55.844, 'eval_steps_per_second': 7.148, 'epoch': 6.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 526.4507, 'train_samples_per_second': 12.286, 'train_steps_per_second': 1.529, 'train_loss': 1.8220674699175066, 'epoch': 6.0}


{'eval_loss': 1.8305621147155762, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 2.3369, 'eval_samples_per_second': 53.489, 'eval_steps_per_second': 6.847, 'epoch': 6.0}
[2025-05-28 11:17:52] ✅ Config 122: Accuracy=0.2720, Loss=1.8306


[2025-05-28 11:17:52] 
🔬 Testing configuration 123/264
[2025-05-28 11:17:52] Config: LR=2e-05, BS=8, Epochs=10, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 610.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 600.06 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 679.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 705.86 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 759.09 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 815.17 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 851.00 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 887.59 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 906.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 817.68 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 564.61 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 581.22 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 573.32 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:18:01] ❌ Fatal error with config 123: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 11:18:01] 
🔬 Testing configuration 124/264
[2025-05-28 11:18:01] Config: LR=5e-05, BS=8, Epochs=3, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 648.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 657.57 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 741.47 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 780.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 845.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 887.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 980.76 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 982.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 895.34 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 629.22 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 626.87 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:18:09] ❌ Fatal error with config 124: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 11:18:09] 
🔬 Testing configuration 125/264
[2025-05-28 11:18:09] Config: LR=3e-05, BS=2, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 615.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 593.83 examples/s]


Map:  24%|██▍       | 224/924 [00:00<00:01, 662.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 681.93 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 736.09 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 797.22 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 816.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 891.82 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 889.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 909.11 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 812.80 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 589.35 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 583.25 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:18:17] ❌ Fatal error with config 125: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 11:18:17] 
🔬 Testing configuration 126/264
[2025-05-28 11:18:17] Config: LR=3e-05, BS=2, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 650.50 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 649.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 726.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 766.48 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 835.87 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 878.87 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 971.49 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 974.19 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 886.57 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 618.42 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 614.76 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:18:25] ❌ Fatal error with config 126: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 11:18:25] 
🔬 Testing configuration 127/264
[2025-05-28 11:18:25] Config: LR=3e-05, BS=4, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 599.94 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 588.64 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 673.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 699.84 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 754.03 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 810.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 846.34 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 883.17 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 904.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 812.39 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 590.04 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 585.89 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:18:34] ❌ Fatal error with config 127: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 11:18:34] 
🔬 Testing configuration 128/264
[2025-05-28 11:18:34] Config: LR=3e-05, BS=2, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 660.17 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 663.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 744.67 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 768.12 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 830.74 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 877.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 961.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 974.94 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 888.87 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 625.54 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 622.15 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:18:42] ❌ Fatal error with config 128: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 11:18:42] 
🔬 Testing configuration 129/264
[2025-05-28 11:18:42] Config: LR=3e-05, BS=16, Epochs=10, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 653.16 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 660.90 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 735.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 770.91 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 834.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 877.84 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 973.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 974.18 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 887.66 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 618.20 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 613.14 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:18:50] ❌ Fatal error with config 129: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 11:18:50] 
🔬 Testing configuration 130/264
[2025-05-28 11:18:50] Config: LR=2e-05, BS=4, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 655.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 661.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 742.91 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 778.46 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 845.89 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 884.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 975.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 978.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 892.93 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 628.09 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 626.15 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:18:59] ❌ Fatal error with config 130: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 11:18:59] 
🔬 Testing configuration 131/264
[2025-05-28 11:18:59] Config: LR=2e-05, BS=2, Epochs=3, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 645.29 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 649.84 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 730.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 769.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 838.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 876.35 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 973.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 975.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 887.17 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 623.24 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 617.42 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8340567946434021, 'eval_accuracy': 0.7760000228881836, 'eval_runtime': 2.4452, 'eval_samples_per_second': 51.12, 'eval_steps_per_second': 13.087, 'epoch': 1.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7093632817268372, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.5796, 'eval_samples_per_second': 48.456, 'eval_steps_per_second': 12.405, 'epoch': 2.0}


{'loss': 0.4972, 'grad_norm': 0.02027253806591034, 'learning_rate': 5.7569985569985565e-06, 'epoch': 2.1645021645021645}


{'eval_loss': 0.7530741691589355, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.5044, 'eval_samples_per_second': 49.913, 'eval_steps_per_second': 12.778, 'epoch': 3.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 410.2353, 'train_samples_per_second': 6.757, 'train_steps_per_second': 1.689, 'train_loss': 0.36367063226465884, 'epoch': 3.0}


{'eval_loss': 0.7530741691589355, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 2.4268, 'eval_samples_per_second': 51.508, 'eval_steps_per_second': 13.186, 'epoch': 3.0}
[2025-05-28 11:26:00] ✅ Config 131: Accuracy=0.8720, Loss=0.7531


[2025-05-28 11:26:01] 
🔬 Testing configuration 132/264
[2025-05-28 11:26:01] Config: LR=5e-05, BS=2, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 610.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 602.64 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 682.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 702.20 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 748.73 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 803.27 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 844.02 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 882.99 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 903.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 813.59 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 587.53 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 585.42 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:26:15] ❌ Fatal error with config 132: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 11:26:15] 
🔬 Testing configuration 133/264
[2025-05-28 11:26:15] Config: LR=2e-05, BS=16, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 630.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 641.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 727.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 766.14 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 828.50 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 872.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 968.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 975.89 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 882.93 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 631.35 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 627.64 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8421, 'grad_norm': 34.65287780761719, 'learning_rate': 1.8632824427480918e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7348073720932007, 'eval_accuracy': 0.3199999928474426, 'eval_runtime': 1.3106, 'eval_samples_per_second': 95.373, 'eval_steps_per_second': 3.052, 'epoch': 1.0}


{'loss': 1.7488, 'grad_norm': 10.624924659729004, 'learning_rate': 1.4987022900763362e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.632911205291748, 'eval_accuracy': 0.3840000033378601, 'eval_runtime': 1.3682, 'eval_samples_per_second': 91.364, 'eval_steps_per_second': 2.924, 'epoch': 2.0}


{'loss': 1.3662, 'grad_norm': 9.660399436950684, 'learning_rate': 1.1189312977099238e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9690183997154236, 'eval_accuracy': 0.7120000123977661, 'eval_runtime': 1.2916, 'eval_samples_per_second': 96.78, 'eval_steps_per_second': 3.097, 'epoch': 3.0}


{'loss': 0.8026, 'grad_norm': 26.200807571411133, 'learning_rate': 7.467557251908398e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8039767742156982, 'eval_accuracy': 0.8159999847412109, 'eval_runtime': 1.2624, 'eval_samples_per_second': 99.014, 'eval_steps_per_second': 3.168, 'epoch': 4.0}


{'loss': 0.6485, 'grad_norm': 12.512643814086914, 'learning_rate': 3.6698473282442756e-06, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7919473648071289, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.2958, 'eval_samples_per_second': 96.464, 'eval_steps_per_second': 3.087, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 290.7306, 'train_samples_per_second': 15.891, 'train_steps_per_second': 0.997, 'train_loss': 1.1881836003270643, 'epoch': 5.0}


{'eval_loss': 0.7919473648071289, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 1.3504, 'eval_samples_per_second': 92.567, 'eval_steps_per_second': 2.962, 'epoch': 5.0}
[2025-05-28 11:31:16] ✅ Config 133: Accuracy=0.8400, Loss=0.7919


[2025-05-28 11:31:16] 
🔬 Testing configuration 134/264
[2025-05-28 11:31:16] Config: LR=5e-05, BS=4, Epochs=3, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 605.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 588.94 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 661.98 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 689.26 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 746.90 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 806.81 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 849.87 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 891.37 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 909.17 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 812.26 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 576.93 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 575.98 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:31:25] ❌ Fatal error with config 134: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 11:31:25] 
🔬 Testing configuration 135/264
[2025-05-28 11:31:25] Config: LR=1e-05, BS=8, Epochs=10, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 608.35 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 593.73 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 674.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 700.18 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 754.61 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 814.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 855.65 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 887.65 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 907.90 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 815.72 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 576.37 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 570.99 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:31:34] ❌ Fatal error with config 135: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 11:31:34] 
🔬 Testing configuration 136/264
[2025-05-28 11:31:34] Config: LR=1e-05, BS=8, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 600.48 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 582.77 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 641.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 667.99 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 720.83 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 786.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 804.04 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 879.20 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 877.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 898.29 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 800.62 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 562.09 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 573.68 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 566.24 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:31:42] ❌ Fatal error with config 136: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 11:31:42] 
🔬 Testing configuration 137/264
[2025-05-28 11:31:42] Config: LR=5e-05, BS=8, Epochs=3, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 615.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 595.59 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 673.11 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 698.22 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 752.11 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 807.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 847.32 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 884.05 examples/s]


Map:  94%|█████████▎| 864/924 [00:01<00:00, 891.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 807.82 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 590.03 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 586.46 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:31:51] ❌ Fatal error with config 137: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 11:31:51] 
🔬 Testing configuration 138/264
[2025-05-28 11:31:51] Config: LR=2e-05, BS=2, Epochs=3, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 606.27 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 592.45 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 667.36 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 692.30 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 746.43 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 803.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 842.29 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 879.90 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 897.37 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 807.26 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 575.33 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 569.48 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8943, 'grad_norm': 23.9207706451416, 'learning_rate': 1.8693434343434346e-05, 'epoch': 0.21645021645021645}


{'loss': 1.9145, 'grad_norm': 108.72705841064453, 'learning_rate': 1.725764790764791e-05, 'epoch': 0.4329004329004329}


{'loss': 1.8681, 'grad_norm': 26.958953857421875, 'learning_rate': 1.5821861471861474e-05, 'epoch': 0.6493506493506493}


{'loss': 1.8753, 'grad_norm': 27.479684829711914, 'learning_rate': 1.438607503607504e-05, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 2.046093463897705, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 3.1009, 'eval_samples_per_second': 40.31, 'eval_steps_per_second': 10.319, 'epoch': 1.0}


{'loss': 1.8533, 'grad_norm': 23.14568328857422, 'learning_rate': 1.2950288600288603e-05, 'epoch': 1.0822510822510822}


{'loss': 1.8751, 'grad_norm': 15.57221794128418, 'learning_rate': 1.1514502164502169e-05, 'epoch': 1.2987012987012987}


{'loss': 1.8657, 'grad_norm': 14.240145683288574, 'learning_rate': 1.0078715728715731e-05, 'epoch': 1.5151515151515151}


{'loss': 1.8337, 'grad_norm': 17.91193962097168, 'learning_rate': 8.642929292929295e-06, 'epoch': 1.7316017316017316}


{'loss': 1.7937, 'grad_norm': 10.454656600952148, 'learning_rate': 7.207142857142857e-06, 'epoch': 1.948051948051948}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7827138900756836, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 3.3319, 'eval_samples_per_second': 37.516, 'eval_steps_per_second': 9.604, 'epoch': 2.0}


{'loss': 1.7899, 'grad_norm': 16.6182804107666, 'learning_rate': 5.771356421356422e-06, 'epoch': 2.1645021645021645}


{'loss': 1.8153, 'grad_norm': 18.864032745361328, 'learning_rate': 4.335569985569986e-06, 'epoch': 2.380952380952381}


{'loss': 1.7937, 'grad_norm': 17.678178787231445, 'learning_rate': 2.899783549783549e-06, 'epoch': 2.5974025974025974}


{'loss': 1.7947, 'grad_norm': 13.814842224121094, 'learning_rate': 1.4639971139971148e-06, 'epoch': 2.813852813852814}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7217507362365723, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 3.2949, 'eval_samples_per_second': 37.937, 'eval_steps_per_second': 9.712, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 497.7481, 'train_samples_per_second': 5.569, 'train_steps_per_second': 2.785, 'train_loss': 1.8421282087053572, 'epoch': 3.0}


{'eval_loss': 2.046093463897705, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 3.102, 'eval_samples_per_second': 40.297, 'eval_steps_per_second': 10.316, 'epoch': 3.0}
[2025-05-28 11:40:21] ✅ Config 138: Accuracy=0.2720, Loss=2.0461


[2025-05-28 11:40:21] 
🔬 Testing configuration 139/264
[2025-05-28 11:40:21] Config: LR=1e-05, BS=2, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 647.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 652.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 736.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 773.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 838.74 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 878.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 961.48 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 969.67 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 885.60 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 622.42 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 623.34 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:40:30] ❌ Fatal error with config 139: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 11:40:30] 
🔬 Testing configuration 140/264
[2025-05-28 11:40:30] Config: LR=1e-05, BS=2, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 646.18 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 651.16 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 731.88 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 770.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 833.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 876.26 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 969.48 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  95%|█████████▌| 880/924 [00:01<00:00, 974.68 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 880.36 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 622.34 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 616.61 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:40:38] ❌ Fatal error with config 140: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 11:40:38] 
🔬 Testing configuration 141/264
[2025-05-28 11:40:38] Config: LR=1e-05, BS=2, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 601.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 593.04 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 673.17 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 695.89 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 748.98 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 804.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 844.95 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 884.01 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 903.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 810.82 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 587.20 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 582.15 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.7908, 'grad_norm': 12.402758598327637, 'learning_rate': 8.596491228070176e-06, 'epoch': 0.4329004329004329}


{'eval_loss': 1.807339072227478, 'eval_accuracy': 0.17599999904632568, 'eval_runtime': 3.6002, 'eval_samples_per_second': 34.72, 'eval_steps_per_second': 8.888, 'epoch': 0.4329004329004329}


{'loss': 1.4854, 'grad_norm': 15.11390209197998, 'learning_rate': 9.197297297297298e-06, 'epoch': 0.8658008658008658}


{'eval_loss': 1.4581719636917114, 'eval_accuracy': 0.4399999976158142, 'eval_runtime': 3.6733, 'eval_samples_per_second': 34.03, 'eval_steps_per_second': 8.712, 'epoch': 0.8658008658008658}


{'loss': 0.8759, 'grad_norm': 13.25808048248291, 'learning_rate': 8.260810810810811e-06, 'epoch': 1.2987012987012987}


{'eval_loss': 0.8408477902412415, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 3.6118, 'eval_samples_per_second': 34.609, 'eval_steps_per_second': 8.86, 'epoch': 1.2987012987012987}


{'loss': 0.6013, 'grad_norm': 29.685558319091797, 'learning_rate': 7.305212355212355e-06, 'epoch': 1.7316017316017316}


{'eval_loss': 0.8405997157096863, 'eval_accuracy': 0.800000011920929, 'eval_runtime': 3.6089, 'eval_samples_per_second': 34.637, 'eval_steps_per_second': 8.867, 'epoch': 1.7316017316017316}


{'loss': 0.5635, 'grad_norm': 0.8281629085540771, 'learning_rate': 6.349613899613899e-06, 'epoch': 2.1645021645021645}


{'eval_loss': 0.6868783235549927, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 3.6233, 'eval_samples_per_second': 34.499, 'eval_steps_per_second': 8.832, 'epoch': 2.1645021645021645}


{'loss': 0.4785, 'grad_norm': 1.8393968343734741, 'learning_rate': 5.413127413127413e-06, 'epoch': 2.5974025974025974}


{'eval_loss': 0.6967641711235046, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 3.6027, 'eval_samples_per_second': 34.696, 'eval_steps_per_second': 8.882, 'epoch': 2.5974025974025974}


{'loss': 0.4687, 'grad_norm': 1.1855756044387817, 'learning_rate': 4.457528957528958e-06, 'epoch': 3.0303030303030303}


{'eval_loss': 0.7003533244132996, 'eval_accuracy': 0.8960000276565552, 'eval_runtime': 3.6074, 'eval_samples_per_second': 34.651, 'eval_steps_per_second': 8.871, 'epoch': 3.0303030303030303}


{'loss': 0.4424, 'grad_norm': 0.4052824378013611, 'learning_rate': 3.5019305019305016e-06, 'epoch': 3.463203463203463}


{'eval_loss': 0.6766861081123352, 'eval_accuracy': 0.8880000114440918, 'eval_runtime': 3.6098, 'eval_samples_per_second': 34.628, 'eval_steps_per_second': 8.865, 'epoch': 3.463203463203463}


{'loss': 0.4397, 'grad_norm': 0.4870437979698181, 'learning_rate': 2.5463320463320455e-06, 'epoch': 3.896103896103896}


{'eval_loss': 0.6961532235145569, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 3.6035, 'eval_samples_per_second': 34.688, 'eval_steps_per_second': 8.88, 'epoch': 3.896103896103896}


{'loss': 0.4284, 'grad_norm': 0.288696825504303, 'learning_rate': 1.590733590733591e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6697181463241577, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 3.5834, 'eval_samples_per_second': 34.883, 'eval_steps_per_second': 8.93, 'epoch': 4.329004329004329}


{'loss': 0.4298, 'grad_norm': 0.7291847467422485, 'learning_rate': 6.35135135135135e-07, 'epoch': 4.761904761904762}


{'eval_loss': 0.6767755746841431, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 3.8375, 'eval_samples_per_second': 32.573, 'eval_steps_per_second': 8.339, 'epoch': 4.761904761904762}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 571.8956, 'train_samples_per_second': 8.078, 'train_steps_per_second': 1.005, 'train_loss': 0.7148323689336362, 'epoch': 4.978354978354979}


{'eval_loss': 0.6697181463241577, 'eval_accuracy': 0.9039999842643738, 'eval_runtime': 3.6657, 'eval_samples_per_second': 34.1, 'eval_steps_per_second': 8.73, 'epoch': 4.978354978354979}
[2025-05-28 11:50:22] ✅ Config 141: Accuracy=0.9040, Loss=0.6697


[2025-05-28 11:50:23] 
🔬 Testing configuration 142/264
[2025-05-28 11:50:23] Config: LR=5e-05, BS=2, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 653.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 658.19 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 739.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 774.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 836.59 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 872.90 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 965.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 969.16 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 885.94 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 606.33 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 609.95 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:50:31] ❌ Fatal error with config 142: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 11:50:31] 
🔬 Testing configuration 143/264
[2025-05-28 11:50:31] Config: LR=2e-05, BS=2, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 608.90 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 595.92 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 676.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 700.35 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 754.05 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 811.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 851.55 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 889.19 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 909.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 816.29 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 586.87 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 583.04 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8608, 'grad_norm': 109.65992736816406, 'learning_rate': 1.8e-05, 'epoch': 0.21645021645021645}


{'eval_loss': 1.8216692209243774, 'eval_accuracy': 0.08799999952316284, 'eval_runtime': 3.5905, 'eval_samples_per_second': 34.814, 'eval_steps_per_second': 8.912, 'epoch': 0.21645021645021645}


{'loss': 1.9097, 'grad_norm': 19719.728515625, 'learning_rate': 1.9920074411012158e-05, 'epoch': 0.4329004329004329}


{'eval_loss': 2.5997984409332275, 'eval_accuracy': 0.11999999731779099, 'eval_runtime': 3.6283, 'eval_samples_per_second': 34.452, 'eval_steps_per_second': 8.82, 'epoch': 0.4329004329004329}


{'loss': 1.919, 'grad_norm': 18.33827018737793, 'learning_rate': 1.9641247592464757e-05, 'epoch': 0.6493506493506493}


{'eval_loss': 1.763559103012085, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 3.6026, 'eval_samples_per_second': 34.697, 'eval_steps_per_second': 8.882, 'epoch': 0.6493506493506493}


{'loss': 1.8515, 'grad_norm': 11.898788452148438, 'learning_rate': 1.9167921953165827e-05, 'epoch': 0.8658008658008658}


{'eval_loss': 1.8295891284942627, 'eval_accuracy': 0.20000000298023224, 'eval_runtime': 3.5893, 'eval_samples_per_second': 34.826, 'eval_steps_per_second': 8.916, 'epoch': 0.8658008658008658}


{'loss': 1.9015, 'grad_norm': 117.77918243408203, 'learning_rate': 1.8509646182513922e-05, 'epoch': 1.0822510822510822}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.830143690109253, 'eval_accuracy': 0.29600000381469727, 'eval_runtime': 3.5978, 'eval_samples_per_second': 34.744, 'eval_steps_per_second': 8.894, 'epoch': 1.0822510822510822}


{'loss': 1.8771, 'grad_norm': 9.224651336669922, 'learning_rate': 1.7679700082722738e-05, 'epoch': 1.2987012987012987}


{'eval_loss': 1.8408962488174438, 'eval_accuracy': 0.14399999380111694, 'eval_runtime': 3.8044, 'eval_samples_per_second': 32.857, 'eval_steps_per_second': 8.411, 'epoch': 1.2987012987012987}


{'loss': 1.8298, 'grad_norm': 8.447275161743164, 'learning_rate': 1.669482666719884e-05, 'epoch': 1.5151515151515151}


{'eval_loss': 1.7960224151611328, 'eval_accuracy': 0.29600000381469727, 'eval_runtime': 3.7029, 'eval_samples_per_second': 33.758, 'eval_steps_per_second': 8.642, 'epoch': 1.5151515151515151}


{'loss': 1.8202, 'grad_norm': 11.828084945678711, 'learning_rate': 1.5574894393428856e-05, 'epoch': 1.7316017316017316}


{'eval_loss': 1.824683666229248, 'eval_accuracy': 0.18400000035762787, 'eval_runtime': 3.6539, 'eval_samples_per_second': 34.21, 'eval_steps_per_second': 8.758, 'epoch': 1.7316017316017316}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 295.0602, 'train_samples_per_second': 15.658, 'train_steps_per_second': 7.829, 'train_loss': 1.8711891174316406, 'epoch': 1.7316017316017316}


{'eval_loss': 1.830143690109253, 'eval_accuracy': 0.29600000381469727, 'eval_runtime': 3.6745, 'eval_samples_per_second': 34.019, 'eval_steps_per_second': 8.709, 'epoch': 1.7316017316017316}
[2025-05-28 11:55:39] ✅ Config 143: Accuracy=0.2960, Loss=1.8301


[2025-05-28 11:55:39] 
🔬 Testing configuration 144/264
[2025-05-28 11:55:39] Config: LR=1e-05, BS=4, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 654.99 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 664.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 742.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 775.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 845.14 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 884.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 979.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 980.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 893.54 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 641.25 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 635.94 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:55:48] ❌ Fatal error with config 144: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 11:55:48] 
🔬 Testing configuration 145/264
[2025-05-28 11:55:48] Config: LR=2e-05, BS=16, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 657.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 665.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 743.24 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 783.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 851.35 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 893.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 987.91 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 990.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 902.24 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 634.21 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 631.39 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:55:56] ❌ Fatal error with config 145: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 11:55:56] 
🔬 Testing configuration 146/264
[2025-05-28 11:55:56] Config: LR=3e-05, BS=8, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 655.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 663.59 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 746.24 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 786.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 853.14 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 886.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 969.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 968.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 891.92 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 633.72 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 630.96 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:56:04] ❌ Fatal error with config 146: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 11:56:04] 
🔬 Testing configuration 147/264
[2025-05-28 11:56:04] Config: LR=2e-05, BS=2, Epochs=10, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 656.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 660.57 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 742.75 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 781.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 847.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 889.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 985.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 989.21 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 899.22 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 634.30 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 631.23 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:56:12] ❌ Fatal error with config 147: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 11:56:12] 
🔬 Testing configuration 148/264
[2025-05-28 11:56:12] Config: LR=5e-05, BS=2, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 691.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 766.16 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 815.64 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 861.46 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 971.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 996.12 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 1059.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:00<00:00, 1083.11 examples/s]


Map: 100%|██████████| 924/924 [00:00<00:00, 975.50 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 683.01 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 685.40 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:56:20] ❌ Fatal error with config 148: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 11:56:20] 
🔬 Testing configuration 149/264
[2025-05-28 11:56:20] Config: LR=2e-05, BS=2, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 663.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 676.82 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 756.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 795.10 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 856.64 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 894.18 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 973.86 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 980.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 899.64 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 632.46 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 630.31 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:56:28] ❌ Fatal error with config 149: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 11:56:28] 
🔬 Testing configuration 150/264
[2025-05-28 11:56:28] Config: LR=5e-05, BS=8, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 665.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 677.60 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 757.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 792.05 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 856.77 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 891.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 974.70 examples/s]


Map:  92%|█████████▏| 848/924 [00:00<00:00, 987.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 900.59 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 642.21 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 639.16 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:56:35] ❌ Fatal error with config 150: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 11:56:35] 
🔬 Testing configuration 151/264
[2025-05-28 11:56:35] Config: LR=1e-05, BS=8, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 671.83 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 753.57 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 807.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 852.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 958.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 988.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 1047.26 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:00<00:00, 1077.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:00<00:00, 965.50 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 684.80 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 683.70 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 11:56:43] ❌ Fatal error with config 151: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 11:56:43] 
🔬 Testing configuration 152/264
[2025-05-28 11:56:43] Config: LR=3e-05, BS=8, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 661.64 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 675.77 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 758.21 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 797.54 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 860.24 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 898.59 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 980.34 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 980.68 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 902.39 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 643.50 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 638.13 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'train_runtime': 307.5724, 'train_samples_per_second': 21.029, 'train_steps_per_second': 0.66, 'train_loss': 1.8228020127770936, 'epoch': 7.0}


{'eval_loss': 1.7930573225021362, 'eval_accuracy': 0.17599999904632568, 'eval_runtime': 2.5412, 'eval_samples_per_second': 49.19, 'eval_steps_per_second': 3.148, 'epoch': 7.0}
[2025-05-28 12:02:02] ✅ Config 152: Accuracy=0.1760, Loss=1.7931


[2025-05-28 12:02:02] 
🔬 Testing configuration 153/264
[2025-05-28 12:02:02] Config: LR=5e-05, BS=2, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 663.99 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 677.05 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 756.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 796.44 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 858.88 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 899.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 978.20 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 983.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 902.37 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 637.37 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 637.03 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 12:02:10] ❌ Fatal error with config 153: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 12:02:10] 
🔬 Testing configuration 154/264
[2025-05-28 12:02:10] Config: LR=3e-05, BS=16, Epochs=10, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 689.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 762.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 812.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 858.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 967.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 1000.22 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 1065.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:00<00:00, 1081.58 examples/s]


Map: 100%|██████████| 924/924 [00:00<00:00, 974.55 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 684.85 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 682.80 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8486, 'grad_norm': 2674.65869140625, 'learning_rate': 2.281030534351145e-05, 'epoch': 3.4482758620689653}


{'eval_loss': 1.7984061241149902, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 1.2723, 'eval_samples_per_second': 98.248, 'eval_steps_per_second': 3.144, 'epoch': 3.4482758620689653}


{'loss': 1.8133, 'grad_norm': 38.80866622924805, 'learning_rate': 1.1398091603053437e-05, 'epoch': 6.896551724137931}


{'eval_loss': 1.7797186374664307, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 1.2612, 'eval_samples_per_second': 99.111, 'eval_steps_per_second': 3.172, 'epoch': 6.896551724137931}


{'train_runtime': 211.2614, 'train_samples_per_second': 43.737, 'train_steps_per_second': 1.373, 'train_loss': 1.8227496969288792, 'epoch': 10.0}


{'eval_loss': 1.7675467729568481, 'eval_accuracy': 0.23199999332427979, 'eval_runtime': 1.2499, 'eval_samples_per_second': 100.004, 'eval_steps_per_second': 3.2, 'epoch': 10.0}
[2025-05-28 12:05:50] ✅ Config 154: Accuracy=0.2320, Loss=1.7675


[2025-05-28 12:05:50] 
🔬 Testing configuration 155/264
[2025-05-28 12:05:50] Config: LR=2e-05, BS=8, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 662.00 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 671.44 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 751.27 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 789.91 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 852.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 890.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 970.82 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 976.36 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 896.06 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 637.93 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 635.33 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 12:05:59] ❌ Fatal error with config 155: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 12:05:59] 
🔬 Testing configuration 156/264
[2025-05-28 12:05:59] Config: LR=2e-05, BS=2, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 684.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 757.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 806.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 852.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 964.76 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 999.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 1065.88 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:00<00:00, 1084.32 examples/s]


Map: 100%|██████████| 924/924 [00:00<00:00, 973.22 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 689.75 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 687.25 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8485, 'grad_norm': 88.79170227050781, 'learning_rate': 1.955209003215434e-05, 'epoch': 0.4329004329004329}


{'eval_loss': 1.8163752555847168, 'eval_accuracy': 0.17599999904632568, 'eval_runtime': 2.2609, 'eval_samples_per_second': 55.287, 'eval_steps_per_second': 14.154, 'epoch': 0.4329004329004329}


{'loss': 1.8353, 'grad_norm': 121.31490325927734, 'learning_rate': 1.6480707395498394e-05, 'epoch': 0.8658008658008658}


{'eval_loss': 1.8479801416397095, 'eval_accuracy': 0.1679999977350235, 'eval_runtime': 2.2518, 'eval_samples_per_second': 55.51, 'eval_steps_per_second': 14.211, 'epoch': 0.8658008658008658}


{'loss': 1.8269, 'grad_norm': 35.348690032958984, 'learning_rate': 1.3281350482315113e-05, 'epoch': 1.2987012987012987}


{'eval_loss': 1.810360312461853, 'eval_accuracy': 0.2160000056028366, 'eval_runtime': 2.1768, 'eval_samples_per_second': 57.423, 'eval_steps_per_second': 14.7, 'epoch': 1.2987012987012987}


{'loss': 1.8083, 'grad_norm': 66.94461059570312, 'learning_rate': 1.0081993569131834e-05, 'epoch': 1.7316017316017316}


{'eval_loss': 1.898189663887024, 'eval_accuracy': 0.1599999964237213, 'eval_runtime': 2.2628, 'eval_samples_per_second': 55.241, 'eval_steps_per_second': 14.142, 'epoch': 1.7316017316017316}


{'loss': 1.7884, 'grad_norm': 31.55914878845215, 'learning_rate': 6.8826366559485546e-06, 'epoch': 2.1645021645021645}


{'eval_loss': 1.9237511157989502, 'eval_accuracy': 0.12800000607967377, 'eval_runtime': 2.2567, 'eval_samples_per_second': 55.391, 'eval_steps_per_second': 14.18, 'epoch': 2.1645021645021645}


{'loss': 1.8032, 'grad_norm': 44.76283264160156, 'learning_rate': 3.6832797427652726e-06, 'epoch': 2.5974025974025974}


{'eval_loss': 1.91139817237854, 'eval_accuracy': 0.15199999511241913, 'eval_runtime': 2.2574, 'eval_samples_per_second': 55.372, 'eval_steps_per_second': 14.175, 'epoch': 2.5974025974025974}


{'train_runtime': 264.777, 'train_samples_per_second': 10.469, 'train_steps_per_second': 1.303, 'train_loss': 1.817008640455163, 'epoch': 2.987012987012987}


{'eval_loss': 1.905574083328247, 'eval_accuracy': 0.13600000739097595, 'eval_runtime': 2.2419, 'eval_samples_per_second': 55.757, 'eval_steps_per_second': 14.274, 'epoch': 2.987012987012987}
[2025-05-28 12:10:33] ✅ Config 156: Accuracy=0.1360, Loss=1.9056


[2025-05-28 12:10:34] 
🔬 Testing configuration 157/264
[2025-05-28 12:10:34] Config: LR=3e-05, BS=2, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 693.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 769.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 820.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 866.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 981.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 1006.24 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 1049.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:00<00:00, 1083.17 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:00<00:00, 977.40 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 692.77 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 691.28 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 12:10:42] ❌ Fatal error with config 157: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 12:10:42] 
🔬 Testing configuration 158/264
[2025-05-28 12:10:42] Config: LR=5e-05, BS=2, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 684.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 760.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 809.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 858.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 972.88 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 1007.49 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 1074.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:00<00:00, 1093.04 examples/s]


Map: 100%|██████████| 924/924 [00:00<00:00, 980.05 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 688.81 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 686.47 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8686, 'grad_norm': 22.639009475708008, 'learning_rate': 2.0346320346320347e-05, 'epoch': 0.21645021645021645}


{'loss': 1.8156, 'grad_norm': 15.414412498474121, 'learning_rate': 4.1991341991341996e-05, 'epoch': 0.4329004329004329}


{'loss': 1.085, 'grad_norm': 1.490830421447754, 'learning_rate': 4.848484848484849e-05, 'epoch': 0.6493506493506493}


{'loss': 0.8379, 'grad_norm': 1.618943214416504, 'learning_rate': 4.607984607984609e-05, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9862029552459717, 'eval_accuracy': 0.7919999957084656, 'eval_runtime': 2.7661, 'eval_samples_per_second': 45.19, 'eval_steps_per_second': 11.569, 'epoch': 1.0}


{'loss': 0.8179, 'grad_norm': 0.30423372983932495, 'learning_rate': 4.367484367484368e-05, 'epoch': 1.0822510822510822}


{'loss': 0.8635, 'grad_norm': 0.20103596150875092, 'learning_rate': 4.126984126984127e-05, 'epoch': 1.2987012987012987}


{'loss': 0.6206, 'grad_norm': 0.853080153465271, 'learning_rate': 3.8864838864838866e-05, 'epoch': 1.5151515151515151}


{'loss': 0.6759, 'grad_norm': 6.758462905883789, 'learning_rate': 3.645983645983646e-05, 'epoch': 1.7316017316017316}


{'loss': 0.8416, 'grad_norm': 2.5327467918395996, 'learning_rate': 3.405483405483406e-05, 'epoch': 1.948051948051948}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9527114033699036, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.8783, 'eval_samples_per_second': 43.429, 'eval_steps_per_second': 11.118, 'epoch': 2.0}


{'loss': 0.522, 'grad_norm': 0.5344439744949341, 'learning_rate': 3.164983164983165e-05, 'epoch': 2.1645021645021645}


{'loss': 0.5485, 'grad_norm': 0.9721916913986206, 'learning_rate': 2.9244829244829247e-05, 'epoch': 2.380952380952381}


{'loss': 0.542, 'grad_norm': 0.4067946672439575, 'learning_rate': 2.6839826839826843e-05, 'epoch': 2.5974025974025974}


{'loss': 0.5788, 'grad_norm': 20.8909854888916, 'learning_rate': 2.4434824434824436e-05, 'epoch': 2.813852813852814}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8974341154098511, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.7466, 'eval_samples_per_second': 45.511, 'eval_steps_per_second': 11.651, 'epoch': 3.0}


{'loss': 0.5301, 'grad_norm': 0.13580121099948883, 'learning_rate': 2.202982202982203e-05, 'epoch': 3.0303030303030303}


{'loss': 0.5161, 'grad_norm': 0.37642520666122437, 'learning_rate': 1.9624819624819628e-05, 'epoch': 3.2467532467532467}


{'loss': 0.5377, 'grad_norm': 0.46798011660575867, 'learning_rate': 1.721981721981722e-05, 'epoch': 3.463203463203463}


{'loss': 0.4242, 'grad_norm': 0.08500449359416962, 'learning_rate': 1.4814814814814815e-05, 'epoch': 3.6796536796536796}


{'loss': 0.4779, 'grad_norm': 0.04574514552950859, 'learning_rate': 1.240981240981241e-05, 'epoch': 3.896103896103896}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9659640789031982, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 2.8285, 'eval_samples_per_second': 44.193, 'eval_steps_per_second': 11.313, 'epoch': 4.0}


{'loss': 0.4646, 'grad_norm': 0.14394240081310272, 'learning_rate': 1.0004810004810006e-05, 'epoch': 4.112554112554113}


{'loss': 0.4402, 'grad_norm': 0.11600901931524277, 'learning_rate': 7.5998075998076e-06, 'epoch': 4.329004329004329}


{'loss': 0.5113, 'grad_norm': 0.06562599539756775, 'learning_rate': 5.194805194805195e-06, 'epoch': 4.545454545454545}


{'loss': 0.4533, 'grad_norm': 0.11019141227006912, 'learning_rate': 2.7898027898027897e-06, 'epoch': 4.761904761904762}


{'loss': 0.476, 'grad_norm': 0.23361966013908386, 'learning_rate': 3.8480038480038485e-07, 'epoch': 4.978354978354979}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9598264098167419, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.7874, 'eval_samples_per_second': 44.844, 'eval_steps_per_second': 11.48, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 745.6411, 'train_samples_per_second': 6.196, 'train_steps_per_second': 3.098, 'train_loss': 0.7139138374493752, 'epoch': 5.0}


{'eval_loss': 0.9527114033699036, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.8231, 'eval_samples_per_second': 44.277, 'eval_steps_per_second': 11.335, 'epoch': 5.0}
[2025-05-28 12:23:18] ✅ Config 158: Accuracy=0.8400, Loss=0.9527


[2025-05-28 12:23:19] 
🔬 Testing configuration 159/264
[2025-05-28 12:23:19] Config: LR=5e-05, BS=2, Epochs=10, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 663.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 674.25 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 754.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 793.40 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 854.00 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 894.50 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 972.58 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 976.46 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 898.06 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 636.21 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 634.79 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8326, 'grad_norm': 16.995418548583984, 'learning_rate': 4.75e-05, 'epoch': 0.4329004329004329}


{'eval_loss': 1.7968745231628418, 'eval_accuracy': 0.13600000739097595, 'eval_runtime': 3.745, 'eval_samples_per_second': 33.378, 'eval_steps_per_second': 8.545, 'epoch': 0.4329004329004329}


{'loss': 1.5474, 'grad_norm': 23.197784423828125, 'learning_rate': 4.7850678733031676e-05, 'epoch': 0.8658008658008658}


{'eval_loss': 1.319976806640625, 'eval_accuracy': 0.7440000176429749, 'eval_runtime': 3.7425, 'eval_samples_per_second': 33.4, 'eval_steps_per_second': 8.55, 'epoch': 0.8658008658008658}


{'loss': 1.0849, 'grad_norm': 17.0902042388916, 'learning_rate': 4.561085972850679e-05, 'epoch': 1.2987012987012987}


{'eval_loss': 1.16102135181427, 'eval_accuracy': 0.800000011920929, 'eval_runtime': 3.7651, 'eval_samples_per_second': 33.2, 'eval_steps_per_second': 8.499, 'epoch': 1.2987012987012987}


{'loss': 0.9909, 'grad_norm': 21.22766876220703, 'learning_rate': 4.334841628959276e-05, 'epoch': 1.7316017316017316}


{'eval_loss': 1.0169614553451538, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 3.7907, 'eval_samples_per_second': 32.976, 'eval_steps_per_second': 8.442, 'epoch': 1.7316017316017316}


{'loss': 0.9138, 'grad_norm': 1.1474257707595825, 'learning_rate': 4.1085972850678736e-05, 'epoch': 2.1645021645021645}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.0365760326385498, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 3.7741, 'eval_samples_per_second': 33.12, 'eval_steps_per_second': 8.479, 'epoch': 2.1645021645021645}


{'loss': 0.8831, 'grad_norm': 1.075554609298706, 'learning_rate': 3.884615384615385e-05, 'epoch': 2.5974025974025974}


{'eval_loss': 1.0873013734817505, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 4.1874, 'eval_samples_per_second': 29.852, 'eval_steps_per_second': 7.642, 'epoch': 2.5974025974025974}


{'loss': 0.8264, 'grad_norm': 0.9599034190177917, 'learning_rate': 3.658371040723982e-05, 'epoch': 3.0303030303030303}


{'eval_loss': 1.0171929597854614, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 3.9206, 'eval_samples_per_second': 31.883, 'eval_steps_per_second': 8.162, 'epoch': 3.0303030303030303}


{'loss': 0.8692, 'grad_norm': 1.5839773416519165, 'learning_rate': 3.4321266968325795e-05, 'epoch': 3.463203463203463}


{'eval_loss': 1.0293400287628174, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 3.9712, 'eval_samples_per_second': 31.477, 'eval_steps_per_second': 8.058, 'epoch': 3.463203463203463}


{'loss': 0.7922, 'grad_norm': 0.38816577196121216, 'learning_rate': 3.205882352941177e-05, 'epoch': 3.896103896103896}


{'eval_loss': 1.0359134674072266, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 3.9397, 'eval_samples_per_second': 31.728, 'eval_steps_per_second': 8.122, 'epoch': 3.896103896103896}


{'loss': 0.7822, 'grad_norm': 0.6527617573738098, 'learning_rate': 2.979638009049774e-05, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.071903944015503, 'eval_accuracy': 0.8159999847412109, 'eval_runtime': 3.9542, 'eval_samples_per_second': 31.612, 'eval_steps_per_second': 8.093, 'epoch': 4.329004329004329}


{'loss': 0.789, 'grad_norm': 0.24055251479148865, 'learning_rate': 2.7533936651583712e-05, 'epoch': 4.761904761904762}


{'eval_loss': 1.1637307405471802, 'eval_accuracy': 0.7839999794960022, 'eval_runtime': 3.9484, 'eval_samples_per_second': 31.658, 'eval_steps_per_second': 8.105, 'epoch': 4.761904761904762}


{'loss': 0.7913, 'grad_norm': 0.18829020857810974, 'learning_rate': 2.5271493212669683e-05, 'epoch': 5.194805194805195}


{'eval_loss': 1.0835555791854858, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 3.6815, 'eval_samples_per_second': 33.954, 'eval_steps_per_second': 8.692, 'epoch': 5.194805194805195}


{'loss': 0.7395, 'grad_norm': 0.26645761728286743, 'learning_rate': 2.3009049773755656e-05, 'epoch': 5.627705627705628}


{'eval_loss': 1.0858296155929565, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 3.9488, 'eval_samples_per_second': 31.655, 'eval_steps_per_second': 8.104, 'epoch': 5.627705627705628}


{'loss': 0.758, 'grad_norm': 0.22396911680698395, 'learning_rate': 2.074660633484163e-05, 'epoch': 6.0606060606060606}


{'eval_loss': 1.0590717792510986, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 3.8677, 'eval_samples_per_second': 32.319, 'eval_steps_per_second': 8.274, 'epoch': 6.0606060606060606}


{'loss': 0.7393, 'grad_norm': 0.36653000116348267, 'learning_rate': 1.8484162895927603e-05, 'epoch': 6.4935064935064934}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.0050954818725586, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 3.9604, 'eval_samples_per_second': 31.563, 'eval_steps_per_second': 8.08, 'epoch': 6.4935064935064934}


{'loss': 0.7578, 'grad_norm': 0.12297947704792023, 'learning_rate': 1.6221719457013577e-05, 'epoch': 6.926406926406926}


{'eval_loss': 1.077862024307251, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 4.152, 'eval_samples_per_second': 30.106, 'eval_steps_per_second': 7.707, 'epoch': 6.926406926406926}


{'loss': 0.7336, 'grad_norm': 0.3951629400253296, 'learning_rate': 1.3959276018099549e-05, 'epoch': 7.359307359307359}


{'eval_loss': 1.1010046005249023, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 3.988, 'eval_samples_per_second': 31.344, 'eval_steps_per_second': 8.024, 'epoch': 7.359307359307359}


{'loss': 0.7258, 'grad_norm': 0.1948169320821762, 'learning_rate': 1.169683257918552e-05, 'epoch': 7.792207792207792}


{'eval_loss': 1.0678733587265015, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 3.9773, 'eval_samples_per_second': 31.428, 'eval_steps_per_second': 8.046, 'epoch': 7.792207792207792}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 1002.5987, 'train_samples_per_second': 9.216, 'train_steps_per_second': 2.304, 'train_loss': 0.9198318142361112, 'epoch': 7.792207792207792}


{'eval_loss': 1.0050954818725586, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 4.0833, 'eval_samples_per_second': 30.612, 'eval_steps_per_second': 7.837, 'epoch': 7.792207792207792}
[2025-05-28 12:40:14] ✅ Config 159: Accuracy=0.8720, Loss=1.0051


[2025-05-28 12:40:14] 
🔬 Testing configuration 160/264
[2025-05-28 12:40:14] Config: LR=3e-05, BS=8, Epochs=10, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 660.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 663.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 745.24 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 770.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 831.84 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 878.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 963.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 978.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 893.32 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 644.94 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 642.23 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 12:40:23] ❌ Fatal error with config 160: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 12:40:23] 
🔬 Testing configuration 161/264
[2025-05-28 12:40:23] Config: LR=3e-05, BS=8, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 627.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 632.71 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 706.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 738.01 examples/s]


Map:  50%|█████     | 464/924 [00:00<00:00, 791.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 830.18 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 899.83 examples/s]


Map:  88%|████████▊ | 816/924 [00:00<00:00, 908.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 927.58 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 838.07 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 595.12 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 560.60 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 12:40:32] ❌ Fatal error with config 161: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 12:40:32] 
🔬 Testing configuration 162/264
[2025-05-28 12:40:32] Config: LR=1e-05, BS=8, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 619.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 619.83 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 690.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 720.64 examples/s]


Map:  48%|████▊     | 448/924 [00:00<00:00, 766.28 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 814.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 856.13 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 893.27 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 909.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 820.50 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 590.63 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 585.95 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 12:40:41] ❌ Fatal error with config 162: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 12:40:41] 
🔬 Testing configuration 163/264
[2025-05-28 12:40:41] Config: LR=1e-05, BS=2, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 614.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 621.52 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 691.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 719.33 examples/s]


Map:  48%|████▊     | 448/924 [00:00<00:00, 762.62 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 807.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 850.31 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 892.98 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 913.77 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 821.04 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 555.07 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 556.77 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 550.55 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 12:40:49] ❌ Fatal error with config 163: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 12:40:49] 
🔬 Testing configuration 164/264
[2025-05-28 12:40:49] Config: LR=1e-05, BS=4, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 665.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 668.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 738.08 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 761.75 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 814.00 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 845.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 921.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  88%|████████▊ | 816/924 [00:00<00:00, 921.68 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 941.30 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 858.82 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 640.37 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 632.88 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 12:40:57] ❌ Fatal error with config 164: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 12:40:57] 
🔬 Testing configuration 165/264
[2025-05-28 12:40:57] Config: LR=1e-05, BS=4, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 616.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 608.72 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:00, 691.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 715.08 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 773.49 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 835.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 876.31 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 917.89 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 937.83 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 839.30 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 596.42 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 591.12 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 12:41:05] ❌ Fatal error with config 165: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 12:41:05] 
🔬 Testing configuration 166/264
[2025-05-28 12:41:05] Config: LR=5e-05, BS=8, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 665.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 674.64 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 760.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 797.83 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 865.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 857.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 920.90 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  88%|████████▊ | 816/924 [00:00<00:00, 910.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 929.42 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 864.28 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 607.69 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 599.73 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 12:41:13] ❌ Fatal error with config 166: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 12:41:13] 
🔬 Testing configuration 167/264
[2025-05-28 12:41:13] Config: LR=5e-05, BS=8, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 624.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 617.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 690.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 719.47 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 778.47 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 822.99 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 903.07 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  88%|████████▊ | 816/924 [00:00<00:00, 909.59 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 934.74 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 834.03 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 644.86 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 638.62 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8536, 'grad_norm': 300.17877197265625, 'learning_rate': 3.508771929824561e-05, 'epoch': 0.43103448275862066}


{'loss': 1.8719, 'grad_norm': 107.96426391601562, 'learning_rate': 4.684512428298279e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 2.0626327991485596, 'eval_accuracy': 0.07999999821186066, 'eval_runtime': 1.4034, 'eval_samples_per_second': 89.071, 'eval_steps_per_second': 5.701, 'epoch': 1.0}


{'loss': 1.8643, 'grad_norm': 25.554616928100586, 'learning_rate': 4.2065009560229444e-05, 'epoch': 1.293103448275862}


{'loss': 1.9083, 'grad_norm': 8.579724311828613, 'learning_rate': 3.72848948374761e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7860857248306274, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 1.5014, 'eval_samples_per_second': 83.256, 'eval_steps_per_second': 5.328, 'epoch': 2.0}


{'loss': 1.8296, 'grad_norm': 9.134930610656738, 'learning_rate': 3.250478011472275e-05, 'epoch': 2.1551724137931036}


{'loss': 1.8301, 'grad_norm': 15.620805740356445, 'learning_rate': 2.7724665391969406e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7691326141357422, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 1.4149, 'eval_samples_per_second': 88.345, 'eval_steps_per_second': 5.654, 'epoch': 3.0}


{'loss': 1.8196, 'grad_norm': 6.399909496307373, 'learning_rate': 2.294455066921606e-05, 'epoch': 3.0172413793103448}


{'loss': 1.8226, 'grad_norm': 8.15587043762207, 'learning_rate': 1.8164435946462717e-05, 'epoch': 3.4482758620689653}


{'loss': 1.8313, 'grad_norm': 7.516485214233398, 'learning_rate': 1.3384321223709371e-05, 'epoch': 3.8793103448275863}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8287266492843628, 'eval_accuracy': 0.1120000034570694, 'eval_runtime': 1.4947, 'eval_samples_per_second': 83.63, 'eval_steps_per_second': 5.352, 'epoch': 4.0}


{'loss': 1.7972, 'grad_norm': 8.125077247619629, 'learning_rate': 8.604206500956023e-06, 'epoch': 4.310344827586207}


{'loss': 1.809, 'grad_norm': 5.227617263793945, 'learning_rate': 3.824091778202677e-06, 'epoch': 4.741379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8354060649871826, 'eval_accuracy': 0.14399999380111694, 'eval_runtime': 1.4083, 'eval_samples_per_second': 88.762, 'eval_steps_per_second': 5.681, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 360.993, 'train_samples_per_second': 12.798, 'train_steps_per_second': 1.607, 'train_loss': 1.8376978380926725, 'epoch': 5.0}


{'eval_loss': 1.7860857248306274, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 1.5158, 'eval_samples_per_second': 82.465, 'eval_steps_per_second': 5.278, 'epoch': 5.0}
[2025-05-28 12:47:25] ✅ Config 167: Accuracy=0.2720, Loss=1.7861


[2025-05-28 12:47:25] 
🔬 Testing configuration 168/264
[2025-05-28 12:47:25] Config: LR=1e-05, BS=16, Epochs=10, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 658.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 667.57 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 747.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 782.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 841.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 885.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 983.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  90%|█████████ | 832/924 [00:00<00:00, 986.12 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 898.79 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 606.53 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 599.83 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 12:47:33] ❌ Fatal error with config 168: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 12:47:33] 
🔬 Testing configuration 169/264
[2025-05-28 12:47:33] Config: LR=2e-05, BS=8, Epochs=3, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 590.22 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 573.01 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 631.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 652.32 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 701.59 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 763.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 778.12 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 847.43 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 845.00 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 862.50 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 774.30 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 603.09 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 596.91 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.870685338973999, 'eval_accuracy': 0.13600000739097595, 'eval_runtime': 1.7615, 'eval_samples_per_second': 70.964, 'eval_steps_per_second': 4.542, 'epoch': 1.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8555101156234741, 'eval_accuracy': 0.14399999380111694, 'eval_runtime': 1.9746, 'eval_samples_per_second': 63.303, 'eval_steps_per_second': 4.051, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7935529947280884, 'eval_accuracy': 0.2160000056028366, 'eval_runtime': 1.7974, 'eval_samples_per_second': 69.546, 'eval_steps_per_second': 4.451, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 254.7311, 'train_samples_per_second': 10.882, 'train_steps_per_second': 1.366, 'train_loss': 1.8328682033494972, 'epoch': 3.0}


{'eval_loss': 1.7935529947280884, 'eval_accuracy': 0.2160000056028366, 'eval_runtime': 1.8705, 'eval_samples_per_second': 66.826, 'eval_steps_per_second': 4.277, 'epoch': 3.0}
[2025-05-28 12:51:59] ✅ Config 169: Accuracy=0.2160, Loss=1.7936


[2025-05-28 12:51:59] 
🔬 Testing configuration 170/264
[2025-05-28 12:51:59] Config: LR=1e-05, BS=2, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 672.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 681.41 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 760.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 799.34 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 861.81 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 902.29 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 984.29 examples/s]


Map:  92%|█████████▏| 848/924 [00:00<00:00, 996.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 907.59 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 642.87 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 641.98 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8399, 'grad_norm': 139.11099243164062, 'learning_rate': 3.565217391304348e-06, 'epoch': 0.21645021645021645}


{'eval_loss': 1.8677984476089478, 'eval_accuracy': 0.11999999731779099, 'eval_runtime': 2.8875, 'eval_samples_per_second': 43.29, 'eval_steps_per_second': 11.082, 'epoch': 0.21645021645021645}


{'loss': 1.8488, 'grad_norm': 86.17288970947266, 'learning_rate': 7.82608695652174e-06, 'epoch': 0.4329004329004329}


{'eval_loss': 1.8918449878692627, 'eval_accuracy': 0.1120000034570694, 'eval_runtime': 2.8557, 'eval_samples_per_second': 43.772, 'eval_steps_per_second': 11.206, 'epoch': 0.4329004329004329}


{'loss': 1.8631, 'grad_norm': 134.5806427001953, 'learning_rate': 9.759615384615386e-06, 'epoch': 0.6493506493506493}


{'eval_loss': 1.8893600702285767, 'eval_accuracy': 0.11999999731779099, 'eval_runtime': 2.8927, 'eval_samples_per_second': 43.211, 'eval_steps_per_second': 11.062, 'epoch': 0.6493506493506493}


{'loss': 1.8261, 'grad_norm': 143.5178985595703, 'learning_rate': 9.278846153846155e-06, 'epoch': 0.8658008658008658}


{'eval_loss': 1.8263940811157227, 'eval_accuracy': 0.1679999977350235, 'eval_runtime': 2.8641, 'eval_samples_per_second': 43.644, 'eval_steps_per_second': 11.173, 'epoch': 0.8658008658008658}


{'loss': 1.8539, 'grad_norm': 30.11525535583496, 'learning_rate': 8.798076923076923e-06, 'epoch': 1.0822510822510822}


{'eval_loss': 1.83169686794281, 'eval_accuracy': 0.12800000607967377, 'eval_runtime': 2.8866, 'eval_samples_per_second': 43.303, 'eval_steps_per_second': 11.086, 'epoch': 1.0822510822510822}


{'loss': 1.835, 'grad_norm': 24.582517623901367, 'learning_rate': 8.317307692307694e-06, 'epoch': 1.2987012987012987}


{'eval_loss': 1.7785722017288208, 'eval_accuracy': 0.2160000056028366, 'eval_runtime': 2.8797, 'eval_samples_per_second': 43.408, 'eval_steps_per_second': 11.112, 'epoch': 1.2987012987012987}


{'loss': 1.8232, 'grad_norm': 16.414304733276367, 'learning_rate': 7.836538461538462e-06, 'epoch': 1.5151515151515151}


{'eval_loss': 1.8551868200302124, 'eval_accuracy': 0.1679999977350235, 'eval_runtime': 2.8718, 'eval_samples_per_second': 43.527, 'eval_steps_per_second': 11.143, 'epoch': 1.5151515151515151}


{'loss': 1.8084, 'grad_norm': 13.405107498168945, 'learning_rate': 7.355769230769232e-06, 'epoch': 1.7316017316017316}


{'eval_loss': 1.8358073234558105, 'eval_accuracy': 0.23999999463558197, 'eval_runtime': 3.0202, 'eval_samples_per_second': 41.388, 'eval_steps_per_second': 10.595, 'epoch': 1.7316017316017316}


{'loss': 1.7734, 'grad_norm': 18.260948181152344, 'learning_rate': 6.875e-06, 'epoch': 1.948051948051948}


{'eval_loss': 1.8010438680648804, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 2.998, 'eval_samples_per_second': 41.695, 'eval_steps_per_second': 10.674, 'epoch': 1.948051948051948}


{'loss': 1.7835, 'grad_norm': 237.71932983398438, 'learning_rate': 6.394230769230769e-06, 'epoch': 2.1645021645021645}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7855478525161743, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 3.0666, 'eval_samples_per_second': 40.762, 'eval_steps_per_second': 10.435, 'epoch': 2.1645021645021645}


{'loss': 1.7691, 'grad_norm': 23.759395599365234, 'learning_rate': 5.913461538461539e-06, 'epoch': 2.380952380952381}


{'eval_loss': 1.7541412115097046, 'eval_accuracy': 0.30399999022483826, 'eval_runtime': 3.2977, 'eval_samples_per_second': 37.905, 'eval_steps_per_second': 9.704, 'epoch': 2.380952380952381}


{'loss': 1.7631, 'grad_norm': 21.290700912475586, 'learning_rate': 5.432692307692308e-06, 'epoch': 2.5974025974025974}


{'eval_loss': 1.7937426567077637, 'eval_accuracy': 0.2800000011920929, 'eval_runtime': 3.02, 'eval_samples_per_second': 41.391, 'eval_steps_per_second': 10.596, 'epoch': 2.5974025974025974}


{'loss': 1.7726, 'grad_norm': 18.624948501586914, 'learning_rate': 4.951923076923077e-06, 'epoch': 2.813852813852814}


{'eval_loss': 1.7869222164154053, 'eval_accuracy': 0.2639999985694885, 'eval_runtime': 3.0675, 'eval_samples_per_second': 40.75, 'eval_steps_per_second': 10.432, 'epoch': 2.813852813852814}


{'loss': 1.7608, 'grad_norm': 62.207664489746094, 'learning_rate': 4.471153846153847e-06, 'epoch': 3.0303030303030303}


{'eval_loss': 1.7589349746704102, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 3.0699, 'eval_samples_per_second': 40.717, 'eval_steps_per_second': 10.424, 'epoch': 3.0303030303030303}


{'loss': 1.7433, 'grad_norm': 18.473264694213867, 'learning_rate': 3.990384615384616e-06, 'epoch': 3.2467532467532467}


{'eval_loss': 1.7578586339950562, 'eval_accuracy': 0.2639999985694885, 'eval_runtime': 3.0591, 'eval_samples_per_second': 40.861, 'eval_steps_per_second': 10.461, 'epoch': 3.2467532467532467}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 431.3131, 'train_samples_per_second': 10.711, 'train_steps_per_second': 2.678, 'train_loss': 1.8042757771809896, 'epoch': 3.2467532467532467}


{'eval_loss': 1.7855478525161743, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 3.17, 'eval_samples_per_second': 39.432, 'eval_steps_per_second': 10.095, 'epoch': 3.2467532467532467}
[2025-05-28 12:59:22] ✅ Config 170: Accuracy=0.2720, Loss=1.7855


[2025-05-28 12:59:22] 💾 Saved checkpoint at 170 configurations
[2025-05-28 12:59:22] 
🔬 Testing configuration 171/264
[2025-05-28 12:59:22] Config: LR=3e-05, BS=2, Epochs=10, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 676.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 681.84 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 766.97 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 803.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 869.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 910.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 1008.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  92%|█████████▏| 848/924 [00:00<00:00, 1019.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 921.87 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 654.29 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 650.76 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 12:59:30] ❌ Fatal error with config 171: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 12:59:30] 
🔬 Testing configuration 172/264
[2025-05-28 12:59:30] Config: LR=5e-05, BS=8, Epochs=10, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 633.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 643.98 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 713.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 746.85 examples/s]


Map:  50%|█████     | 464/924 [00:00<00:00, 803.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 845.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 911.51 examples/s]


Map:  88%|████████▊ | 816/924 [00:00<00:00, 917.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 937.69 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 848.73 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 610.95 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 604.44 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8513, 'grad_norm': 37.618709564208984, 'learning_rate': 4.6206896551724135e-05, 'epoch': 0.8620689655172413}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.9888904094696045, 'eval_accuracy': 0.09600000083446503, 'eval_runtime': 2.1452, 'eval_samples_per_second': 58.27, 'eval_steps_per_second': 3.729, 'epoch': 1.0}


{'loss': 1.8347, 'grad_norm': 7.834895133972168, 'learning_rate': 4.1896551724137934e-05, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 2.25290584564209, 'eval_accuracy': 0.07999999821186066, 'eval_runtime': 2.3918, 'eval_samples_per_second': 52.263, 'eval_steps_per_second': 3.345, 'epoch': 2.0}


{'loss': 1.8203, 'grad_norm': 8.794310569763184, 'learning_rate': 3.7586206896551726e-05, 'epoch': 2.586206896551724}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 2.0496246814727783, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 2.4228, 'eval_samples_per_second': 51.594, 'eval_steps_per_second': 3.302, 'epoch': 3.0}


{'loss': 1.8069, 'grad_norm': 6.098857402801514, 'learning_rate': 3.327586206896552e-05, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 2.0173046588897705, 'eval_accuracy': 0.06400000303983688, 'eval_runtime': 2.2365, 'eval_samples_per_second': 55.892, 'eval_steps_per_second': 3.577, 'epoch': 4.0}


{'loss': 1.8229, 'grad_norm': 6.722133159637451, 'learning_rate': 2.8965517241379313e-05, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 2.1674997806549072, 'eval_accuracy': 0.06400000303983688, 'eval_runtime': 2.3104, 'eval_samples_per_second': 54.103, 'eval_steps_per_second': 3.463, 'epoch': 5.0}


{'loss': 1.8216, 'grad_norm': 7.161037921905518, 'learning_rate': 2.4655172413793105e-05, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.959246039390564, 'eval_accuracy': 0.06400000303983688, 'eval_runtime': 2.1574, 'eval_samples_per_second': 57.941, 'eval_steps_per_second': 3.708, 'epoch': 6.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 545.7858, 'train_samples_per_second': 16.93, 'train_steps_per_second': 1.063, 'train_loss': 1.8220612098430764, 'epoch': 6.0}


{'eval_loss': 2.0496246814727783, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 2.1458, 'eval_samples_per_second': 58.252, 'eval_steps_per_second': 3.728, 'epoch': 6.0}
[2025-05-28 13:08:47] ✅ Config 172: Accuracy=0.2720, Loss=2.0496


[2025-05-28 13:08:47] 
🔬 Testing configuration 173/264
[2025-05-28 13:08:47] Config: LR=2e-05, BS=2, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 686.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 762.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 814.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 857.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 966.46 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 997.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 1055.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:00<00:00, 1081.19 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:00<00:00, 972.11 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 688.32 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 683.83 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8332, 'grad_norm': 1769.033935546875, 'learning_rate': 1.999061302243977e-05, 'epoch': 0.8658008658008658}


{'eval_loss': 1.874582052230835, 'eval_accuracy': 0.2800000011920929, 'eval_runtime': 2.7836, 'eval_samples_per_second': 44.906, 'eval_steps_per_second': 11.496, 'epoch': 0.8658008658008658}


{'loss': 1.8646, 'grad_norm': 81.95328521728516, 'learning_rate': 1.8885344083972912e-05, 'epoch': 1.7316017316017316}


{'eval_loss': 1.7587811946868896, 'eval_accuracy': 0.24799999594688416, 'eval_runtime': 2.7801, 'eval_samples_per_second': 44.963, 'eval_steps_per_second': 11.511, 'epoch': 1.7316017316017316}


{'loss': 1.8085, 'grad_norm': 2048.906005859375, 'learning_rate': 1.6137626749425377e-05, 'epoch': 2.5974025974025974}


{'eval_loss': 1.7651640176773071, 'eval_accuracy': 0.25600001215934753, 'eval_runtime': 2.7468, 'eval_samples_per_second': 45.507, 'eval_steps_per_second': 11.65, 'epoch': 2.5974025974025974}


{'loss': 1.8193, 'grad_norm': 22.178768157958984, 'learning_rate': 1.2255374261813944e-05, 'epoch': 3.463203463203463}


{'eval_loss': 1.8115781545639038, 'eval_accuracy': 0.19200000166893005, 'eval_runtime': 2.7933, 'eval_samples_per_second': 44.75, 'eval_steps_per_second': 11.456, 'epoch': 3.463203463203463}


{'loss': 1.8129, 'grad_norm': 12.604766845703125, 'learning_rate': 7.956217734936353e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7765312194824219, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 2.7646, 'eval_samples_per_second': 45.215, 'eval_steps_per_second': 11.575, 'epoch': 4.329004329004329}


{'loss': 1.812, 'grad_norm': 18.731088638305664, 'learning_rate': 4.034852644323661e-06, 'epoch': 5.194805194805195}


{'eval_loss': 1.801007866859436, 'eval_accuracy': 0.12800000607967377, 'eval_runtime': 2.7794, 'eval_samples_per_second': 44.973, 'eval_steps_per_second': 11.513, 'epoch': 5.194805194805195}


{'loss': 1.8029, 'grad_norm': 20.808496475219727, 'learning_rate': 1.2161400356095376e-06, 'epoch': 6.0606060606060606}


{'eval_loss': 1.8098750114440918, 'eval_accuracy': 0.09600000083446503, 'eval_runtime': 2.8004, 'eval_samples_per_second': 44.637, 'eval_steps_per_second': 11.427, 'epoch': 6.0606060606060606}


{'loss': 1.7911, 'grad_norm': 16.56639289855957, 'learning_rate': 2.1116568651156076e-08, 'epoch': 6.926406926406926}


{'eval_loss': 1.814773440361023, 'eval_accuracy': 0.07999999821186066, 'eval_runtime': 2.7555, 'eval_samples_per_second': 45.363, 'eval_steps_per_second': 11.613, 'epoch': 6.926406926406926}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 649.044, 'train_samples_per_second': 9.965, 'train_steps_per_second': 1.24, 'train_loss': 1.8180529022216796, 'epoch': 6.926406926406926}


{'eval_loss': 1.7765312194824219, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 2.7799, 'eval_samples_per_second': 44.966, 'eval_steps_per_second': 11.511, 'epoch': 6.926406926406926}
[2025-05-28 13:19:47] ✅ Config 173: Accuracy=0.2720, Loss=1.7765


[2025-05-28 13:19:48] 
🔬 Testing configuration 174/264
[2025-05-28 13:19:48] Config: LR=3e-05, BS=4, Epochs=10, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 662.68 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 674.10 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 755.97 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 794.38 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 855.22 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 891.94 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 972.74 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 978.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 898.69 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 639.45 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 636.44 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:19:56] ❌ Fatal error with config 174: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 13:19:56] 
🔬 Testing configuration 175/264
[2025-05-28 13:19:56] Config: LR=3e-05, BS=2, Epochs=3, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 663.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 668.25 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 731.99 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 764.53 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 832.21 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 875.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 959.36 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 968.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 884.37 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 638.86 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 633.08 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:20:04] ❌ Fatal error with config 175: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 13:20:04] 
🔬 Testing configuration 176/264
[2025-05-28 13:20:04] Config: LR=3e-05, BS=16, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 684.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 758.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 811.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 855.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 948.94 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 981.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 1043.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:00<00:00, 1076.04 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:00<00:00, 964.81 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 692.85 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 687.52 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:20:12] ❌ Fatal error with config 176: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 13:20:12] 
🔬 Testing configuration 177/264
[2025-05-28 13:20:12] Config: LR=2e-05, BS=2, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 662.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 670.48 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 751.12 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 788.01 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 849.48 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 888.04 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 965.26 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 971.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 893.13 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 638.16 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 637.11 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:20:20] ❌ Fatal error with config 177: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 13:20:20] 
🔬 Testing configuration 178/264
[2025-05-28 13:20:20] Config: LR=1e-05, BS=8, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 666.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 677.02 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 756.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 795.38 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 858.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 898.18 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 979.26 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 980.24 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 899.60 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 642.58 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 638.90 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:20:27] ❌ Fatal error with config 178: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 13:20:27] 
🔬 Testing configuration 179/264
[2025-05-28 13:20:27] Config: LR=3e-05, BS=2, Epochs=10, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 663.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 674.28 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 752.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 780.82 examples/s]


Map:  50%|█████     | 464/924 [00:00<00:00, 832.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 880.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 959.73 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 976.88 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 893.70 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 640.92 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 635.91 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:20:35] ❌ Fatal error with config 179: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 13:20:35] 
🔬 Testing configuration 180/264
[2025-05-28 13:20:35] Config: LR=5e-05, BS=8, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 665.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 678.07 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 755.97 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 792.41 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 851.88 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 892.74 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 974.39 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 980.08 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 899.39 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 641.03 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 637.79 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:20:43] ❌ Fatal error with config 180: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 13:20:43] 
🔬 Testing configuration 181/264
[2025-05-28 13:20:43] Config: LR=3e-05, BS=2, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 656.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 669.67 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 749.47 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 785.21 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 849.04 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 886.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 966.32 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 967.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 890.88 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 638.25 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 635.27 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:20:50] ❌ Fatal error with config 181: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 13:20:50] 
🔬 Testing configuration 182/264
[2025-05-28 13:20:50] Config: LR=2e-05, BS=8, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 664.26 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 669.20 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 751.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 793.10 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 857.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 899.07 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 979.52 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 984.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 902.15 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 636.20 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 625.70 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:20:58] ❌ Fatal error with config 182: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 13:20:58] 
🔬 Testing configuration 183/264
[2025-05-28 13:20:58] Config: LR=3e-05, BS=2, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 653.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 667.32 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 746.83 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 785.61 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 849.08 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 888.36 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 969.97 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 976.19 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 893.93 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 636.10 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 631.60 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8242, 'grad_norm': 14.813455581665039, 'learning_rate': 2.94e-05, 'epoch': 0.21645021645021645}


{'loss': 1.1585, 'grad_norm': 3.6562840938568115, 'learning_rate': 2.8683257918552038e-05, 'epoch': 0.4329004329004329}


{'loss': 0.698, 'grad_norm': 1.0455626249313354, 'learning_rate': 2.7339366515837103e-05, 'epoch': 0.6493506493506493}


{'loss': 0.7529, 'grad_norm': 40.52213668823242, 'learning_rate': 2.6009049773755658e-05, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8149423003196716, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.8857, 'eval_samples_per_second': 43.316, 'eval_steps_per_second': 11.089, 'epoch': 1.0}


{'loss': 0.6918, 'grad_norm': 0.681647002696991, 'learning_rate': 2.465158371040724e-05, 'epoch': 1.0822510822510822}


{'loss': 0.6364, 'grad_norm': 0.25567129254341125, 'learning_rate': 2.3294117647058824e-05, 'epoch': 1.2987012987012987}


{'loss': 0.5321, 'grad_norm': 0.31192901730537415, 'learning_rate': 2.1936651583710406e-05, 'epoch': 1.5151515151515151}


{'loss': 0.5329, 'grad_norm': 0.26068350672721863, 'learning_rate': 2.0579185520361992e-05, 'epoch': 1.7316017316017316}


{'loss': 0.6337, 'grad_norm': 0.31327465176582336, 'learning_rate': 1.9221719457013575e-05, 'epoch': 1.948051948051948}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8830212354660034, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 3.1066, 'eval_samples_per_second': 40.237, 'eval_steps_per_second': 10.301, 'epoch': 2.0}


{'loss': 0.4629, 'grad_norm': 0.08282216638326645, 'learning_rate': 1.7864253393665158e-05, 'epoch': 2.1645021645021645}


{'loss': 0.4725, 'grad_norm': 0.3145867884159088, 'learning_rate': 1.6506787330316744e-05, 'epoch': 2.380952380952381}


{'loss': 0.5004, 'grad_norm': 0.16653811931610107, 'learning_rate': 1.5149321266968325e-05, 'epoch': 2.5974025974025974}


{'loss': 0.553, 'grad_norm': 0.23245863616466522, 'learning_rate': 1.379185520361991e-05, 'epoch': 2.813852813852814}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9624282121658325, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 3.1087, 'eval_samples_per_second': 40.21, 'eval_steps_per_second': 10.294, 'epoch': 3.0}


{'loss': 0.4757, 'grad_norm': 0.19118501245975494, 'learning_rate': 1.2434389140271494e-05, 'epoch': 3.0303030303030303}


{'loss': 0.4588, 'grad_norm': 0.18364864587783813, 'learning_rate': 1.1076923076923077e-05, 'epoch': 3.2467532467532467}


{'loss': 0.473, 'grad_norm': 0.224903404712677, 'learning_rate': 9.719457013574662e-06, 'epoch': 3.463203463203463}


{'loss': 0.4402, 'grad_norm': 0.10538917034864426, 'learning_rate': 8.361990950226244e-06, 'epoch': 3.6796536796536796}


{'loss': 0.5017, 'grad_norm': 0.21673108637332916, 'learning_rate': 7.004524886877828e-06, 'epoch': 3.896103896103896}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8912041187286377, 'eval_accuracy': 0.8399999737739563, 'eval_runtime': 2.9802, 'eval_samples_per_second': 41.943, 'eval_steps_per_second': 10.737, 'epoch': 4.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 655.6236, 'train_samples_per_second': 7.047, 'train_steps_per_second': 3.523, 'train_loss': 0.6502929704013841, 'epoch': 4.0}


{'eval_loss': 0.8149423003196716, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 2.9432, 'eval_samples_per_second': 42.47, 'eval_steps_per_second': 10.872, 'epoch': 4.0}
[2025-05-28 13:32:05] ✅ Config 183: Accuracy=0.8480, Loss=0.8149


[2025-05-28 13:32:05] 
🔬 Testing configuration 184/264
[2025-05-28 13:32:05] Config: LR=5e-05, BS=2, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 689.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 765.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 818.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 859.35 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 963.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 994.87 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 1054.29 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:00<00:00, 1083.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:00<00:00, 973.26 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 687.83 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 683.76 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:32:13] ❌ Fatal error with config 184: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 13:32:13] 
🔬 Testing configuration 185/264
[2025-05-28 13:32:13] Config: LR=3e-05, BS=2, Epochs=10, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 675.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 752.12 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 800.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 845.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 961.64 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 998.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 1065.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:00<00:00, 1089.04 examples/s]


Map: 100%|██████████| 924/924 [00:00<00:00, 972.06 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 689.03 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 687.19 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:32:21] ❌ Fatal error with config 185: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 13:32:21] 
🔬 Testing configuration 186/264
[2025-05-28 13:32:21] Config: LR=2e-05, BS=4, Epochs=3, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 665.17 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 672.02 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 751.24 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 790.23 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 853.17 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 892.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 972.78 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 977.97 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 897.68 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 641.65 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 636.45 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8771, 'grad_norm': 36.47258377075195, 'learning_rate': 1.9179165852537596e-05, 'epoch': 0.4329004329004329}


{'eval_loss': 1.8945775032043457, 'eval_accuracy': 0.12800000607967377, 'eval_runtime': 2.8504, 'eval_samples_per_second': 43.854, 'eval_steps_per_second': 5.613, 'epoch': 0.4329004329004329}


{'loss': 1.8972, 'grad_norm': 163.46282958984375, 'learning_rate': 1.6514279599397774e-05, 'epoch': 0.8658008658008658}


{'eval_loss': 1.774911642074585, 'eval_accuracy': 0.29600000381469727, 'eval_runtime': 2.9787, 'eval_samples_per_second': 41.964, 'eval_steps_per_second': 5.371, 'epoch': 0.8658008658008658}


{'loss': 1.8664, 'grad_norm': 15.242826461791992, 'learning_rate': 1.2533413520603281e-05, 'epoch': 1.2987012987012987}


{'eval_loss': 1.9388182163238525, 'eval_accuracy': 0.14399999380111694, 'eval_runtime': 2.9316, 'eval_samples_per_second': 42.639, 'eval_steps_per_second': 5.458, 'epoch': 1.2987012987012987}


{'loss': 1.8244, 'grad_norm': 21.147674560546875, 'learning_rate': 8.040760830214334e-06, 'epoch': 1.7316017316017316}


{'eval_loss': 1.8305097818374634, 'eval_accuracy': 0.2639999985694885, 'eval_runtime': 2.9486, 'eval_samples_per_second': 42.392, 'eval_steps_per_second': 5.426, 'epoch': 1.7316017316017316}


{'loss': 1.8071, 'grad_norm': 56.2880973815918, 'learning_rate': 3.943903128623336e-06, 'epoch': 2.1645021645021645}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8893970251083374, 'eval_accuracy': 0.2639999985694885, 'eval_runtime': 2.8996, 'eval_samples_per_second': 43.11, 'eval_steps_per_second': 5.518, 'epoch': 2.1645021645021645}


{'loss': 1.8356, 'grad_norm': 46.204322814941406, 'learning_rate': 1.0909640297225067e-06, 'epoch': 2.5974025974025974}


{'eval_loss': 1.8124425411224365, 'eval_accuracy': 0.2639999985694885, 'eval_runtime': 3.0836, 'eval_samples_per_second': 40.538, 'eval_steps_per_second': 5.189, 'epoch': 2.5974025974025974}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 276.1321, 'train_samples_per_second': 10.039, 'train_steps_per_second': 2.51, 'train_loss': 1.8446044701690454, 'epoch': 3.0}


{'eval_loss': 1.8893970251083374, 'eval_accuracy': 0.2639999985694885, 'eval_runtime': 3.1269, 'eval_samples_per_second': 39.975, 'eval_steps_per_second': 5.117, 'epoch': 3.0}
[2025-05-28 13:37:09] ✅ Config 186: Accuracy=0.2640, Loss=1.8894


[2025-05-28 13:37:09] 
🔬 Testing configuration 187/264
[2025-05-28 13:37:09] Config: LR=3e-05, BS=16, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 680.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 760.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 813.14 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 855.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 968.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 996.81 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 1051.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:00<00:00, 1078.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:00<00:00, 969.94 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 679.84 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 679.66 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:37:17] ❌ Fatal error with config 187: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 13:37:17] 
🔬 Testing configuration 188/264
[2025-05-28 13:37:17] Config: LR=1e-05, BS=16, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 692.29 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 763.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 816.49 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 858.99 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 963.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 991.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 1051.04 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:00<00:00, 1083.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:00<00:00, 972.37 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 689.07 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 686.31 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:37:25] ❌ Fatal error with config 188: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 13:37:25] 
🔬 Testing configuration 189/264
[2025-05-28 13:37:25] Config: LR=1e-05, BS=4, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 654.16 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 670.39 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 752.54 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 792.43 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 856.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 894.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 974.62 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 980.08 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 898.23 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 638.71 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 635.11 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:37:32] ❌ Fatal error with config 189: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 13:37:32] 
🔬 Testing configuration 190/264
[2025-05-28 13:37:32] Config: LR=2e-05, BS=8, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 679.27 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 755.49 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 810.16 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 854.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 965.77 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 999.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 1065.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:00<00:00, 1082.51 examples/s]


Map: 100%|██████████| 924/924 [00:00<00:00, 972.63 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 672.64 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 673.76 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:37:40] ❌ Fatal error with config 190: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 13:37:40] 
🔬 Testing configuration 191/264
[2025-05-28 13:37:40] Config: LR=1e-05, BS=8, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 658.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 667.26 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 748.08 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 786.09 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 848.49 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 884.77 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 963.80 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 969.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 890.73 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 635.90 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 632.66 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.5655937194824219, 'eval_accuracy': 0.3919999897480011, 'eval_runtime': 1.6851, 'eval_samples_per_second': 74.179, 'eval_steps_per_second': 4.747, 'epoch': 1.0}


{'loss': 1.3416, 'grad_norm': 9.028512954711914, 'learning_rate': 9.375747350433044e-06, 'epoch': 1.7241379310344827}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7808289527893066, 'eval_accuracy': 0.7039999961853027, 'eval_runtime': 1.9019, 'eval_samples_per_second': 65.725, 'eval_steps_per_second': 4.206, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5980282425880432, 'eval_accuracy': 0.7839999794960022, 'eval_runtime': 1.777, 'eval_samples_per_second': 70.345, 'eval_steps_per_second': 4.502, 'epoch': 3.0}


{'loss': 0.2246, 'grad_norm': 3.9483842849731445, 'learning_rate': 6.040953934600425e-06, 'epoch': 3.4482758620689653}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5568836331367493, 'eval_accuracy': 0.800000011920929, 'eval_runtime': 1.7827, 'eval_samples_per_second': 70.117, 'eval_steps_per_second': 4.487, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5517748594284058, 'eval_accuracy': 0.800000011920929, 'eval_runtime': 1.7676, 'eval_samples_per_second': 70.718, 'eval_steps_per_second': 4.526, 'epoch': 5.0}


{'loss': 0.019, 'grad_norm': 0.24878761172294617, 'learning_rate': 1.9888363722092376e-06, 'epoch': 5.172413793103448}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5651555061340332, 'eval_accuracy': 0.8080000281333923, 'eval_runtime': 1.9094, 'eval_samples_per_second': 65.466, 'eval_steps_per_second': 4.19, 'epoch': 6.0}


{'loss': 0.0053, 'grad_norm': 0.12372548878192902, 'learning_rate': 1.171973700349216e-08, 'epoch': 6.896551724137931}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.5861716270446777, 'eval_accuracy': 0.8159999847412109, 'eval_runtime': 1.7474, 'eval_samples_per_second': 71.534, 'eval_steps_per_second': 4.578, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 595.9679, 'train_samples_per_second': 10.853, 'train_steps_per_second': 0.681, 'train_loss': 0.3917392127839803, 'epoch': 7.0}


{'eval_loss': 0.5861716270446777, 'eval_accuracy': 0.8159999847412109, 'eval_runtime': 1.804, 'eval_samples_per_second': 69.292, 'eval_steps_per_second': 4.435, 'epoch': 7.0}
[2025-05-28 13:47:46] ✅ Config 191: Accuracy=0.8160, Loss=0.5862


[2025-05-28 13:47:46] 
🔬 Testing configuration 192/264
[2025-05-28 13:47:46] Config: LR=1e-05, BS=8, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 688.97 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 766.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 820.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 856.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 967.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 1002.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 1071.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:00<00:00, 1092.92 examples/s]


Map: 100%|██████████| 924/924 [00:00<00:00, 980.52 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 690.12 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 688.79 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:47:59] ❌ Fatal error with config 192: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 13:47:59] 
🔬 Testing configuration 193/264
[2025-05-28 13:47:59] Config: LR=5e-05, BS=4, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 687.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 767.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 814.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 861.98 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 976.98 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 1002.74 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 1058.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:00<00:00, 1089.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:00<00:00, 978.14 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 687.02 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 681.35 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:48:07] ❌ Fatal error with config 193: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 13:48:07] 
🔬 Testing configuration 194/264
[2025-05-28 13:48:07] Config: LR=5e-05, BS=4, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 695.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 767.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 816.67 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 862.46 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 975.18 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 1002.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 1060.75 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:00<00:00, 1090.17 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:00<00:00, 979.08 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 682.02 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 683.60 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:48:15] ❌ Fatal error with config 194: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 13:48:15] 
🔬 Testing configuration 195/264
[2025-05-28 13:48:15] Config: LR=1e-05, BS=8, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 669.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 681.13 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 753.83 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 789.54 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 855.18 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 893.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 976.87 examples/s]


Map:  90%|█████████ | 832/924 [00:00<00:00, 977.46 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 899.20 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 641.12 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 638.37 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:48:22] ❌ Fatal error with config 195: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 13:48:22] 
🔬 Testing configuration 196/264
[2025-05-28 13:48:22] Config: LR=3e-05, BS=2, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 667.99 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 677.62 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 754.76 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 795.95 examples/s]


Map:  52%|█████▏    | 480/924 [00:00<00:00, 859.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 900.11 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 980.42 examples/s]


Map:  92%|█████████▏| 848/924 [00:00<00:00, 990.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 903.28 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 625.49 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 628.35 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:48:30] ❌ Fatal error with config 196: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 13:48:30] 
🔬 Testing configuration 197/264
[2025-05-28 13:48:30] Config: LR=5e-05, BS=2, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 689.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 768.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 816.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 861.46 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 968.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 997.94 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 1064.49 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:00<00:00, 1086.10 examples/s]


Map: 100%|██████████| 924/924 [00:00<00:00, 975.41 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 691.46 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 689.14 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 13:48:38] ❌ Fatal error with config 197: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 13:48:38] 
🔬 Testing configuration 198/264
[2025-05-28 13:48:38] Config: LR=5e-05, BS=2, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:01, 688.90 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  21%|██        | 192/924 [00:00<00:00, 762.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  31%|███       | 288/924 [00:00<00:00, 815.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  43%|████▎     | 400/924 [00:00<00:00, 860.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  57%|█████▋    | 528/924 [00:00<00:00, 975.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:00<00:00, 1007.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 1063.24 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:00<00:00, 1091.11 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:00<00:00, 979.37 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 688.97 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 683.22 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8957, 'grad_norm': 2734.859619140625, 'learning_rate': 4.35e-05, 'epoch': 0.21645021645021645}


{'loss': 1.9756, 'grad_norm': 11.159906387329102, 'learning_rate': 4.803561085972851e-05, 'epoch': 0.4329004329004329}


{'loss': 2.0101, 'grad_norm': 11.469075202941895, 'learning_rate': 4.5777692307692306e-05, 'epoch': 0.6493506493506493}


{'loss': 1.8864, 'grad_norm': 9.998397827148438, 'learning_rate': 4.351977375565611e-05, 'epoch': 0.8658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.9729140996932983, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 3.1138, 'eval_samples_per_second': 40.143, 'eval_steps_per_second': 10.277, 'epoch': 1.0}


{'loss': 1.9322, 'grad_norm': 9.699413299560547, 'learning_rate': 4.126185520361991e-05, 'epoch': 1.0822510822510822}


{'loss': 1.9016, 'grad_norm': 8.205329895019531, 'learning_rate': 3.900393665158371e-05, 'epoch': 1.2987012987012987}


{'loss': 1.8492, 'grad_norm': 24.668561935424805, 'learning_rate': 3.674601809954752e-05, 'epoch': 1.5151515151515151}


{'loss': 1.8832, 'grad_norm': 9.412075996398926, 'learning_rate': 3.448809954751131e-05, 'epoch': 1.7316017316017316}


{'loss': 1.8148, 'grad_norm': 6.877192974090576, 'learning_rate': 3.223018099547511e-05, 'epoch': 1.948051948051948}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8195078372955322, 'eval_accuracy': 0.1599999964237213, 'eval_runtime': 3.4957, 'eval_samples_per_second': 35.758, 'eval_steps_per_second': 9.154, 'epoch': 2.0}


{'loss': 1.8408, 'grad_norm': 12.416749000549316, 'learning_rate': 2.9972262443438916e-05, 'epoch': 2.1645021645021645}


{'loss': 1.8427, 'grad_norm': 9.439713478088379, 'learning_rate': 2.7714343891402716e-05, 'epoch': 2.380952380952381}


{'loss': 1.821, 'grad_norm': 8.156265258789062, 'learning_rate': 2.5456425339366512e-05, 'epoch': 2.5974025974025974}


{'loss': 1.8323, 'grad_norm': 7.816908359527588, 'learning_rate': 2.3198506787330316e-05, 'epoch': 2.813852813852814}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7662265300750732, 'eval_accuracy': 0.2800000011920929, 'eval_runtime': 3.4186, 'eval_samples_per_second': 36.565, 'eval_steps_per_second': 9.361, 'epoch': 3.0}


{'loss': 1.8139, 'grad_norm': 7.043128967285156, 'learning_rate': 2.0940588235294116e-05, 'epoch': 3.0303030303030303}


{'loss': 1.8253, 'grad_norm': 8.11402416229248, 'learning_rate': 1.868266968325792e-05, 'epoch': 3.2467532467532467}


{'loss': 1.8083, 'grad_norm': 7.080801963806152, 'learning_rate': 1.6424751131221716e-05, 'epoch': 3.463203463203463}


{'loss': 1.8197, 'grad_norm': 9.174485206604004, 'learning_rate': 1.416683257918552e-05, 'epoch': 3.6796536796536796}


{'loss': 1.8218, 'grad_norm': 8.922087669372559, 'learning_rate': 1.1908914027149324e-05, 'epoch': 3.896103896103896}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8401875495910645, 'eval_accuracy': 0.06400000303983688, 'eval_runtime': 3.4282, 'eval_samples_per_second': 36.462, 'eval_steps_per_second': 9.334, 'epoch': 4.0}


{'loss': 1.8144, 'grad_norm': 7.036344051361084, 'learning_rate': 9.650995475113122e-06, 'epoch': 4.112554112554113}


{'loss': 1.8142, 'grad_norm': 8.731677055358887, 'learning_rate': 7.393076923076925e-06, 'epoch': 4.329004329004329}


{'loss': 1.8188, 'grad_norm': 7.31684684753418, 'learning_rate': 5.135158371040722e-06, 'epoch': 4.545454545454545}


{'loss': 1.7943, 'grad_norm': 8.168353080749512, 'learning_rate': 2.877239819004526e-06, 'epoch': 4.761904761904762}


{'loss': 1.791, 'grad_norm': 6.674577236175537, 'learning_rate': 6.193212669683239e-07, 'epoch': 4.978354978354979}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8050858974456787, 'eval_accuracy': 0.06400000303983688, 'eval_runtime': 3.4689, 'eval_samples_per_second': 36.034, 'eval_steps_per_second': 9.225, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 805.4387, 'train_samples_per_second': 5.736, 'train_steps_per_second': 2.868, 'train_loss': 1.8524436257102272, 'epoch': 5.0}


{'eval_loss': 1.7662265300750732, 'eval_accuracy': 0.2800000011920929, 'eval_runtime': 3.4511, 'eval_samples_per_second': 36.221, 'eval_steps_per_second': 9.272, 'epoch': 5.0}
[2025-05-28 14:02:15] ✅ Config 198: Accuracy=0.2800, Loss=1.7662


[2025-05-28 14:02:15] 
🔬 Testing configuration 199/264
[2025-05-28 14:02:15] Config: LR=1e-05, BS=8, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 617.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 609.67 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 684.89 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 710.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  48%|████▊     | 448/924 [00:00<00:00, 752.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 804.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  73%|███████▎  | 672/924 [00:00<00:00, 856.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 900.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:01<00:00, 914.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 816.30 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 604.58 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 593.12 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'train_runtime': 99.9841, 'train_samples_per_second': 27.724, 'train_steps_per_second': 1.74, 'train_loss': 1.2377352659729706, 'epoch': 3.0}


{'eval_loss': 1.1382648944854736, 'eval_accuracy': 0.7279999852180481, 'eval_runtime': 1.0638, 'eval_samples_per_second': 117.505, 'eval_steps_per_second': 7.52, 'epoch': 3.0}
[2025-05-28 14:04:05] ✅ Config 199: Accuracy=0.7280, Loss=1.1383


[2025-05-28 14:04:06] 
🔬 Testing configuration 200/264
[2025-05-28 14:04:06] Config: LR=1e-05, BS=16, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 616.75 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 617.16 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 689.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 717.90 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 771.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 811.68 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 889.77 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  88%|████████▊ | 816/924 [00:01<00:00, 901.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 919.85 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 824.80 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 596.45 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 580.85 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 14:04:14] ❌ Fatal error with config 200: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 14:04:14] 
🔬 Testing configuration 201/264
[2025-05-28 14:04:14] Config: LR=5e-05, BS=2, Epochs=3, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 591.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 572.66 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 628.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 642.43 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 693.89 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 758.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 777.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 843.93 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 836.12 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 850.88 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 767.02 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 550.29 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 554.41 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 547.13 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 14:04:23] ❌ Fatal error with config 201: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 14:04:23] 
🔬 Testing configuration 202/264
[2025-05-28 14:04:23] Config: LR=5e-05, BS=8, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 577.76 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 560.95 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 615.88 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 632.79 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 678.71 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 738.98 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 752.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 803.53 examples/s]


Map:  85%|████████▍ | 784/924 [00:01<00:00, 825.94 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 837.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 750.81 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 541.17 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 538.45 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 532.68 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8578, 'grad_norm': 5.740675926208496, 'learning_rate': 3.4942295573599245e-05, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.763218879699707, 'eval_accuracy': 0.2879999876022339, 'eval_runtime': 1.7977, 'eval_samples_per_second': 69.534, 'eval_steps_per_second': 4.45, 'epoch': 4.310344827586207}


{'loss': 1.8079, 'grad_norm': 4.492899417877197, 'learning_rate': 3.035467310337095e-06, 'epoch': 8.620689655172415}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8063703775405884, 'eval_accuracy': 0.07199999690055847, 'eval_runtime': 2.0097, 'eval_samples_per_second': 62.199, 'eval_steps_per_second': 3.981, 'epoch': 8.620689655172415}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 617.4204, 'train_samples_per_second': 14.965, 'train_steps_per_second': 1.879, 'train_loss': 1.8274621634647763, 'epoch': 10.0}


{'eval_loss': 1.763218879699707, 'eval_accuracy': 0.2879999876022339, 'eval_runtime': 1.8217, 'eval_samples_per_second': 68.616, 'eval_steps_per_second': 4.391, 'epoch': 10.0}
[2025-05-28 14:14:51] ✅ Config 202: Accuracy=0.2880, Loss=1.7632


[2025-05-28 14:14:52] 
🔬 Testing configuration 203/264
[2025-05-28 14:14:52] Config: LR=1e-05, BS=8, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 585.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 564.11 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 622.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 642.47 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 696.23 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 761.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 771.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 842.12 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 843.81 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 862.59 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 770.18 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 560.69 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 561.08 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 555.29 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 14:15:00] ❌ Fatal error with config 203: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 14:15:00] 
🔬 Testing configuration 204/264
[2025-05-28 14:15:00] Config: LR=1e-05, BS=16, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 612.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 625.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 713.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 738.22 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 802.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 851.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 935.05 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  88%|████████▊ | 816/924 [00:00<00:00, 929.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 938.45 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 849.00 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 598.02 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 590.72 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 14:15:09] ❌ Fatal error with config 204: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 14:15:09] 
🔬 Testing configuration 205/264
[2025-05-28 14:15:09] Config: LR=3e-05, BS=4, Epochs=5, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 587.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 568.75 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 629.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 643.74 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 692.35 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 753.96 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 767.57 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 841.27 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 838.94 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 855.04 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 766.86 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 549.18 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 553.61 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 547.13 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.5907, 'grad_norm': 60.19130325317383, 'learning_rate': 1.7194805194805198e-05, 'epoch': 2.1645021645021645}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8457562327384949, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 2.1235, 'eval_samples_per_second': 58.866, 'eval_steps_per_second': 7.535, 'epoch': 2.1645021645021645}


{'loss': 0.59, 'grad_norm': 0.2030244916677475, 'learning_rate': 4.207792207792208e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.87284916639328, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 2.4849, 'eval_samples_per_second': 50.304, 'eval_steps_per_second': 6.439, 'epoch': 4.329004329004329}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 479.7443, 'train_samples_per_second': 9.63, 'train_steps_per_second': 2.408, 'train_loss': 1.0137300631700656, 'epoch': 5.0}


{'eval_loss': 0.8457562327384949, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 2.2401, 'eval_samples_per_second': 55.802, 'eval_steps_per_second': 7.143, 'epoch': 5.0}
[2025-05-28 14:23:20] ✅ Config 205: Accuracy=0.8240, Loss=0.8458


[2025-05-28 14:23:21] 
🔬 Testing configuration 206/264
[2025-05-28 14:23:21] Config: LR=3e-05, BS=16, Epochs=7, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 625.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 620.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 690.24 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 715.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  48%|████▊     | 448/924 [00:00<00:00, 759.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 807.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  73%|███████▎  | 672/924 [00:00<00:00, 861.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 906.84 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:01<00:00, 922.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 822.48 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 599.00 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 592.17 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7606252431869507, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 1.2347, 'eval_samples_per_second': 101.237, 'eval_steps_per_second': 3.24, 'epoch': 0.9655172413793104}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.4158273935317993, 'eval_accuracy': 0.46399998664855957, 'eval_runtime': 1.3393, 'eval_samples_per_second': 93.329, 'eval_steps_per_second': 2.987, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.8594018816947937, 'eval_accuracy': 0.6320000290870667, 'eval_runtime': 1.3209, 'eval_samples_per_second': 94.631, 'eval_steps_per_second': 3.028, 'epoch': 2.9655172413793105}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.7129248380661011, 'eval_accuracy': 0.7599999904632568, 'eval_runtime': 1.3227, 'eval_samples_per_second': 94.503, 'eval_steps_per_second': 3.024, 'epoch': 4.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6890955567359924, 'eval_accuracy': 0.7839999794960022, 'eval_runtime': 1.3547, 'eval_samples_per_second': 92.274, 'eval_steps_per_second': 2.953, 'epoch': 4.9655172413793105}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.6822507381439209, 'eval_accuracy': 0.7599999904632568, 'eval_runtime': 1.3226, 'eval_samples_per_second': 94.513, 'eval_steps_per_second': 3.024, 'epoch': 6.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.659885048866272, 'eval_accuracy': 0.7919999957084656, 'eval_runtime': 1.3338, 'eval_samples_per_second': 93.714, 'eval_steps_per_second': 2.999, 'epoch': 6.758620689655173}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 441.8698, 'train_samples_per_second': 14.638, 'train_steps_per_second': 0.222, 'train_loss': 0.5559639054901746, 'epoch': 6.758620689655173}


{'eval_loss': 0.659885048866272, 'eval_accuracy': 0.7919999957084656, 'eval_runtime': 1.433, 'eval_samples_per_second': 87.23, 'eval_steps_per_second': 2.791, 'epoch': 6.758620689655173}
[2025-05-28 14:30:53] ✅ Config 206: Accuracy=0.7920, Loss=0.6599


[2025-05-28 14:30:54] 
🔬 Testing configuration 207/264
[2025-05-28 14:30:54] Config: LR=5e-05, BS=4, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 615.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 611.74 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 682.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 711.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  48%|████▊     | 448/924 [00:00<00:00, 754.81 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 806.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  73%|███████▎  | 672/924 [00:00<00:00, 861.19 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 906.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:01<00:00, 920.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 819.37 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 594.32 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 587.07 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 14:31:03] ❌ Fatal error with config 207: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 14:31:03] 
🔬 Testing configuration 208/264
[2025-05-28 14:31:03] Config: LR=1e-05, BS=2, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 586.21 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 565.13 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 623.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 644.56 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 694.17 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 754.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 769.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 842.67 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 842.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 864.70 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 770.26 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 551.83 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 552.54 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 545.94 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 14:31:11] ❌ Fatal error with config 208: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 14:31:11] 
🔬 Testing configuration 209/264
[2025-05-28 14:31:11] Config: LR=5e-05, BS=8, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 582.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 545.73 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 606.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 630.98 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 679.41 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 739.31 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 754.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 823.48 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 824.22 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 843.84 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 752.34 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 599.30 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 596.19 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8683, 'grad_norm': 11.954146385192871, 'learning_rate': 4.1418471337579625e-05, 'epoch': 0.8620689655172413}


{'eval_loss': 1.9224579334259033, 'eval_accuracy': 0.14399999380111694, 'eval_runtime': 1.8077, 'eval_samples_per_second': 69.147, 'eval_steps_per_second': 4.425, 'epoch': 0.8620689655172413}


{'loss': 1.7216, 'grad_norm': 8.520588874816895, 'learning_rate': 2.5526751592356686e-05, 'epoch': 1.7241379310344827}


{'eval_loss': 1.3998788595199585, 'eval_accuracy': 0.6079999804496765, 'eval_runtime': 1.7886, 'eval_samples_per_second': 69.886, 'eval_steps_per_second': 4.473, 'epoch': 1.7241379310344827}


{'loss': 0.8411, 'grad_norm': 5.530345439910889, 'learning_rate': 9.952866242038216e-06, 'epoch': 2.586206896551724}


{'eval_loss': 0.8309648633003235, 'eval_accuracy': 0.8080000281333923, 'eval_runtime': 1.7895, 'eval_samples_per_second': 69.852, 'eval_steps_per_second': 4.471, 'epoch': 2.586206896551724}


{'train_runtime': 149.1384, 'train_samples_per_second': 18.587, 'train_steps_per_second': 1.167, 'train_loss': 1.3687058920147774, 'epoch': 3.0}


{'eval_loss': 0.7761801481246948, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 1.7814, 'eval_samples_per_second': 70.17, 'eval_steps_per_second': 4.491, 'epoch': 3.0}
[2025-05-28 14:33:51] ✅ Config 209: Accuracy=0.8320, Loss=0.7762


[2025-05-28 14:33:51] 
🔬 Testing configuration 210/264
[2025-05-28 14:33:51] Config: LR=2e-05, BS=8, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 587.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 570.71 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 629.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 648.15 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 696.31 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 757.64 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 772.84 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 838.78 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 833.87 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 851.53 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 766.84 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 553.30 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 553.76 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 547.43 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 14:34:00] ❌ Fatal error with config 210: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 14:34:00] 
🔬 Testing configuration 211/264
[2025-05-28 14:34:00] Config: LR=1e-05, BS=16, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 601.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 582.56 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  26%|██▌       | 240/924 [00:00<00:01, 662.39 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 689.23 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  47%|████▋     | 432/924 [00:00<00:00, 746.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  59%|█████▉    | 544/924 [00:00<00:00, 807.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 856.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 896.76 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  95%|█████████▌| 880/924 [00:01<00:00, 913.19 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 811.79 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 588.61 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 580.33 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 14:34:09] ❌ Fatal error with config 211: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 14:34:09] 
🔬 Testing configuration 212/264
[2025-05-28 14:34:09] Config: LR=1e-05, BS=4, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 584.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 565.62 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 620.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 640.79 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 687.56 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 747.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 763.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 835.54 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 835.12 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 853.56 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 763.13 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 553.69 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 560.23 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 553.72 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.817362904548645, 'eval_accuracy': 0.20800000429153442, 'eval_runtime': 2.6549, 'eval_samples_per_second': 47.084, 'eval_steps_per_second': 6.027, 'epoch': 0.9956709956709957}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8203758001327515, 'eval_accuracy': 0.24799999594688416, 'eval_runtime': 2.8964, 'eval_samples_per_second': 43.156, 'eval_steps_per_second': 5.524, 'epoch': 2.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7791701555252075, 'eval_accuracy': 0.2160000056028366, 'eval_runtime': 2.8057, 'eval_samples_per_second': 44.553, 'eval_steps_per_second': 5.703, 'epoch': 2.995670995670996}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7526100873947144, 'eval_accuracy': 0.24799999594688416, 'eval_runtime': 2.9044, 'eval_samples_per_second': 43.039, 'eval_steps_per_second': 5.509, 'epoch': 4.0}


{'loss': 1.798, 'grad_norm': 137.7613067626953, 'learning_rate': 6.304761904761905e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7838438749313354, 'eval_accuracy': 0.2720000147819519, 'eval_runtime': 2.8707, 'eval_samples_per_second': 43.543, 'eval_steps_per_second': 5.573, 'epoch': 4.995670995670996}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7145005464553833, 'eval_accuracy': 0.2879999876022339, 'eval_runtime': 2.8749, 'eval_samples_per_second': 43.48, 'eval_steps_per_second': 5.565, 'epoch': 6.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.696589708328247, 'eval_accuracy': 0.30399999022483826, 'eval_runtime': 2.9059, 'eval_samples_per_second': 43.015, 'eval_steps_per_second': 5.506, 'epoch': 6.995670995670996}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7001781463623047, 'eval_accuracy': 0.2800000011920929, 'eval_runtime': 2.8875, 'eval_samples_per_second': 43.29, 'eval_steps_per_second': 5.541, 'epoch': 8.0}


{'loss': 1.7624, 'grad_norm': 127.44972229003906, 'learning_rate': 1.5523809523809525e-06, 'epoch': 8.658008658008658}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7198535203933716, 'eval_accuracy': 0.3199999928474426, 'eval_runtime': 2.8969, 'eval_samples_per_second': 43.149, 'eval_steps_per_second': 5.523, 'epoch': 8.995670995670995}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.717447280883789, 'eval_accuracy': 0.3440000116825104, 'eval_runtime': 2.9502, 'eval_samples_per_second': 42.37, 'eval_steps_per_second': 5.423, 'epoch': 9.956709956709958}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 1093.4813, 'train_samples_per_second': 8.45, 'train_steps_per_second': 1.052, 'train_loss': 1.7767961319633152, 'epoch': 9.956709956709958}


{'eval_loss': 1.717447280883789, 'eval_accuracy': 0.3440000116825104, 'eval_runtime': 3.0628, 'eval_samples_per_second': 40.813, 'eval_steps_per_second': 5.224, 'epoch': 9.956709956709958}
[2025-05-28 14:52:34] ✅ Config 212: Accuracy=0.3440, Loss=1.7174


[2025-05-28 14:52:34] 
🔬 Testing configuration 213/264
[2025-05-28 14:52:34] Config: LR=2e-05, BS=2, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 578.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 557.73 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 613.77 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 633.00 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 681.94 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 741.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 759.36 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 830.53 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 829.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 848.88 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 757.09 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 550.03 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 551.63 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 545.60 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 14:52:44] ❌ Fatal error with config 213: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 14:52:44] 
🔬 Testing configuration 214/264
[2025-05-28 14:52:44] Config: LR=2e-05, BS=16, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 640.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 636.97 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 705.27 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 731.87 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  48%|████▊     | 448/924 [00:00<00:00, 764.49 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  59%|█████▉    | 544/924 [00:00<00:00, 802.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 840.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 873.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  95%|█████████▌| 880/924 [00:01<00:00, 889.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 813.80 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 598.81 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 591.77 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 14:52:52] ❌ Fatal error with config 214: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 14:52:52] 
🔬 Testing configuration 215/264
[2025-05-28 14:52:52] Config: LR=5e-05, BS=8, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 627.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 625.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 702.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 733.98 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 796.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 833.11 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 911.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  88%|████████▊ | 816/924 [00:00<00:00, 915.88 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 945.46 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 844.76 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 562.34 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 565.44 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 558.61 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 14:53:01] ❌ Fatal error with config 215: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 14:53:01] 
🔬 Testing configuration 216/264
[2025-05-28 14:53:01] Config: LR=5e-05, BS=8, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   5%|▌         | 48/924 [00:00<00:02, 334.00 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  10%|█         | 96/924 [00:00<00:02, 331.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  19%|█▉        | 176/924 [00:00<00:01, 470.24 examples/s]


Map:  29%|██▉       | 272/924 [00:00<00:01, 589.16 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 602.10 examples/s]


Map:  48%|████▊     | 448/924 [00:00<00:00, 654.43 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 697.29 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  69%|██████▉   | 640/924 [00:01<00:00, 728.04 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  80%|███████▉  | 736/924 [00:01<00:00, 773.24 examples/s]


Map:  90%|█████████ | 832/924 [00:01<00:00, 788.90 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 812.58 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 679.39 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 545.65 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 549.97 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 542.90 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'train_runtime': 322.0923, 'train_samples_per_second': 20.081, 'train_steps_per_second': 0.63, 'train_loss': 1.0397881568946274, 'epoch': 7.0}


{'eval_loss': 0.7315139770507812, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.8012, 'eval_samples_per_second': 69.397, 'eval_steps_per_second': 4.441, 'epoch': 7.0}
[2025-05-28 14:58:34] ✅ Config 216: Accuracy=0.8640, Loss=0.7315


[2025-05-28 14:58:34] 
🔬 Testing configuration 217/264
[2025-05-28 14:58:34] Config: LR=1e-05, BS=2, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 614.57 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 609.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 678.24 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 705.86 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  48%|████▊     | 448/924 [00:00<00:00, 751.41 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 805.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 876.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  87%|████████▋ | 800/924 [00:00<00:00, 905.54 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 920.64 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 819.38 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 606.00 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 598.56 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 14:58:43] ❌ Fatal error with config 217: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 14:58:43] 
🔬 Testing configuration 218/264
[2025-05-28 14:58:43] Config: LR=3e-05, BS=8, Epochs=3, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 588.04 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 567.88 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 629.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 649.69 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 697.61 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 758.94 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 772.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 841.68 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 838.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 855.22 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 768.83 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 557.09 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 556.96 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 550.37 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 14:58:52] ❌ Fatal error with config 218: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 14:58:52] 
🔬 Testing configuration 219/264
[2025-05-28 14:58:52] Config: LR=3e-05, BS=8, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 611.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 607.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 679.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 710.50 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  48%|████▊     | 448/924 [00:00<00:00, 754.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 810.47 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  73%|███████▎  | 672/924 [00:00<00:00, 866.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 911.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:01<00:00, 930.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 822.76 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 604.10 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 598.16 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8257, 'grad_norm': 10.770914077758789, 'learning_rate': 1.725e-05, 'epoch': 0.43103448275862066}


{'eval_loss': 1.856515884399414, 'eval_accuracy': 0.13600000739097595, 'eval_runtime': 1.8988, 'eval_samples_per_second': 65.831, 'eval_steps_per_second': 4.213, 'epoch': 0.43103448275862066}


{'loss': 1.8164, 'grad_norm': 22.14456558227539, 'learning_rate': 2.9387295081967214e-05, 'epoch': 0.8620689655172413}


{'eval_loss': 1.852940559387207, 'eval_accuracy': 0.13600000739097595, 'eval_runtime': 1.8355, 'eval_samples_per_second': 68.101, 'eval_steps_per_second': 4.358, 'epoch': 0.8620689655172413}


{'loss': 1.6671, 'grad_norm': 17.043882369995117, 'learning_rate': 2.7344945355191255e-05, 'epoch': 1.293103448275862}


{'eval_loss': 1.2972277402877808, 'eval_accuracy': 0.7360000014305115, 'eval_runtime': 1.8934, 'eval_samples_per_second': 66.017, 'eval_steps_per_second': 4.225, 'epoch': 1.293103448275862}


{'loss': 1.0764, 'grad_norm': 10.703402519226074, 'learning_rate': 2.5302595628415302e-05, 'epoch': 1.7241379310344827}


{'eval_loss': 1.0224069356918335, 'eval_accuracy': 0.8240000009536743, 'eval_runtime': 1.8604, 'eval_samples_per_second': 67.191, 'eval_steps_per_second': 4.3, 'epoch': 1.7241379310344827}


{'loss': 1.0119, 'grad_norm': 5.4948296546936035, 'learning_rate': 2.3260245901639346e-05, 'epoch': 2.1551724137931036}


{'eval_loss': 1.0537091493606567, 'eval_accuracy': 0.8560000061988831, 'eval_runtime': 1.9783, 'eval_samples_per_second': 63.187, 'eval_steps_per_second': 4.044, 'epoch': 2.1551724137931036}


{'loss': 0.8672, 'grad_norm': 18.086570739746094, 'learning_rate': 2.1217896174863387e-05, 'epoch': 2.586206896551724}


{'eval_loss': 1.0031590461730957, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 1.7924, 'eval_samples_per_second': 69.74, 'eval_steps_per_second': 4.463, 'epoch': 2.586206896551724}


{'loss': 0.851, 'grad_norm': 1.3416547775268555, 'learning_rate': 1.917554644808743e-05, 'epoch': 3.0172413793103448}


{'eval_loss': 1.0049943923950195, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.8596, 'eval_samples_per_second': 67.218, 'eval_steps_per_second': 4.302, 'epoch': 3.0172413793103448}


{'loss': 0.8152, 'grad_norm': 24.112558364868164, 'learning_rate': 1.7133196721311475e-05, 'epoch': 3.4482758620689653}


{'eval_loss': 1.0099049806594849, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.7444, 'eval_samples_per_second': 71.657, 'eval_steps_per_second': 4.586, 'epoch': 3.4482758620689653}


{'loss': 0.7748, 'grad_norm': 36.005592346191406, 'learning_rate': 1.5090846994535523e-05, 'epoch': 3.8793103448275863}


{'eval_loss': 1.0008714199066162, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.8257, 'eval_samples_per_second': 68.466, 'eval_steps_per_second': 4.382, 'epoch': 3.8793103448275863}


{'loss': 0.7799, 'grad_norm': 23.19527816772461, 'learning_rate': 1.3048497267759563e-05, 'epoch': 4.310344827586207}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 0.9688442945480347, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.8227, 'eval_samples_per_second': 68.581, 'eval_steps_per_second': 4.389, 'epoch': 4.310344827586207}


{'loss': 0.7625, 'grad_norm': 0.48543643951416016, 'learning_rate': 1.1006147540983609e-05, 'epoch': 4.741379310344827}


{'eval_loss': 1.0003842115402222, 'eval_accuracy': 0.8799999952316284, 'eval_runtime': 1.9903, 'eval_samples_per_second': 62.805, 'eval_steps_per_second': 4.02, 'epoch': 4.741379310344827}


{'loss': 0.7543, 'grad_norm': 0.2365136444568634, 'learning_rate': 8.963797814207653e-06, 'epoch': 5.172413793103448}


{'eval_loss': 0.9795288443565369, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.8928, 'eval_samples_per_second': 66.04, 'eval_steps_per_second': 4.227, 'epoch': 5.172413793103448}


{'loss': 0.7321, 'grad_norm': 0.3424621820449829, 'learning_rate': 6.921448087431694e-06, 'epoch': 5.603448275862069}


{'eval_loss': 0.981340229511261, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.8566, 'eval_samples_per_second': 67.326, 'eval_steps_per_second': 4.309, 'epoch': 5.603448275862069}


{'loss': 0.7659, 'grad_norm': 19.20128059387207, 'learning_rate': 4.879098360655739e-06, 'epoch': 6.0344827586206895}


{'eval_loss': 1.0062072277069092, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 1.8508, 'eval_samples_per_second': 67.538, 'eval_steps_per_second': 4.322, 'epoch': 6.0344827586206895}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 297.2664, 'train_samples_per_second': 21.758, 'train_steps_per_second': 2.732, 'train_loss': 1.0357425962175641, 'epoch': 6.0344827586206895}


{'eval_loss': 0.9688442945480347, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.9404, 'eval_samples_per_second': 64.42, 'eval_steps_per_second': 4.123, 'epoch': 6.0344827586206895}
[2025-05-28 15:04:00] ✅ Config 219: Accuracy=0.8640, Loss=0.9688


[2025-05-28 15:04:00] 
🔬 Testing configuration 220/264
[2025-05-28 15:04:00] Config: LR=1e-05, BS=2, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 581.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 566.23 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 624.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 644.44 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 694.14 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 755.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 771.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 842.76 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 839.18 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 854.81 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 766.97 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 556.52 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 558.68 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 551.68 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:04:09] ❌ Fatal error with config 220: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 15:04:09] 
🔬 Testing configuration 221/264
[2025-05-28 15:04:09] Config: LR=5e-05, BS=4, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 585.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 569.84 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 631.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 650.54 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 699.87 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 760.33 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 773.11 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 841.76 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 837.33 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 853.88 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 768.95 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 548.61 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 550.09 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 544.22 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:04:18] ❌ Fatal error with config 221: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 15:04:18] 
🔬 Testing configuration 222/264
[2025-05-28 15:04:18] Config: LR=3e-05, BS=8, Epochs=3, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 620.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 616.77 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 688.46 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 719.16 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 775.22 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 816.19 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 895.14 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  88%|████████▊ | 816/924 [00:01<00:00, 900.04 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 919.36 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 825.91 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 597.28 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 590.67 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:04:27] ❌ Fatal error with config 222: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 15:04:27] 
🔬 Testing configuration 223/264
[2025-05-28 15:04:27] Config: LR=5e-05, BS=2, Epochs=3, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 619.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 611.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 681.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 713.17 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 772.97 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 817.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 897.00 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  88%|████████▊ | 816/924 [00:01<00:00, 903.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 923.94 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 826.05 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 607.46 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 599.14 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:04:35] ❌ Fatal error with config 223: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 15:04:35] 
🔬 Testing configuration 224/264
[2025-05-28 15:04:35] Config: LR=2e-05, BS=8, Epochs=3, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 573.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 555.86 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 613.85 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 635.06 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 680.82 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 740.87 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 757.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 828.08 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 825.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 841.77 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 754.23 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 553.13 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 554.39 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 548.42 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:04:44] ❌ Fatal error with config 224: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 15:04:44] 
🔬 Testing configuration 225/264
[2025-05-28 15:04:44] Config: LR=1e-05, BS=4, Epochs=3, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 625.13 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 620.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 692.11 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 718.75 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  48%|████▊     | 448/924 [00:00<00:00, 751.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 801.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  73%|███████▎  | 672/924 [00:00<00:00, 851.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 895.11 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:01<00:00, 909.87 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 815.67 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 598.99 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 593.66 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:04:52] ❌ Fatal error with config 225: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 15:04:52] 
🔬 Testing configuration 226/264
[2025-05-28 15:04:52] Config: LR=5e-05, BS=2, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 585.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 564.86 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 624.91 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 644.63 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 693.33 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 753.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 770.01 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 839.50 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 837.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 856.71 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 767.08 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 554.95 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 557.82 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 551.33 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:05:01] ❌ Fatal error with config 226: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 15:05:01] 
🔬 Testing configuration 227/264
[2025-05-28 15:05:01] Config: LR=2e-05, BS=8, Epochs=10, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 615.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 613.16 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 686.32 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 714.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  48%|████▊     | 448/924 [00:00<00:00, 761.70 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 814.34 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  73%|███████▎  | 672/924 [00:00<00:00, 866.52 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 907.63 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:01<00:00, 923.06 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 822.85 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 597.21 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 588.55 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:05:10] ❌ Fatal error with config 227: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 15:05:10] 
🔬 Testing configuration 228/264
[2025-05-28 15:05:10] Config: LR=1e-05, BS=4, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 621.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 616.27 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 687.25 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 714.12 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  48%|████▊     | 448/924 [00:00<00:00, 753.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 803.55 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  73%|███████▎  | 672/924 [00:00<00:00, 856.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 900.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:01<00:00, 921.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 819.71 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 601.08 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 590.28 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:05:19] ❌ Fatal error with config 228: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 15:05:19] 
🔬 Testing configuration 229/264
[2025-05-28 15:05:19] Config: LR=1e-05, BS=8, Epochs=3, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 614.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 615.04 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 686.24 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 711.66 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  48%|████▊     | 448/924 [00:00<00:00, 757.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 807.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  73%|███████▎  | 672/924 [00:00<00:00, 853.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 896.89 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:01<00:00, 910.09 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 814.59 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 604.59 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 597.85 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:05:27] ❌ Fatal error with config 229: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 15:05:27] 
🔬 Testing configuration 230/264
[2025-05-28 15:05:27] Config: LR=1e-05, BS=16, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 619.60 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 610.22 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 683.81 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 707.40 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  48%|████▊     | 448/924 [00:00<00:00, 748.73 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 804.95 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  73%|███████▎  | 672/924 [00:00<00:00, 854.20 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 898.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:01<00:00, 916.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 815.85 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 625.55 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 624.36 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:05:37] ❌ Fatal error with config 230: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 15:05:37] 
🔬 Testing configuration 231/264
[2025-05-28 15:05:37] Config: LR=5e-05, BS=2, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 604.44 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 594.26 examples/s]


Map:  26%|██▌       | 240/924 [00:00<00:01, 682.57 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 712.21 examples/s]


Map:  47%|████▋     | 432/924 [00:00<00:00, 769.25 examples/s]


Map:  59%|█████▉    | 544/924 [00:00<00:00, 829.49 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 868.38 examples/s]


Map:  83%|████████▎ | 768/924 [00:00<00:00, 904.25 examples/s]


Map:  95%|█████████▌| 880/924 [00:01<00:00, 924.02 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 828.62 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 600.82 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 595.69 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:05:46] ❌ Fatal error with config 231: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 15:05:46] 
🔬 Testing configuration 232/264
[2025-05-28 15:05:46] Config: LR=5e-05, BS=8, Epochs=3, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 687.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 692.49 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 778.36 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  40%|███▉      | 368/924 [00:00<00:00, 812.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  52%|█████▏    | 480/924 [00:00<00:00, 878.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 920.91 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  80%|███████▉  | 736/924 [00:00<00:00, 1032.46 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  94%|█████████▎| 864/924 [00:00<00:00, 1037.16 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:00<00:00, 934.09 examples/s] 





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 647.93 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 641.81 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'train_runtime': 90.7762, 'train_samples_per_second': 30.537, 'train_steps_per_second': 0.958, 'train_loss': 1.0217423713070222, 'epoch': 3.0}


{'eval_loss': 0.3826301097869873, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 1.8519, 'eval_samples_per_second': 67.497, 'eval_steps_per_second': 4.32, 'epoch': 3.0}
[2025-05-28 15:07:26] ✅ Config 232: Accuracy=0.8640, Loss=0.3826


[2025-05-28 15:07:27] 
🔬 Testing configuration 233/264
[2025-05-28 15:07:27] Config: LR=5e-05, BS=2, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   9%|▊         | 80/924 [00:00<00:01, 626.81 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 649.31 examples/s]


Map:  28%|██▊       | 256/924 [00:00<00:00, 738.54 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 771.16 examples/s]


Map:  48%|████▊     | 448/924 [00:00<00:00, 803.94 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 860.48 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 928.07 examples/s]


Map:  87%|████████▋ | 800/924 [00:00<00:00, 960.22 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 968.94 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 870.00 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 605.61 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 599.40 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.3099130392074585, 'eval_accuracy': 0.5120000243186951, 'eval_runtime': 2.8909, 'eval_samples_per_second': 43.24, 'eval_steps_per_second': 11.069, 'epoch': 1.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.1990653276443481, 'eval_accuracy': 0.7839999794960022, 'eval_runtime': 3.3821, 'eval_samples_per_second': 36.96, 'eval_steps_per_second': 9.462, 'epoch': 2.0}


{'loss': 1.3126, 'grad_norm': 1.7562267780303955, 'learning_rate': 4.385515110807893e-05, 'epoch': 2.1645021645021645}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.1247453689575195, 'eval_accuracy': 0.8080000281333923, 'eval_runtime': 3.3731, 'eval_samples_per_second': 37.058, 'eval_steps_per_second': 9.487, 'epoch': 3.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.0852482318878174, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 3.387, 'eval_samples_per_second': 36.905, 'eval_steps_per_second': 9.448, 'epoch': 4.0}


{'loss': 0.8069, 'grad_norm': 0.267135351896286, 'learning_rate': 1.9436976651092144e-05, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.0354889631271362, 'eval_accuracy': 0.8640000224113464, 'eval_runtime': 3.3721, 'eval_samples_per_second': 37.069, 'eval_steps_per_second': 9.49, 'epoch': 5.0}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.065545678138733, 'eval_accuracy': 0.8479999899864197, 'eval_runtime': 3.3764, 'eval_samples_per_second': 37.022, 'eval_steps_per_second': 9.478, 'epoch': 6.0}


{'loss': 0.7447, 'grad_norm': 0.1555972397327423, 'learning_rate': 8.894839047518727e-07, 'epoch': 6.4935064935064934}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.0374516248703003, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 3.3876, 'eval_samples_per_second': 36.899, 'eval_steps_per_second': 9.446, 'epoch': 7.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 1120.4617, 'train_samples_per_second': 5.773, 'train_steps_per_second': 1.443, 'train_loss': 0.9380921910556659, 'epoch': 7.0}


{'eval_loss': 1.0374516248703003, 'eval_accuracy': 0.871999979019165, 'eval_runtime': 3.4707, 'eval_samples_per_second': 36.016, 'eval_steps_per_second': 9.22, 'epoch': 7.0}
[2025-05-28 15:26:19] ✅ Config 233: Accuracy=0.8720, Loss=1.0375


[2025-05-28 15:26:20] 
🔬 Testing configuration 234/264
[2025-05-28 15:26:20] Config: LR=2e-05, BS=2, Epochs=7, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 623.81 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 618.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 688.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 717.29 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 774.65 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 819.45 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 896.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  88%|████████▊ | 816/924 [00:01<00:00, 899.97 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 918.09 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 826.23 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 592.80 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 586.28 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:26:29] ❌ Fatal error with config 234: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 15:26:29] 
🔬 Testing configuration 235/264
[2025-05-28 15:26:29] Config: LR=5e-05, BS=4, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 610.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  16%|█▌        | 144/924 [00:00<00:01, 585.37 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  26%|██▌       | 240/924 [00:00<00:01, 661.11 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  36%|███▋      | 336/924 [00:00<00:00, 683.15 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  47%|████▋     | 432/924 [00:00<00:00, 742.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  59%|█████▉    | 544/924 [00:00<00:00, 802.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  71%|███████   | 656/924 [00:00<00:00, 848.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  83%|████████▎ | 768/924 [00:00<00:00, 890.76 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  95%|█████████▌| 880/924 [00:01<00:00, 910.49 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 809.99 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 597.42 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 590.26 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:26:38] ❌ Fatal error with config 235: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 15:26:38] 
🔬 Testing configuration 236/264
[2025-05-28 15:26:38] Config: LR=1e-05, BS=8, Epochs=3, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 589.46 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 566.63 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 625.89 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 646.44 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 693.41 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 755.29 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 769.92 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 840.49 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 838.36 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 854.52 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 766.89 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 553.54 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 557.79 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 550.54 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:26:46] ❌ Fatal error with config 236: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'
[2025-05-28 15:26:46] 
🔬 Testing configuration 237/264
[2025-05-28 15:26:46] Config: LR=1e-05, BS=16, Epochs=5, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 624.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 619.54 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 691.53 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 719.57 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 773.69 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 813.58 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 891.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  88%|████████▊ | 816/924 [00:01<00:00, 895.59 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 915.51 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 824.29 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 600.80 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 593.60 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:26:55] ❌ Fatal error with config 237: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 15:26:55] 
🔬 Testing configuration 238/264
[2025-05-28 15:26:55] Config: LR=5e-05, BS=2, Epochs=7, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 622.21 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 619.93 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 689.10 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 715.46 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  48%|████▊     | 448/924 [00:00<00:00, 762.78 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 818.38 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  73%|███████▎  | 672/924 [00:00<00:00, 872.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 918.27 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:01<00:00, 934.26 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 829.41 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 604.52 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 596.74 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:27:03] ❌ Fatal error with config 238: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps
[2025-05-28 15:27:03] 
🔬 Testing configuration 239/264
[2025-05-28 15:27:03] Config: LR=1e-05, BS=2, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 616.72 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 614.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 687.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 718.54 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  50%|█████     | 464/924 [00:00<00:00, 777.29 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  62%|██████▏   | 576/924 [00:00<00:00, 818.67 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  76%|███████▌  | 704/924 [00:00<00:00, 900.51 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  88%|████████▊ | 816/924 [00:01<00:00, 907.30 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 927.27 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 829.50 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  64%|██████▍   | 80/125 [00:00<00:00, 598.99 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 590.60 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 2.0720791816711426, 'eval_accuracy': 0.2160000056028366, 'eval_runtime': 3.1224, 'eval_samples_per_second': 40.033, 'eval_steps_per_second': 10.248, 'epoch': 1.0}


{'loss': 1.8807, 'grad_norm': 45.3801383972168, 'learning_rate': 9.935714285714286e-06, 'epoch': 1.0822510822510822}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8309025764465332, 'eval_accuracy': 0.18400000035762787, 'eval_runtime': 3.3063, 'eval_samples_per_second': 37.807, 'eval_steps_per_second': 9.678, 'epoch': 2.0}


{'loss': 1.8295, 'grad_norm': 27.43683433532715, 'learning_rate': 8.745238095238097e-06, 'epoch': 2.1645021645021645}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.7999576330184937, 'eval_accuracy': 0.23999999463558197, 'eval_runtime': 3.2314, 'eval_samples_per_second': 38.683, 'eval_steps_per_second': 9.903, 'epoch': 3.0}


{'loss': 1.8265, 'grad_norm': 27.3193416595459, 'learning_rate': 7.554761904761904e-06, 'epoch': 3.2467532467532467}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.9164353609085083, 'eval_accuracy': 0.12800000607967377, 'eval_runtime': 2.9833, 'eval_samples_per_second': 41.9, 'eval_steps_per_second': 10.726, 'epoch': 4.0}


{'loss': 1.8231, 'grad_norm': 16.89163589477539, 'learning_rate': 6.364285714285714e-06, 'epoch': 4.329004329004329}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.8352689743041992, 'eval_accuracy': 0.07199999690055847, 'eval_runtime': 3.2596, 'eval_samples_per_second': 38.348, 'eval_steps_per_second': 9.817, 'epoch': 5.0}


{'loss': 1.8129, 'grad_norm': 43.94933319091797, 'learning_rate': 5.173809523809524e-06, 'epoch': 5.411255411255412}


Non-default generation parameters: {'forced_eos_token_id': 2}


{'eval_loss': 1.797178864479065, 'eval_accuracy': 0.13600000739097595, 'eval_runtime': 3.2601, 'eval_samples_per_second': 38.343, 'eval_steps_per_second': 9.816, 'epoch': 6.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight'].


{'train_runtime': 1004.9407, 'train_samples_per_second': 9.195, 'train_steps_per_second': 4.597, 'train_loss': 1.831580135901425, 'epoch': 6.0}


{'eval_loss': 1.7999576330184937, 'eval_accuracy': 0.23999999463558197, 'eval_runtime': 3.1434, 'eval_samples_per_second': 39.766, 'eval_steps_per_second': 10.18, 'epoch': 6.0}
[2025-05-28 15:44:00] ✅ Config 239: Accuracy=0.2400, Loss=1.8000


[2025-05-28 15:44:01] 
🔬 Testing configuration 240/264
[2025-05-28 15:44:01] Config: LR=5e-05, BS=2, Epochs=10, WD=0.0



Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:   9%|▊         | 80/924 [00:00<00:01, 620.21 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  17%|█▋        | 160/924 [00:00<00:01, 611.62 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  28%|██▊       | 256/924 [00:00<00:00, 685.82 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  38%|███▊      | 352/924 [00:00<00:00, 717.28 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  48%|████▊     | 448/924 [00:00<00:00, 760.18 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  61%|██████    | 560/924 [00:00<00:00, 811.36 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  73%|███████▎  | 672/924 [00:00<00:00, 865.71 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  85%|████████▍ | 784/924 [00:00<00:00, 911.61 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  97%|█████████▋| 896/924 [00:01<00:00, 927.81 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 824.09 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 580.48 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 589.57 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 581.11 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:44:10] ❌ Fatal error with config 240: Adafactor.__init__() got an unexpected keyword argument 'relative_step_size'


[2025-05-28 15:44:10] 
🔬 Testing configuration 241/264
[2025-05-28 15:44:10] Config: LR=1e-05, BS=2, Epochs=5, WD=0.1



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 582.42 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 561.87 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 619.79 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 639.83 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 687.64 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 747.97 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 761.03 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  74%|███████▍  | 688/924 [00:00<00:00, 802.87 examples/s]


Map:  87%|████████▋ | 800/924 [00:01<00:00, 827.79 examples/s]


Map:  97%|█████████▋| 896/924 [00:01<00:00, 840.87 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map: 100%|██████████| 924/924 [00:01<00:00, 754.66 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 557.90 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 557.62 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 552.19 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-05-28 15:44:19] ❌ Fatal error with config 241: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: steps
- Save strategy: epoch
[2025-05-28 15:44:19] 
🔬 Testing configuration 242/264
[2025-05-28 15:44:19] Config: LR=2e-05, BS=4, Epochs=10, WD=0.01



Map:   0%|          | 0/924 [00:00<?, ? examples/s]


Map:   7%|▋         | 64/924 [00:00<00:01, 585.43 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  14%|█▍        | 128/924 [00:00<00:01, 567.32 examples/s]


Map:  23%|██▎       | 208/924 [00:00<00:01, 622.98 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  33%|███▎      | 304/924 [00:00<00:00, 646.30 examples/s]


Map:  43%|████▎     | 400/924 [00:00<00:00, 693.95 examples/s]


Map:  54%|█████▎    | 496/924 [00:00<00:00, 754.75 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  64%|██████▍   | 592/924 [00:00<00:00, 773.48 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  78%|███████▊  | 720/924 [00:00<00:00, 837.75 examples/s]


Map:  88%|████████▊ | 816/924 [00:01<00:00, 830.80 examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Map:  99%|█████████▊| 912/924 [00:01<00:00, 844.96 examples/s]


Map: 100%|██████████| 924/924 [00:01<00:00, 763.04 examples/s]





Map:   0%|          | 0/125 [00:00<?, ? examples/s]


Map:  51%|█████     | 64/125 [00:00<00:00, 548.88 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 548.87 examples/s]


Map: 100%|██████████| 125/125 [00:00<00:00, 542.24 examples/s]




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([6, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 1.8348, 'grad_norm': 25.519290924072266, 'learning_rate': 4.155844155844157e-06, 'epoch': 0.21645021645021645}


{'eval_loss': 1.7721874713897705, 'eval_accuracy': 0.25600001215934753, 'eval_runtime': 2.9845, 'eval_samples_per_second': 41.884, 'eval_steps_per_second': 5.361, 'epoch': 0.21645021645021645}


{'loss': 1.7468, 'grad_norm': 22.11333656311035, 'learning_rate': 8.398268398268398e-06, 'epoch': 0.4329004329004329}


{'eval_loss': 1.7947615385055542, 'eval_accuracy': 0.15199999511241913, 'eval_runtime': 2.9315, 'eval_samples_per_second': 42.64, 'eval_steps_per_second': 5.458, 'epoch': 0.4329004329004329}


{'loss': 1.4796, 'grad_norm': 27.66858673095703, 'learning_rate': 1.2727272727272728e-05, 'epoch': 0.6493506493506493}


{'eval_loss': 1.4088497161865234, 'eval_accuracy': 0.4480000138282776, 'eval_runtime': 3.0708, 'eval_samples_per_second': 40.706, 'eval_steps_per_second': 5.21, 'epoch': 0.6493506493506493}


{'loss': 0.7291, 'grad_norm': 29.484514236450195, 'learning_rate': 1.7056277056277057e-05, 'epoch': 0.8658008658008658}


{'eval_loss': 0.6320651769638062, 'eval_accuracy': 0.800000011920929, 'eval_runtime': 2.9463, 'eval_samples_per_second': 42.427, 'eval_steps_per_second': 5.431, 'epoch': 0.8658008658008658}


{'loss': 0.5888, 'grad_norm': 93.11883544921875, 'learning_rate': 1.9846849446849448e-05, 'epoch': 1.0822510822510822}


{'eval_loss': 1.2835172414779663, 'eval_accuracy': 0.7120000123977661, 'eval_runtime': 3.0682, 'eval_samples_per_second': 40.741, 'eval_steps_per_second': 5.215, 'epoch': 1.0822510822510822}


{'loss': 0.437, 'grad_norm': 1.4862374067306519, 'learning_rate': 1.937782587782588e-05, 'epoch': 1.2987012987012987}


{'eval_loss': 0.5898113250732422, 'eval_accuracy': 0.8320000171661377, 'eval_runtime': 2.9608, 'eval_samples_per_second': 42.218, 'eval_steps_per_second': 5.404, 'epoch': 1.2987012987012987}


{'loss': 0.2985, 'grad_norm': 0.29659485816955566, 'learning_rate': 1.88992303992304e-05, 'epoch': 1.5151515151515151}
