# Model Comparation
This notebook is used for a comparation between 3 models: BERT, RoBERTa, DeepSeek. The model with the best metrics will be selected as the baseline for the application. I'll use Stanford Sentiment Treebank (SST-2) as the testing dataset. Although it's a binary dataset, the first iteration will be a 2-class classification: postive sentiments and negative sentiments.

In [None]:
from datasets import load_dataset
from huggingface_hub import login

# Personal token to log in
login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [16]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    """
    Computes evaluation metrics from model predictions.

    Uses weighted averages for precision, recall, and F1.

    Parameter:
    - eval_pred: object with 'predictions' and 'label_ids' attributes

    Returns:
    - dict with 'accuracy', 'f1', 'precision', and 'recall'
    """
    predictions = np.argmax(eval_pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        eval_pred.label_ids, predictions, average='weighted'
    )
    accuracy = accuracy_score(eval_pred.label_ids, predictions)

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [17]:
import pandas as pd
def create_comparison_table(results):
    """
    Displays a comparison table of model performance.

    Sorts models by F1 score and shows:
    - Metrics (accuracy, F1, size, training time)
    - Best overall model
    - Most efficient model (F1 / (size * time))

    Parameter:
    - results: list of dicts with keys like 'model_name', 'accuracy', 'f1_score', 
      'model_size_M', and 'training_time'
    """
    df = pd.DataFrame(results)

    # Sort by f1-score
    df = df.sort_values('f1_score', ascending=False)

    print("\n" + "="*80)
    print("MULTILINGUAL SENTIMENT ANALYSIS COMPARISON")
    print("="*80)

    print("\nPerformance Summary:")
    print(df[['model_name', 'accuracy', 'f1_score', 'model_size_M', 'training_time']].to_string(index=False))

    print("\n" + "="*80)

    # Best overall
    best_overall = df.iloc[0]
    print(f"\n1. Best Overall Performance:")
    print(f"   - Model: {best_overall['model_name']}")
    print(f"   - F1 Score: {best_overall['f1_score']:.4f}")
    print(f"   - Size: {best_overall['model_size_M']:.1f}M parameters")

    # Most eficient
    df['efficiency_score'] = df['f1_score'] / (df['model_size_M'] * df['training_time'])
    most_efficient = df.nlargest(1, 'efficiency_score').iloc[0]
    print(f"\n2. Most Efficient (Performance/Size/Time):")
    print(f"   - Model: {most_efficient['model_name']}")
    print(f"   - F1 Score: {most_efficient['f1_score']:.4f}")
    print(f"   - Size: {most_efficient['model_size_M']:.1f}M parameters")

In [18]:
import time
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

def evaluate_model(model_name, model_info):
    """
    Loads the model and tokenizer, prepares the data (SST2), trains the model for 2 epochs,
    evaluates it, and returns performance metrics and basic model info.

    Parameters:
    - model_name: str, HuggingFace model identifier
    - model_info: dict with model details like 'type', 'languages', and 'num_labels' (optional)

    Returns:
    - dict with:
        'model_name': str
        'model_type': str
        'languages': str or int
        'accuracy': float
        'f1_score': float
        'model_size_M': float (number of parameters in millions)
        'training_time': float (in seconds)
    """
    
    print(f"\n{'='*60}")
    print(f"Evaluating: {model_name}")
    print(f"Type: {model_info['type']}")
    print(f"Languages supported: {model_info.get('languages', 'N/A')}")
    print(f"{'='*60}")

    start_time = time.time()

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_labels = model_info.get('num_labels', 2)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        ignore_mismatched_sizes=True
    )

    # Tokenize
    def tokenize_function(examples):
        return tokenizer(
            examples["sentence"],
            truncation=True,
            padding=False,
            max_length=128  
        )

    # Data preparation
    # Reference: https://huggingface.co/datasets/cardiffnlp/tweet_eval
    dataset = load_dataset("glue", "sst2")
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["sentence", "idx"]  # Remove unnecesary data
    )

    # Split the dataset
    train_dataset = tokenized_dataset["train"].select(range(5000))
    val_dataset = tokenized_dataset["validation"]
    test_dataset = tokenized_dataset["test"]

    training_args = TrainingArguments(
        output_dir=f"./results/{model_name.split('/')[-1]}",
        num_train_epochs=2,  # Good results with few epochs
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        warmup_steps=200,
        learning_rate=2e-5,

        # Evaluation
        eval_strategy="epoch",
        save_strategy="no",  

        gradient_checkpointing=False,  # Removed for efficiency  
        # Logging mínimo
        logging_steps=500,
        report_to="none",
        disable_tqdm=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=DataCollatorWithPadding(tokenizer),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics  # Accuracy, f1, recall, precision
    )

    print("Training...")
    train_result = trainer.train()

    print("Evaluating...")
    eval_results = trainer.evaluate()
    total_time = time.time() - start_time
    model_size = sum(p.numel() for p in model.parameters()) / 1e6  # En millones

    # Save results
    result = {
        'model_name': model_name,
        'model_type': model_info['type'],
        'languages': model_info.get('languages', 1),
        'accuracy': eval_results['eval_accuracy'],
        'f1_score': eval_results['eval_f1'],
        'model_size_M': model_size,
        'training_time': total_time,
    }

    # Clean up
    del model
    del trainer
    torch.cuda.empty_cache()

    return result

In [19]:
models_to_compare = {
            # Multilanguage base
            "xlm-roberta-base": {
                "type": "multilingual_base",
                "languages": 100,
                "size": "base",
                "map_to_binary": False
            },

            # Multilanguage pre-trained sentiment 
            "cardiffnlp/twitter-xlm-roberta-base-sentiment": {
                "type": "multilingual_sentiment",
                "languages": 100,
                "num_labels": 3,
                "map_to_binary": True
            },

            # Only English
            "distilbert-base-uncased-finetuned-sst-2-english": {
                "type": "english_baseline",
                "languages": 1,
                "size": "distilled",
                "map_to_binary": False
            },

            # Small model
            "microsoft/Multilingual-MiniLM-L12-H384": {
                "type": "multilingual_efficient",
                "languages": 100,
                "size": "mini",
                "map_to_binary": False

            }
        }

def run_comparison(models):
    """
    Runs the evaluation and comparison for all models.

    For each model:
    - Trains and evaluates it using `evaluate_model`
    - Collects the results
    - Skips models that raise an error

    Finally, displays a comparison table with the results.

    Parameters:
    - models: dict that contains various dicts with model details like 'type', 'languages', and 'num_labels' (optional)
    """
    print(f"Models to compare: {len(models)}")
    results = []

    for model_name, model_info in models.items():
        try:
            # Save the results for each model
            results.append(evaluate_model(model_name, model_info))
        except Exception as e:
            print(f"Error with {model_name}: {str(e)}")
            continue

    create_comparison_table(results)

In [20]:
import numpy as np
import torch, random

# Set seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

run_comparison(models_to_compare)

Models to compare: 4

Evaluating: xlm-roberta-base
Type: multilingual_base
Languages supported: 100


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

  trainer = Trainer(


Training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.453053,0.81422,0.814044,0.814727,0.81422
2,No log,0.352218,0.858945,0.858729,0.860169,0.858945


Evaluating...



Evaluating: cardiffnlp/twitter-xlm-roberta-base-sentiment
Type: multilingual_sentiment
Languages supported: 100
Error with cardiffnlp/twitter-xlm-roberta-base-sentiment: Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', '

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

  trainer = Trainer(


Training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.417077,0.902523,0.902451,0.903067,0.902523
2,No log,0.499975,0.902523,0.902438,0.903212,0.902523


Evaluating...



Evaluating: microsoft/Multilingual-MiniLM-L12-H384
Type: multilingual_efficient
Languages supported: 100
Error with microsoft/Multilingual-MiniLM-L12-H384: Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokeniz