In [1]:
from transformers import AutoTokenizer
import numpy as np
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

def compare_tokenizers(text_samples):
    """
    Compare tokenization results between BGE and Nomic tokenizers
    
    Args:
        text_samples: List of text strings to compare tokenization
    
    Returns:
        dict: Comparison statistics and analysis results
    """
    # Load both tokenizers
    bge_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
    nomic_tokenizer = AutoTokenizer.from_pretrained("nomic-ai/nomic-embed-text-v1.5")
    
    results = {
        "vocabulary_sizes": {
            "bge": len(bge_tokenizer.vocab),
            "nomic": len(nomic_tokenizer.vocab),
        },
        "samples": []
    }
    
    # Compare tokenization for each sample
    for text in text_samples:
        bge_tokens = bge_tokenizer.tokenize(text)
        nomic_tokens = nomic_tokenizer.tokenize(text)
        
        # Get token counts
        bge_counts = Counter(bge_tokens)
        nomic_counts = Counter(nomic_tokens)
        
        # Compare token sequences
        sample_result = {
            "text": text,
            "bge_tokens": bge_tokens,
            "nomic_tokens": nomic_tokens,
            "token_counts": {
                "bge": len(bge_tokens),
                "nomic": len(nomic_tokens)
            },
            "unique_tokens": {
                "bge": len(bge_counts),
                "nomic": len(nomic_counts)
            },
            "identical_tokenization": bge_tokens == nomic_tokens
        }
        
        results["samples"].append(sample_result)
    
    # Calculate overall statistics
    identical_count = sum(1 for r in results["samples"] if r["identical_tokenization"])
    results["overall_stats"] = {
        "total_samples": len(text_samples),
        "identical_tokenizations": identical_count,
        "identical_percentage": (identical_count / len(text_samples)) * 100 if text_samples else 0
    }
    
    return results

In [3]:

def print_comparison_report(results):
    """Print a formatted report of the tokenizer comparison results"""
    print("Tokenizer Comparison Report")
    print("==========================")
    print(f"\nVocabulary Sizes:")
    print(f"BGE: {results['vocabulary_sizes']['bge']:,} tokens")
    print(f"Nomic: {results['vocabulary_sizes']['nomic']:,} tokens")
    
    print(f"\nOverall Statistics:")
    print(f"Total samples analyzed: {results['overall_stats']['total_samples']}")
    print(f"Identical tokenizations: {results['overall_stats']['identical_tokenizations']}")
    print(f"Percentage identical: {results['overall_stats']['identical_percentage']:.1f}%")
    
    print("\nDetailed Sample Analysis:")
    for i, sample in enumerate(results['samples'], 1):
        print(f"\nSample {i}:")
        print(f"Text: {sample['text']}")
        print(f"BGE tokens ({sample['token_counts']['bge']}): {sample['bge_tokens']}")
        print(f"Nomic tokens ({sample['token_counts']['nomic']}): {sample['nomic_tokens']}")
        print(f"Identical: {sample['identical_tokenization']}")

In [4]:

# Example usage
sample_texts = [
    "This is a test sentence.",
    "Machine learning models use different tokenization approaches.",
    "Some текст with mixed 字符 and специальные characters!",
]

results = compare_tokenizers(sample_texts)
print_comparison_report(results)

Tokenizer Comparison Report

Vocabulary Sizes:
BGE: 30,522 tokens
Nomic: 30,522 tokens

Overall Statistics:
Total samples analyzed: 3
Identical tokenizations: 3
Percentage identical: 100.0%

Detailed Sample Analysis:

Sample 1:
Text: This is a test sentence.
BGE tokens (6): ['this', 'is', 'a', 'test', 'sentence', '.']
Nomic tokens (6): ['this', 'is', 'a', 'test', 'sentence', '.']
Identical: True

Sample 2:
Text: Machine learning models use different tokenization approaches.
BGE tokens (9): ['machine', 'learning', 'models', 'use', 'different', 'token', '##ization', 'approaches', '.']
Nomic tokens (9): ['machine', 'learning', 'models', 'use', 'different', 'token', '##ization', 'approaches', '.']
Identical: True

Sample 3:
Text: Some текст with mixed 字符 and специальные characters!
BGE tokens (24): ['some', 'т', '##е', '##к', '##с', '##т', 'with', 'mixed', '[UNK]', '[UNK]', 'and', 'с', '##п', '##е', '##ц', '##и', '##а', '##л', '##ь', '##н', '##ы', '##е', 'characters', '!']
Nomic tokens (24

In [5]:
bge_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
nomic_tokenizer = AutoTokenizer.from_pretrained("nomic-ai/nomic-embed-text-v1.5")


In [6]:
bge_tokenizer

BertTokenizerFast(name_or_path='BAAI/bge-base-en-v1.5', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [7]:
nomic_tokenizer

BertTokenizerFast(name_or_path='nomic-ai/nomic-embed-text-v1.5', vocab_size=30522, model_max_length=8192, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}