# Evaluation vs Optix SELFIE WordLevel Tokenizer

In [None]:
from FastChemTokenizer import FastChemTokenizer
tokenizer = FastChemTokenizer.from_pretrained("../chemtok")

test_text = "CCO" * 500  # Long repeated string to stress test
import time
# Warm up cache
tokenizer.encode(test_text)
# Benchmark
start = time.perf_counter()
for _ in range(1000):
    tokenizer.encode(test_text)
end = time.perf_counter()
print(f"⏱️  1000 encodes in {end - start:.4f} seconds")

⏱️  1000 encodes in 0.0021 seconds


In [None]:
import pandas as pd
import time
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
import psutil
import os

# Load tokenizers with proper naming
#tok1 = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
tok1 = AutoTokenizer.from_pretrained("smostafanejad/gen-mlm-cismi-bert-wordpiece", torch_dtype="auto")  

# Reconstruct tokenizer
tok2 = FastChemTokenizer.from_pretrained("../chemtok")

# Get UNK token IDs for counting
unk_id1 = tok1.unk_token_id
unk_id2 = tok2.unk_token_id

# Initialize metrics
metrics = {
    "tok1": {"total_time": 0, "total_tokens": 0, "unk_count": 0, "max_memory": 0},
    "tok2": {"total_time": 0, "total_tokens": 0, "unk_count": 0, "max_memory": 0},
}

# Read CSV in chunks
chunksize = 50_000
data_path = "../comb_smi.csv"
col_name = "SMILES"

# Count total number of rows for tqdm if desired (optional for more accuracy)
total_rows = sum(1 for _ in open(data_path)) - 1  # -1 for header
total_chunks = (total_rows + chunksize - 1) // chunksize

print(f"Processing {data_path} in chunks of {chunksize:,} rows...")

# tqdm progress bar over the chunks
with tqdm(pd.read_csv(data_path, chunksize=chunksize, usecols=[col_name]), desc="Processing chunks", total=total_chunks) as pbar:
    for chunk_idx, chunk in enumerate(pbar):
        selfies = chunk[col_name].tolist()
        total_texts = len(selfies)

        # Process Tokenizer 1
        process = psutil.Process(os.getpid())
        mem_before = process.memory_info().rss / 1024**2  # MB

        start = time.perf_counter()
        encoded1 = tok1.batch_encode_plus(
            selfies,
            padding=False,
            truncation=False,
            return_attention_mask=False,
            return_token_type_ids=False
        )
        end = time.perf_counter()

        mem_after = process.memory_info().rss / 1024**2
        metrics["tok1"]["max_memory"] = max(metrics["tok1"]["max_memory"], mem_after - mem_before)
        metrics["tok1"]["total_time"] += (end - start)
        metrics["tok1"]["total_tokens"] += sum(len(ids) for ids in encoded1["input_ids"])
        metrics["tok1"]["unk_count"] += sum(ids.count(unk_id1) for ids in encoded1["input_ids"])

        # Process Tokenizer 2
        mem_before = process.memory_info().rss / 1024**2
        start = time.perf_counter()
        encoded2 = tok2.batch_encode_plus(
            selfies,
            padding=False,
            truncation=False,
            return_attention_mask=False,
            return_token_type_ids=False
        )
        end = time.perf_counter()

        mem_after = process.memory_info().rss / 1024**2
        metrics["tok2"]["max_memory"] = max(metrics["tok2"]["max_memory"], mem_after - mem_before)
        metrics["tok2"]["total_time"] += (end - start)
        metrics["tok2"]["total_tokens"] += sum(len(ids) for ids in encoded2["input_ids"])
        metrics["tok2"]["unk_count"] += sum(ids.count(unk_id2) for ids in encoded2["input_ids"])
        pbar.set_postfix(chunk=chunk_idx + 1)

# Final metrics
total_texts = (chunk_idx + 1) * chunksize
avg_tokens_tok1 = metrics["tok1"]["total_tokens"] / total_texts
avg_tokens_tok2 = metrics["tok2"]["total_tokens"] / total_texts

unk_rate_tok1 = (metrics["tok1"]["unk_count"] / metrics["tok1"]["total_tokens"]) * 100 if metrics["tok1"]["total_tokens"] > 0 else 0
unk_rate_tok2 = (metrics["tok2"]["unk_count"] / metrics["tok2"]["total_tokens"]) * 100 if metrics["tok2"]["total_tokens"] > 0 else 0

time_per_text_tok1 = (metrics["tok1"]["total_time"] / total_texts) * 1000  # ms
time_per_text_tok2 = (metrics["tok2"]["total_time"] / total_texts) * 1000  # ms

# Print results
print("\n" + "="*60)
print("TOKENIZER PERFORMANCE COMPARISON RESULTS")
print("="*60)
print(f"Total texts processed: {total_texts:,}")
print(f"Chunk size: {chunksize:,} | Columns: ['{col_name}']")

print("\n[Tokenizer 1: chemfie-experiment-1]")
print(f"  Avg time per text: {time_per_text_tok1:.4f} ms")
print(f"  Avg tokens per text: {avg_tokens_tok1:.2f}")
print(f"  UNK token rate: {unk_rate_tok1:.4f}%")
print(f"  Peak memory usage: {metrics['tok1']['max_memory']:.2f} MB")

print("\n[Tokenizer 2: new prototype tokenizer]")
print(f"  Avg time per text: {time_per_text_tok2:.4f} ms")
print(f"  Avg tokens per text: {avg_tokens_tok2:.2f}")
print(f"  UNK token rate: {unk_rate_tok2:.4f}%")
print(f"  Peak memory usage: {metrics['tok2']['max_memory']:.2f} MB")

print("\n[Comparison Summary]")
print(f"Tokenizer 2 is {time_per_text_tok1 / time_per_text_tok2:.2f}x faster")
print(f"Tokenizer 1 produces {avg_tokens_tok1 / avg_tokens_tok2:.2f}x more tokens")
print(f"Tokenizer 2 has {unk_rate_tok1 - unk_rate_tok2:.4f}% lower UNK rate")

if unk_rate_tok1 > 0 or unk_rate_tok2 > 0:
    print("\n⚠️ WARNING: UNK tokens detected! Check if your tokenizers were trained on this data.")
print("="*60)


Processing ../comb_smi.csv in chunks of 50,000 rows...


Processing chunks:   0%|          | 0/54 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (551 > 512). Running this sequence through the model will result in indexing errors



TOKENIZER PERFORMANCE COMPARISON RESULTS
Total texts processed: 2,700,000
Chunk size: 50,000 | Columns: ['SMILES']

[Tokenizer 1: chemfie-experiment-1]
  Avg time per text: 0.0938 ms
  Avg tokens per text: 50.57
  UNK token rate: 0.0000%
  Peak memory usage: 387.43 MB

[Tokenizer 2: new prototype tokenizer]
  Avg time per text: 0.0625 ms
  Avg tokens per text: 21.49
  UNK token rate: 0.0000%
  Peak memory usage: 15.34 MB

[Comparison Summary]
Tokenizer 2 is 1.50x faster
Tokenizer 1 produces 2.35x more tokens
Tokenizer 2 has 0.0000% lower UNK rate



In [8]:
# Print results
print("\n" + "="*60)
print("TOKENIZER PERFORMANCE COMPARISON RESULTS")
print("="*60)
print(f"Total texts processed: {total_texts:,}")
print(f"Chunk size: {chunksize:,} | Columns: ['{col_name}']")

print("\n[Tokenizer 1: chemfie-experiment-1]")
print(f"  Avg time per text: {time_per_text_tok1:.4f} ms")
print(f"  Avg sequence length: {avg_tokens_tok1:.2f} tokens")
print(f"  UNK token rate: {unk_rate_tok1:.4f}%")
print(f"  Peak memory usage: {metrics['tok1']['max_memory']:.2f} MB")

print("\n[Tokenizer 2: new prototype tokenizer]")
print(f"  Avg time per text: {time_per_text_tok2:.4f} ms")
print(f"  Avg sequence length: {avg_tokens_tok2:.2f} tokens")
print(f"  UNK token rate: {unk_rate_tok2:.4f}%")
print(f"  Peak memory usage: {metrics['tok2']['max_memory']:.2f} MB")

print("\n[Comparison Summary]")
speed_ratio = time_per_text_tok1 / time_per_text_tok2
if speed_ratio > 1:
    print(f"Tokenizer 2 is {speed_ratio:.2f}x faster")
else:
    print(f"Tokenizer 2 is {1/speed_ratio:.2f}x slower")

print(f"Tokenizer 1 produces {avg_tokens_tok1 / avg_tokens_tok2:.2f}x more tokens")
print(f"Tokenizer 2 has {unk_rate_tok1 - unk_rate_tok2:.4f}% lower UNK rate")

if unk_rate_tok1 > 0 or unk_rate_tok2 > 0:
    print("\n⚠️ WARNING: UNK tokens detected! Check if your tokenizers were trained on this data.")
print("="*60)

throughput_tok1 = total_texts / metrics["tok1"]["total_time"]  # texts/sec
throughput_tok2 = total_texts / metrics["tok2"]["total_time"]

print(f"  Throughput Tok1: {throughput_tok1:,.0f} texts/sec")
print(f"  Throughput Tok2: {throughput_tok2:,.0f} texts/sec")


TOKENIZER PERFORMANCE COMPARISON RESULTS
Total texts processed: 2,700,000
Chunk size: 50,000 | Columns: ['SMILES']

[Tokenizer 1: chemfie-experiment-1]
  Avg time per text: 0.0938 ms
  Avg sequence length: 50.57 tokens
  UNK token rate: 0.0000%
  Peak memory usage: 387.43 MB

[Tokenizer 2: new prototype tokenizer]
  Avg time per text: 0.0625 ms
  Avg sequence length: 21.49 tokens
  UNK token rate: 0.0000%
  Peak memory usage: 15.34 MB

[Comparison Summary]
Tokenizer 2 is 1.50x faster
Tokenizer 1 produces 2.35x more tokens
Tokenizer 2 has 0.0000% lower UNK rate

  Throughput Tok1: 10,658 texts/sec
  Throughput Tok2: 15,995 texts/sec


In [3]:
from transformers import AutoTokenizer

model = AutoTokenizer.from_pretrained("smostafanejad/gen-mlm-cismi-bert-wordpiece", torch_dtype="auto")  

test_text = "CCO" * 500  # Long repeated string to stress test
import time
# Warm up cache
model.encode(test_text)
# Benchmark
start = time.perf_counter()
for _ in range(1000):
    model.encode(test_text)
end = time.perf_counter()
print(f"⏱️  1000 encodes in {end - start:.4f} seconds")

⏱️  1000 encodes in 0.5965 seconds
