In [1]:
import os
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace, Sequence, Split, PreTokenizer
from tokenizers.trainers import BpeTrainer
from tokenizers.processors import TemplateProcessing
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = datasets.load_dataset("jablonkagroup/pubchem-smiles-molecular-formula", split="train")

Generating train split: 100%|██████████| 82335768/82335768 [00:31<00:00, 2655407.02 examples/s]


In [3]:
ds

Dataset({
    features: ['smiles', 'molecular_formula'],
    num_rows: 82335768
})

In [5]:
smiles_tokenizer = PreTrainedTokenizerFast.from_pretrained("../data/pubchem_large_tokenizer_2048")

In [8]:
encoded = smiles_tokenizer.encode(text=ds["molecular_formula"][0], text_pair=ds["smiles"][0])

In [11]:
smiles_tokenizer.convert_tokens_to_string(smiles_tokenizer.convert_ids_to_tokens(encoded))

'[CLS] C14H19 NO2 [SEP] OC1 [C@H]2 CO C[C@H]1 CN(C c1ccccc1) C2 [SEP]'

In [5]:
def create_smiles_pre_tokenizer():
    """Create a pre-tokenizer suitable for SMILES strings."""
    # SMILES use specific characters - we'll split on common atom/bond boundaries
    # This helps the tokenizer learn meaningful chemical substructures
    return Sequence([
        Split(pattern=r'(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])', behavior="isolated"),
    ])

In [6]:
# Define special tokens
special_tokens = [
    "[PAD]",    # Padding token
    "[UNK]",    # Unknown token  
    "[CLS]",    # Classification token (start of sequence)
    "[SEP]",    # Separator token (end of sequence)
]

# Initialize tokenizer with BPE model
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

tokenizer.pre_tokenizer = create_smiles_pre_tokenizer()

trainer = BpeTrainer(
    # vocab_size=1024,
    special_tokens=special_tokens,
    min_frequency=10000,
    show_progress=True
)

In [None]:
# Train the tokenizer
tokenizer.train_from_iterator(ds["smiles"], trainer=trainer)

In [20]:
# Add post-processor to add special tokens
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)
    
# Create transformers tokenizer
fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
)

output_dir = "/mnt/workspace/md4/data"
tokenizer_name = "pubchem_large_tokenizer"
# Save tokenizer
os.makedirs(output_dir, exist_ok=True)
tokenizer_path = os.path.join(output_dir, tokenizer_name)
fast_tokenizer.save_pretrained(tokenizer_path)

print(f"Tokenizer saved to {tokenizer_path}")
print(f"Vocabulary size: {fast_tokenizer.vocab_size}")

Tokenizer saved to /mnt/workspace/md4/data/pubchem_large_tokenizer
Vocabulary size: 1024


In [29]:
fast_tokenizer.convert_ids_to_tokens(fast_tokenizer.encode(ds["smiles"][0]))

['[CLS]', 'CCCC', 'CCN', 'O', 'P', '(C)', '(=O)', 'OC', '[SEP]']