In [14]:
import os
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace, Sequence, Split, PreTokenizer
from tokenizers.trainers import BpeTrainer
from tokenizers.processors import TemplateProcessing
import datasets

In [2]:
ds = datasets.load_dataset("jablonkagroup/pubchem-smiles-molecular-formula", split="train")

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Using the latest cached version of the dataset since jablonkagroup/pubchem-smiles-molecular-formula couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/wuhao/.cache/huggingface/datasets/jablonkagroup___pubchem-smiles-molecular-formula/default/0.0.0/e0869fce5418e8672cd3c4ce2f8a60e776bf1372 (last modified on Mon Jul 21 13:04:12 2025).


Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

In [16]:
ds

Dataset({
    features: ['CID', 'smiles', 'molecular_formula'],
    num_rows: 118536656
})

In [10]:
smiles_tokenizer = PreTrainedTokenizerFast.from_pretrained("DeepChem/SmilesTokenizer_PubChem_1M")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [17]:
def create_smiles_pre_tokenizer():
    """Create a pre-tokenizer suitable for SMILES strings."""
    # SMILES use specific characters - we'll split on common atom/bond boundaries
    # This helps the tokenizer learn meaningful chemical substructures
    return Sequence([
        Split(pattern=r'(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])', behavior="isolated"),
    ])

In [18]:
# Define special tokens
special_tokens = [
    "[PAD]",    # Padding token
    "[UNK]",    # Unknown token  
    "[CLS]",    # Classification token (start of sequence)
    "[SEP]",    # Separator token (end of sequence)
]

# Initialize tokenizer with BPE model
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

tokenizer.pre_tokenizer = create_smiles_pre_tokenizer()

trainer = BpeTrainer(
    vocab_size=1024,
    special_tokens=special_tokens,
    min_frequency=2000,
    show_progress=True
)

In [None]:
# Train the tokenizer
tokenizer.train_from_iterator(ds["smiles"], trainer=trainer)






In [None]:
# Add post-processor to add special tokens
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)
    
# Create transformers tokenizer
fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
)

output_dir = "/mnt/workspace/md4/data"
tokenizer_name = "pubchem_large_tokenizer"
# Save tokenizer
os.makedirs(output_dir, exist_ok=True)
tokenizer_path = os.path.join(output_dir, tokenizer_name)
fast_tokenizer.save_pretrained(tokenizer_path)

print(f"Tokenizer saved to {tokenizer_path}")
print(f"Vocabulary size: {fast_tokenizer.vocab_size}")

Tokenizer saved to data/pubchem_large_tokenizer
Vocabulary size: 30000


In [13]:
fast_tokenizer.vocab

{'COC12C': 19445,
 'Cc1nc2cccnc2': 25176,
 'c5sc6ccccc6c5': 21174,
 'c7cc8ccccc8': 5759,
 'Cn1cccc1C1': 27230,
 'c3c2OC': 11473,
 'CNCc3cc': 22254,
 'NCCCN1CC': 18258,
 'CCC2OCCCO2': 25164,
 'c3c4ccc5c6ccc7c8c': 25095,
 'Sc8ccccc87': 27711,
 'CCn2c3c': 22081,
 'n1nc': 726,
 'c2c3ccc4ccccc4c3nc3c2ccc2ccccc23': 28905,
 'Oc3nncc4ccccc34': 18941,
 '13cc': 8112,
 'C1Cc1csc': 28943,
 'c2nc3o': 26232,
 'CN1CCCC1': 1085,
 'C1CCNC1': 2650,
 'Oc3nccc4cc': 21733,
 '2COc2ccccc2': 28908,
 'CCNCCCN1C': 19668,
 'CCn2nc3c': 9526,
 'CCNc1ncccc1S': 8323,
 'COCCCc1ccncc1': 24776,
 'cc4C6': 24931,
 'Oc1nnc': 15357,
 '32CCC': 27104,
 'ccc4c3c12': 24352,
 'oc1ccccc13': 9764,
 'Oc3n': 8059,
 'CCCNc1snnc1CN': 16108,
 'n1cccc1C': 7304,
 'c2ccc2': 2023,
 'CCCNCc1cc2c': 24727,
 'Nc2nc3cccc': 16096,
 'NCC45CC6CC': 13879,
 'Oc1ccccc1I': 13202,
 'cc5cc34': 23644,
 'OC1n1c': 23276,
 'NCC2CC3CCC': 19112,
 'OCCC1CC': 20881,
 'CCCCCCCS': 10559,
 'Cc1cccc2cc3ccccc3cc12': 25004,
 'c3cc2c': 13179,
 'n5c6ccccc6c6cccc': 181