## PSMILES 标准化

In [9]:
from canonicalize_psmiles import canonicalize
from rdkit import Chem
from typing import Optional
import tqdm as notebook_tqdm

In [10]:
def has_two_stars(ps): # 确保有且仅有两个连接位点
    return ps.count('[*]') == 2

def canonicalize_or_skip(psmiles: str) -> Optional[str]:
    try:
        canon = canonicalize(psmiles)
        if not has_two_stars(canon):
            return None
        # 基础 RDKit 语法校验（PSMILES到分子对象可能需要将 [*] 替换为占位原子）
        tmp = canon.replace('[*]', '[Xe]')  # 占位为惰性原子检查价态/语法
        if Chem.MolFromSmiles(tmp) is None:
            return None
        return canon
    except Exception:
        return None



In [11]:
# 加载PolyBERT的SentencePiece分词器
from transformers import AutoTokenizer

TOK_NAME = "kuelumbus/polyBERT"  # 论文对应的开源模型与词表
tok = AutoTokenizer.from_pretrained(TOK_NAME, use_fast=False)  # SentencePiece
print(tok.mask_token, tok.cls_token, tok.sep_token)  # 确认特殊符号

[MASK] [CLS] [SEP]


In [None]:
import random
random.seed(42)

# 为PSMILES的token id列表随机掩码
def random_spans_to_mask(ids, p=0.20, min_span=1, max_span=5, mask_id=None):
    n = len(ids)
    K = max(1, int(n * p))  # 大致目标掩码 token 数
    masked = ids[:]
    covered = 0
    while covered < K:
        span = random.randint(min_span, max_span)
        start = random.randint(1, n-2-span)  # 避免动 [CLS]/[SEP]
        # 跳过特殊符号与星号位置（星号尽量不掩码，降低语法崩溃风险）
        if mask_id is None:
            mask_id = tok.mask_token_id
        for j in range(span):
            idx = start + j
            if masked[idx] in (tok.cls_token_id, tok.sep_token_id) or tok.decode([masked[idx]]) == '[*]':
                continue
            masked[idx] = mask_id
            covered += 1
            if covered >= K:
                break
    return masked