In [1]:
import sys, torch
from pathlib import Path
repo = Path("/Users/tangren/Documents/PolymersGenerator")
sys.path.append(str(repo / "src"))  # 允许导入 src 包

In [2]:
# 导入模块与设备
from src.tokenizer import PolyBertTokenizer
from src.dataset import make_loader
from src.model import VAESmiles
from src.train import train_one_epoch, val_loss, set_seed
from transformers import AutoModel
import torch.optim as optim
import tqdm as notebook_tqdm

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "mps"
                      if torch.backends.mps.is_available() else "cpu")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 加载数据与tokenizer
csv_path = "data/PSMILES_Tg_only.csv"
tokenizer = PolyBertTokenizer("kuelumbus/polyBERT")
train_loader = make_loader(
    csv_path,
    tokenizer,
    batch_size=128,
    shuffle=True,
    col="PSMILES",
    max_len=256,
)
val_loader = make_loader(
    csv_path,
    tokenizer,
    batch_size=128,
    shuffle=False,
    col="PSMILES",
    max_len=256,
)


In [4]:
# 构建带polyBERT编码器的VAE模型
polybert = AutoModel.from_pretrained("kuelumbus/polyBERT")
model = VAESmiles(
    vocab_size=tokenizer.vocab_size,
    emb_dim=256,
    encoder_hid_dim=polybert.config.hidden_size,
    decoder_hid_dim=512,
    z_dim=128,
    n_layers=1,
    pad_id=tokenizer.pad_id,
    bos_id=tokenizer.bos_id,
    eos_id=tokenizer.eos_id,
    drop=0.1,
    use_polybert=True,
    polybert=polybert,
    freeze_polybert=True,
    polybert_pooling="cls",
).to(device)

optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)



In [5]:
# 训练循环
epochs, best = 10, float("inf")
for epoch in range(epochs):
    kl_w = min(1.0, (epoch + 1) / 10.0)
    train_loss = train_one_epoch(model, train_loader, optimizer,
                                 kl_w, tokenizer.pad_id, device)
    val_loss_value = val_loss(model, val_loader, kl_w,
                              tokenizer.pad_id, device)
    print(f"[{epoch+1}/{epochs}] train={train_loss:.4f} "
          f"val={val_loss_value:.4f} kl_w={kl_w:.2f}")

    if val_loss_value + 1e-3 < best:
        best = val_loss_value
        (repo / "checkpoints").mkdir(exist_ok=True)
        torch.save(
            {
                "model": model.state_dict(),
                "tokenizer_name": "kuelumbus/polyBERT",
                "tokenizer": tokenizer.get_vocab(),
                "pad_token_id": tokenizer.pad_id,
                "bos_token_id": tokenizer.bos_id,
                "eos_token_id": tokenizer.eos_id,
                "use_polybert": True,
            },
            repo / "checkpoints/notebook.pt",
        )



                                               

[1/10] train=2.4584 val=1.4561 kl_w=0.10


                                               

[2/10] train=1.2960 val=1.1144 kl_w=0.20


                                               

[3/10] train=1.0283 val=0.9243 kl_w=0.30


                                               

[4/10] train=0.8725 val=0.8074 kl_w=0.40


                                               

[5/10] train=0.7737 val=0.7248 kl_w=0.50


                                               

[6/10] train=0.7033 val=0.6666 kl_w=0.60


                                               

[7/10] train=0.6513 val=0.6218 kl_w=0.70


                                               

[8/10] train=0.6125 val=0.5903 kl_w=0.80


                                               

[9/10] train=0.5821 val=0.5620 kl_w=0.90


                                               

[10/10] train=0.5562 val=0.5391 kl_w=1.00


In [6]:
# 生成与重构
@torch.no_grad()
def sample_smiles(model, tokenizer, num=16, max_len=256):
    z = torch.randn(num, model.mu.out_features, device=device)
    token_ids = model.sample(z, max_len=max_len)
    return [tokenizer.decode(row.tolist()) for row in token_ids.cpu()]

@torch.no_grad()
def reconstruct(model, tokenizer, smiles):
    ids = tokenizer.encode(smiles)
    enc = torch.tensor(ids, device=device).unsqueeze(0)
    mask = (enc != tokenizer.pad_id).long()
    mu, logvar = model.encode(enc, mask)
    z = model.reparameterize(mu, logvar)
    out = model.sample(z, max_len=enc.size(1))
    return tokenizer.decode(out.squeeze(0).tolist())

model.eval()
generated = sample_smiles(model, tokenizer, num=10)
recon = reconstruct(model, tokenizer, "[*]#C[SiH2]C#Cc1cccc(C#[*])c1")


In [5]:
# 推理用保存的模型
ckpt = torch.load(repo / "checkpoints/notebook.pt", map_location=device)
tokenizer = PolyBertTokenizer(ckpt["tokenizer_name"])
polybert = AutoModel.from_pretrained(ckpt["tokenizer_name"])
model.load_state_dict(ckpt["model"])
model.to(device).eval()
# 之后可复用 sample_smiles / reconstruct


  ckpt = torch.load(repo / "checkpoints/notebook.pt", map_location=device)


VAESmiles(
  (drop): Dropout(p=0.1, inplace=False)
  (emb): Embedding(270, 256, padding_idx=267)
  (polybert): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(269, 600, padding_idx=3)
      (position_embeddings): Embedding(512, 600)
      (LayerNorm): LayerNorm((600,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=600, out_features=600, bias=True)
              (key_proj): Linear(in_features=600, out_features=600, bias=True)
              (value_proj): Linear(in_features=600, out_features=600, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=600, out_features=600, bias=True)
           

In [6]:
# 加载模型
import torch
from src.tokenizer import PolyBertTokenizer
from src.model import VAESmiles
from transformers import AutoModel

ckpt_path = repo / "checkpoints/notebook.pt"   # 调整为你保存的路径
ckpt = torch.load(ckpt_path, map_location=device)

tokenizer = PolyBertTokenizer(ckpt["tokenizer_name"])
polybert = AutoModel.from_pretrained(ckpt["tokenizer_name"])

model = VAESmiles(
    vocab_size=tokenizer.vocab_size,
    emb_dim=256,
    encoder_hid_dim=polybert.config.hidden_size,
    decoder_hid_dim=512,
    z_dim=128,
    n_layers=1,
    pad_id=tokenizer.pad_id,
    bos_id=tokenizer.bos_id,
    eos_id=tokenizer.eos_id,
    drop=0.1,
    use_polybert=True,
    polybert=polybert,
    freeze_polybert=True,
).to(device).eval()
model.load_state_dict(ckpt["model"])



  ckpt = torch.load(ckpt_path, map_location=device)


<All keys matched successfully>

In [7]:
# 重构示例
import random, pandas as pd

df = pd.read_csv("data/PSMILES_Tg_only.csv")
subset = random.sample(df["PSMILES"].tolist(), 4)

def reconstruct(smiles):
    ids = tokenizer.encode(smiles)
    inp = torch.tensor(ids, device=device).unsqueeze(0)
    mask = (inp != tokenizer.pad_id).long()
    mu, logvar = model.encode(inp, mask)
    z = model.reparameterize(mu, logvar)
    out = model.sample(z, max_len=inp.size(1))
    return tokenizer.decode(out.squeeze(0).tolist())

for s in subset:
    rec = reconstruct(s)
    print(f"orig: {s}")
    print(f"reco: {rec}\n")


orig: [*]Oc1ccc2ccc(Oc3ccc4c(c3)C(=O)N(c3cccc(N5C(=O)c6ccc([*])cc6C5=O)c3)C4=O)cc2c1
reco: [*]CC([*])(C)C(=O)OCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC

orig: [*]C=CC([*])(C)c1ccccc1
reco: [*]CC([*])(C)C(=O)OCCCCC

orig: [*]C(=O)NCCCCCCCNC(=O)C(OC)C([*])OC
reco: [*]CC([*])(C)C(=O)OCCCCCCCCCCCCCCCCC

orig: [*]c1ccc(OC(=O)Oc2ccc(C([*])(C)C)cc2CC)c(CC)c1
reco: [*]CC([*])(C)C(=O)OCCCCCCCCCCCCCCCCCCCCCCCCCCCC



In [8]:
# 随机生成指标
from rdkit import Chem

@torch.no_grad() # 推理阶段不记录梯度，省显存、提速
def sample_smiles(num=256, max_len=256):
    z = torch.randn(num, model.mu.out_features, device=device) # 从标准正态分布采样潜变量 z
    token_ids = model.sample(z, max_len=max_len) # 让模型在潜变量条件下生成 token 序列
    return [tokenizer.decode(row.tolist()) for row in token_ids.cpu()]

gen = sample_smiles(num=512) # 一次性生成 512 个 SMILES 字符串（可能包含无效或重复）
def to_rdkit(smiles):
    return Chem.MolFromSmiles(smiles.replace("[*]", "[Xe]")) # 把 [*] 替换为 [Xe] 再交给 RDKit 解析

valid = [s for s in gen if to_rdkit(s)]
validity = len(valid) / len(gen) # 对每个生成的 SMILES 调 to_rdkit，能解析就当作有效
uniqueness = len(set(gen)) / len(gen) # 计算生成集合中不重复 SMILES 的比例
train_set = set(df["PSMILES"].astype(str)) # 训练集中所有 SMILES 的集合
novelty = len([s for s in set(gen) if s not in train_set]) / max(len(set(gen)), 1) # 计算生成集合中不在训练集的比例

print(f"Validity: {validity:.3f}")
print(f"Uniqueness: {uniqueness:.3f}")
print(f"Novelty: {novelty:.3f}")


[20:15:55] SMILES Parse Error: unclosed ring for input: '[Xe]c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(C([Xe])(C)C)cc4)cc3)cc2)cc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1cccc'
[20:15:55] SMILES Parse Error: unclosed ring for input: '[Xe]c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(C([Xe])(C)C)cc4)cc3)cc2)cc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1cccc'
[20:15:55] SMILES Parse Error: unclosed ring for input: '[Xe]c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(C([Xe])(C)C)cc4)cc3)cc2)cc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1cccc'
[20:15:55] SMILES Parse Error: unclosed ring for input: '[Xe]c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(C([Xe])(C)C)cc4)cc3)cc2)cc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1cc

Validity: 0.771
Uniqueness: 0.051
Novelty: 1.000


[20:15:56] SMILES Parse Error: unclosed ring for input: '[Xe]c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(C([Xe])(C)C)cc4)cc3)cc2)cc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1cccc'
[20:15:56] SMILES Parse Error: unclosed ring for input: '[Xe]c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(C([Xe])(C)C)cc4)cc3)cc2)cc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1cccc'
[20:15:56] SMILES Parse Error: unclosed ring for input: '[Xe]c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(C([Xe])(C)C)cc4)cc3)cc2)cc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1cccc'
[20:15:56] SMILES Parse Error: unclosed ring for input: '[Xe]c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(C([Xe])(C)C)cc4)cc3)cc2)cc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1cc

In [9]:
print("length of valid:",len(valid))
print("\nSome valid generated SMILES:", valid[1])
unique_valid = set(valid)
print("Number of unique valid SMILES:", len(unique_valid))
print("\nSome valid generated SMILES:", unique_valid)

length of valid: 395

Some valid generated SMILES: [*]CC([*])(C)C(=O)OCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
Number of unique valid SMILES: 5

Some valid generated SMILES: {'[*]CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC', '[*]c1ccc(Oc2ccc(Oc3ccc(Oc4ccc(C([*])(C)C)cc4)cc3)cc2)cc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1', '[*]Nc1ccc(Oc2ccc(NC(=O)c3ccc(Oc4ccc(C(=O)c5ccc(Oc6ccc(C([*])=O)cc6)cc5)cc4)cc3)cc2)cc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1ccccc1c1c

In [14]:
# 使用生成的 SMILES 进行3D建模
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
smi = valid[0]
mol = to_rdkit(smi)
mol_h = Chem.AddHs(mol)  # 添加氢原子
AllChem.EmbedMolecule(mol_h, AllChem.ETKDG())  # 3D构型嵌入
AllChem.UFFOptimizeMolecule(mol_h)  # UFF力场优化
Draw.MolToFile(mol_h, "generated_molecule.png", size=(300, 300))

[20:17:42] UFFTYPER: Unrecognized atom type: Xe3+4 (0)
[20:17:42] UFFTYPER: Unrecognized atom type: Xe3+4 (3)
[20:18:00] UFFTYPER: Unrecognized atom type: Xe3+4 (0)
[20:18:00] UFFTYPER: Unrecognized atom type: Xe3+4 (3)


ValueError: Bad Conformer Id

In [14]:
# 插值示例
def encode_to_z(smiles):
    ids = tokenizer.encode(smiles)
    inp = torch.tensor(ids, device=device).unsqueeze(0)
    mask = (inp != tokenizer.pad_id).long()
    mu, logvar = model.encode(inp, mask)
    return mu.squeeze(0), logvar.squeeze(0)

s1, s2 = subset[:2]
z1, _ = encode_to_z(s1)
z2, _ = encode_to_z(s2)

alphas = torch.linspace(0, 1, steps=6, device=device)
interpolations = []
for a in alphas:
    z = (1 - a) * z1 + a * z2
    ids = model.sample(z.unsqueeze(0), max_len=128)
    interpolations.append(tokenizer.decode(ids.squeeze(0).tolist()))

for a, seq in zip(alphas.tolist(), interpolations):
    print(f"α={a:.2f}: {seq}")


α=0.00: [*]c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(C([*])(C)C)cc4)cc3)cc2)cc1
α=0.20: [*]c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(C([*])(C)C)cc4)cc3)cc2)cc1
α=0.40: [*]c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(C([*])(C)C)cc4)cc3)cc2)cc1
α=0.60: [*]c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(C([*])(C)C)cc4)cc3)cc2)cc1
α=0.80: [*]c1ccc(Oc2ccc(C(=O)c3ccc(Oc4ccc(C([*])(C)C)cc4)cc3)cc2)cc1
α=1.00: [*]CC([*])(C)C(=O)OCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
