In [1]:
from transformers import AutoTokenizer
model = "openai-community/gpt2"  # 你要微调的模型
tok = AutoTokenizer.from_pretrained(model)
tok.pad_token = tok.eos_token  # 给因果LM一个pad

SEP = '|cond|'
protected = [
    '[*]','Cl','Br','Si','Na','Li','Mg','Ca','Se',
    'c','n','o','s','p','C','N','O','S','P','F','I','H',
    '=','#','(',')','1','2','3','4','5','6','7','8','9'
    '.','/','\\','-','@','@@','0',SEP
]
# 只添加词表中没有的
to_add = [t for t in protected if t not in tok.get_vocab()]
tok.add_tokens(to_add, special_tokens=False)  # 作为“新增词”，不可再被拆
tok.save_pretrained("psmiles_tokenizer_added")


('psmiles_tokenizer_added/tokenizer_config.json',
 'psmiles_tokenizer_added/special_tokens_map.json',
 'psmiles_tokenizer_added/vocab.json',
 'psmiles_tokenizer_added/merges.txt',
 'psmiles_tokenizer_added/added_tokens.json',
 'psmiles_tokenizer_added/tokenizer.json')

In [2]:
def show_tokens(s):
    print(tok.tokenize(s))

# 连接位点要独立成 token：
show_tokens("[*]CH2CH2[*]")           # 期望 ['[*]', 'C', 'H', '2', 'C', 'H', '2', '[*]'] 或近似（byte级仍会把数字独立）
# 芳环与环数字
show_tokens("c1ccccc1")               # 期望 'c','1','c','c','c','c','c','1'
# 卤素不被拆
show_tokens("[*]CH2CH(Cl)CH2[*]")     # 期望包含 'Cl' 而不是 'C','l'
# 条件分隔
show_tokens("target_Tg=150<|cond|>")  # 期望 '<|cond|>' 独立


['[*]', 'CH', '2', 'CH', '2', '[*]']
['c', '1', 'cc', 'cc', 'c', '1']
['[*]', 'CH', '2', 'CH', '(', 'Cl', ')', 'CH', '2', '[*]']
['target', '_', 'T', 'g', '=', '150', '<', '|cond|', '>']


In [None]:
# pip install transformers accelerate datasets tokenizers peft rdkit-pypi
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch, re

model = "openai-community/gpt2"  # 你要微调的模型
tok = AutoTokenizer.from_pretrained("psmiles_tokenizer_added")
model = AutoModelForCausalLM.from_pretrained(model)
model.resize_token_embeddings(len(tok))

def build_example(ex):
    cond = ex["instruction"]
    out  = ex["output"]
    in_ids  = tok(cond + SEP, add_special_tokens=False).input_ids
    out_ids = tok(out, add_special_tokens=False).input_ids
    ex["input_ids"] = in_ids + out_ids
    ex["labels"]    = [-100]*len(in_ids) + out_ids
    return ex

ds = load_dataset("json", data_files={"train":"train.json","val":"val.json"})
ds = ds.map(build_example, remove_columns=ds["train"].column_names)

def collate(batch):
    ids = [torch.tensor(x["input_ids"]) for x in batch]
    labs= [torch.tensor(x["labels"]) for x in batch]
    ids  = torch.nn.utils.rnn.pad_sequence(ids,  batch_first=True, padding_value=tok.pad_token_id)
    labs = torch.nn.utils.rnn.pad_sequence(labs, batch_first=True, padding_value=-100)
    attn = (ids != tok.pad_token_id)
    return {"input_ids":ids, "labels":labs, "attention_mask":attn}

# ---- 关键 token 加权 Loss ----
KT = set(tok.convert_tokens_to_ids(['[*]','=', '#','(',')','1','2','3']))
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):  # 增加 **kwargs
        labels = inputs["labels"]
        out = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=labels)
        logits = out.logits
        # shift
        sl = labels[:, 1:].contiguous()
        lg = logits[:, :-1].contiguous()
        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none')
        loss = loss_fct(lg.view(-1, lg.size(-1)), sl.view(-1))
        with torch.no_grad():
            key_mask = torch.isin(sl.view(-1), torch.tensor(list(KT), device=sl.device))
        loss = torch.where(key_mask, loss*1.8, loss)  # 1.5~2.0 可调
        loss = loss.mean()
        return (loss, out) if return_outputs else loss

args = TrainingArguments(
    output_dir="psmiles-gpt",
    learning_rate=3e-5,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    warmup_ratio=0.05,
    weight_decay=0.01,
    logging_steps=50,
    save_steps=1000, save_total_limit=3,
    # fp16=True  # 注释掉或删除这一行
)

trainer = WeightedTrainer(model=model, args=args, train_dataset=ds["train"], eval_dataset=ds["val"], data_collator=collate)
trainer.train()
trainer.save_model("psmiles-gpt")
tok.save_pretrained("psmiles-gpt")
