In [None]:
import json
import tqdm
import pandas as pd
import torch
from datasets import Dataset
from modelscope import snapshot_download, AutoTokenizer
from swanlab.integration.huggingface import SwanLabCallback
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import os
import swanlab

In [None]:
# 定义数据处理函数
def process_func(example):
    MAX_LENGTH = 384
    instruction = tokenizer(
        f"<|im_start|>system\n"
        f"你是一名分子性质预测专家。请根据输入分子的smiles表示，"
        f"预测该分子的最高占据分子轨道能级、最低未占分子轨道能级、带隙这三个性质的数值。"
        f"并严格按照如下例子格式输出，最高占据分子轨道能级:数值;最低未占分子轨道能级:数值;带隙:数值"
        f"（数值保留到小数点后三位即可,不要输出任何文字解释、单位或换行符）。"
        f"<|im_end|>\n"
        f"<|im_start|>user\n{example['input']}<|im_end|>\n"
        f"<|im_start|>assistant\n",
        add_special_tokens=False,
    )
    response = tokenizer(f"{example['output']}", add_special_tokens=False)

    input_ids = instruction["input_ids"] + response["input_ids"]
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"]

    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [None]:
def predict(example, model, tokenizer):
    device = "cuda"
    prompt = (
        f"<|im_start|>system\n"
        f"你是一名分子性质预测专家。请根据输入分子的smiles表示，"
        f"预测该分子的最高占据分子轨道能级、最低未占分子轨道能级、带隙这三个性质的数值。"
        f"并严格按照如下例子格式输出，最高占据分子轨道能级:数值;最低未占分子轨道能级:数值;带隙:数值"
        f"（数值保留到小数点后三位即可,不要输出任何文字解释、单位或换行符）。"
        f"<|im_end|>\n"
        f"<|im_start|>user\n{example['input']}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    generated_ids = model.generate(
        inputs.input_ids,
        max_new_tokens=64,  
        do_sample=False,    
        temperature=0,      
        pad_token_id=tokenizer.pad_token_id
    )
    output_ids = generated_ids[0][len(inputs.input_ids[0]):]
    response = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
    return response

In [None]:
model_name = "../modle/Qwen2.5-7B-Instruct/Qwen/Qwen2.5-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.enable_input_require_grads()

In [None]:
train_dataset_path = "./train_smi_homo_lumo_gap.jsonl"
train_df = pd.read_json(train_dataset_path, lines=True)
train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)

In [None]:
# LoRA 配置
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
)

In [None]:
model = get_peft_model(model, config)

In [None]:
swanlab_callback = swanlab.integration.huggingface.SwanLabCallback(
    project="fluorescence-prediction-qwen2",
    experiment_name="smiles-lambda-em-regression",
    config={
        "model": "Qwen2-7B-Instruct",
        "method": "LoRA",
        "lr": 1e-4,
        "batch_size": 16,
        "epochs": 2,
        "task": "Regression from SMILES"
    }
)

In [None]:
args = TrainingArguments(
    output_dir="./output/01",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    logging_steps=100,
    num_train_epochs=3,
    save_steps=1000,
    learning_rate=5e-5,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",  
    optim="adamw_torch",
    disable_tqdm=False,  
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[swanlab_callback],  
)

In [None]:
trainer.train()