In [None]:
%load_ext autoreload

%autoreload 2

%env CUDA_VISIBLE_DEVICES=0

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from trl import (SFTTrainer, SFTConfig)

from transformers import (AutoTokenizer, AutoModelForCausalLM)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset, load_from_disk

from pathlib import Path

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.float16

In [None]:
datasets_path = Path("./datasets/tulu-math")

raw_ds = load_from_disk(datasets_path)

train_size = 10000
test_size = 100
eval_size = 10

raw_ds = raw_ds.train_test_split(
    train_size=train_size, test_size=test_size + eval_size, seed=42)

train_ds = raw_ds["train"]
test_ds = raw_ds["test"]
eval_ds = test_ds.select(range(eval_size))
test_ds = test_ds.select(range(eval_size, eval_size + test_size))

In [None]:
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast

# model_name = "huggingface/meta-llama/Llama-3.2-3B-Instruct"
model_name = "huggingface/Qwen/Qwen3-4B-Instruct-2507"

tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name, dtype=dtype, device_map=device)
base_model = base_model.eval()

base_model = base_model.to(device)

In [None]:
model = base_model

batch_size = 10

for start in tqdm(range(0, len(eval_ds), batch_size)):
    end = min(start + batch_size, len(eval_ds))
    gt_msgs = [eval_ds[i]["messages"] for i in range(start, end)]
    batch_msgs = [eval_ds[i]["messages"][:-1] for i in range(start, end)]
    batch_texts = [
        tokenizer.apply_chat_template(
            msg, tokenize=False, add_generation_prompt=True)
        for msg in batch_msgs
    ]

    tokenizer.padding_side = "left"
    tokenizer.truncation_side = "left"
    inputs = tokenizer.apply_chat_template(batch_msgs, tokenize=True,
                                           return_dict=True,
                                           add_generation_prompt=True,
                                           padding=True,
                                           return_tensors="pt")
    inputs = inputs.to(device)

    with torch.no_grad():
        with torch.amp.autocast(device_type=device.type, dtype=dtype):
            outputs = model.generate(
                **inputs,
                max_new_tokens=4096,
            )

    gen_ids = outputs[:, inputs.input_ids.shape[1]:]
    gen_strs = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
    gen_msgs = [
        eval_ds[i]["messages"][:-1] +
        [{
            "role": "assistant",
            "content": gen_strs[i]
        }]
        for i in range(start, end)
    ]

In [None]:
from evaluate import load

metric_bleu = load("bleu")
metric_rouge = load("rouge")
metric_bertscore = load("bertscore")


In [None]:
bleu_result = metric_bleu.compute(
    predictions=[gen_msgs[0][-1]["content"]],
    references=[gen_msgs[0][-2]["content"]]
)

print(bleu_result)

rouge_result = metric_rouge.compute(
    predictions=[gen_msgs[0][-1]["content"]],
    references=[gen_msgs[0][-2]["content"]],
    use_stemmer=True
)

print(rouge_result)

bertscore_result = metric_bertscore.compute(
    predictions=[gen_msgs[0][-1]["content"]],
    references=[gen_msgs[0][-2]["content"]],
    lang="en",                # 指定语言
    model_type="roberta-large",  # 用哪个模型做嵌入
    device=device
)

print(bertscore_result)


In [None]:
output_dir = Path("./output/qwen3_4b_instruct/tulu_math")
output_dir.mkdir(parents=True, exist_ok=True)

batch_size = 10
model = base_model.eval().to("cuda")

for start in tqdm(range(0, len(eval_ds), batch_size)):
    end = min(start + batch_size, len(eval_ds))
    batch_msgs = [eval_ds[i]["messages"][:-1] for i in range(start, end)]

    # 生成批量 prompt 文本
    batch_texts = [
        tokenizer.apply_chat_template(
            msg, tokenize=False, add_generation_prompt=True)
        for msg in batch_msgs
    ]

    # 批量 tokenize
    inputs = tokenizer(
        batch_texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=4096
    ).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5000,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )

    # 解码结果
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # 保存每条生成
    for i, text in enumerate(decoded):
        idx = start + i
        (output_dir / f"{idx:04d}.md").write_text(text)

In [None]:
print(inputs.input_ids.shape)

In [None]:
lora_rank = 8

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=lora_rank,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj",
                    "o_proj", "gate_proj", "down_proj", "up_proj"],
)

model = get_peft_model(base_model, lora_config)

print(model)

model.print_trainable_parameters()

model.config.use_cache = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

In [None]:
datasets_id = "huggingface/allenai/tulu-3-sft-mixture"

raw_ds = load_dataset(datasets_id, split="train")


def filter_func(x):
    msg = x["messages"]
    if not any(m["role"] == "assistant" and m["content"].strip() != "" for m in msg):
        return False
    src = x["source"]
    allowed_src = ["math", "science", "history", "literature"]
    for allowed in allowed_src:
        if allowed in src:
            return True
    return False


raw_ds = raw_ds.filter(filter_func).flatten_indices()

raw_ds.save_to_disk("./datasets/tulu-math")

# raw_ds = raw_ds.train_test_split(test_size=0.005, seed=42)

# train_ds = raw_ds["train"]
# eval_ds = raw_ds["test"]

# train_ds = train_ds.shuffle(seed=42)

# mini_ds = train_ds.select(range(10))

# print("size of train dataset: ", len(train_ds))
# print("size of eval dataset: ", len(eval_ds))

In [None]:
my_template = ""

with open("llama-3.2.jinja2", "r", encoding="utf-8") as f:
    my_template = f.read()

tokenizer.chat_template = my_template

In [None]:
sft_args = SFTConfig(
    output_dir="./output/test",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    logging_steps=20,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    learning_rate=2e-4,
    num_train_epochs=1,
    bf16=True,
    gradient_checkpointing=True,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    assistant_only_loss=True,
    # dataset_kwargs={"skip_prepare_dataset": True},
    packing=False,
)

trainer = SFTTrainer(
    model=model,
    args=sft_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    processing_class=tokenizer,
)

In [None]:
trainer.train()

In [None]:
from peft import PeftModel
if isinstance(model, PeftModel):
    model.save_pretrained("./output/test/adapter")
else:
    # 意外情况（例如未套 PEFT）：存整模型
    model.save_pretrained("./output/test/full")
tokenizer.save_pretrained("./output/test")
print("✅ Done. Saved to ./output/test")