In [None]:
torchrun --nproc_per_node 4 --nnodes 1 --node_rank 0 --master_addr localhost --master_port 6601 /root/autodl-tmp/finetune.py \
    --model_name_or_path "/root/autodl-fs/DeepSeek-R1-Distill-Qwen-32B" \
    --data_path "/root/autodl-tmp/data/datasets/output.json" \
    --eval_data_path "/root/autodl-tmp/data/datasets/dev.json" \
    --bf16 True \
    --output_dir "/root/autodl-fs/trained_models/deepseek_ri_32b_sop" \
    --num_train_epochs 50 \
    --per_device_train_batch_size 4 \
    --per_device_eval_batch_size 4 \
    --gradient_accumulation_steps 1 \
    --eval_strategy "steps" \
    --eval_steps  100 \
    --metric_for_best_model "eval_loss" \
    --greater_is_better False \
    --save_strategy "steps" \
    --save_steps 500 \
    --load_best_model_at_end True \
    --save_total_limit 3 \
    --learning_rate 0.001 \
    --weight_decay 0.1 \
    --adam_beta2 0.95 \
    --warmup_ratio 0.01 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --report_to "none" \
    --model_max_length 512 \
    --gradient_checkpointing True \
    --lazy_preprocess True \
    --deepspeed "/root/autodl-tmp/deepspeed/ds_config_zero2.json" \
    --use_lora

## 权重融合

In [1]:
from transformers import AutoModelForCausalLM
from peft import PeftModel
import torch

model = AutoModelForCausalLM.from_pretrained("/root/autodl-fs/DeepSeek-R1-Distill-Qwen-32B", torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)
model = PeftModel.from_pretrained(model, "/root/autodl-fs/trained_models/deepseek_ri_32b_sop/checkpoint-1500")
merged_model = model.merge_and_unload()
merged_model.save_pretrained("/root/autodl-fs/trained_models/deepseek_ri_32b_merged", max_shard_size="2048MB", safe_serialization=True)



Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

[2025-02-27 11:37:45,452] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/root/miniconda3/compiler_compat/ld: cannot find -laio
collect2: error: ld returned 1 exit status
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'
collect2: error: ld returned 1 exit status


## 分词器

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "/root/autodl-fs/DeepSeek-R1-Distill-Qwen-32B",
    trust_remote_code=True
)

tokenizer.save_pretrained("/root/autodl-fs/trained_models/deepseek_ri_32b_merged")

('/root/autodl-fs/trained_models/deepseek_ri_32b_merged/tokenizer_config.json',
 '/root/autodl-fs/trained_models/deepseek_ri_32b_merged/special_tokens_map.json',
 '/root/autodl-fs/trained_models/deepseek_ri_32b_merged/tokenizer.json')

## 模型测试

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
import torchvision

torchvision.disable_beta_transforms_warning()

tokenizer = AutoTokenizer.from_pretrained("/root/autodl-fs/trained_models/deepseek_ri_32b_merged", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "/root/autodl-fs/trained_models/deepseek_ri_32b_merged",
    device_map="auto",
    trust_remote_code=True
).eval()

prompt = "你现在是一个很厉害的阅读理解器，严格按照人类指令进行回答。\nInput: 下面描述中包含了哪些SPO？\n返回json回答:\n\n花样男子花样男子《花样男子》，日文原名《花より男子》（Hana-yori Danshi），是日本漫画家神尾叶子的一部长篇爱情校园漫画，原作自1992年至2004年在集英社的漫画杂志《Margaret》上连载，并发行了37册的单行本，销量超过5900万册，霸占了日本第一畅销少女漫画的宝座。"
inputs = tokenizer(prompt, return_tensors="pt")

response = tokenizer.decode(model.generate(**inputs, max_new_tokens=128000)[0], skip_special_tokens=True)
print(response)

Loading checkpoint shards:   0%|          | 0/34 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.90 GiB (GPU 0; 47.50 GiB total capacity; 42.54 GiB already allocated; 1.74 GiB free; 45.42 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### 选一条样本对训练出来的模型进行人工比对

In [None]:
{
    "conversations": 
        [
            {
                "from": "user", 
                "value": "你现在是一个很厉害的阅读理解器，严格按照人类指令进行回答。\nInput: 下面描述中包含了哪些SPO？\n返回json回答:\n\n花样男子花样男子《花样男子》，日文原名《花より男子》（Hana-yori Danshi），是日本漫画家神尾叶子的一部长篇爱情校园漫画，原作自1992年至2004年在集英社的漫画杂志《Margaret》上连载，并发行了37册的单行本，销量超过5900万册，霸占了日本第一畅销少女漫画的宝座。"}, 
            {
                "from": "assistant", 
                "value": "[{\"predicate\": \"编剧\", \"object_type\": \"人物\", \"subject_type\": \"影视作品\", \"object\": \"神尾叶子\", \"subject\": \"花样男子\"}, {\"predicate\": \"出版社\", \"object_type\": \"出版社\", \"subject_type\": \"书籍\", \"object\": \"集英社\", \"subject\": \"花样男子\"}]"
            }
        ], 
    "id": "identity_0"
}