## install

In [1]:
# conda activate sft

# 最新版有 bug，先用旧版本
# https://github.com/unslothai/unsloth/issues/1925
!pip install unsloth==2025.2.14 unsloth_zoo==2025.2.7 python-dotenv

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0m

## load model

In [2]:
from unsloth import FastLanguageModel
import torch
from dotenv import load_dotenv

# HF_TOKEN
load_dotenv(".env")

max_seq_length = 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
# load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
load_in_4bit = False

model, tokenizer = FastLanguageModel.from_pretrained(
    # 下载报错：
    # 1. 不学术加速：超时
    # 2. 设置学术加速：Failed too many failures in parallel (3): Request: error sending request for url (https://cdn-lfs-us-1.hf.co/...，域名是 hf.co，是官网，怀疑是 AutoDL 梯子不行
    # 3. 设置 HF_HF_ENDPOINT：超时，还是走的官网域名，不是镜像站，有 bug，见 https://github.com/unslothai/unsloth/issues/1353
    # model_name = "unsloth/Meta-Llama-3.1-8B",

    # 解决下载报错：只加载本地模型
    # https://github.com/unslothai/unsloth/issues/495#issuecomment-2134831390
    local_files_only = True,
    
    # 加载本地模型报错，可能和 transformers 版本有关系，但现在已经是最新版了，不确定原因
    # OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /root/autodl-tmp/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct.
    # model_name = "/root/autodl-tmp/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct",
    
    # 加载本地模型：指定 snapshots，成功了
    model_name = "/root/autodl-tmp/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659",
    
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.14: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.65 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

/root/autodl-tmp/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659 does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.2.14 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## prepare data

In [4]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [5]:
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add eos_token, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + tokenizer.eos_token
        texts.append(text)
    return { "text" : texts, }

In [6]:
from datasets import load_dataset

dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True)

## train

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    # dataset_num_proc = 2,
    dataset_num_proc = 6,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Converting train dataset to ChatML (num_proc=6):   0%|          | 0/51760 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=6):   0%|          | 0/51760 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=6):   0%|          | 0/51760 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=6):   0%|          | 0/51760 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.5675
2,1.9598
3,1.6698
4,1.883
5,1.7047
6,1.4868
7,1.0843
8,1.2497
9,1.157
10,1.1


## inference

In [9]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nContinue the fibonnaci sequence.\n\n### Input:\n1, 1, 2, 3, 5, 8\n\n### Response:\nThe next number in the Fibonacci sequence is 13. The sequence now looks like this: 1, 1, 2, 3, 5, 8, 13.<|eot_id|>']

## saving model

In [10]:
# save only lora
model.save_pretrained("/root/autodl-tmp/saved_model/lora_model")
tokenizer.save_pretrained("/root/autodl-tmp/saved_model/lora_model")

('/root/autodl-tmp/saved_model/lora_model/tokenizer_config.json',
 '/root/autodl-tmp/saved_model/lora_model/special_tokens_map.json',
 '/root/autodl-tmp/saved_model/lora_model/tokenizer.json')

In [11]:
# merge and save 16bit
model.save_pretrained_merged("/root/autodl-tmp/saved_model/model", tokenizer, save_method = "merged_16bit")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 699.6 out of 1007.51 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


  3%|████▌                                                                                                                                           | 1/32 [00:00<00:03,  9.39it/s]
We will save to Disk and not RAM now.
 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████         | 30/32 [00:19<00:01,  1.52it/s]


RuntimeError: [enforce fail at inline_container.cc:626] . unexpected pos 576 vs 470