安装opencompass：Kaggle上已经为我们准备好了其他常用包，只需安装opencompass用于评测即可。如果不在Kaggle上运行，则还需要安装其他必要包。

In [1]:
# !pip install "opencompass[full]" transformers
# !pip install pytorch transformers datasets "opencompass[full]"
!pip install peft

Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.14.0


# 指令微调

In [2]:
"""
The main program for finetuning LLMs with Huggingface Transformers Library.

ALL SECTIONS WHERE CODE POSSIBLY NEEDS TO BE FILLED IN ARE MARKED AS TODO.
"""

import argparse
from dataclasses import dataclass, field
from typing import Optional, List, Dict
import sys
import torch
from transformers import TrainingArguments, HfArgumentParser, Trainer, AutoTokenizer, AutoModelForCausalLM
import datasets
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [3]:
# Define the arguments required for the main program.
# NOTE: You can customize any arguments you need to pass in.
@dataclass
class ModelArguments:
    """Arguments for model
    """
    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The path to the LLM to fine-tune or its name on the Hugging Face Hub."
        }
    )
    torch_dtype: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "Override the default `torch.dtype` and load the model under this dtype."
            ),
            "choices": ["bfloat16", "float16", "float32"],
        },
    )
    # TODO: add your model arguments here
    pass


@dataclass
class DataArguments:
    """Arguments for data
    """
    dataset_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The path to the fine-tuning dataset or its name on the Hugging Face Hub."
        }
    )
    # TODO: add your data arguments here
    max_output_length: Optional[int] = field(
        default=1024,
        metadata={"help": "Maximum output length for the tokenizer."}
    )

In [4]:
# The main function
# NOTE You can customize some logs to monitor your program.
def finetune():
    # TODO Step 1: Define an arguments parser and parse the arguments
    # NOTE Three parts: model arguments, data arguments, and training arguments
    # HINT: Refer to 
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/internal/trainer_utils#transformers.HfArgumentParser
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/main_classes/trainer#transformers.TrainingArguments
    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # TODO Step 2: Load tokenizer and model
    # HINT 1: Refer to
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/main_classes/tokenizer#tokenizer
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/model_doc/qwen2
    # HINT 2: To save training GPU memory, you need to set the model's parameter precision to half-precision (float16 or bfloat16).
    #         You may also check other strategies to save the memory!
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/model_doc/llama2#usage-tips
    #   * https://huggingface.co/docs/transformers/perf_train_gpu_one
    #   * https://www.53ai.com/news/qianyanjishu/2024052494875.html
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
    model = AutoModelForCausalLM.from_pretrained(
        model_args.model_name_or_path,
        device_map="auto",
        torch_dtype="auto",  # Load model with float16 precision to save memory
        max_position_embeddings=1024,    # 最大长度1024时，batch_size=1恰好能在一张P100中训练
    )
    model = prepare_model_for_kbit_training(model)
    config = LoraConfig(
        r=32,
        lora_alpha=16,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, config)
    model = model.to("cuda")

    print(model.device)    # 确认模型加载的位置
    print(model.config)    # 确认模型当前配置

    # TODO Step 3: Load dataset
    # HINT: https://huggingface.co/docs/datasets/v3.1.0/en/package_reference/main_classes#datasets.Dataset
    dataset = datasets.load_dataset('csv', data_files=data_args.dataset_path)

    # TODO Step 4: Define the data collator function
    # NOTE During training, for each model parameter update, we fetch a batch of data, perform a forward and backward pass,
    # and then update the model parameters. The role of the data collator is to process the data (e.g., padding the data within
    # a batch to the same length) and format the batch into the input required by the model.
    #
    # In this assignment, the purpose of the custom data_collator is to process each batch of data from the dataset loaded in
    # Step 3 into the format required by the model. This includes tasks such as tokenizing the data, converting each token into 
    # an ID sequence, applying padding, and preparing labels.
    # 
    # HINT:
    #   * Before implementation, you should:
    #      1. Clearly understand the format of each sample in the dataset loaded in Step 3.
    #      2. Understand the input format required by the model (https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2ForCausalLM).
    #         Reading its source code also helps!
    
    # Step 4: Define the data collator function
    def data_collator(batch: List[Dict]):
        # Extract the 'instruction', 'input', and 'output' fields from the batch
        instructions = [
            (example['instruction'] or "") + (example['input'] or "") # Ensure 'None' is replaced by empty string
            for example in batch
        ]
        inputs = [
            (example['instruction'] or "") + (example['input'] or "") + (example['output'] or "") + "<|endoftext|>"
            for example in batch
        ]
    
        # Tokenize inputs and outputs (note that this is still in list form)
#        model_instructions = tokenizer(instructions, max_length=data_args.max_output_length, truncation=True, padding="do_not_pad")
        model_inputs = tokenizer(inputs, max_length=data_args.max_output_length, truncation=True, padding=True)
    
        # Add the labels to model_inputs (still using list operations)
#        model_inputs['labels'] = [
#            [-100] * len(instruction_seq) + input_seq[len(instruction_seq):]
#            for instruction_seq, input_seq in zip(model_instructions['input_ids'], model_inputs['input_ids'])
#        ]  
        labels = []
        input_ids = model_inputs['input_ids']
        for idx, input_id in enumerate(input_ids):
            instruction_end = len(tokenizer(instructions[idx], max_length=data_args.max_output_length, truncation=True, padding="do_not_pad")['input_ids'])
            label = [-100] * instruction_end + input_id[instruction_end:]
            labels.append(label)
        model_inputs['labels'] = labels
    
        # Convert everything to tensor at the end
        model_inputs['input_ids'] = torch.tensor(model_inputs['input_ids'], dtype=torch.long)
        model_inputs['labels'] = torch.tensor(model_inputs['labels'], dtype=torch.long)
        model_inputs['attention_mask'] = torch.tensor(model_inputs['attention_mask'], dtype=torch.long)

#        print("Model Input:", model_inputs)  # This will print the entire model input dictionary 
        
        return model_inputs

#    accelerator = Accelerator()
#    model, tokenizer, dataset['train']= accelerator.prepare(
#        model, tokenizer, dataset['train']
#    )

    # TODO Step 5: Define the Trainer
    # HINT: https://huggingface.co/docs/transformers/main_classes/trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

#    torch.autograd.set_detect_anomaly(True)

    # Step 6: Train!
    trainer.train()
    trainer.save_model("/kaggle/working/sft_model")

In [5]:
# Pass your training arguments.
# NOTE [IMPORTANT!!!] DO NOT FORGET TO PASS PROPER ARGUMENTS TO SAVE YOUR CHECKPOINTS!!!
sys.argv = [
    "notebook", 
    "--model_name_or_path", "/kaggle/input/qwen2.5/transformers/1.5b/1",
    "--dataset_path", "/kaggle/input/alpaca-language-instruction-training/train.csv",
    "--output_dir", "./output",
    "--num_train_epochs", "2",
    "--per_device_train_batch_size", "1",  # 增加批量大小
    "--logging_dir", "./logs",
    "--logging_steps", "10",  # 更频繁地记录日志
    "--save_steps", "500",  # 保持保存间隔
    "--save_total_limit","1",  # 最多保留1个检查点，删除较旧的
    "--learning_rate", "5e-5",  # 设置学习率
    "--gradient_accumulation_steps", "16",  # 梯度累积，模拟大批量训练
    "--adam_epsilon", "1e-8",  # 小 epsilon 提高稳定性
    "--lr_scheduler_type", "linear",  # 使用线性学习率调度器
    "--report_to", "none",  # 禁用 W&B 集成
    "--remove_unused_columns","False",#保留未使用的列
    "--eval_strategy", "no",  # 禁用评测集
#    "--gradient_clip_val", "1.0", #启用梯度裁剪
#    "--dataloader_num_workers", "2",  # 数据加载的线程数
]
#accelerator = Accelerator()
#accelerator.wait_for_everyone()
finetune()

cuda:0
Qwen2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "/kaggle/input/qwen2.5/transformers/1.5b/1",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 1536,
  "initializer_range": 0.02,
  "intermediate_size": 8960,
  "max_position_embeddings": 1024,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 12,
  "num_hidden_layers": 28,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.3",
  "use_cache": true,
  "use_mrope": false,
  "use_sliding_window": false,
  "vocab_size": 151936
}



Generating train split: 0 examples [00:00, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,1.2271
20,1.0926
30,1.1421
40,1.1182
50,1.1766
60,1.1083
70,1.1369
80,1.1312
90,1.0542
100,1.1231
