In [1]:
from peft import LoraConfig, TaskType, get_peft_model
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig

tokenizer = AutoTokenizer.from_pretrained(
    '/root/autodl-tmp/Llama-2-7b-hf',
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    '/root/autodl-tmp/OpenELM-3B-Instruct',
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
def process_func(example):
    MAX_LENGTH = 384
    
    instruction = tokenizer(f"{example['en_instruction'] + example['en_input']}<sep>", add_special_tokens=True)
    response = tokenizer(f"{example['en_output']}", add_special_tokens=False)

    # instruction = tokenizer(f"{example['instruction'] + example['input']}<sep>", add_special_tokens=True)
    # response = tokenizer(f"{example['output']}", add_special_tokens=False)
    
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  

    
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [3]:
from datasets import Dataset
import pandas as pd

import os
import pandas as pd
from datasets import Dataset

# 定义要处理的文件夹路径
folder_path = '/root/autodl-tmp/alpaca-chinese-dataset/data/'

# 获取文件夹中所有的 JSON 文件名
json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

# 创建一个空的 DataFrame 用于存放所有数据
combined_df = pd.DataFrame()

# 遍历所有 JSON 文件并将它们合并到一个 DataFrame 中
for json_file in json_files:
    file_path = os.path.join(folder_path, json_file)
    df = pd.read_json(file_path)
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# 将合并后的 DataFrame 转换为 Dataset
ds = Dataset.from_pandas(combined_df)
# df = pd.read_json('/root/alpaca-chinese-dataset/data/alpaca_chinese_part_0.json')
# ds = Dataset.from_pandas(df)
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
tokenized_id

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 52002
})

In [4]:
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=['token_embeddings', "qkv_proj", "out_proj", "proj_1", "proj_2"],
    inference_mode=False,
    r=32, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1 # Dropout 比例
)

In [5]:
model.enable_input_require_grads()
peft_model = get_peft_model(model, config)
peft_model.train()
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='./OpenELM-3B-Instruct', revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=32, target_modules={'token_embeddings', 'proj_1', 'proj_2', 'out_proj', 'qkv_proj'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [6]:
peft_model.print_trainable_parameters()

trainable params: 45,883,392 || all params: 3,082,530,816 || trainable%: 1.4885


In [10]:
args = TrainingArguments(
    output_dir="/root/autodl-tmp/output/openelm_3B_lora",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    logging_steps=100,
    num_train_epochs=0.87,  # 为了快速掩饰，我们训练到约1200个iter作为测试，建议设为10个epochs
    save_steps=600,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True
)

In [11]:
trainer = Trainer(
    model=peft_model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

In [None]:
trainer.train()