In [1]:
from datasets import Dataset
from ReadLoad import read_json
from prompt_template import get_input_template, get_sys_prompt
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig

  from .autonotebook import tqdm as notebook_tqdm
2024-07-14 20:03:32.619612: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-14 20:03:32.660300: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('qwen/Qwen2-7B-Instruct')

Downloading: 100%|██████████| 663/663 [00:00<00:00, 1.37kB/s]
Downloading: 100%|██████████| 48.0/48.0 [00:00<00:00, 93.1B/s]
Downloading: 100%|██████████| 243/243 [00:00<00:00, 604B/s]
Downloading: 100%|██████████| 11.1k/11.1k [00:00<00:00, 29.2kB/s]
Downloading: 100%|██████████| 1.59M/1.59M [00:00<00:00, 3.65MB/s]
Downloading: 100%|██████████| 3.67G/3.67G [00:10<00:00, 393MB/s] 
Downloading: 100%|██████████| 3.60G/3.60G [00:10<00:00, 373MB/s] 
Downloading: 100%|██████████| 3.60G/3.60G [00:11<00:00, 330MB/s] 
Downloading: 100%|██████████| 3.31G/3.31G [00:09<00:00, 360MB/s] 
Downloading: 100%|██████████| 27.1k/27.1k [00:00<00:00, 64.9kB/s]
Downloading: 100%|██████████| 6.41k/6.41k [00:00<00:00, 16.9kB/s]
Downloading: 100%|██████████| 6.70M/6.70M [00:00<00:00, 11.2MB/s]
Downloading: 100%|██████████| 1.26k/1.26k [00:00<00:00, 2.38kB/s]
Downloading: 100%|██████████| 2.65M/2.65M [00:00<00:00, 6.04MB/s]


In [3]:
def processing(data_path):
    data = read_json(data_path)
    train_data = [{
        'instruction': '对用户提出的有关保险条款的问题给予准确、清晰的回答。',
        'input': get_input_template(d['产品名'], d['条款'], d['问题']),
        'output': d['答案']
    } for d in data]
    df = pd.DataFrame(train_data)
    ds = Dataset.from_pandas(df)
    return ds 

In [4]:
train_data = processing('dataset/train.json')
dev_data = processing('dataset/dev.json')

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True, use_cache=False)
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Qwen2Tokenizer(name_or_path='/mnt/workspace/.cache/modelscope/hub/qwen/Qwen2-7B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
def process_func(example):
    MAX_LENGTH = 384    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<|im_start|>system\n你是一个基于保险条款的问答系统<|im_end|>\n<|im_start|>user\n{example['instruction'] + example['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

tokenized_id = train_data.map(process_func, remove_columns=train_data.column_names)
tokenized_id

                                                                

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5000
})

In [7]:
dev_data_id = dev_data.map(process_func, remove_columns=dev_data.column_names)
dev_data_id

                                                               

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [8]:
import torch

model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto",torch_dtype=torch.bfloat16)
model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法

Loading checkpoint shards: 100%|██████████| 4/4 [00:47<00:00, 11.82s/it]


In [9]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 20,185,088 || all params: 7,635,801,600 || trainable%: 0.26434798934534914


In [29]:
args = TrainingArguments(
    output_dir="./output/Qwen2_instruct_lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    logging_steps=50,
    eval_steps=200,
    eval_strategy="steps",
    num_train_epochs=3,
    save_steps=200, # 为了快速演示，这里设置10，建议你设置成100
    learning_rate=1e-4,
    save_on_each_node=True,
    load_best_model_at_end=True,
    #gradient_checkpointing=True
)

In [30]:
torch.cuda.empty_cache()

In [31]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    eval_dataset=dev_data_id, 
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)
trainer.train()

Detected kernel version 4.19.91, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
200,0.2493,0.251654
400,0.1516,0.249524
600,0.1452,0.244188
800,0.0668,0.296206




TrainOutput(global_step=936, training_loss=0.1578593526640509, metrics={'train_runtime': 5619.9022, 'train_samples_per_second': 2.669, 'train_steps_per_second': 0.167, 'total_flos': 2.1527664606133862e+17, 'train_loss': 0.1578593526640509, 'epoch': 2.9952})

In [32]:
#模型下载
from modelscope import snapshot_download
from modelscope import AutoModelForCausalLM, AutoTokenizer
from prompt_template import get_sys_prompt
from peft import PeftModel
model_dir = snapshot_download('qwen/Qwen2-7B-Instruct')
lora_path = './output/Qwen2_instruct_lora/checkpoint-600'

device = "cuda" 
# the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, model_id=lora_path)
tokenizer = AutoTokenizer.from_pretrained(model_dir)


def qwen_response(prompt):

    messages = [
        {"role": "system", "content": get_sys_prompt()},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response


Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.94it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [34]:
qwen_response("Hi")

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.02 GiB. GPU 