In [1]:
from datasets import Dataset
from ReadLoad import read_json
from prompt_template import get_input_template, get_sys_prompt
from eval_data import key_word_score
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
from transformers import EarlyStoppingCallback

2024-07-20 14:45:34.229458: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-20 14:45:34.241982: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-20 14:45:34.257052: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-20 14:45:34.261491: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-20 14:45:34.272303: I tensorflow/core/platform/cpu_feature_guar

In [2]:
#模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('qwen/Qwen2-7B-Instruct')

In [3]:
def processing(data):
    data = [{
        'instruction': '对用户提出的有关保险条款的问题给予准确、清晰的回答。',
        'input': get_input_template(d['产品名'], d['条款'], d['问题']),
        'output': d['答案']
    } for d in data]
    df = pd.DataFrame(data)
    ds = Dataset.from_pandas(df)
    return ds

### data pre-processing

In [4]:
train_data = read_json('dataset/train.json')
train_eval = [{
    '产品名': data['产品名'],
    '条款': data['条款'],
    '问题': data['问题'],
    '答案': data['答案'],
    'score': key_word_score(data['产品名'] + data['条款'] + data['问题'], data['答案'])
} for data in train_data]
train_data = [{key: value for key, value in data.items() if key != 'score'} 
              for data in train_eval if data['score'] >= 0.8]
train_data = processing(train_data)

dev_data = read_json('dataset/dev.json')
dev_data = processing(dev_data)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.693 seconds.
Prefix dict has been built successfully.


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True, use_cache=False)
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Qwen2Tokenizer(name_or_path='/mnt/workspace/.cache/modelscope/hub/qwen/Qwen2-7B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
def process_func(example):
    # MAX_LENGTH = 512    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<|im_start|>system\n你是一个基于保险条款的问答系统<|im_end|>\n<|im_start|>user\n{example['instruction'] + example['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    # if len(input_ids) > MAX_LENGTH:  # 做一个截断
    #     # input_ids = input_ids[:MAX_LENGTH]
    #     # attention_mask = attention_mask[:MAX_LENGTH]
    #     # labels = labels[:MAX_LENGTH]
    #     return {}
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

MAX_LENGTH = 512
tokenized_id = train_data.map(process_func, remove_columns=train_data.column_names)
tokenized_id = tokenized_id.filter(lambda x: len(x["input_ids"]) < MAX_LENGTH)
tokenized_id

Map:   0%|          | 0/4449 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4449 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3931
})

In [7]:
dev_data_id = dev_data.map(process_func, remove_columns=dev_data.column_names)
dev_data_id = dev_data_id.filter(lambda x: len(x["input_ids"]) < MAX_LENGTH)
dev_data_id

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 886
})

### 加载模型

In [8]:
import torch
import logging
logger = logging.getLogger(__name__)

model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", torch_dtype=torch.bfloat16, use_cache=False)
model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法

# device_count = torch.cuda.device_count()
# if device_count > 0:
#     logger.debug("Select GPU device")
#     device = torch.device("cuda")
# else:
#     logger.debug("Select CPU device")
#     device = torch.device("cpu")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
model = get_peft_model(model, config)
model.print_trainable_parameters()
#model.to(device)

trainable params: 20,185,088 || all params: 7,635,801,600 || trainable%: 0.26434798934534914


In [11]:
args = TrainingArguments(
    output_dir="./output/Qwen2_instruct_lora/20240720",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    logging_steps=50,
    eval_steps=100,
    eval_strategy="steps",
    num_train_epochs=5,
    save_steps=100, # 为了快速演示，这里设置10，建议你设置成100
    learning_rate=5e-5,
    save_on_each_node=True,
    #bf16=True,
    load_best_model_at_end=True,
    gradient_checkpointing=True
)

In [12]:
import torch

def print_memory_usage():
    allocated = torch.cuda.memory_allocated() / (1024 * 1024 * 1024)
    cached = torch.cuda.memory_reserved() / (1024 * 1024 * 1024)
    total_used = allocated + cached
    print(f'Allocated: {allocated:.2f} GB, Cached: {cached:.2f} GB, Total Used: {total_used:.2f} GB')

torch.cuda.empty_cache()
print_memory_usage()


Allocated: 14.70 GB, Cached: 14.92 GB, Total Used: 29.62 GB


In [13]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    eval_dataset=dev_data_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
trainer.train()

Detected kernel version 4.19.91, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2024-07-20 14:48:21,846] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)




Step,Training Loss,Validation Loss
100,0.1931,0.293287
200,0.1767,0.279972
300,0.1091,0.288233
400,0.0986,0.295482
500,0.0963,0.289023




TrainOutput(global_step=500, training_loss=0.1533053436279297, metrics={'train_runtime': 3178.3569, 'train_samples_per_second': 6.184, 'train_steps_per_second': 0.385, 'total_flos': 1.148309086585006e+17, 'train_loss': 0.1533053436279297, 'epoch': 2.034587995930824})

In [None]:
# #模型下载
# from modelscope import snapshot_download
# from modelscope import AutoModelForCausalLM, AutoTokenizer
# from prompt_template import get_sys_prompt
# from peft import PeftModel
# model_dir = snapshot_download('qwen/Qwen2-7B-Instruct')
# lora_path = './output/Qwen2_instruct_lora/checkpoint-600'

# device = "cuda"
# # the device to load the model onto

# model = AutoModelForCausalLM.from_pretrained(
#     model_dir,
#     torch_dtype="auto",
#     device_map="auto"
# )
# model = PeftModel.from_pretrained(model, model_id=lora_path)
# tokenizer = AutoTokenizer.from_pretrained(model_dir)


# def qwen_response(prompt):

#     messages = [
#         {"role": "system", "content": get_sys_prompt()},
#         {"role": "user", "content": prompt}
#     ]
#     text = tokenizer.apply_chat_template(
#         messages,
#         tokenize=False,
#         add_generation_prompt=True
#     )
#     model_inputs = tokenizer([text], return_tensors="pt").to(device)

#     generated_ids = model.generate(
#         model_inputs.input_ids,
#         max_new_tokens=512
#     )
#     generated_ids = [
#         output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
#     ]

#     response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
#     return response


##### qwen_response("你是谁")