# 手撕LLM实操脚本-全流程+RLHF

本实操由"小冬瓜AIGC"创建
微信：xiaodongguaAIGC

该版本涵盖：
- 医疗数据处理
- Pretrained + LoRA
- SFT + LoRA
- DPO
- Reward Model + LoRA
- RLHF PPO + LoRA
- 配备测试程序

可以在消费级笔记本电脑/Colab运行的LLaMA微调Demo

# 配置

In [30]:
!pip3 install torch numpy evaluate tqdm
!pip3 install -q -U transformers accelerate datasets trl git+https://github.com/huggingface/peft.git
!pip3 install -q bitsandbytes sentencepiece



In [31]:
!pip3 install wandb



In [32]:
!nvidia-smi

Sat Apr 20 14:09:44 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0              25W /  70W |  13401MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla T4                       Off | 00000000:00:05.0 Off |  

In [33]:
# 通用库

import torch
import torch.nn as nn
# import evaluate
import numpy as np
import tqdm
import sys
from typing import Dict, Optional, Any, Dict, List, Optional, Union
from dataclasses import dataclass, field

# Huggingface Transformers系列库

from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, TrainerCallback
from transformers import AutoTokenizer, AutoConfig, DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2ForSequenceClassification, AutoModelForSequenceClassification
from transformers import PreTrainedTokenizerBase
from transformers import Adafactor, pipeline
from transformers import BitsAndBytesConfig
from transformers.utils import PaddingStrategy

from accelerate import Accelerator

from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset, DatasetDict

from peft import PeftModel, PeftConfig, LoraConfig
from peft import TaskType, get_peft_model, get_peft_config

from trl import SFTTrainer, DPOTrainer
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler
from trl.trainer import ConstantLengthDataset



In [34]:
batch_size = 8
max_length = 256
max_steps = 1000
device = 'cuda:0'
# device = 'cuda'
lora_r = 8
debug_mode = False
use_pretrained_text_data = True
block_size = 256

In [35]:
import os
temp_path = './'
if os.path.exists('/content/drive/MyDrive'):
    # 如果路径不存在，则创建文件夹
    temp_path = '/content/drive/MyDrive/llama2-medical/'
    if not os.path.exists('/content/drive/MyDrive/llama2-medical'): #训练过程中所存放的网盘模型路径
      os.makedirs(temp_path)
      print("文件夹已创建")
else:
    print("使用本地路径")

使用本地路径


In [36]:
# 模型名称
datasets_name = 'shibing624/medical'
model_pretrained_name = temp_path + 'llama2-medical-pretrained'
model_pretrained_name_full = model_pretrained_name + '-full'

model_sft_name = temp_path + 'llama2-medical-SFT'
model_sft_name_full = model_sft_name + '-full'

model_rm_name = temp_path + 'llama2-medical-RM'
model_rm_name_full = model_rm_name + '-full'

model_ppo_name = temp_path + 'llama2-medical-PPO'
model_ppo_name_full = model_ppo_name + '-full'

model_dpo_name = temp_path + 'llama2-medical-DPO'

# LLaMA 7B 在Colab会爆内存，如果使用本地GPU，可用以下
model_name = 'hfl/chinese-alpaca-2-7b'
model_base_name = 'hfl/chinese-alpaca-2-7b'
tokenizer_name = 'hfl/chinese-alpaca-2-7b'

# # LLaMA 1B 使用原生LLaMA tokenizer对中文支持不友好，会添加很多额外的Token
# model_base_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
# tokenizer_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
# model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

# # LLaMA 1B 使用原生LLaMA tokenizer对中文支持不友好，会添加很多额外的Token
# model_base_name = 'HuggingFaceM4/tiny-random-LlamaForCausalLM'
# tokenizer_name = 'HuggingFaceM4/tiny-random-LlamaForCausalLM'
# model_name = 'HuggingFaceM4/tiny-random-LlamaForCausalLM'

# if debug_mode:
#     model_name = temp_path + './LLaMA_base_baby'

# 要在Google云盘加入文本数据
if os.path.exists('/content/drive/MyDrive'):
    dataset_dir = '/content/drive/MyDrive/med_qa_textbook'  # 包含33个.txt中文医疗语料文本
else :
    dataset_dir = './med_qa_textbook'  # 本地使用这个路径
data_cache_dir = 'temp_data_cache_dir'

In [37]:
# QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [38]:

lora_full = ['embed_tokens', 'lm_head', 'q_proj', 'k_proj', 'v_proj', 'o_proj',
             'gate_proj','up_proj','down_proj']
lora_pretrained = ['embed_tokens', 'lm_head', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'down_proj']
lora_finetune = [ 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'down_proj']

pretrained_lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    target_modules = lora_pretrained,
    modules_to_save = None,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
)

lm_lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    target_modules = lora_finetune,
    modules_to_save = None,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
)

rm_lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r=8,
    target_modules = lora_finetune,
    modules_to_save = None,
    lora_alpha=32,
    lora_dropout=0.05,
    inference_mode=False,
    bias="none",
)


# 中文tokenizer

In [39]:
# 加载tokenizer
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

print(tokenizer_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast = False)
tokenizer.pad_token = tokenizer.eos_token # 原始LLaMA tokenizer 没有Pad Token， 统一用eos替换
print(tokenizer)


hfl/chinese-alpaca-2-7b
LlamaTokenizer(name_or_path='hfl/chinese-alpaca-2-7b', vocab_size=55296, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	32000: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [40]:
input_string = '我是小冬瓜，爱学习计算机科学'
input_ids = tokenizer(input_string)
print(input_ids['input_ids'])
output_string = tokenizer.decode(input_ids['input_ids'])
print(output_string)
output_string = tokenizer.decode(input_ids['input_ids'][3])
print(output_string)

[1, 32553, 30392, 30446, 33900, 34818, 30214, 32045, 32087, 33893, 32661]
<s>我是小冬瓜，爱学习计算机科学
小


# 数据集

In [41]:
# 该数据集已经包含Pretrained、fintune、Reward数据集代码， 仅加载Reward，用于教程
# Pretrained采用加载txt的方式，通用性更好
datasets = load_dataset(datasets_name, 'reward')

pretrain

train_encyclopedia.json: 共36万条，来自医疗百科数据FreedomIntelligence/huatuo_encyclopedia_qa , 拼接 questions 和 answers，形成 text 文本字段，语句通顺，用于预训练注入医疗知识。 medical_book_zh.json: 共8475条，来自医疗教材的文本数据，来源：https://github.com/jind11/MedQA， 原始数据集：google drive ，只对长段落切分为2048字的小段落了。

finetune

train_zh_0.json: 共195万条，来自1）中文医疗对话数据集Toyhom/Chinese-medical-dialogue-data的六个科室医疗问诊数据， 有79万条；2）在线医疗百科 huatuo_encyclopedia_qa ，有36万条；3）医疗知识图谱 huatuo_knowledge_graph_qa，有79万条。三部分合并，共195万条。 train_en_1.json：共11万条，来自英文医疗问诊对话数据Kent0n-Li/ChatDoctor，合并了HealthCareMagic-100k、GenMedGPT-5k 数据集，共11万条。

reward

train.json 共4000条，问题来自中文医疗对话数据集Toyhom/Chinese-medical-dialogue-data的随机4000条提问，response_chosen来自该数据集的医生答复， response_rejected来自本草模型SCIR-HI/Huatuo-Llama-Med-Chinese的答复。

In [42]:
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['question', 'response_chosen', 'response_rejected'],
        num_rows: 3800
    })
    validation: Dataset({
        features: ['question', 'response_chosen', 'response_rejected'],
        num_rows: 100
    })
    test: Dataset({
        features: ['question', 'response_chosen', 'response_rejected'],
        num_rows: 100
    })
})


In [43]:
# 人类的回答为Chosen， 其他LLM的模型的回答作为rejected
sample_index = 7
print("Question: ", datasets['train']['question'][sample_index])
print("response_chosen: ", datasets['train']['response_chosen'][sample_index])
print("response_rejected: ",
      datasets['train']['response_rejected'][sample_index])

Question:  病毒性疱疹怎样治疗
response_chosen:  尖锐湿疣治疗方法较多，以去处局部增生性疣为主，在选择时仍以有效，简便，安全，不引起疤痕为基本原则。治疗注意事项有：1。要消除尖锐湿疣病人的恐惧心理。2。对尖锐疣病人要进行有关性病的检查，特别是要进行梅毒，淋病，非淋菌性尿道生殖道炎，生殖器疱疹，软下疳，艾滋病等性病的检查，若发现有相关疾病应进行治疗。生殖器疱疹是由单纯疱疹病毒引起的性传播疾病，主要是HSV-2型，少数为HSV-1型。是常见的性病之一。生殖器疱疹可反复发作，对病人的健康和心理影响较大；还可通过胎盘及产道感染新生儿，导致新生儿先天性感染。因此该病也是较为严重的公共卫生问题之一，应对其有效的防治引起重视。人是单纯疱疹病毒的惟一自然宿主。发作期、恢复期病人，以及无明显症状的感染病毒者为该病的传染源。主要通过病损处的水疱疱液、局部渗出液、病损皮肤黏膜表面等存在的病毒进行传播。该病主要通过性行为传染，通过被污染物品的间接传染较少。此外，患生殖器疱疹的母亲，在分娩过程中，经过产道可将病毒直接传染给新生儿，或怀孕过程中患病，病毒可通过胎盘传给胎儿。感染者主要通过性接触而传染给其性伴侣。男性同性性行为者传染的危险性也很大。有时在口唇及其周围患有疱疹的人，可通过口－生殖器性交，使对方感染生殖器疱疹。因此，不同方式的异性或同性性行为，都可以传播生殖器疱疹。由于有感染性的病毒能在潮湿的环境中存活数小时，因而也有在少数症状下通过污染物而间接传播。生殖器疱疹的治疗方法很多，包括抗病毒、一般治疗等。不同治疗方法具各自的优缺点及适应症。而生殖器疱疹病人症状，病因等存在差异，所以要注意选择合适的方法。日常生活中要注意戒酒，不能吃辛辣、海鲜和油腻性食物。
response_rejected:  目前尚无特效药物可以治愈单纯疱疹。对于轻度的感染者可采用局部外用药物进行处理；而对重症患者则需住院观察和使用抗生素等药物控制病情。


# 创建一个Baby-LLaMA(optional)

In [44]:
# 如果使用Colab或GPU算力显存充足情况， 可忽略当前步骤
# 没有GPU资源的情况，自己创建个baby-llama，参数量极少，但是需要从头开始训练
if debug_mode:
  config = AutoConfig.from_pretrained(model_base_name)
  print(config)
  config.num_attention_heads = 4
  config.num_key_value_heads = 4
  config.num_hidden_layers = 1
  config.hidden_size = 256
  config.intermediate_size = 768
  model = AutoModelForCausalLM.from_config(config)
  print(model)

In [45]:
# 保存成base mode，从头训练
if debug_mode:
    model.save_pretrained(model_name)
    tokenizer.save_pretrained(model_name)

# Pretrained训练

## 创建Pretrained数据集

In [46]:
def prepare_data_pretrained(example):
    example[
        'question'] = f"{example['question']}{example['response_rejected']}{tokenizer.eos_token}"
    example['question'] = example['question'][:max_length]  #最大长度 128
    example = tokenizer(example['question'])
    return example

datasets_pretrained = datasets.map(prepare_data_pretrained)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
print(datasets_pretrained)

Map:   0%|          | 0/3800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'response_chosen', 'response_rejected', 'input_ids', 'attention_mask'],
        num_rows: 3800
    })
    validation: Dataset({
        features: ['question', 'response_chosen', 'response_rejected', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
    test: Dataset({
        features: ['question', 'response_chosen', 'response_rejected', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
})


In [47]:
# 新增'input_ids', 'token_type_ids', 'attention_mask'
print(datasets['train'])

Dataset({
    features: ['question', 'response_chosen', 'response_rejected'],
    num_rows: 3800
})


## 基于医疗文本创建预训练数据集

In [48]:
from pathlib import Path
from itertools import chain

def tokenize_function(examples):
    output = tokenizer(examples["text"])
    return output

def group_texts(examples):
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

# 如果使用与训练的
use_pretrained_text_data = True
if use_pretrained_text_data:
  #datasets_pretrained = []
  datasets_pretrained=DatasetDict()
  path = Path("/kaggle/input")
  data_files = str(path/"medqatext")
  cache_dir = Path("/kaggle/working/")
  os.makedirs(cache_dir, exist_ok=True)
  files = [file.name for file in path.glob("medqatext/*.txt")]
  for idx, file in enumerate(files):
      data_file = os.path.join(data_files, file)
      filename = ''.join(file.split(".")[:-1])
      cache_path = os.path.join(cache_dir, filename)
      os.makedirs(cache_path, exist_ok=True)
      if True:
          cache_dir_text = os.path.join(data_cache_dir, filename+"_text")
          os.makedirs(cache_dir_text, exist_ok=True)
          raw_dataset = load_dataset("text", data_files=data_file, cache_dir=cache_dir_text, keep_in_memory=False)
          print(f"{file} has been loaded")
          tokenized_dataset = raw_dataset.map(
              tokenize_function,
              batched=True,
              num_proc=8,
              remove_columns="text",
              load_from_cache_file=True,
              keep_in_memory=False,
              cache_file_names = {k: os.path.join(cache_dir, 'tokenized.arrow') for k in raw_dataset},
              desc="Running tokenizer on dataset",
          )
          grouped_datasets = tokenized_dataset.map(
              group_texts,
              batched=True,
              num_proc=8,
              load_from_cache_file=True,
              keep_in_memory=False,
              cache_file_names = {k: os.path.join(cache_dir, 'grouped.arrow') for k in tokenized_dataset},
              desc=f"Grouping texts in chunks of {block_size}",
          )
          processed_dataset = grouped_datasets
          processed_dataset.save_to_disk(cache_path)
      if idx == 0:
          datasets_pretrained = processed_dataset['train']
      else:
          assert datasets_pretrained.features.type == processed_dataset["train"].features.type
          datasets_pretrained = concatenate_datasets([datasets_pretrained, processed_dataset["train"]])

  datasets_pretrained = datasets_pretrained.train_test_split(test_size = 0.05)

  print(tokenizer.decode(datasets_pretrained['train'][10]['input_ids']))
  print(tokenizer.decode(datasets_pretrained['test'][10]['input_ids']))

.txt has been loaded


Grouping texts in chunks of 256 (num_proc=8):   0%|          | 0/4141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1467 [00:00<?, ? examples/s]

 组织及个人因素相互作用而影响到自身的健康和幸福。本章将选择医院这样特殊的场所，并以安全为主题，讨论医院安全管理。<s> 医院作为与人类生命安全健康联系最紧密的社会机构，其自身质量安全、风险管理水平直接关系到患者、医务人员的生命安全和身心健康。医院安全管理(hospital safety management)是指通过对医院进行有效和科学的管理，保证医务入员在提供医疗卫生服务、患者及其家属在接受医疗卫生服务的过程中，不受医院内在不良因素的影响和伤害。医院安全管理要根据系统论原理，运用现代安全管理策略、方法和手段，识别、控制医院内各种潜在的不安全因素，从技术上、组织上和管理上，建立有效的医院安全管理体系，预防和减少患者及医务人员在诊疗过程中的安全风险事件。其内涵一是保障医务人员和患者在医院活动过程中不受伤害；二是保障医院在经营管理过程中良性运转，降低医院安全风险，减少医院纠纷。医院安全管理是医院管理的核心内容，是全面提升医疗质量的关键，是实现优质医疗服务的基础。<s> 第一节医院安全<s> 一、医院安全概述<s>
 相关样本。对可能被污染的物品、场所、环境、动植物等进行消毒、杀虫、灭鼠等卫生学处理。疫区内重点部位要开展经常性消毒。<s> 6.疫区内家禽、家畜应实行圈养。如有必要，报经当地政府同意后，对可能染疫的野生动物、家禽家畜进行控制或捕杀。7开展健康教育，提高居民自我保护意识，做到群防群治。8现场处理结束时要对疫源地进行终末消毒，妥善处理医疗废物和临时隔离点的物品。根据对控制措施效果评价，以及疾病原因的进一步调查结果，及时改进、补充和完善各项控制措施。<s> 七、临床救治原则<s> （一）疑似传染病的救治在群体性不明原因疾病处置中，鉴于传染病对人群和社会危害较大，因此，在感染性疾病尚未明心已确是否具有传染性之前，应按传染病进行救治。<s> 第二十章突发公共卫生事件及其应急策略359<s>救治原则：隔离患者，病原治疗，一般治疗与病情观察，对症治疗。（二）疑似非传染性疾病的救治1.疑似食物


In [49]:
print(datasets_pretrained)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1393
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 74
    })
})


## 加载base训练模型

In [54]:
del model
#if not debug_mode:
#    torch.cuda.empty_cache()

In [69]:
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install accelerate



Looking in indexes: https://pypi.org/simple/


In [70]:

print(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map='auto'
)
model.config.use_cache = False
model.config.pad_token_id = model.config.eos_token_id

if not debug_mode:
    model = get_peft_model(model, lm_lora_config)
    model.print_trainable_parameters()
model.to(device)

hfl/chinese-alpaca-2-7b


ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [None]:
# Question:  轻度白内障的临床表现有些什么？
# test 程序,
prompt = '轻度白内障的临床表现有些什么？'
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
# output = model.generate(**input_ids, max_new_tokens=50, top_k=200, penalty_alpha=1.6, do_sample=True)
output = model.generate(**input_ids, max_new_tokens=100)
response = tokenizer.decode(output[1], skip_special_tokens=True)
print(response)

## 设置训练参数

In [None]:
# max_steps = 10
eval_freq = 500
save_freq = 500
log_freq = 10
num_train_epochs = 1

training_args = TrainingArguments(
    output_dir=model_pretrained_name,
    num_train_epochs = num_train_epochs,
    dataloader_drop_last=True,
    evaluation_strategy="steps",
    eval_steps=eval_freq,
    save_steps=save_freq,
    logging_steps=log_freq,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=16,
    # max_steps=max_steps,
    warmup_steps=100,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    weight_decay=0.05,
    fp16=True,
    logging_first_step=True,
    # report_to="wandb",
    max_steps=10, # 为了调试方便，设置为10步
)

## Pretrained模型训练

In [None]:
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=datasets_pretrained['train'],
                  eval_dataset=datasets_pretrained['test'],
                  data_collator=data_collator)

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
# 保存预训练好的模型，这里保存的是adapter
model.save_pretrained(model_pretrained_name)
tokenizer.save_pretrained(model_pretrained_name)

## 模型合并

https://huggingface.co/docs/peft/conceptual_guides/lora

使用这个函数, merge_and_unload() 具体操作adapter+base model合并当成是基线模型

In [None]:
# model = model.merge_and_unload()
# model.save_pretrained(model_pretrained_name_full)
# tokenizer.save_pretrained(model_pretrained_name_full)

In [None]:
print(model)

## Pretrained模型测试

In [None]:
# Question:  轻度白内障的临床表现有些什么？
# test 程序,
prompt = '轻度白内障的临床表现有些什么？'
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(**input_ids, max_new_tokens=50)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

## 6.8 merge lora

In [None]:
del model
# del tensor
# del optimizer
if not debug_mode:
    torch.cuda.empty_cache()

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name, #llama-7b base
    device_map = 'cpu',
    torch_dtype=torch.float16
)

model = PeftModel.from_pretrained(
    model,
    model_pretrained_name, #adapter
    device_map='cpu',
)

model = model.merge_and_unload()

In [None]:
print(model)

In [None]:
model.save_pretrained(model_pretrained_name_full)
tokenizer.save_pretrained(model_pretrained_name_full)

In [None]:
# Question:  轻度白内障的临床表现有些什么？
# test 程序
model = AutoModelForCausalLM.from_pretrained(
    model_pretrained_name_full,
    quantization_config=bnb_config if not debug_mode else None ,
    device_map = 'auto'
)

prompt = '轻度白内障的临床表现有些什么？'
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(**input_ids, max_new_tokens=100)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

# SFT训练

In [None]:
del model
# del tensor
# del optimizer
if not debug_mode:
    torch.cuda.empty_cache()

## SFT数据处理

In [None]:
datasets = load_dataset(datasets_name, 'reward')
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
def prepare_sample_text(example):
    text = f"Question: {example['question']}\n\nAnswer: {example['response_rejected']}{tokenizer.eos_token}"
    return text


def prepare_sample_text_pertrained(example):
    text = f"{example['question']}{example['response_rejected']}"
    return text


def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    total_characters, total_tokens = 0, 0
    for _, example in zip(range(nb_examples), iter(dataset)):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))
    return total_characters / total_tokens


def create_sft_datasets(datasets, tokenizer, seq_length=128):

    train_data = datasets["train"]
    valid_data = datasets["test"]

    chars_per_token = chars_token_ratio(train_data, tokenizer)
    print(
        f"The character to token ratio of the dataset is: {chars_per_token:.2f}"
    )

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=prepare_sample_text,
        infinite=True,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=prepare_sample_text,
        infinite=False,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset

In [None]:
train_data, val_data = create_sft_datasets(datasets, tokenizer)

In [None]:
print(train_data)

## SFT模型加载

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_pretrained_name_full,
    quantization_config=bnb_config if not debug_mode else None ,
    device_map = 'auto',
)
model.config.use_cache = False
model = get_peft_model(model, lm_lora_config)
model.print_trainable_parameters()
model.config.pad_token_id = model.config.eos_token_id

In [None]:
print(model)

In [None]:
# # 查看模型参数中的数据类型
for name, param in model.named_parameters():
    print(name, param.dtype)

## 模型加载

In [None]:
# max_steps = 10
eval_freq = 100
save_freq = 500
log_freq = 1
num_train_epochs = 1

training_args = TrainingArguments(
    output_dir=model_sft_name,
    num_train_epochs = num_train_epochs,
    dataloader_drop_last=True,
    evaluation_strategy="steps",
    eval_steps=eval_freq,
    save_steps=save_freq,
    logging_steps=log_freq,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=16,
    # max_steps=max_steps,
    warmup_steps=50,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    weight_decay=0.05,
    fp16=True,
    logging_first_step=True,
    # report_to="wandb"
)

In [None]:
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_data,
                  eval_dataset=val_data,
                  data_collator=data_collator)


In [None]:
trainer.train()

In [None]:
# 保存预训练好的模型
model.save_pretrained(model_sft_name)
tokenizer.save_pretrained(model_sft_name)

## SFT 模型测试

In [None]:
# Question:  轻度白内障的临床表现有些什么？
prompt = 'Question:轻度白内障的临床表现有些什么?  Answer:'
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(**input_ids, max_new_tokens=100)
# output = model.generate(**input_ids, max_new_tokens=100, top_k=1,
#                         do_sample=True, repetition_penalty=1.2)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

In [None]:
del model
# del tensor
# del optimizer
if not debug_mode:
    torch.cuda.empty_cache()

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_pretrained_name_full, #llama-7b base
    device_map = 'cpu',
    torch_dtype=torch.float16
)

model = PeftModel.from_pretrained(
    model,
    model_sft_name, #adapter
    device_map='cpu',
)

model = model.merge_and_unload()

In [None]:
# 保存预训练好的模型
model.save_pretrained(model_sft_name_full)
tokenizer.save_pretrained(model_sft_name_full)

## 上传模型到Huggingface hub

In [None]:
# 登陆Huggingface， 这里的Acesse Token需要Write权限
from huggingface_hub import notebook_login
from huggingface_hub import create_repo
notebook_login()

In [None]:
# 创建仓库
create_repo("zyj_TinyLLaMA_medical_sft")

In [None]:
# 上传Model和Tokenizer
model.push_to_hub("zyj_TinyLLaMA_medical_sft")
tokenizer.push_to_hub("zyj_TinyLLaMA_medical_sft")

# RM模型训练

In [None]:
del model
# del optimizer
if not debug_mode:
    torch.cuda.empty_cache()

## 分类模型加载

In [None]:
rm_model = AutoModelForSequenceClassification.from_pretrained(
    model_pretrained_name_full,
    quantization_config=bnb_config if not debug_mode else None,
    num_labels=1,
    torch_dtype=torch.float32)

rm_model.config.pad_token_id = rm_model.config.eos_token_id
rm_model = get_peft_model(rm_model, rm_lora_config)
rm_model.print_trainable_parameters()

In [None]:
print(rm_model.score.original_module.weight.dtype)
print(rm_model.score.modules_to_save)

## RM 数据处理

In [None]:
def preprocess_function(examples):
    new_examples = {
        "input_ids_j": [],
        "attention_mask_j": [],
        "input_ids_k": [],
        "attention_mask_k": [],
    }
    for question, response_j, response_k in zip(examples["question"],
                                                examples["response_chosen"],
                                                examples["response_rejected"]):
        tokenized_j = tokenizer("Question: " + question + "\n\nAnswer: " +
                                response_j ,
                                truncation=True
                               )
        tokenized_k = tokenizer("Question: " + question + "\n\nAnswer: " +
                                response_k,
                                truncation=True
                               )

        new_examples["input_ids_j"].append(tokenized_j["input_ids"])
        new_examples["attention_mask_j"].append(tokenized_j["attention_mask"])
        new_examples["input_ids_k"].append(tokenized_k["input_ids"])
        new_examples["attention_mask_k"].append(tokenized_k["attention_mask"])

    return new_examples


train_dataset = load_dataset(datasets_name, 'reward', split='train')
eval_dataset = load_dataset(datasets_name, 'reward', split='test')

original_columns = train_dataset.column_names

rm_max_length = 128
max_length = rm_max_length

train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=original_columns
)
train_dataset = train_dataset.filter(lambda x: len(x[
    "input_ids_j"]) <= max_length and len(x["input_ids_k"]) <= max_length)

eval_dataset = eval_dataset.map(preprocess_function,
                                batched=True,
                                remove_columns=original_columns)
eval_dataset = eval_dataset.filter(lambda x: len(x[
    "input_ids_j"]) <= max_length and len(x["input_ids_k"]) <= max_length)


In [None]:
print(train_dataset)

In [None]:
@dataclass
class RewardDataCollatorWithPadding:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"
    max_length = 128

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        features_j = []
        features_k = []
        for feature in features:
            features_j.append(
                {
                    "input_ids": feature["input_ids_j"],
                    "attention_mask": feature["attention_mask_j"],
                }
            )
            features_k.append(
                {
                    "input_ids": feature["input_ids_k"],
                    "attention_mask": feature["attention_mask_k"],
                }
            )
        batch_j = self.tokenizer.pad(
            features_j,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch_k = self.tokenizer.pad(
            features_k,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch = {
            "input_ids_j": batch_j["input_ids"],
            "attention_mask_j": batch_j["attention_mask"],
            "input_ids_k": batch_k["input_ids"],
            "attention_mask_k": batch_k["attention_mask"],
            "return_loss": True,
        }
        return batch


In [None]:
# # debug collator
data_collator = RewardDataCollatorWithPadding(tokenizer=tokenizer, max_length=max_length)
data_dc = data_collator(train_dataset)
print(data_dc['input_ids_j'].dtype)
for i, batch in enumerate(data_dc):
    print(batch)
    print('iter:', i)
    # break


In [None]:
trainiter = iter(data_dc)
for batch in trainiter:
    print(batch)
# print(trainiter[])

In [None]:
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, _ = eval_pred
    # Here, predictions is rewards_j and rewards_k.
    # We want to see how much of the time rewards_j > rewards_k.
    predictions = np.argmax(predictions, axis=0)
    labels = np.zeros(predictions.shape)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
class RewardTrainer(Trainer):
    # Define how to compute the reward loss. We use the InstructGPT pairwise logloss: https://arxiv.org/abs/2203.02155
    def compute_loss(self, model, inputs, return_outputs=False):
        # print('haha')
        #         print(inputs["input_ids_j"])
        rewards_j = model(input_ids=inputs["input_ids_j"],
                          attention_mask=inputs["attention_mask_j"])[0]
        rewards_k = model(input_ids=inputs["input_ids_k"],
                          attention_mask=inputs["attention_mask_k"])[0]
        loss = -nn.functional.sigmoid(rewards_j - rewards_k).log().mean()
        if return_outputs:
            return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
        return loss



# max_steps = 10
eval_freq = 50
save_freq = 500
log_freq = 1
num_train_epochs = 2

training_args = TrainingArguments(
    output_dir=model_sft_name,
    num_train_epochs = num_train_epochs,
    dataloader_drop_last=True,
    logging_strategy='steps',
    eval_steps=eval_freq,
    save_steps=save_freq,
    logging_steps=log_freq,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=16,
    # max_steps=max_steps,
    warmup_steps=50,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    weight_decay=0.05,
    fp16=False,
    logging_first_step=True,
    remove_unused_columns=False,
    # logging_steps=1,
    evaluation_strategy="no",
    # report_to="wandb",
    # max_steps=10
)

# Train the model, woohoo.
trainer = RewardTrainer(
    model=rm_model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=RewardDataCollatorWithPadding(tokenizer=tokenizer,
                                                max_length=max_length),
)

In [None]:
print(train_dataset)

In [None]:
trainer.train()

In [None]:
rm_model.save_pretrained(model_rm_name)
tokenizer.save_pretrained(model_rm_name)

In [None]:
print(rm_model.config)

In [None]:
prompt_chosen = 'Question:轻度白内障的临床表现有些什么？\n\nAnswer:轻度白内障伴玻璃体混浊'
input_chosen = tokenizer(prompt_chosen, return_tensors="pt").to(device)
score_chosen = rm_model(**input_chosen)[0]

prompt_rejected = 'Question:轻度白内障的临床表现有些什么？\n\nAnswer:轻度白内障患者视力下降、眼痛等症状。'
input_rejected = tokenizer(prompt_rejected, return_tensors="pt").to(device)
score_rejected = rm_model(**input_rejected)[0]

print(score_chosen)
print(score_rejected)

# RLHF训练

In [None]:
# del model
# del rm_model
if not debug_mode:
    torch.cuda.empty_cache()

## 加载模型

In [None]:
# rm_adapter_id
rm_adapter_id = model_rm_name
ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    model_sft_name_full,
    peft_config=lm_lora_config,
    reward_adapter=rm_adapter_id,
    quantization_config=bnb_config if not debug_mode else None,
    device_map = 'auto'
)

# continue trainning PPO
# ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained(
#     model_ppo_name,
#     peft_config=lm_lora_config,
#     quantization_config=bnb_config if not debug_mode else None,
# )
# print(ppo_model)

ppo_model.config.pad_token_id = ppo_model.config.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

print(ppo_model)

In [None]:
generation_kwargs = {
    # "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": 100_000,
}
output_min_length = 16
output_max_length = 128
output_length_sampler = LengthSampler(output_min_length, output_max_length)

## 加载数据

In [None]:
def build_dataset(
    tokenizer,
    dataset_name="lvwerra/stack-exchange-paired",
):
    datasets = load_dataset(datasets_name, 'reward', split='train')
    #     train_dataset = datasets['tra']

    #     original_columns = ds.column_names
    num_proc = 1

    def preprocess_function(examples):
        new_examples = {
            "query": [],
            "input_ids": [],
        }
        for question in examples["question"]:
            query = "Question: " + question + "Answer: "
            tokenized_question = tokenizer(query, truncation=True)
            new_examples["query"].append(query)
            new_examples["input_ids"].append(tokenized_question["input_ids"])

        return new_examples

    ds = datasets.map(
        preprocess_function,
        batched=True,
        num_proc=num_proc,
        #         remove_columns=original_columns,
    )
    ds = ds.filter(lambda x: len(x["input_ids"]) < 32, batched=False)

    ds.set_format(type="torch")
    return ds


dataset = build_dataset(tokenizer)

## 加载训练参数

In [None]:

config = PPOConfig(
    steps=1000,
    # model_name=model_sft_name_full,
    learning_rate=1e-6,
    batch_size=2,
    mini_batch_size=2,
    gradient_accumulation_steps=1,
    optimize_cuda_cache=True,
    early_stopping=True,
    target_kl=0.1,
    ppo_epochs=2,
    seed=0,
    init_kl_coef=0.2,
    adap_kl_ctrl=True,
    max_grad_norm=0.01 # fix generate nan
)


def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])


optimizer = None
# if script_args.adafactor:
#     optimizer = Adafactor(
#         filter(lambda p: p.requires_grad, model.parameters()),
#         scale_parameter=False,
#         relative_step=False,
#         warmup_init=False,
#         lr=config.learning_rate,
#     )

ppo_trainer = PPOTrainer(
    config,
    ppo_model,
    ref_model=None,
    tokenizer=tokenizer,
    dataset=dataset,
    data_collator=collator,
    optimizer=optimizer,
)

# device = ppo_trainer.accelerator.device
# if ppo_trainer.accelerator.num_processes == 1:
#     device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a ` pipeline` bug

## RLHF PPO迭代训练

In [None]:
reward_baseline = 0.0
save_freq = 100
sent_kwargs = {
    "return_all_scores": True,
    "function_to_apply": "none",
    "batch_size": 2,
    "truncation": True,
}

# for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
for epoch, batch in enumerate(ppo_trainer.dataloader):
    if epoch >= config.total_ppo_epochs:
        break

    question_tensors = batch["input_ids"]

    response_tensors = ppo_trainer.generate(
        question_tensors,
        return_prompt=False,
        length_sampler=output_length_sampler,
        **generation_kwargs,
    )
    batch["response"] = tokenizer.batch_decode(response_tensors,
                                               skip_special_tokens=True)

    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    print(texts)

    # original separate reward model
    # pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    # rewards = [
    #     torch.tensor(output[0]["score"] - reward_baseline)
    #     for output in pipe_outputs
    # ]

    # calculate Rewards with MARL
    # https://huggingface.co/docs/trl/multi_adapter_rl
    # trl/examples/scripts/ppo_multi_adapter.py
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(ppo_trainer.accelerator.device)
    raw_rewards = ppo_trainer.accelerator.unwrap_model(ppo_trainer.model).compute_reward_score(**inputs)
    # raw_rewards = ppo_trainer.model.compute_reward_score(**inputs)

    rewards = [raw_rewards[i, -1, 0]/100.0 for i in range(len(raw_rewards))]  # take last token
    rewards = [0.001 if isinstance(x,float) and math.isnan(x) else x for x in rewards] # fix rewards with nan
    # print(rewards)

    stats = ppo_trainer.step(question_tensors, response_tensors, rewards)

    # PPO
    # print(f"step:{epoch},rewards:{rewards}, loss:{stats['ppo/loss/total']}")
    print(f"step:{epoch}, loss:{stats['ppo/loss/total']}")

    if save_freq and epoch and epoch % save_freq == 0:
        ppo_trainer.save_pretrained(model_ppo_name)

    # break

In [None]:
# debug MARL Rewards
print(raw_rewards.shape)
print(inputs['input_ids'].shape)
rewards = [raw_rewards[i, -1, 0]/100.0 for i in range(len(raw_rewards))]  # take last token
print(rewards)

In [None]:
ppo_trainer.save_pretrained(model_ppo_name)
tokenizer.save_pretrained(model_ppo_name)

In [None]:
# Question:  轻度白内障的临床表现有些什么？

prompt = 'Question:轻度白内障的临床表现有些什么？answer:'
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
output = ppo_model.generate(**input_ids, max_new_tokens=50)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

In [None]:
ppo_trainer.is_peft_model