In [1]:
import json
import torch
from peft import LoraConfig, TaskType, get_peft_model
from collections import defaultdict
from transformers import AutoModelForCausalLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("qgyd2021/chinese_ner_sft", "CMeEE", trust_remote_code=True) 
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'entities', 'data_source'],
        num_rows: 20000
    })
})

In [3]:
ds = ds["train"].train_test_split(test_size=0.2, seed=42)
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'entities', 'data_source'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'entities', 'data_source'],
        num_rows: 4000
    })
})

In [4]:
test_data = ds["train"].select(range(3))
test_data

Dataset({
    features: ['text', 'entities', 'data_source'],
    num_rows: 3
})

In [5]:
test_data[0]

{'text': '5.肾活体组织检查肾活体组织检查是确定HBV-GN的最终手段，是诊断HBV-GN的必备条件。',
 'entities': {'start_idx': [2, 9, 19, 34],
  'end_idx': [8, 15, 24, 39],
  'entity_text': ['肾活体组织检查', '肾活体组织检查', 'HBV-GN', 'HBV-GN'],
  'entity_label': ['pro', 'pro', 'dis', 'dis'],
  'entity_names': [['医疗程序', '医疗过程'],
   ['医疗程序', '医疗过程'],
   ['疾病', '病名', '病症'],
   ['疾病', '病名', '病症']]},
 'data_source': 'CMeEE'}

In [6]:
checkpoint = "Qwen/Qwen2.5-1.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


In [7]:
def process_dataset(example):
    text_list = example["text"]
    entities_list = example["entities"]
    
    input_ids_list = []
    labels_list = []
    for text, entities in zip(text_list, entities_list):
        answer_dict = defaultdict(list)
        entity_label = entities["entity_label"]
        entity_text = entities["entity_text"]
        for label_text, label_label in zip(entity_text, entity_label):
            answer_dict[label_label].append(label_text)
        answer_dict = json.dumps(answer_dict, ensure_ascii=False)
        instruction = [
            {
                "role": "user",
                "content": "请你帮我提取里面的医学实体" + "\n" + text
            },
        ]
        prompt = tokenizer.apply_chat_template(instruction, tokenize=False, add_generation_prompt=True)
        input_ids = tokenizer(prompt)["input_ids"]
        labels = [-100] * len(input_ids)
        answer_token = tokenizer(answer_dict)["input_ids"]
        answer_token.append(tokenizer.eos_token_id)
        input_ids.extend(answer_token)
        labels.extend(answer_token)
        
        # 算力有限 555 求赞助
        if len(input_ids) > 1000:
            continue
        
        input_ids_list.append(input_ids)
        # labels_list.append(labels)
    return {"input_ids": input_ids_list}
    
    # return {"input_ids": input_ids_list, "labels": labels_list}

In [8]:
tokenized_ds = ds.map(process_dataset, batched=True, remove_columns=ds["train"].column_names)
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15991
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 3999
    })
})

In [9]:
from torch.utils.data import DataLoader

In [18]:
dataloader = DataLoader(tokenized_ds["train"], batch_size=2, collate_fn=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="pt"))
dataloader

<torch.utils.data.dataloader.DataLoader at 0x15666b69550>

In [19]:
for item in dataloader:
    item.size()
    break

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [13]:
tokenized_ds["train"]

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 15991
})

In [17]:
type(tokenized_ds["train"]["input_ids"][0])

list

In [9]:
train_data = tokenized_ds["train"][0]
print(train_data["input_ids"])
print(train_data["labels"])

[151644, 8948, 198, 2610, 525, 1207, 16948, 11, 3465, 553, 54364, 14817, 13, 1446, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 112720, 108965, 107439, 107172, 104316, 101565, 198, 20, 13, 102512, 75606, 31914, 99877, 101071, 102512, 75606, 31914, 99877, 101071, 20412, 60610, 30725, 53, 12010, 45, 9370, 103941, 104085, 3837, 20412, 105262, 30725, 53, 12010, 45, 9370, 109457, 76095, 1773, 151645, 198, 151644, 77091, 198, 4913, 776, 788, 4383, 102512, 75606, 31914, 99877, 101071, 497, 330, 102512, 75606, 31914, 99877, 101071, 7914, 330, 4243, 788, 4383, 30725, 53, 12010, 45, 497, 330, 30725, 53, 12010, 45, 92446, 151645]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100

In [10]:
tokenizer.decode(train_data["input_ids"])

'<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n请你帮我提取里面的医学实体\n5.肾活体组织检查肾活体组织检查是确定HBV-GN的最终手段，是诊断HBV-GN的必备条件。<|im_end|>\n<|im_start|>assistant\n{"pro": ["肾活体组织检查", "肾活体组织检查"], "dis": ["HBV-GN", "HBV-GN"]}<|im_end|>'

In [11]:
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,089,536 || all params: 1,544,803,840 || trainable%: 0.0705


In [12]:
train_args = Seq2SeqTrainingArguments(
    "output/llm_result",
    per_device_train_batch_size=1, 
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=1,
    logging_steps=20,
    bf16=True,
    eval_strategy="epoch",
    learning_rate=4e-5,
    use_cpu=False,
    lr_scheduler_type="cosine"
)
trainer = Seq2SeqTrainer(
                        model, 
                        args=train_args,
                        train_dataset=tokenized_ds["train"],
                        eval_dataset=tokenized_ds["test"],
                        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
                        processing_class=tokenizer
                        )

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.8862,0.999726
2,3.7967,0.983651




TrainOutput(global_step=11991, training_loss=4.0564709191683805, metrics={'train_runtime': 3464.4969, 'train_samples_per_second': 13.847, 'train_steps_per_second': 3.461, 'total_flos': 4.272920317488845e+16, 'train_loss': 4.0564709191683805, 'epoch': 2.9993121130635982})

In [51]:
import random
from peft import PeftModel, PeftConfig
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

In [9]:
# 从保存点载入, 为了方便从头运行到末尾， 这里注释掉

# checkpoint = "Qwen/Qwen2.5-1.5B-Instruct"
# save_dir = "output/llm_result/checkpoint-11967"

# # peft_config = PeftConfig.from_pretrained(save_dir)
# model = AutoModelForCausalLM.from_pretrained(checkpoint)
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# peft_model = PeftModel.from_pretrained(model, save_dir)

In [16]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
pipe

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'Jam

<transformers.pipelines.text_generation.TextGenerationPipeline at 0x261758e5af0>

In [54]:
# 网络搜索
text = "艾滋病疫苗，即艾滋病病毒（HIV）疫苗，注射了它便可以在一段时间内防止艾滋病（类似乙肝疫苗的原理）。HIV疫苗被认为是预防艾滋病的最有效工具"

messages = [
    {
        "role": "user",
        "content": "请你帮我提取里面的医学实体" + "\n" + text
    }
]

inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
result = pipe(messages, max_new_tokens=1000, do_sample=False)
result[0]["generated_text"][-1]["content"]

'{"dru": ["艾滋病疫苗", "HIV疫苗"], "dis": ["艾滋病", "乙肝"], "pro": ["注射"]}'

In [52]:
index = random.randint(0, len(ds["test"]))
text = ds["test"][index]["text"]
answer = ds["test"][index]["entities"]["entity_text"]
messages = [
    {
        "role": "user",
        "content": "请你帮我提取里面的医学实体" + "\n" + text
    }
]

inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
result = pipe(messages, max_new_tokens=1000, do_sample=False)
print(result[0]["generated_text"][-1]["content"])
print(answer)



{"dis": ["流感"]}
['流感']


In [53]:
index = random.randint(0, len(ds["train"]))
text = ds["train"][index]["text"]
answer = ds["train"][index]["entities"]["entity_text"]
messages = [
    {
        "role": "user",
        "content": "请你帮我提取里面的医学实体" + "\n" + text
    }
]

inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
result = pipe(messages, max_new_tokens=1000, do_sample=False)
print(result[0]["generated_text"][-1]["content"])
print(answer)

{"bod": ["细胞外区", "血浆"], "sym": ["蛋白水解片段", "循环于血浆中"]}
['细胞外区的蛋白水解片段', '血浆', 'GH结合蛋白']


In [56]:
with model.disable_adapter():
    # 网络搜索
    text = "艾滋病疫苗，即艾滋病病毒（HIV）疫苗，注射了它便可以在一段时间内防止艾滋病（类似乙肝疫苗的原理）。HIV疫苗被认为是预防艾滋病的最有效工具"

    messages = [
        {
            "role": "user",
            "content": "请你帮我提取里面的医学实体" + "\n" + text
        }
    ]

    inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    result = pipe(messages, max_new_tokens=1000, do_sample=False)
    print(result[0]["generated_text"][-1]["content"])

以下是文档中的医学实体：

1. 艾滋病（HIV）
2. 艾滋病病毒（HIV）
3. 艾滋病疫苗
4. 艾滋病（HIV）
5. 乙肝疫苗


In [57]:
with model.disable_adapter():
    index = random.randint(0, len(ds["test"]))
    text = ds["test"][index]["text"]
    answer = ds["test"][index]["entities"]["entity_text"]
    messages = [
        {
            "role": "user",
            "content": "请你帮我提取里面的医学实体" + "\n" + text
        }
    ]

    inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    result = pipe(messages, max_new_tokens=1000, do_sample=False)
    print(result[0]["generated_text"][-1]["content"])
    print(answer)

在这个句子中，没有明确的医学实体。
['TSS', '早期积极的液体供应', '静脉', '免疫球蛋白', '糖皮质激素']


In [58]:
with model.disable_adapter():
    index = random.randint(0, len(ds["train"]))
    text = ds["train"][index]["text"]
    answer = ds["train"][index]["entities"]["entity_text"]
    messages = [
        {
            "role": "user",
            "content": "请你帮我提取里面的医学实体" + "\n" + text
        }
    ]

    inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    result = pipe(messages, max_new_tokens=1000, do_sample=False)
    print(result[0]["generated_text"][-1]["content"])
    print(answer)

在这个文本中，没有明确提到具体的医学实体。不过，可以提取出一些与疾病和治疗方法相关的词汇：

- 疟疾：这是一种传染病。
- 发作：指疾病的发作或出现。

这些词汇可能涉及到医疗领域中的概念，但它们并不是具体的医学实体。在实际应用中，需要更详细的信息来准确识别和提取医学实体。
['基因工程', '重组的蛋白质或多肽作为抗原诊断疟疾', '卧床休息', '进食营养丰富、富含维生素、易消化的食物']
