In [7]:
import copy
import logging
import numpy as np
import pandas as pd
from datetime import datetime,timedelta,date
import time
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModel, get_linear_schedule_with_warmup, \
    default_data_collator, TrainingArguments, Trainer
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr, spearmanr
from transformers import AutoTokenizer
from transformers import BertConfig, BertTokenizer,BertTokenizerFast
from datasets import Dataset
from typing import List,Union,Dict
import tqdm
import re
import json
from dataclasses import dataclass
import os,time
from pprint import pprint
os.chdir('/home/stops/Work_space/NLP_work/Med_assit_chatglm')
#os.environ['CUDA_VISIBLE_DEVICES']='0'

from db_config_taiyi import DB ## load data from pgsql

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s')
logger.info('Starting')

"""
Qwen tokenizer注意事项：
1.特殊token用以给模型传递特殊信号，如到达文本末尾。 理论上，输入文本中不包含特殊token，它们仅在tokenization后由开发者手动加入。
特殊token的字面表达，如表示文本结束的<|endoftext|>，仅便于指代特殊token，不意味着它们在输入文本空间中。
2.在训练过程中，我们仅使用<|endoftext|>这一token作为sample/document之间的分隔符及padding位置占位符，
你可以将bos_id, eos_id, pad_id均指向tokenizer.eod_id。
3.Qwen模型结构类似于Llama.
4.预测从表明在【assistant+\n】之后开始
5.训练label:
   system:不预测
   assistant:预测[im_start]+[assistant_text_token]+[im_end]+nl_tokens,不预测[assistant+\n]
   注意：全程预测[im_start]+[im_end]+nl_tokens
"""

2023-12-18 13:40:29,854 - INFO : Starting


'\nQwen tokenizer注意事项：\n1.特殊token用以给模型传递特殊信号，如到达文本末尾。 理论上，输入文本中不包含特殊token，它们仅在tokenization后由开发者手动加入。\n特殊token的字面表达，如表示文本结束的<|endoftext|>，仅便于指代特殊token，不意味着它们在输入文本空间中。\n2.在训练过程中，我们仅使用<|endoftext|>这一token作为sample/document之间的分隔符及padding位置占位符，\n你可以将bos_id, eos_id, pad_id均指向tokenizer.eod_id。\n3.Qwen模型结构类似于Llama.\n4.预测从表明在【assistant+\n】之后开始\n5.训练label:\n   system:不预测\n   assistant:预测[im_start]+[assistant_text_token]+[im_end]+nl_tokens,不预测[assistant+\n]\n   注意：全程预测[im_start]+[im_end]+nl_tokens\n'

In [8]:
data_file="data/BC2_train_Official_data_format_1010.json"

with open(data_file,"r") as f:
    conversation_data_list=json.load(f)

logger.info(f"load data from : {data_file} ")
logger.info(f"data nums : {len(conversation_data_list)} ")
logger.info(f"examples : {conversation_data_list[:5]} ")

2023-12-18 13:40:30,537 - INFO : load data from : data/BC2_train_Official_data_format_1010.json 
2023-12-18 13:40:30,538 - INFO : data nums : 203 
2023-12-18 13:40:30,538 - INFO : examples : [{'id': 0, 'conversations': [{'from': 'human', 'value': '模拟问诊,你的角色是小太医医生，有丰富的医疗问诊知识和经验，如果患者有买药意图，医生回复应该追加“稍后会有医生为您开药。[开药]'}, {'from': 'gpt', 'value': '您好，我是小太医，有什么可以帮到您的？'}, {'from': 'human', 'value': '我发烧了'}, {'from': 'gpt', 'value': '您好，请告诉我一下您发热有多久了？体温最高是多少呢，目前的体温是多少？'}, {'from': 'human', 'value': '得有2天了。目前37.8度'}, {'from': 'gpt', 'value': '这段时间内除了发热之外还有其他伴随症状吗？例如鼻塞流涕、咽痛、咳嗽、咳痰等症状，\n有没有呕吐、腹泻、尿频、尿急、尿痛呢？'}, {'from': 'human', 'value': '没有，就嘴巴干'}, {'from': 'gpt', 'value': '了解了，出现这些情况之前是否有受凉、淋雨、不洁饮食等情况呢？'}, {'from': 'human', 'value': '没哦'}, {'from': 'gpt', 'value': '好的，之前是否有出现过类似问题，有无其他什么基础疾病呢？比如风湿、肿瘤、甲亢、结缔组织病等，有没有出血、大手术、烧伤等情况呢？'}, {'from': 'human', 'value': '没'}, {'from': 'gpt', 'value': '您的发热可能是由于感染引起的。根据您的症状，暂时没有其他明显的伴随症状，所以可能是一种轻微的感冒或者病毒性感染。建议您注意以下几点：\n1. 饮食方面，多喝水，保持充足的水分摄入，可以喝一些温开水、绿茶、红枣汤等，有助于缓解嘴

In [11]:
model_path="/home/stops/Work_space/NLP_models/Qwen-14B-Chat"
tokenizer=AutoTokenizer.from_pretrained(model_path,trust_remote_code=True)
logger.info(f"load model : {model_path}")

## 注意设置pad
print("Pad id: ",print(tokenizer.eod_id))
tokenizer.pad_token_id = tokenizer.eod_id
print(tokenizer.pad_token_id)
print(tokenizer.convert_ids_to_tokens([tokenizer.pad_token_id]))

2023-12-18 14:03:01,846 - INFO : load model : /home/stops/Work_space/NLP_models/Qwen-14B-Chat


151643
Pad id:  None
151643
['<|endoftext|>']


In [24]:
im_start = tokenizer.im_start_id
im_end = tokenizer.im_end_id
nl_tokens = tokenizer('\n').input_ids
_system = tokenizer('system').input_ids + nl_tokens
_user = tokenizer('user').input_ids + nl_tokens
_assistant = tokenizer('assistant').input_ids + nl_tokens

print("im_start: ",im_start)
print("im_end: ",im_end)
print("nl_tokens: ",nl_tokens)
print("_system\n: ",_system)
print("_user\n: ",_user)
print("_assistant\n: ",_assistant)

im_start:  151644
im_end:  151645
nl_tokens:  [198]
_system:  [8948, 198]
_user:  [872, 198]
_assistant:  [77091, 198]


In [12]:
print(tokenizer('<|im_start|>').input_ids)## tokenizer.im_start_id,
print(tokenizer('<|im_end|>').input_ids) ## tokenizer.im_end_id
print(tokenizer("<|endoftext|>").input_ids)
print(tokenizer.pad_token_id)

[151644]
[151645]
[151643]
151643


In [26]:
print(tokenizer('<|im_start|>user').input_ids)
print(tokenizer('<|im_start|>assistant').input_ids)


[151644, 872]
[151644, 77091]


In [27]:
"""
1.由于都是手动添加：im_start，im_end，nl_tokens，故这三个special-token不预测，但是要显性出现。
2.全部对话从[assistant+\n]开始，只预测[assistant_text_token],但不预测[assistant+\n]
"""

IGNORE_TOKEN_ID=-100

def preprocess(sources,tokenizer,max_len,system_message= "You are a helpful assistant.") -> Dict:
    roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}

    im_start = tokenizer.im_start_id
    im_end = tokenizer.im_end_id
    nl_tokens = tokenizer('\n').input_ids
    _system = tokenizer('system').input_ids + nl_tokens
    _user = tokenizer('user').input_ids + nl_tokens
    _assistant = tokenizer('assistant').input_ids + nl_tokens

    # Apply prompt templates
    input_ids, targets = [], []
    for i, source in enumerate(sources):
        ## 要求首个角色必须为User
        if roles[source[0]["from"]] != roles["user"]:
            source = source[1:]

        input_id, target = [], []
        system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
        input_id += system
        #由于都是手动添加：im_start，im_end，nl_tokens，故这三个special-token不预测。
        target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens
        print("system len: ",len(system),"target len: ",len(target))
        assert len(input_id) == len(target)
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            ## 统一处理文本，核心是信息文本，其他token[im_start，im_end，nl_tokens]手动添加，所以模型不应该预测。
            _input_id = tokenizer(role).input_ids + nl_tokens + tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
            input_id += _input_id

            ## 按不同角色处理不同信息文本，用户角色文本不预测，重点预测助手的文本内容。
            if role == '<|im_start|>user':
                #由于都是手动添加：im_start，im_end，nl_tokens，故这三个special-token不预测。
                _target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens
            elif role == '<|im_start|>assistant':
                #由于都是手动添加：im_start，im_end，nl_tokens，故这三个special-token不预测。
                #只预测[assistant_text_token],但不预测[assistant+\n]
                _target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \
                    _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens
            else:
                raise NotImplementedError
            target += _target
            print("_input_id len: ",len(system),"_target len: ",len(target))
        assert len(input_id) == len(target)
        input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
        target += [IGNORE_TOKEN_ID] * (max_len - len(target))
        input_ids.append(input_id[:max_len])
        targets.append(target[:max_len])
    input_ids = torch.tensor(input_ids, dtype=torch.int)
    targets = torch.tensor(targets, dtype=torch.int)

    return dict(
        input_ids=input_ids,
        labels=targets,
        attention_mask=input_ids.ne(tokenizer.pad_token_id))


In [28]:
test_data=[[{'from': 'user', 'value': '在吗'},
            {'from': 'assistant', 'value': '在的'},
            {'from': 'user', 'value': '头痛'},
            {'from': 'assistant', 'value': '几天了'},]]
print(test_data)


[[{'from': 'user', 'value': '在吗'}, {'from': 'assistant', 'value': '在的'}, {'from': 'user', 'value': '头痛'}, {'from': 'assistant', 'value': '几天了'}]]


In [29]:
test_result_ids=preprocess(test_data,tokenizer,max_len=40)
pprint(test_result_ids)

system len:  11 target len:  11
{'attention_mask': tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True, False, False]]),
 'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,  18493, 101037, 151645,    198,
         151644,  77091,    198,  18493,   9370, 151645,    198, 151644,    872,
            198, 109180, 151645,    198, 151644,  77091,    198, 101437,  34187,
         151645,    198, 151643, 151643]], dtype=torch.int32),
 'labels': tensor([[151644,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
         151645,    198, 151644,   -100,   -100,   -100,   -100, 151645,    198,
         151644,   -100,   -100,  18493,   9370, 151645,

In [30]:
print("nums: ",len(test_result_ids["input_ids"].numpy()[0]))
for x,y in zip(test_result_ids["input_ids"].numpy()[0],test_result_ids["labels"].numpy()[0]):
    print(x,"====>",y)


nums:  40
151644 ====> 151644
8948 ====> -100
198 ====> -100
2610 ====> -100
525 ====> -100
264 ====> -100
10950 ====> -100
17847 ====> -100
13 ====> -100
151645 ====> 151645
198 ====> 198
151644 ====> 151644
872 ====> -100
198 ====> -100
18493 ====> -100
101037 ====> -100
151645 ====> 151645
198 ====> 198
151644 ====> 151644
77091 ====> -100
198 ====> -100
18493 ====> 18493
9370 ====> 9370
151645 ====> 151645
198 ====> 198
151644 ====> 151644
872 ====> -100
198 ====> -100
109180 ====> -100
151645 ====> 151645
198 ====> 198
151644 ====> 151644
77091 ====> -100
198 ====> -100
101437 ====> 101437
34187 ====> 34187
151645 ====> 151645
198 ====> 198
151643 ====> -100
151643 ====> -100


In [31]:
for x,y in zip(test_result_ids["input_ids"].numpy()[0][:-1],test_result_ids["labels"].numpy()[0][1:]):
    x_token=tokenizer.convert_ids_to_tokens([x])[0]
    y_token=tokenizer.convert_ids_to_tokens([y])[0] if y not in [-100,198] else ""
    #print(type(x_token),x_token,type(y_token),y_token)
    if isinstance(x_token,bytes):
        x_token=x_token.decode("utf8")
    if isinstance(y_token,bytes):
        y_token=y_token.decode("utf8")
    print(r"{:<6s} : {:>6d} ----> {:<6d} : {:>6s}".format(x_token,x,y,y_token))


<|im_start|> : 151644 ----> -100   :       
system :   8948 ----> -100   :       

      :    198 ----> -100   :       
You    :   2610 ----> -100   :       
 are   :    525 ----> -100   :       
 a     :    264 ----> -100   :       
 helpful :  10950 ----> -100   :       
 assistant :  17847 ----> -100   :       
.      :     13 ----> 151645 : <|im_end|>
<|im_end|> : 151645 ----> 198    :       

      :    198 ----> 151644 : <|im_start|>
<|im_start|> : 151644 ----> -100   :       
user   :    872 ----> -100   :       

      :    198 ----> -100   :       
在      :  18493 ----> -100   :       
吗      : 101037 ----> 151645 : <|im_end|>
<|im_end|> : 151645 ----> 198    :       

      :    198 ----> 151644 : <|im_start|>
<|im_start|> : 151644 ----> -100   :       
assistant :  77091 ----> -100   :       

      :    198 ----> 18493  :      在
在      :  18493 ----> 9370   :      的
的      :   9370 ----> 151645 : <|im_end|>
<|im_end|> : 151645 ----> 198    :       

      :    198 ----> 151

In [None]:
#########################
## 推理的表达式
#########################
from typing import Tuple

## history:history.append((query, response))

def make_context(tokenizer,query: str,history: List[Tuple[str, str]] = None,system: str = "",
    max_window_size: int = 6144,chat_format: str = "chatml",):
    if history is None:
        history = []

    if chat_format == "chatml":
        im_start, im_end = "<|im_start|>", "<|im_end|>"
        im_start_tokens = [tokenizer.im_start_id]
        im_end_tokens = [tokenizer.im_end_id]
        nl_tokens = tokenizer.encode("\n")

        def _tokenize_str(role, content):
            return f"{role}\n{content}", tokenizer.encode(
                role, allowed_special=set()
            ) + nl_tokens + tokenizer.encode(content, allowed_special=set())

        system_text, system_tokens_part = _tokenize_str("system", system)
        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens

        raw_text = ""
        context_tokens = []

        for turn_query, turn_response in reversed(history):
            query_text, query_tokens_part = _tokenize_str("user", turn_query)
            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
            response_text, response_tokens_part = _tokenize_str("assistant", turn_response)
            response_tokens = im_start_tokens + response_tokens_part + im_end_tokens

            next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
            prev_chat = (f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}")

            current_context_size = (len(system_tokens) + len(next_context_tokens) + len(context_tokens))
            if current_context_size < max_window_size:
                context_tokens = next_context_tokens + context_tokens
                raw_text = prev_chat + raw_text
            else:
                break

        context_tokens = system_tokens + context_tokens
        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
        context_tokens += (
            nl_tokens
            + im_start_tokens
            + _tokenize_str("user", query)[1]
            + im_end_tokens
            + nl_tokens
            + im_start_tokens
            + tokenizer.encode("assistant")
            + nl_tokens
        )## 表明在assistant+\n之后开始预测
        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"

    elif chat_format == "raw":
        raw_text = query
        context_tokens = tokenizer.encode(raw_text)
    else:
        raise NotImplementedError(f"Unknown chat format {chat_format!r}")

    return raw_text, context_tokens


In [None]:
"""注意换行符号
system
system_text
user
user_text
assistant
assistant_text
"""

"""
system_tokens   : im_start_tokens + system_token      + nl_tokens + system_text_token      +  im_end_tokens
query_tokens    : im_start_tokens + user_token        + nl_tokens + user_text_token        +  im_end_tokens
response_tokens : im_start_tokens + assistant_token   + nl_tokens + assistant_text_token   +  im_end_tokens

##loop:
next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens

context_tokens =system_tokens+next_context_tokens

"""

