In [1]:
import pandas as pd
import openai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from openai import OpenAI
import json
from difflib import get_close_matches
import numpy as np
import jieba


In [2]:
file_path = 'D:/example.xlsx'
xls = pd.ExcelFile(file_path)
sheets = {sheet_name: xls.parse(sheet_name) for sheet_name in xls.sheet_names}
standard_dept_file_path = 'D:/标准科室列表.xlsx'
std_xls = pd.ExcelFile(standard_dept_file_path)
std_sheets = {sheet_name: std_xls.parse(sheet_name) for sheet_name in std_xls.sheet_names}

In [3]:
client = OpenAI(
    api_key = "",
    base_url = "https://api.moonshot.cn/v1",
)


In [4]:
def determine_source(new_dialog, sheets):
    # 将所有工作表的数据合并
    all_data = []
    for sheet_name, df in sheets.items():
        for dialogue in df['dialogue']:
            all_data.append((sheet_name, dialogue))
    
    # 创建一个 DataFrame 存储所有数据
    all_df = pd.DataFrame(all_data, columns=['sheet_name', 'dialogue'])
    
    # 使用 TfidfVectorizer 对所有数据进行向量化
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_df['dialogue'].astype(str))
    
    # 对新的对话进行向量化
    new_dialog_tfidf = vectorizer.transform([new_dialog])
    
    # 计算相似度
    cos_sim = cosine_similarity(new_dialog_tfidf, tfidf_matrix).flatten()
    max_sim_index = cos_sim.argmax()
    
    # 返回最相似对话的工作表名称
    best_source = all_df.loc[max_sim_index, 'sheet_name']
    
    return best_source


In [5]:
def find_similar_examples(new_dialog, df):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['dialogue'].astype(str))
    new_dialog_tfidf = vectorizer.transform([new_dialog])
    cos_sim = cosine_similarity(new_dialog_tfidf, tfidf_matrix)
    similar_indices = cos_sim[0].argsort()[-3:][::-1]  # 获取最相似的三个索引
    second_most_similar_index = similar_indices[1]  # 取第二个相似的索引
    return df.iloc[[second_most_similar_index]]


In [6]:
# 生成COT
def generate_cot(dialogue, answer):
    prompt = f"""对话: {dialogue}
               问题: 患者应该挂什么科？
               请根据{dialogue}和{answer}来生成详细的推理过程和最终的建议，建议去的专业是{answer},输出不要包含对话，只包含推理过程和建议"""
    response =  client.chat.completions.create(
        model= "moonshot-v1-8k",
        messages=[
            {"role": "system", "content": "你是一名医生助手。"},
            {"role": "user", "content": prompt},
        ],
        max_tokens=600,
        temperature=0.7,
        top_p=0.9,
        n=1,
        stop=None
    )
    cot = response.choices[0].message.content
    lines = [line.strip() for line in  cot.split('\n') if line.strip()]  
    formatted_output = ' '.join(lines)
    return formatted_output


In [7]:
# 生成Few Shot
def generate_few_shot(similar_examples):
    few_shot = ""
    for _, row in similar_examples.iterrows():
        dialogue = row['dialogue']
        answer = row['answer']
        cot = generate_cot(dialogue, answer)
        few_shot += f"""对话: {dialogue}
        问题: 患者应该挂什么科？
        思维链: {cot}"""
    return few_shot


In [8]:
# 定义一个函数来获取候选科室
def get_candidate_departments(department, std_sheets):
    if department in std_sheets:
        return std_sheets[department].iloc[:, 0].tolist()  # 获取第一列的所有值
    return []
    


In [9]:
# 生成最终Prompt
def generate_final_prompt(new_dialog, few_shot,departments):
    prompt = (f"{few_shot}"
              f"新的对话: {new_dialog}"
              f"问题: 患者应该挂什么科？"
              f"请根据以上示例生成详细的推理过程和最终的建议，不要包含对话，只包含推理过程和建议，最终的建议挂的科室在{departments}寻找。")
    return prompt


In [10]:
# API输出
def get_api_output(prompt):
    response =  client.chat.completions.create(
        model = "moonshot-v1-8k",
        messages=[
            {"role": "system", "content": "你是一名医生助手。"},
            {"role": "user", "content": prompt},
        ],
        max_tokens=600,
        temperature=0.7,
        top_p=0.9,
        n=1,
        stop=None
    )
    api_out = response.choices[0].message.content

    lines = [line.strip() for line in api_out.split('\n') if line.strip()] 
    formatted_output = ' '.join(lines)
    return formatted_output


In [97]:
def find_closest_department(api_out_content, candidate_departments):
    # 确保 api_out_content 是一个字符串
    if isinstance(api_out_content, tuple):
        api_out_content = ''.join(api_out_content)   
    # 找到并提取建议内容
    suggestion_start = api_out_content.find('建议')
    if suggestion_start != -1:
        suggestion = api_out_content[suggestion_start:].replace('建议', '').strip()
    else:
        return "无建议内容"
    #print(suggestion)
    # 分词
    suggestion_tokens = ' '.join(jieba.lcut(suggestion))
    department_tokens = [' '.join(jieba.lcut(dept)) for dept in candidate_departments]

    # 使用TF-IDF计算相似度
    vectorizer = TfidfVectorizer().fit_transform([suggestion_tokens] + department_tokens)
    vectors = vectorizer.toarray()

    cosine_matrix = cosine_similarity(vectors)
    similarity_scores = cosine_matrix[0][1:]  # 与候选科室的相似度

    # 输出相似度分数，便于调试
    for dept, score in zip(candidate_departments, similarity_scores):
        print(f"科室: {dept}, 相似度得分: {score}")

    # 找到最相似的科室
    best_match_index = np.argmax(similarity_scores)
    return candidate_departments[best_match_index] if similarity_scores[best_match_index] > 0 else "其它"


In [78]:
# def find_closest_department(api_out_content, candidate_departments):
#     # 确保 api_out_content 是一个字符串
#     if isinstance(api_out_content, tuple):
#         api_out_content = ''.join(api_out_content)
    
#     # 找到并提取建议内容
#     suggestion_start = api_out_content.find('建议')
#     if suggestion_start != -1:
#         suggestion = api_out_content[suggestion_start:].replace('建议', '').strip()
#     else:
#         return "无建议内容"
#     #print(suggestion)
#     # 分词
#     #suggestion_tokens = ' '.join(jieba.lcut(suggestion))
#     #department_tokens = [' '.join(jieba.lcut(dept)) for dept in candidate_departments]
#     matched_departments = []
#     for token in candidate_departments:
#         if token in suggestion:
#             matched_departments.append(token)
#     return matched_departments

In [12]:
# 将结果保存为 JSON 文件
def save_to_json(new_dialog, departments, few_shot, api_output, answer_department, source, file_path):
   
    data = {
        "dialogue": new_dialog,
        "department": departments,
        "few_shot": few_shot,
        "question": "患者应该挂什么科？",
        "api_output": api_output,
        "answer_department": answer_department,
        "sample_id": "sample_12",  # 更新样本 ID
        "source": source
    }
    # 使用 json.dumps 格式化 JSON 字符串
    formatted_json = json.dumps(data, ensure_ascii=False, indent=4)

    # 自定义处理数组的输出格式
    # 查找所有数组的内容
    formatted_json = re.sub(r'(\[[^\[\]]*\])', lambda m: m.group(1).replace('\n', '').replace('  ', ''), formatted_json)
    formatted_json = formatted_json.replace('\\n', '\n')

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(formatted_json)



In [81]:
new_dialog = "患者：2岁小孩嘴里长了很多小泡，应该去哪个科？"

In [82]:
source = determine_source(new_dialog, sheets)

In [83]:
source

'专科-口腔'

In [84]:
similar_examples = find_similar_examples(new_dialog, sheets[source])

In [85]:
similar_examples

Unnamed: 0,dialogue,answer
5,患者：男，41岁，口气比较严重，应该去哪个科？\n医生：了解了，请问您这种口气问题大概持续了...,牙周病专业|牙体牙髓病专业


In [86]:
few_shot = generate_few_shot(similar_examples)


In [87]:
departments = get_candidate_departments(source, std_sheets)


In [88]:
departments

['牙体牙髓病专业',
 '牙周病专业',
 '口腔粘膜病专业',
 '儿童口腔专业',
 '口腔颌面外科专业',
 '口腔外科',
 '口腔颅颌面科',
 '口腔头颈颌面肿瘤科',
 '口腔修复专业',
 '口腔正畸专业',
 '口腔种植专业',
 '口腔麻醉专业',
 '口腔颌面医学影像专业',
 '口腔病理专业',
 '预防口腔专业',
 '其他']

In [89]:
final_prompt = generate_final_prompt(new_dialog, few_shot,departments )

In [90]:
api_out = get_api_output(final_prompt)


In [91]:
api_out


'推理过程： 1. 患者为2岁小孩，主诉为嘴里长了很多小泡，这可能是口腔黏膜的病变。 2. 口腔内小泡可能是由多种原因引起的，包括病毒感染（如手足口病）、细菌感染、过敏反应、营养不良等。 3. 需要专业的口腔医生进行详细检查，以确定小泡的性质和原因。 4. 考虑到患者是儿童，儿童口腔专业的医生更了解儿童口腔的特点和常见问题，因此更可能给出准确的诊断和治疗建议。 最终建议： 患者应该挂儿童口腔专业进行进一步的检查和治疗。医生可能会进行口腔检查，评估小泡的性质，必要时进行相应的治疗，并提供预防措施和口腔护理指导，以避免类似情况的再次发生。如果儿童口腔专业无法解决，可能需要转诊至口腔粘膜病专业进一步诊治。'

In [98]:
answer_department = find_closest_department(api_out, departments)


科室: 牙体牙髓病专业, 相似度得分: 0.04275800127276682
科室: 牙周病专业, 相似度得分: 0.05842644875703232
科室: 口腔粘膜病专业, 相似度得分: 0.1743620336043013
科室: 儿童口腔专业, 相似度得分: 0.3429991525678351
科室: 口腔颌面外科专业, 相似度得分: 0.1165506201993513
科室: 口腔外科, 相似度得分: 0.0874254200060733
科室: 口腔颅颌面科, 相似度得分: 0.10280740046861556
科室: 口腔头颈颌面肿瘤科, 相似度得分: 0.05127210952313436
科室: 口腔修复专业, 相似度得分: 0.1280867406207698
科室: 口腔正畸专业, 相似度得分: 0.1280867406207698
科室: 口腔种植专业, 相似度得分: 0.1280867406207698
科室: 口腔麻醉专业, 相似度得分: 0.1280867406207698
科室: 口腔颌面医学影像专业, 相似度得分: 0.108657359377587
科室: 口腔病理专业, 相似度得分: 0.1280867406207698
科室: 预防口腔专业, 相似度得分: 0.1280867406207698
科室: 其他, 相似度得分: 0.0


In [99]:
answer_department

'儿童口腔专业'

In [50]:
file_path = 'D:/dialog-T.json'

In [51]:
save_to_json(new_dialog, departments, few_shot, api_out, answer_department, source, file_path)