In [1]:
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility
import pandas as pd

  from pandas.core import (


In [2]:
from pymilvus import connections, Collection, utility

# 配置
MILVUS_HOST = "localhost"
MILVUS_PORT = 19530
COLLECTION_NAME = "snomed"
DIM = 1024  # 向量维度
USING = "testing"

# 1. 建立连接
connections.connect(
    alias=USING,  # 连接别名
    host="localhost",
    port=19530
)

# 2. 获取Collection对象
collection = Collection(COLLECTION_NAME, using=USING)  # 使用你配置的collection名称

# 3. 加载Collection到内存（查询前必须）
collection.load()


In [3]:
# 查看行列

# 获取行数（实体数量）
num_rows = collection.num_entities
print(f"行数（记录数）: {num_rows}")

# 获取列数（字段数量）
num_cols = len(collection.schema.fields)
print(f"列数（字段数）: {num_cols}")

# 查看所有字段信息
print("\n所有字段:")
for i, field in enumerate(collection.schema.fields, 1):
    print(f"{i}. {field.name} ({field.dtype})")

行数（记录数）: 406993
列数（字段数）: 11

所有字段:
1. id (5)
2. vector (101)
3. concept_id (21)
4. concept_name (21)
5. domain_id (21)
6. vocabulary_id (21)
7. concept_class_id (21)
8. standard_concept (21)
9. concept_code (21)
10. valid_start_date (21)
11. valid_end_date (21)


加载embedding模型

In [4]:
import torch
from sentence_transformers import SentenceTransformer

In [5]:
em_model = SentenceTransformer(
    model_name_or_path='BAAI/bge-m3', 
    # model_name_or_path='Alibaba-NLP/gte-Qwen2-1.5B-instruct',
    device='cuda:0' if torch.cuda.is_available() else 'cpu',
    trust_remote_code=True,

    model_kwargs={'dtype': torch.bfloat16}  # 使用 bfloat16
)

查询

In [None]:
# 1 通过向量进行搜索

# 输入文本
query = "postdiphtheria"
query = "糖尿病"
'''embedding 有一定跨语言搜索能力 但没有原生的语言更精确'''

# embedding
q_em = em_model.encode([query], show_progress_bar=False)
q_vector = q_em[0].tolist()

# 搜索
results = collection.search(
    data=[q_vector],      
    anns_field="vector",
    param={
        "metric_type": "COSINE",
        "params": {"ef": 64}
    },
    limit=5,
    output_fields=["concept_id", "concept_name", "domain_id"]
)


# 将结果转为DataFrame
df = pd.DataFrame([
    {'rank': i, 'score': hit.distance, 'id': hit.id, 
     **{k: hit.entity.get(k) for k in hit.entity.fields}}  # 智能解包所有字段
    for hits in results for i, hit in enumerate(hits, 1)
]).round({'score': 4})

df.head()

Unnamed: 0,rank,score,id,concept_id,concept_name,domain_id
0,1,0.8353,462511009688506132,201820,Diabetes mellitus,Condition
1,2,0.7591,462511009688462306,201254,Type 1 diabetes mellitus,Condition
2,3,0.7524,462511009688624601,4202383,Drug-induced diabetes mellitus,Condition
3,4,0.7523,462511009688643321,37311673,Hyperglycemia due to diabetes mellitus,Condition
4,5,0.7523,462511009688113340,4052041,Diabetic diet,Observation


In [6]:
# 2 关键字完全匹配查询
query = collection.query(
    expr="concept_name == 'Periungual fibroma'", 
    output_fields=["concept_name", "concept_class_id", "domain_id"],
    limit=5
)

# 直接转DataFrame（结果是字典列表）
df = pd.DataFrame(query)
df.head()

Unnamed: 0,concept_name,concept_class_id,domain_id,id
0,Periungual fibroma,Disorder,Condition,462511009688590665


In [7]:
# 条件查询
query = collection.query(
    expr="id >= 0",
    limit=10,
    output_fields=["*"]
)

# 直接转DataFrame（结果是字典列表）
df = pd.DataFrame(query)
df.head()

Unnamed: 0,vector,id,concept_class_id,concept_id,concept_name,domain_id,vocabulary_id,standard_concept,concept_code,valid_start_date,valid_end_date
0,"[-0.048095703125, -0.00555419921875, -0.023925...",462511009688045249,Clinical Finding,42538812,Somatic hallucination,Condition,SNOMED,S,762620006,20180131,20991231
1,"[0.006256103515625, 0.0184326171875, -0.032714...",462511009688045250,Disorder,4084170,Non-allergic anaphylaxis caused by whole blood,Condition,SNOMED,S,241944009,20020131,20991231
2,"[-0.048828125, 0.01458740234375, -0.0161132812...",462511009688045251,Clinical Finding,4085530,Unformed visual hallucinations,Condition,SNOMED,S,247733004,20020131,20991231
3,"[-0.02001953125, -0.0067138671875, -0.01000976...",462511009688045252,Clinical Finding,4085038,Formed visual hallucinations,Condition,SNOMED,S,247734005,20020131,20991231
4,"[-0.0537109375, -0.0034027099609375, -0.029907...",462511009688045253,Clinical Finding,4085531,Scenic visual hallucinations,Condition,SNOMED,S,247735006,20020131,20991231


名词解释场景

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
import requests

# 导入环境变量
api_url = os.getenv("LLM_URL")
api_key = os.getenv("LLM_API_KEY")
model = os.getenv("LLM_MODEL")

In [28]:
# =====定义聊天函数
def chat(messages):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    data = {
        "model": model,
        "messages": messages
    }

    response = requests.post(api_url, headers=headers, json=data)

    answer = response.json()["choices"][0]["message"]["content"] 
    
    return answer

# =====search 搜索
# 输入文本
query = "乙醇"
query = "氨基酸"
query = "剪刀"
query = "格拉斯哥抗精神病药物副作用量表测量"
query = "从头部或颈部皮肤上取出玻璃碎片"

'''embedding 有一定跨语言搜索能力 但没有原生的语言更精确'''

# embedding
q_em = em_model.encode([query], show_progress_bar=False)
q_vector = q_em[0].tolist()

# 搜索
results = collection.search(
    data=[q_vector],      
    anns_field="vector",
    param={
        "metric_type": "COSINE",
        "params": {"ef": 64}
    },
    limit=1,
    output_fields=["concept_name", "concept_class_id", "domain_id"]
)


# =====llm 润色
concept_name = results[0][0].entity.concept_name
concept_class_id = results[0][0].entity.concept_class_id
domain_id = results[0][0].entity.domain_id
print(concept_name)
print(concept_class_id)
print(domain_id)

messages = [
    {"role": "system", "content": "翻译成中文"},
]
user_input = f"概念名称：{concept_name}，概念类型：{concept_class_id}，领域：{domain_id}"
messages.append({"role": "user", "content": user_input}) # 加入历史记录
answer = chat(messages)
print(answer)

Removal of glass from skin of head or neck
Procedure
Procedure
概念名称：从头部或颈部皮肤上移除玻璃，概念类型：操作，领域：操作
