## 链接MilvusDB

In [3]:
from pymilvus import MilvusClient

client = MilvusClient("http://localhost:19530")


## 查询PMID

In [8]:
import os
import time
from typing import List
from Bio import Entrez
import json

SEARCH_BATCH_SIZE = 100         # 每次esearch的返回数量
SLEEP_INTERVAL = 0.2 

Entrez.email = 'q2977991823@gmail.com'
Entrez.api_key = 'a0d7058fac69711cdbb0c646742e89d56608'

def fetch_pubmed_ids_recent_days(query: str, days: int = 7, retmax: int = 100) -> List[str]:
    initial = Entrez.esearch(
        db="pubmed",
        term=query,
        reldate=days,
        datetype="pdat",
        retmax=0,
        usehistory="y",
        retmode="xml"
    )
    record = Entrez.read(initial)
    initial.close()

    total_count = int(record["Count"])
    webenv = record["WebEnv"]
    query_key = record["QueryKey"]
    print(f"📊 PubMed中最近 {days} 天共找到 {total_count} 篇文献")

    all_pmids = []
    for start in range(0, total_count, retmax):
        print(f"🔍 拉取第 {start} - {min(start + retmax, total_count)} 条PMID")
        handle = Entrez.esearch(
            db="pubmed",
            term=query,
            reldate=days,
            datetype="pdat",
            retstart=start,
            retmax=retmax,
            usehistory="y",
            webenv=webenv,
            query_key=query_key,
            retmode="xml"
        )
        batch = Entrez.read(handle)
        handle.close()
        all_pmids.extend(batch["IdList"])
        time.sleep(SLEEP_INTERVAL)

    print(f"✅ 共成功获取 {len(all_pmids)} 条PMID")
    return all_pmids

pmid_list = fetch_pubmed_ids_recent_days(query="rare disease", days=5, retmax=SEARCH_BATCH_SIZE)
print("The latest 5 days pmid:", pmid_list)
print(f"[INFO] Fetched {len(pmid_list)} PMIDs from Entrez.")

📊 PubMed中最近 5 天共找到 227 篇文献
🔍 拉取第 0 - 100 条PMID
🔍 拉取第 100 - 200 条PMID
🔍 拉取第 200 - 227 条PMID
✅ 共成功获取 227 条PMID
The latest 5 days pmid: ['40640950', '40640934', '40640933', '40640932', '40640928', '40640921', '40640912', '40640857', '40640823', '40640769', '40640744', '40640699', '40640386', '40640314', '40640191', '40640065', '40639993', '40639874', '40639871', '40639783', '40639776', '40639759', '40639691', '40639476', '40639355', '40638920', '40638881', '40638880', '40638827', '40638739', '40638719', '40638015', '40638000', '40637999', '40637848', '40637790', '40637762', '40637760', '40637723', '40637719', '40637713', '40637700', '40637595', '40637405', '40637387', '40637257', '40636397', '40636342', '40636332', '40636326', '40636131', '40635565', '40635387', '40635359', '40635327', '40635258', '40635224', '40635111', '40635103', '40635052', '40635034', '40635014', '40634996', '40634972', '40634964', '40634917', '40634891', '40634889', '40634824', '40634659', '40634618', '40634490', '4

## 获取详细文献信息

In [10]:
import time
from typing import List
from Bio import Entrez
from typing import List, Dict

SEARCH_BATCH_SIZE = 100         # 每次esearch的返回数量
SLEEP_INTERVAL = 0.2
FETCH_BATCH_SIZE = 100

Entrez.email = 'q2977991823@gmail.com'
Entrez.api_key = 'a0d7058fac69711cdbb0c646742e89d56608'

def fetch_pubmed_ids_recent_days(query: str, days: int = 7, retmax: int = 100) -> List[str]:
    initial = Entrez.esearch(
        db="pubmed",
        term=query,
        reldate=days,
        datetype="pdat",
        retmax=0,
        usehistory="y",
        retmode="xml"
    )
    record = Entrez.read(initial)
    initial.close()

    total_count = int(record["Count"])
    webenv = record["WebEnv"]
    query_key = record["QueryKey"]
    print(f"📊 PubMed中最近 {days} 天共找到 {total_count} 篇文献")

    all_pmids = []
    for start in range(0, total_count, retmax):
        print(f"🔍 拉取第 {start} - {min(start + retmax, total_count)} 条PMID")
        handle = Entrez.esearch(
            db="pubmed",
            term=query,
            reldate=days,
            datetype="pdat",
            retstart=start,
            retmax=retmax,
            usehistory="y",
            webenv=webenv,
            query_key=query_key,
            retmode="xml"
        )
        batch = Entrez.read(handle)
        handle.close()
        all_pmids.extend(batch["IdList"])
        time.sleep(SLEEP_INTERVAL)

    print(f"✅ 共成功获取 {len(all_pmids)} 条PMID")
    return all_pmids

pmid_list = fetch_pubmed_ids_recent_days(query="rare disease", days=5, retmax=SEARCH_BATCH_SIZE)
print("The latest 5 days pmid:", pmid_list)
print(f"[INFO] Fetched {len(pmid_list)} PMIDs from Entrez.")

def fetch_pubmed_details(pmids: List[str], batch_size: int = 100, sleep_time: float = 0.5) -> List[Dict]:
    results = []
    for i in range(0, len(pmids), batch_size):
        batch_pmids = pmids[i:i + batch_size]
        print(f"📥 获取文献详情: {i} - {i + len(batch_pmids)}")
        try:
            handle = Entrez.efetch(
                db="pubmed",
                id=",".join(batch_pmids),
                rettype="medline",
                retmode="xml"
            )
            records = Entrez.read(handle)
            handle.close()
        except Exception as e:
            print(f"❌ 批次失败: {e}")
            continue

        for article in records.get("PubmedArticle", []):
            try:
                medline = article["MedlineCitation"]
                article_data = medline["Article"]
                pmid = str(medline["PMID"])
                title = article_data.get("ArticleTitle", "")
                abstract_parts = article_data.get("Abstract", {}).get("AbstractText", [])
                abstract = " ".join(str(p) for p in abstract_parts) if abstract_parts else ""
                authors = [
                    f"{a['ForeName']} {a['LastName']}"
                    for a in article_data.get("AuthorList", [])
                    if "ForeName" in a and "LastName" in a
                ]
                doi = ""
                for id_ in article_data.get("ELocationID", []):
                    if id_.attributes.get("EIdType") == "doi":
                        doi = str(id_)
                        break
                journal = article_data.get("Journal", {}).get("Title", "")
                pub_year = article_data.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {}).get("Year", "")

                results.append({
                    "pmid": pmid,
                    "title": title,
                    "abstract": abstract,
                    "authors": ", ".join(authors),
                    "doi": doi,
                    "journal": journal,
                    "year": pub_year,
                    "source": "PubMed"
                })
            except Exception as e:
                print(f"⚠️ 单篇解析失败: {e}")
                continue
        time.sleep(sleep_time)
    print(f"✅ 获取完毕，总共 {len(results)} 篇")
    return results

records = fetch_pubmed_details(pmid_list, batch_size=FETCH_BATCH_SIZE, sleep_time=SLEEP_INTERVAL)
print("The first 5 pubmed_details:", records[0:5])

📊 PubMed中最近 5 天共找到 227 篇文献
🔍 拉取第 0 - 100 条PMID
🔍 拉取第 100 - 200 条PMID
🔍 拉取第 200 - 227 条PMID
✅ 共成功获取 227 条PMID
The latest 5 days pmid: ['40640950', '40640934', '40640933', '40640932', '40640928', '40640921', '40640912', '40640857', '40640823', '40640769', '40640744', '40640699', '40640386', '40640314', '40640191', '40640065', '40639993', '40639874', '40639871', '40639783', '40639776', '40639759', '40639691', '40639476', '40639355', '40638920', '40638881', '40638880', '40638827', '40638739', '40638719', '40638015', '40638000', '40637999', '40637848', '40637790', '40637762', '40637760', '40637723', '40637719', '40637713', '40637700', '40637595', '40637405', '40637387', '40637257', '40636397', '40636342', '40636332', '40636326', '40636131', '40635565', '40635387', '40635359', '40635327', '40635258', '40635224', '40635111', '40635103', '40635052', '40635034', '40635014', '40634996', '40634972', '40634964', '40634917', '40634891', '40634889', '40634824', '40634659', '40634618', '40634490', '4

## 嵌入模型调用

In [12]:
import requests

EMBEDDING_DIM = 1024
EMBEDDING_API_URL = 'http://192.168.10.199:7997/v1/embeddings'
EMBEDDING_MODEL = 'qwen3-0.6b-embedding'

def get_embedding(text: str) -> List[float]:
    payload = {
        "input": [text],
        "model": EMBEDDING_MODEL
    }
    response = requests.post(EMBEDDING_API_URL, json=payload)
    response.raise_for_status()
    embedding = response.json()["data"][0]["embedding"]
    if len(embedding) != EMBEDDING_DIM:
        raise ValueError(f"Embedding dim mismatch: {len(embedding)} != {EMBEDDING_DIM}")
    return embedding

text = "Cobalamin C deficiency: a rare but treatable genetic cause of pulmonary hypertension."
embedding_result = get_embedding(text)
print(embedding_result)

[-0.0096435546875, 0.0257568359375, -0.01202392578125, 0.0130615234375, 0.025634765625, 0.037353515625, -0.0025787353515625, -0.0654296875, -0.017822265625, 0.03955078125, -0.043212890625, -0.05078125, 0.040771484375, -0.01202392578125, -0.059814453125, 0.06787109375, 0.0038909912109375, 0.01397705078125, -0.06884765625, -0.01519775390625, -0.044677734375, -0.004150390625, 0.00921630859375, 0.1025390625, 0.109375, 0.021240234375, -0.01068115234375, 0.05224609375, 0.03173828125, -0.051513671875, 0.016845703125, -0.0262451171875, -0.01483154296875, -0.06396484375, 0.0250244140625, -0.014404296875, -0.0038909912109375, -0.011474609375, -0.0057373046875, 0.033203125, -0.01251220703125, 0.0458984375, 0.0810546875, 0.05126953125, 0.037353515625, -0.044189453125, 0.01165771484375, -0.0498046875, 0.056396484375, 0.0228271484375, 0.0038909912109375, -0.047119140625, -0.0033111572265625, 0.03076171875, 0.04150390625, 0.0264892578125, 0.03564453125, 0.04638671875, 0.0245361328125, -0.014770507812

## 插入 Milvus