In [21]:
# embedding model 

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

embedding = HuggingFaceEmbedding(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    device="cpu",                 # 建议放顶层
    cache_folder=r"E:\local_models\huggingface\cache\hub",
    trust_remote_code=True,       # 建议放顶层
    model_kwargs={"local_files_only": False},   # 允许联网 False
)
Settings.embed_model = embedding

In [22]:
# load data  metadata 

from pathlib import Path 
import json 
from typing import List, Dict, Any, Tuple

from llama_index.core import Document, SimpleDirectoryReader, VectorStoreIndex, StorageContext, load_index_from_storage 
from llama_index.core.schema import TextNode, NodeRelationship, RelatedNodeInfo
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.vector_stores.types import VectorStoreQueryResult
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.extractors import TitleExtractor     # 需要接入llm

# text-info: full_split_struct.md (可能还不是很干净，有一些零散的干扰信息语句、对整体的而影响不大)  --后续需要优化
# figs-info: figs.json 就是结构化的图像信息
data_root = Path.cwd().parent / ".log/SimplePDF"
assert Path(data_root).is_dir()
mdfs: Path = next(Path(data_root).rglob('full_split_struct.md'), None)
assert  Path(mdfs).exists()
figs: Path = Path(mdfs).with_name("figs.json")
assert figs.is_file()

"""  figs.json
{
  "im_abs": [
    "path/to/im_abs",
    ""
    ],
  "ims_desc": {
    "1": "本实用新型整体结构示意图",
    "2": "本实用新型剖视图",
    "3": "本实用新型爆炸视图",
    "4": "本实用新型局部爆炸视图",
    "5": "本实用新型部分结构示意图（不包括保护罩总成）",
    "6": "本实用新型设置发光件的结构示意图（不包括保护罩总成）"
  },
  "ims_absp": {
    "1": "path/to/im_1",
    "2": "path/to/im_2",
    "3": "path/to/im_3",
    "4": "path/to/im_4",
    "5": "path/to/im_5",
    "6": "path/to/im_6",
  },
  "ims_bs64": {
      ""
  },
  "ims_annos": "附图标记说明：1：保护罩总成，2：足基座总成，3：足垫，..."
}

"""

# load text_  figs_
def load_md_and_figs(md_path: Path, figs_name: str = "figs.json") -> Tuple[str, Dict[str, Any]]:
    """读取单篇专利的 md 文本+figs.json（若缺失则空字典）"""
    text = md_path.read_text(encoding="utf-8", errors="ignore")
    fj = md_path.with_name(figs_name)
    figs = {}
    if fj.exists():
        with open(fj, "r", encoding="utf-8") as f:
            figs = json.load(f)
    return text, figs
text_, figs_ = load_md_and_figs(md_path=mdfs)

print("纯文本长度：", len(text_))
print("figs.json 键：", list(figs_.keys()))
print("ims_desc 示例：", list((figs_.get("ims_desc") or {}).items())[:2])

纯文本长度： 5831
figs.json 键： ['im_abs', 'ims_desc', 'ims_absp', 'ims_bs64', 'ims_annos']
ims_desc 示例： [('1', '本实用新型整体结构示意图'), ('2', '本实用新型剖视图')]


In [23]:
# node

import uuid

# Vector stores
from llama_index.vector_stores.faiss import FaissVectorStore  # faiss
from llama_index.core.vector_stores import SimpleVectorStore  # 内存

# Optional BM25
from llama_index.retrievers.bm25 import BM25Retriever  # use or not


_DIGIT_TRANS = str.maketrans("０１２３４５６７８９", "0123456789")

def _to_ascii_digits(s: str) -> str:
    return (s or "").translate(_DIGIT_TRANS)


# nodes
# chunk_size=700, chunk_overlap=128
def build_text_nodes_from_markdown(text: str, doc_id: str) -> List[TextNode]:
    splitter = SentenceSplitter(chunk_size=700, chunk_overlap=128)
    chunks = splitter.split_text(text)
    nodes: List[TextNode] = []
    for i, ch in enumerate(chunks, 1):
        nodes.append(TextNode(
            text=ch,
            id_=f"{doc_id}::text::{i}",
            metadata={"doc_id": doc_id, "node_type": "text", "chunk_idx": i},
        ))
    return nodes

def build_figure_nodes_from_figs(figs: Dict[str, Any], doc_id: str) -> List[TextNode]:
    nodes: List[TextNode] = []
    ims_desc: Dict[str, str] = figs.get("ims_desc", {}) or {}
    ims_absp: Dict[str, str] = figs.get("ims_absp", {}) or {}
    ims_bs64: Dict[str, str] = figs.get("ims_bs64", {}) or {}
    annos: str = (figs.get("ims_annos") or "").strip()

    # 摘要图
    im_abs = figs.get("im_abs") or []
    if isinstance(im_abs, list) and len(im_abs) >= 1:
        abs_path = im_abs[0] if isinstance(im_abs[0], str) else ""
        text_for_embed = "摘要图" + (f"；附图标记：{annos}" if annos else "")
        nodes.append(TextNode(
            text=text_for_embed,
            id_=f"{doc_id}::fig::abs",
            metadata={
                "doc_id": doc_id, "node_type": "figure",
                "fig_no": "abs", "fig_desc": "摘要图",
                "fig_path": abs_path, "fig_b64": "B64(omitted)" if im_abs[1:] else "",
                "fig_annos": annos,
            },
        ))

    # 普通图
    def _key_sorter(k: str) -> int:
        try: return int(_to_ascii_digits(k))
        except: return 10**9

    for k in sorted(ims_desc.keys(), key=_key_sorter):
        desc = (ims_desc.get(k) or "").strip()
        pth  = (ims_absp.get(k) or "").strip()
        text_for_embed = f"图{k}：{desc}" + (f"；附图标记：{annos}" if annos else "")
        nodes.append(TextNode(
            text=text_for_embed,
            id_=f"{doc_id}::fig::{k}",
            metadata={
                "doc_id": doc_id, "node_type": "figure",
                "fig_no": k, "fig_desc": desc,
                "fig_path": pth, "fig_b64": "B64(omitted)" if (ims_bs64.get(k) or "") else "",
                "fig_annos": annos,
            },
        ))
    return nodes


# 小测试：构建一份示例的节点数量
doc_id_demo = str(uuid.uuid5(uuid.NAMESPACE_URL, str(mdfs.resolve())))
test_nodes = build_text_nodes_from_markdown(text_, doc_id_demo) + build_figure_nodes_from_figs(figs_, doc_id_demo)
len(test_nodes), sum(1 for n in test_nodes if n.metadata["node_type"]=="figure")


(19, 7)

In [33]:
# index

import faiss

# Settings.embed_model 
Settings.llm = None

persist_dir = Path.cwd().parent / ".log/faiss_db"
persist_dir.mkdir(parents=True, exist_ok=True)
faiss_index_path = persist_dir / "faiss.index"

def _storage_triplet_ok(pdir: Path) -> bool:
    return all([
        (pdir / "docstore.json").exists(),
        (pdir / "index_store.json").exists(),
        (pdir / "faiss.index").exists(),
    ])


def build_or_load_index(md_files: List[Path]) -> Tuple[VectorStoreIndex, List[TextNode]]:
    
    # init 
    vector_store = FaissVectorStore(faiss_index=faiss.IndexFlatL2(1024))
    # vector_store = SimpleVectorStore()  # 内存

    # 若已存在存储，直接加载（加快重跑）
    if _storage_triplet_ok(persist_dir):
        storage_context = StorageContext.from_defaults(
            persist_dir=persist_dir,
            vector_store=vector_store
            )
        index = load_index_from_storage(storage_context)
        
        # 为了 BM25 或其他用途，再次扫描构建 nodes（不影响向量库）
        nodes_cache: List[TextNode] = []
        for md in md_files:
            text, figs = load_md_and_figs(md)
            doc_id = str(uuid.uuid5(uuid.NAMESPACE_URL, str(md.resolve())))
            nodes_cache += build_text_nodes_from_markdown(text, doc_id)
            nodes_cache += build_figure_nodes_from_figs(figs, doc_id)
        print(f"[load] 载入索引，缓存 nodes={len(nodes_cache)}")
        return index, nodes_cache

    # 首建：扫描数据 -> nodes
    all_nodes: List[TextNode] = []
    
    for md in md_files:
        text, figs = load_md_and_figs(md)
        doc_id = str(uuid.uuid5(uuid.NAMESPACE_URL, str(md.resolve())))
        all_nodes += build_text_nodes_from_markdown(text, doc_id)
        all_nodes += build_figure_nodes_from_figs(figs, doc_id)

    # 构建 faiss_db   
    # ## 参数persist_dir只有在加载的时候用，这里是创建 
    storage_context = StorageContext.from_defaults(vector_store=vector_store) 

    index = VectorStoreIndex(all_nodes, storage_context=storage_context, show_progress=True)
    storage_context.persist(persist_dir=str(persist_dir))
    print(f"[build] 新建索引：nodes={len(all_nodes)}")
    return index, all_nodes

index, nodes_cache = build_or_load_index(md_files=[mdfs])


LLM is explicitly disabled. Using MockLLM.


Generating embeddings:   0%|          | 0/19 [00:00<?, ?it/s]

[build] 新建索引：nodes=19


In [35]:
# retriever 
# ==== ① 向量检索参数 & 展示工具 ====
import re
from typing import List, Dict, Any
from llama_index.core import QueryBundle

# 超参数（随手改）
TOP_K = 8                       # 向量检索返回条数
VECTOR_MODE = "default"         # 可选: "default" | "mmr"
MMR_ALPHA = 0.5                 # 仅当 VECTOR_MODE="mmr" 生效，0~1，越大越多样化

# 简单预处理：看看用户是否在问“图N”
FIG_PAT = re.compile(r"图\s*([0-9０-９]+)")

def _has_fig_mention(q: str) -> List[int]:
    """检测 query 里有没有“图N”，返回命中的图号列表（半角化）"""
    nums = []
    for m in FIG_PAT.finditer(q or ""):
        s = m.group(1).translate(str.maketrans("０１２３４５６７８９","0123456789"))
        try:
            nums.append(int(s))
        except:
            pass
    return nums

def preview_hit(hit, max_chars: int = 120) -> str:
    """把 NodeWithScore 变成一行可读文本"""
    n = hit.node
    kind = n.metadata.get("node_type", "text")
    if kind == "figure":
        fig_no = n.metadata.get("fig_no","?")
        fig_desc = n.metadata.get("fig_desc","")
        fig_path = n.metadata.get("fig_path","")
        return f"[figure] 图{fig_no} - {fig_desc}  | path={fig_path}  | score={hit.score:.4f}"
    else:
        text = (n.get_content() or "").replace("\n"," ").strip()
        return f"[text] {text[:max_chars]}{'…' if len(text)>max_chars else ''}  | score={hit.score:.4f}"


In [36]:
# retriever 


# ==== ② 基础向量检索 ====
from llama_index.core.retrievers import VectorIndexRetriever

def vector_search(query: str, top_k: int = TOP_K) -> List[Any]:
    retriever = index.as_retriever(
        similarity_top_k=top_k,
        vector_store_query_mode=VECTOR_MODE,
        alpha=MMR_ALPHA if VECTOR_MODE == "mmr" else None,
    )
    hits = retriever.retrieve(QueryBundle(query))
    return hits

# --- 小测试 ---
q = "图1的结构示意图讲了什么？"
hits = vector_search(q, top_k=5)
print("Q:", q)
for i, h in enumerate(hits, 1):
    print(f"{i:>2}.", preview_hit(h))


Q: 图1的结构示意图讲了什么？
 1. [figure] 图abs - 摘要图  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\88fe5b0e5b16afc42f78352f8df13f234eeb58b135cdcfdfea80134da7580363.jpg  | score=1.0469
 2. [figure] 图2 - 本实用新型剖视图  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\f6f4294118dd28c8f795d7c633a1e3bd99fad6f9422678f4dc7439d64982e4a8.jpg  | score=1.1815
 3. [figure] 图4 - 本实用新型局部爆炸视图  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\906093d2fd363840626568d48a84b01cc75bf18892b5b8197140496dc648bfe0.jpg  | score=1.1947
 4. [figure] 图3 - 本实用新型爆炸视图  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\2649e5073d9e380fe64058ca443c4bcb2bf8d6c0183067d83a025b530a406f7e.jpg  | score=1.2036
 5. [figure] 图1 - 本实用新型整体结构示意图  | 

In [37]:
# retriever 

# 纯工程做法：用 BM25 对 nodes_cache（你前面 build 的文本/图片节点汇总）做关键词匹配，然后和向量检索融合。
# ==== ③ BM25 + 混合检索（可选） ====
try:
    from llama_index.retrievers.bm25 import BM25Retriever
    HAS_BM25 = True
except Exception:
    HAS_BM25 = False

# 融合权重（可调）
W_VEC = 0.7
W_BM25 = 0.3

def _merge_scores(vec_hits, bm_hits, top_k=TOP_K) -> List[Any]:
    """把两路结果按加权分数融合，返回前 top_k 的 NodeWithScore 列表"""
    # 归一化
    def _norm_scores(hits):
        if not hits:
            return []
        scores = [h.score for h in hits if h.score is not None]
        if not scores:
            return hits
        mx, mn = max(scores), min(scores)
        rng = (mx - mn) or 1.0
        for h in hits:
            h._norm = (h.score - mn) / rng if h.score is not None else 0.0
        return hits

    vec_hits = _norm_scores(list(vec_hits))
    bm_hits  = _norm_scores(list(bm_hits))

    pool: Dict[str, Dict[str, Any]] = {}
    def _accumulate(hits, w):
        for h in hits:
            nid = h.node.node_id
            if nid not in pool:
                pool[nid] = {"node": h.node, "score": 0.0, "v": 0.0, "b": 0.0}
            pool[nid]["score"] += w * getattr(h, "_norm", 0.0)
            pool[nid]["v"] += (w * getattr(h, "_norm", 0.0) if w==W_VEC else 0.0)
            pool[nid]["b"] += (w * getattr(h, "_norm", 0.0) if w==W_BM25 else 0.0)

    _accumulate(vec_hits, W_VEC)
    _accumulate(bm_hits,  W_BM25)

    # 排序&构造 NodeWithScore
    merged = sorted(pool.values(), key=lambda x: x["score"], reverse=True)[:top_k]
    out = []
    from llama_index.core.schema import NodeWithScore
    for it in merged:
        out.append(NodeWithScore(node=it["node"], score=it["score"]))
    return out

def hybrid_search(query: str, top_k: int = TOP_K) -> List[Any]:
    # 向量路
    vec_hits = vector_search(query, top_k=top_k)

    # 词匹配路（可选）
    if HAS_BM25 and nodes_cache:
        bm25 = BM25Retriever.from_defaults(nodes=nodes_cache, similarity_top_k=top_k)
        bm_hits = bm25.retrieve(query)
    else:
        bm_hits = []

    # 融合
    merged = _merge_scores(vec_hits, bm_hits, top_k=top_k)
    return merged

# --- 小测试 ---
q = "介绍一下图2"
hits = hybrid_search(q, top_k=8)
print("Q:", q)
for i, h in enumerate(hits, 1):
    print(f"{i:>2}.", preview_hit(h))


Q: 介绍一下图2
 1. [text] 微控制器MCU, 控制测控单元2- 4上的电信号放大芯片、三轴加速度传感器芯片、三轴陀螺仪传感器芯片和发光件2- 3, 并且微控制器通过通讯线与机器人的主控制器进行通讯。  [0036] 所述足基座2- 5的端面开设一凹槽, 所述凹槽通过粘…  | score=0.7000
 2. [figure] 图6 - 本实用新型设置发光件的结构示意图（不包括保护罩总成）  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\730d3218997db726e8df83cfee8bab58bd2a80709dd3b13590f9d5dc91bc494d.jpg  | score=0.4803
 3. [figure] 图4 - 本实用新型局部爆炸视图  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\906093d2fd363840626568d48a84b01cc75bf18892b5b8197140496dc648bfe0.jpg  | score=0.4416
 4. [figure] 图1 - 本实用新型整体结构示意图  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\0b110668bdedeef913bee95d62f88cc6c1765ec83d8fd0cf83cbfe316729c0cf.jpg  | score=0.4342
 5. [figure] 图5 - 本实用新型部分结构示意图（不包括保护罩总成）  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994

In [38]:
# retriever 


# ==== ④ 高级检索：支持“图N”偏好 + 统一打印 ====
from llama_index.core.schema import NodeWithScore

PREFER_FIG_BOOST = 0.15   # 出现“图N”时，对齐编号的 figure 节点加一个小偏置

def search(query: str, top_k: int = TOP_K, use_hybrid: bool = True, prefer_fig: bool = True) -> List[NodeWithScore]:
    # 1) 基础检索
    hits = hybrid_search(query, top_k=top_k) if use_hybrid else vector_search(query, top_k=top_k)

    # 2) 如果 query 里写了“图N”，对命中同号的 figure 轻度加权（不改变原有分数体系）
    if prefer_fig:
        fig_ids = set(_has_fig_mention(query))
        if fig_ids:
            boosted = []
            for h in hits:
                n = h.node
                kind = n.metadata.get("node_type")
                if kind == "figure":
                    try:
                        no = int(str(n.metadata.get("fig_no","")).strip())
                    except:
                        no = None
                    if no in fig_ids:
                        # 构造一个新的 NodeWithScore，增加一点偏置
                        boosted.append(NodeWithScore(node=n, score=(h.score or 0.0) + PREFER_FIG_BOOST))
                        continue
                boosted.append(h)
            # 重新排序
            hits = sorted(boosted, key=lambda x: x.score or 0.0, reverse=True)[:top_k]

    return hits

def print_results(query: str, hits: List[NodeWithScore], show=10):
    print(f"Q: {query}")
    for i, h in enumerate(hits[:show], 1):
        print(f"{i:>2}.", preview_hit(h))
        # 如是配图，还可以补充打印 annos（附图标记说明）
        if h.node.metadata.get("node_type") == "figure":
            ann = h.node.metadata.get("fig_annos","")
            if ann:
                print("    └─ annos:", ann[:140] + ("…" if len(ann)>140 else ""))

# --- 小测试：包含“图1”的偏好 ---
q = "请给出图1，并说明该图的结构示意内容"
hits = search(q, top_k=6, use_hybrid=True, prefer_fig=True)
print_results(q, hits)


Q: 请给出图1，并说明该图的结构示意内容
 1. [figure] 图1 - 本实用新型整体结构示意图  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\0b110668bdedeef913bee95d62f88cc6c1765ec83d8fd0cf83cbfe316729c0cf.jpg  | score=0.8091
    └─ annos: 附图标记说明：1：保护罩总成，2：足基座总成，3：足垫，2-1：传感部件，2-2：应变片，2-3：发光件，2-4：测控单元，2-5：足基座，2-1- 1：敏感部，2-1- 1-1：平行平面区域
 2. [figure] 图5 - 本实用新型部分结构示意图（不包括保护罩总成）  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\c13063d537ddc01148e49c7f83d4127ab3acbea6482574365a083c7f1efcb34e.jpg  | score=0.7000
    └─ annos: 附图标记说明：1：保护罩总成，2：足基座总成，3：足垫，2-1：传感部件，2-2：应变片，2-3：发光件，2-4：测控单元，2-5：足基座，2-1- 1：敏感部，2-1- 1-1：平行平面区域
 3. [figure] 图3 - 本实用新型爆炸视图  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\2649e5073d9e380fe64058ca443c4bcb2bf8d6c0183067d83a025b530a406f7e.jpg  | score=0.6394
    └─ annos: 附图标记说明：1：保护罩

In [None]:
# retriever 

from typing import Iterable, Optional

def _coerce_hits(hits: Iterable) -> List[Tuple[TextNode, float]]:
    out = []
    for h in hits:
        if hasattr(h, "node"):  # NodeWithScore
            out.append((h.node, float(getattr(h, "score", 0.0))))
        elif isinstance(h, TextNode):  # Node
            out.append((h, 0.0))
        else:
            try:
                n, s = h
                out.append((n, float(s)))
            except Exception:
                pass
    return out

def retrieve_nodes(
    index: VectorStoreIndex,
    query: str,
    top_k: int = 3, # 
    *,
    use_hybrid: bool = False, #  
    bm25_top_k: int = 5,
    bm25_nodes: Optional[List[TextNode]] = None,
    doc_id_filter: Optional[str] = None,
) -> List[Tuple[TextNode, float]]:
    if not (use_hybrid and HAS_BM25):
        retriever = index.as_retriever(similarity_top_k=top_k)
        pairs = _coerce_hits(retriever.retrieve(query))
        if doc_id_filter:
            pairs = [(n, s) for n, s in pairs if n.metadata.get("doc_id") == doc_id_filter]
        return pairs[:top_k]

    # Hybrid：BM25 + Vector 简单融合
    if bm25_nodes is None:
        bm25_nodes = nodes_cache
    bm25 = BM25Retriever.from_text_nodes(bm25_nodes, similarity_top_k=bm25_top_k)
    bm25_hits = _coerce_hits(bm25.retrieve(query))
    vec_hits  = _coerce_hits(index.as_retriever(similarity_top_k=top_k).retrieve(query))

    pool: Dict[str, Tuple[TextNode, float]] = {}
    if bm25_hits:
        max_b = max(s for _, s in bm25_hits) or 1.0
        for n, s in bm25_hits:
            pool[n.node_id] = (n, 0.6 * (s / max_b))
    if vec_hits:
        max_v = max(s for _, s in vec_hits) or 1.0
        for n, s in vec_hits:
            w = 0.7 * (s / max_v)
            if n.node_id in pool:
                old_n, old_s = pool[n.node_id]
                pool[n.node_id] = (old_n, old_s + w)
            else:
                pool[n.node_id] = (n, w)

    merged = sorted(pool.values(), key=lambda x: x[1], reverse=True)
    pairs = merged[:top_k]
    if doc_id_filter:
        pairs = [(n, s) for n, s in pairs if n.metadata.get("doc_id") == doc_id_filter]
    return pairs[:top_k]

def pretty_print_hits(pairs: List[Tuple[TextNode, float]], preview_chars: int = 260):
    for i, (n, s) in enumerate(pairs, 1):
        mt = n.metadata or {}
        ntype = mt.get("node_type", "text")
        print(f"\n[{i}] score={s:.4f}  id={n.node_id}  type={ntype}")
        if ntype == "figure":
            print(f"  图号   : {mt.get('fig_no')}")
            print(f"  描述   : {mt.get('fig_desc')}")
            print(f"  路径   : {mt.get('fig_path')}")
            ann = (mt.get('fig_annos') or "")
            if ann:
                print(f"  标记   : {(ann[:160] + '…') if len(ann)>160 else ann}")
            print("  ——向量文本——")
        print(n.get_content()[:preview_chars] + ("…" if len(n.get_content()) > preview_chars else ""))



In [None]:
# 交互式检索示例（你可反复改 query）

# 可选：只查某一份文档（按 doc_id 过滤）