In [1]:
# embedding model 

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

embedding = HuggingFaceEmbedding(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    device="cpu",                 # 建议放顶层
    cache_folder=r"E:\local_models\huggingface\cache\hub",
    trust_remote_code=True,       # 建议放顶层
    model_kwargs={"local_files_only": True},   # 允许联网 False
)
Settings.embed_model = embedding

In [None]:
# llm

# local huggingface llm 
model_name = "Qwen/Qwen3-1.7B"
from llama_index.llms.huggingface import HuggingFaceLLM 
from llama_index.core import Settings 

local_llm = HuggingFaceLLM(
    model_name=model_name,
    tokenizer_name=model_name,
    context_window=1400,
    max_new_tokens=300,
    generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
    device_map='cpu'
)

# chat model  api-key 
# openai  pypi    

from openai import OpenAI 
import os 
from dotenv import load_dotenv
load_dotenv()

DEEPSEEK_API_KEY = os.getenv("GLM_API_KEY")   # https://api.deepseek.com
QWEN_API_KEY = os.getenv("GLM_API_KEY")       # https://dashscope.aliyuncs.com/compatible-mode/v1

# client
client = OpenAI(
    api_key=QWEN_API_KEY,
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

In [None]:
# load data  metadata 

from pathlib import Path 
import json 
from typing import List, Dict, Any, Tuple

from llama_index.core import Document, SimpleDirectoryReader, VectorStoreIndex, StorageContext, load_index_from_storage 
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.vector_stores.types import VectorStoreQueryResult
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.extractors import TitleExtractor     # 需要接入llm

# text-info: full_split_struct.md (可能还不是很干净，有一些零散的干扰信息语句，影响不大)  --后续需要优化
# figs-info: figs.json 就是结构化的图像信息
data_root = Path.cwd().parent / ".log/SimplePDF"
assert Path(data_root).is_dir()
mdfs: Path = next(Path(data_root).rglob('full_split_struct.md'), None)
assert  Path(mdfs).exists()
figs: Path = Path(mdfs).with_name("figs.json")
assert figs.is_file()

"""  figs.json
{
  "im_abs": [
    "path/to/im_abs",
    ""
    ],
  "ims_desc": {
    "1": "本实用新型整体结构示意图",
    ...
  },
  "ims_absp": {
    "1": "path/to/im_1",
    ...
  },
  "ims_bs64": {
      "1": "bs64xx",
      ...
  },
  "ims_annos": "附图标记说明：1：保护罩总成，2：足基座总成，3：足垫，..."
}
"""

# load text_  figs_
def load_md_and_figs(md_path: Path, figs_name: str = "figs.json") -> Tuple[str, Dict[str, Any]]:
    """读取单篇专利的 md 文本+figs.json（若缺失则空字典）"""
    text = md_path.read_text(encoding="utf-8", errors="ignore")
    fj = md_path.with_name(figs_name)
    figs: Dict = {}
    if fj.exists():
        with open(fj, "r", encoding="utf-8") as f:
            figs = json.load(f)
    return text, figs
text_, figs_ = load_md_and_figs(md_path=mdfs)

print("纯文本长度：", len(text_))
print("figs.json 键：", list(figs_.keys()))
print("ims_desc 示例：", list((figs_.get("ims_desc") or {}).items())[:3])
print("ims_absp 示例：", list((figs_.get("ims_desc") or {}).items())[:3])
print("ims_annos 示例：", [(figs_.get("ims_annos") or "")])


纯文本长度： 5831
figs.json 键： ['im_abs', 'ims_desc', 'ims_absp', 'ims_bs64', 'ims_annos']
ims_desc 示例： [('1', '本实用新型整体结构示意图'), ('2', '本实用新型剖视图'), ('3', '本实用新型爆炸视图')]
ims_absp 示例： [('1', '本实用新型整体结构示意图'), ('2', '本实用新型剖视图'), ('3', '本实用新型爆炸视图')]
ims_annos 示例： ['附图标记说明：1：保护罩总成，2：足基座总成，3：足垫，2-1：传感部件，2-2：应变片，2-3：发光件，2-4：测控单元，2-5：足基座，2-1- 1：敏感部，2-1- 1-1：平行平面区域']


In [None]:
# node

import uuid

# Vector stores
from llama_index.vector_stores.faiss import FaissVectorStore  # faiss
from llama_index.core.vector_stores import SimpleVectorStore  # 内存

# Optional BM25
from llama_index.retrievers.bm25 import BM25Retriever  # use or not

from llama_index.core.schema import TextNode, NodeRelationship, RelatedNodeInfo
"""  TextNode

class MetadataMode(str, Enum):
    ALL = "all"
    EMBED = "embed"
    LLM = "llm"
    NONE = "none"

class TextNode(BaseNode):
    text:   for embedding
    metadata: for embedding 
    
    get_content(metadata_mode: MetadataMode)
        - 先根据 mode 通过 get_metadata_str(mode) 生成元数据串
        - 如果 mode != NONE 且有元数据，就用 text_template 把 content（= self.text）和 metadata_str 拼起来返回；否则直接返回 self.text。

    get_text() == get_content(metadata_mode=MetadataMode.NONE)
    get_metadata_str(mode) : ALL, NONE, LLM, EMBED

    text_template: 控制当需要把元数据拼到正文里时的格式（默认是“content + metadata_str”）。
    hash: 基于 text + metadata 计算；用于去重等（不是节点 ID）。
"""


def _to_ascii_digits(s: str) -> str:
    _DIGIT_TRANS = str.maketrans("０１２３４５６７８９", "0123456789")
    return (s or "").translate(_DIGIT_TRANS)

# nodes
# chunk_size=700, chunk_overlap=128  # 著录信息是最重要的一段  对应于pdf的第一页， markdown的第一部分
def build_text_nodes_from_markdown(text: str, doc_id: str, 
                                   chunk_size: int=700, 
                                   chunk_overlap: int=128) -> List[TextNode]:
    splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = splitter.split_text(text)
    nodes: List[TextNode] = []
    for i, ch in enumerate(chunks, 1):
        nodes.append(TextNode(
            text=ch,
            id_=f"{doc_id}::text::{i}",
            metadata={"doc_id": doc_id, "node_type": "text", "chunk_idx": i},
        ))
    return nodes

def build_figure_nodes_from_figs(figs: Dict[str, Any], doc_id: str) -> List[TextNode]:
    nodes: List[TextNode] = []
    # key: value
    ims_desc: Dict[str, str] = figs.get("ims_desc", {}) or {}
    ims_absp: Dict[str, str] = figs.get("ims_absp", {}) or {}
    ims_bs64: Dict[str, str] = figs.get("ims_bs64", {}) or {}
    annos: str = (figs.get("ims_annos") or "").strip()

    # 摘要图
    im_abs = figs.get("im_abs") or []
    if isinstance(im_abs, list) and len(im_abs) >= 1:
        abs_path = im_abs[0] if isinstance(im_abs[0], str) else ""
        # text_for_embed = "摘要图" + (f"；附图标记：{annos}" if annos else "")    # TextNode(text=...) 的 text 字段就是喂给 embedding 模型用于建库/检索的内容。
        text_for_embed = "摘要" 
        nodes.append(TextNode(
            text=text_for_embed,
            id_=f"{doc_id}::fig::abs",
            metadata = {"doc_id": doc_id, 
                        "node_type": "figure",
                        "fig_no": "abs",
                        "fig_desc": "摘要图",
                        "fig_path": abs_path,
                        "fig_b64": "B64(omitted)" if im_abs[1:] else "",
                        "fig_annos": annos,
                        "display_text": "【摘要图】",   # 前端用
                        },
            excluded_embed_metadata_keys = ["fig_path", "fig_b64", "fig_annos", "display_text"],     # 不参与embedding的metadata字段
            excluded_llm_metadata_keys = ["fig_b64"],       # 不参与llm生成的metadata字段
        ))

    # 普通图
    def _key_sorter(k: str) -> int:
        try: return int(_to_ascii_digits(k))
        except: return 10**9

    for k in sorted(ims_desc.keys(), key=_key_sorter):
        desc = (ims_desc.get(k) or "").strip()
        pth  = (ims_absp.get(k) or "").strip()
        # text_for_embed = f"图{k}：{desc}" + (f"；附图标记：{annos}" if annos else "")
        text_for_embed = f"图{k}为{desc}" 
        nodes.append(TextNode(
            text=text_for_embed,       #  text + metadata -> embed
            id_=f"{doc_id}::fig::{k}",
            metadata={
                "doc_id": doc_id, 
                "node_type": "figure",
                "fig_no": k, 
                "fig_desc": desc,
                "fig_path": pth, 
                "fig_b64": "B64(omitted)" if (ims_bs64.get(k) or "") else "",
                # "fig_annos": annos,  
                "display_text":f"【图{k} {desc}】",   # 前端用  
            },
            excluded_embed_metadata_keys = ["fig_path", "fig_b64", "fig_annos", "display_text"],     # 不参与embedding的metadata字段
            excluded_llm_metadata_keys = ["fig_b64"],       # 不参与llm生成的metadata字段
        ))
    return nodes


# 小测试：构建一份示例的节点数量
doc_id_demo = str(uuid.uuid5(uuid.NAMESPACE_URL, str(mdfs.resolve())))
test_nodes = build_text_nodes_from_markdown(text_, doc_id_demo) + build_figure_nodes_from_figs(figs_, doc_id_demo)
len(test_nodes), sum(1 for n in test_nodes if n.metadata["node_type"]=="figure")


def _exclude_metadata_for_embedding(node: TextNode, extra_keys: List[str] = None):
    """确保这些 metadata 不会被拼进 EMBED 文本。"""
    extra_keys = extra_keys or []
    # 复制成 set，再并集（有些实现里是 tuple）
    exc = set(getattr(node, "excluded_embed_metadata_keys", []))
    exc |= {"fig_annos", "fig_b64", "fig_path"} | set(extra_keys)
    node.excluded_embed_metadata_keys = list(exc)

def _exclude_metadata_for_llm(node: TextNode, extra_keys: List[str] = None):
    """如果你也不想把这些 metadata 在 get_content(LLM) 里拼给 LLM，可一并排除。"""
    extra_keys = extra_keys or []
    exc = set(getattr(node, "excluded_llm_metadata_keys", []))
    exc |= {"fig_b64"} | set(extra_keys)
    node.excluded_llm_metadata_keys = list(exc)



(19, 7)

In [None]:
# index
from typing import Optional
import faiss

# Settings.embed_model 
Settings.llm = None

persist_dir = Path.cwd().parent / ".log/faiss_db"
persist_dir.mkdir(parents=True, exist_ok=True)
faiss_index_path = persist_dir / "faiss.index"
meta_path = persist_dir / "vector_meta.json"

def _storage_triplet_ok(pdir: Path) -> bool:
    return all([
        (pdir / "docstore.json").exists(),
        (pdir / "index_store.json").exists(),
        (pdir / "faiss.index").exists(),
    ])
    
# ---------- helper: 读/写 meta ----------
def _read_meta(p: Path) -> Dict[str, Any]:
    if not p.exists():
        return {}
    with open(p, "r", encoding="utf-8") as f:
        return json.load(f)

def _write_meta(p: Path, meta: Dict[str, Any]) -> None:
    with open(p, "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)
    

# ---------- helper: 推断 embedding 维度（如果你没手动填） ----------
def infer_embed_dim(default_dim: Optional[int] = None) -> int:
    if default_dim and default_dim > 0:
        return default_dim
    # 尝试从 Settings.embed_model 推断
    em = Settings.embed_model
    if em is None:
        raise ValueError("Settings.embed_model 未设置，且未显式指定 embedding 维度。")
    try:
        # 大多数本地 embedding 都支持 get_text_embedding
        vec = em.get_text_embedding("probe-dim")
        return len(vec)
    except Exception as e:
        raise ValueError(f"无法从 Settings.embed_model 推断维度，请手动传入 embedding_vector_length。错误：{e}")

# ---------- helper: 校验 FAISS 维度 ----------
def _faiss_dim(path: Path) -> Optional[int]:
    if not path.exists():
        return None
    try:
        idx = faiss.read_index(str(path))
        return idx.d
    except Exception:
        return None

# ---------- helper: 需要重建吗？（根据 meta + 文件 + 维度） ----------
def need_rebuild(
    pdir: Path,
    current_cfg: Dict[str, Any],
    faiss_path: Path,
) -> bool:
    # 文件不全 -> 必须重建
    if not _storage_triplet_ok(pdir):
        return True

    # 维度不一致 -> 必须重建
    cur_faiss_d = _faiss_dim(faiss_path)
    if not cur_faiss_d:
        return True
    if cur_faiss_d != int(current_cfg.get("embed_dim", -1)):
        return True

    # meta 不一致 -> 必须重建
    old = _read_meta(meta_path)
    # 只比较关键键，避免时间戳等非关键项影响
    keys_to_compare = ["embed_model", "embed_dim", "chunk_size", "chunk_overlap"]
    for k in keys_to_compare:
        if old.get(k) != current_cfg.get(k):
            return True

    return False

# ---------- helper: 清理旧持久化 ----------
def nuke_persist_dir(pdir: Path) -> None:
    # 只删我们知道的文件，避免误删你目录里其他东西
    for fn in ["docstore.json", "index_store.json", "vector_store.json", "faiss.index", "vector_meta.json"]:
        f = pdir / fn
        if f.exists():
            f.unlink()


def build_or_load_index(md_files: List[Path], embedding_vector_length: int=1024) -> Tuple[VectorStoreIndex, List[TextNode]]:
    
    # init 
    vector_store = FaissVectorStore(faiss_index=faiss.IndexFlatL2(embedding_vector_length))
    # vector_store = SimpleVectorStore()  # 内存

    # -- 重建
    # # 若已存在存储，直接加载（加快重跑）
    if _storage_triplet_ok(persist_dir):
        storage_context = StorageContext.from_defaults(
            persist_dir=persist_dir,
            vector_store=vector_store
            )
        index = load_index_from_storage(storage_context)
        
        # 为了 BM25 或其他用途，再次扫描构建 nodes（不影响向量库）
        nodes_cache: List[TextNode] = []
        for md in md_files:
            text, figs = load_md_and_figs(md)
            doc_id = str(uuid.uuid5(uuid.NAMESPACE_URL, str(md.resolve())))
            nodes_cache += build_text_nodes_from_markdown(text, doc_id)
            nodes_cache += build_figure_nodes_from_figs(figs, doc_id)
        print(f"[load] 载入索引，缓存 nodes={len(nodes_cache)}")
        return index, nodes_cache

    # 首建：扫描数据 -> nodes
    all_nodes: List[TextNode] = []
    
    for md in md_files:
        text, figs = load_md_and_figs(md)
        doc_id = str(uuid.uuid5(uuid.NAMESPACE_URL, str(md.resolve())))
        all_nodes += build_text_nodes_from_markdown(text, doc_id)
        all_nodes += build_figure_nodes_from_figs(figs, doc_id)

    # 构建 faiss_db   
    # ## 参数persist_dir只有在加载的时候用，这里是创建 
    storage_context = StorageContext.from_defaults(vector_store=vector_store) 
    index = VectorStoreIndex(all_nodes, storage_context=storage_context, show_progress=True)
    
    storage_context.persist(persist_dir=str(persist_dir))
    print(f"[build] 新建索引：nodes={len(all_nodes)}")
    return index, all_nodes

index, nodes_cache = build_or_load_index(md_files=[mdfs])


LLM is explicitly disabled. Using MockLLM.


Generating embeddings:   0%|          | 0/19 [00:00<?, ?it/s]

[build] 新建索引：nodes=19


In [8]:
# retriever 

# ==== ① 向量检索参数 & 展示工具 ====
import re
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Iterable

from llama_index.core import QueryBundle, VectorStoreIndex
from llama_index.core.schema import TextNode, NodeWithScore

# ------- 全局/默认参数（可自由改） -------
TOP_K = 8
VECTOR_MODE = "mmr"   # "default" | "mmr"
MMR_ALPHA = 0.5       # 仅当 VECTOR_MODE="mmr" 生效，越大越多样化
USE_BM25 = True       # 你也可在调用时覆盖
HYBRID_W_VEC = 0.70   # 混合检索加权：向量
HYBRID_W_BM25 = 0.30  # 混合检索加权：BM25
PREFER_FIG_BOOST = 0.02  # 检索命中“图N”时，小幅提升对应 figure 的分数
TEXT_PREVIEW_CHARS = 200 # 打印预览字数

FIG_PAT = re.compile(r"图\s*([0-9０-９]+)")
_DIGIT_TRANS = str.maketrans("０１２３４５６７８９", "0123456789")

def _to_ascii_int(s: str) -> Optional[int]:
    try:
        return int(s.translate(_DIGIT_TRANS))
    except Exception:
        return None

def _fig_nums_in_text(s: str) -> List[int]:
    out = []
    for m in FIG_PAT.finditer(s or ""):
        n = _to_ascii_int(m.group(1))
        if n is not None:
            out.append(n)
    return out

def _has_fig_mention(q: str) -> List[int]:
    return _fig_nums_in_text(q or "")

# ------- 统一结果结构 -------
@dataclass
class HitRow:
    node_id: str
    node_type: str
    score: float
    text_preview: str
    # figure extra (如果是配图)
    fig_no: Optional[str] = None
    fig_desc: Optional[str] = None
    fig_path: Optional[str] = None
    fig_annos: Optional[str] = None

def _coerce_hits(hits: Iterable[Any]) -> List[Tuple[TextNode, float]]:
    """把多种 hits 统一成 List[(node, score)]"""
    out = []
    for h in hits:
        if hasattr(h, "node"):  # NodeWithScore
            out.append((h.node, float(getattr(h, "score", 0.0))))
        elif isinstance(h, TextNode):  # 直接 node
            out.append((h, 0.0))
        else:
            try:
                n, s = h
                out.append((n, float(s)))
            except Exception:
                pass
    return out

def _build_hit_row(n: TextNode, score: float, preview_chars: int = TEXT_PREVIEW_CHARS) -> HitRow:
    ntype = (n.metadata or {}).get("node_type", "text")
    if ntype == "figure":
        pr = (n.get_content() or "")
        pr = pr.replace("\n", " ").strip()
        pr = pr[:preview_chars] + ("…" if len(pr) > preview_chars else "")
        return HitRow(
            node_id=n.node_id, node_type="figure", score=score,
            text_preview=pr,
            fig_no=str((n.metadata or {}).get("fig_no")),
            fig_desc=(n.metadata or {}).get("fig_desc") or "",
            fig_path=(n.metadata or {}).get("fig_path") or "",
            fig_annos=(n.metadata or {}).get("fig_annos") or "",
        )
    else:
        pr = (n.get_content() or "")
        pr = pr.replace("\n", " ").strip()
        pr = pr[:preview_chars] + ("…" if len(pr) > preview_chars else "")
        return HitRow(
            node_id=n.node_id, node_type="text", score=score,
            text_preview=pr,
        )

def print_rows(query: str, rows: List[HitRow], show: int = 10, show_annos_once: bool = True):
    """统一打印；附图标记说明（annos）只打印一次"""
    print(f"Q: {query}\n")
    annos_printed = False
    for i, r in enumerate(rows[:show], 1):
        if r.node_type == "figure":
            print(f"{i:>2}. [FIG  ] score={r.score:.4f} | 图{r.fig_no} {r.fig_desc} | path={r.fig_path}")
            if show_annos_once and (not annos_printed) and r.fig_annos:
                print(f"    └─ 附图标记说明：{r.fig_annos[:200]}{'…' if len(r.fig_annos)>200 else ''}")
                annos_printed = True
        else:
            print(f"{i:>2}. [TEXT ] score={r.score:.4f} | {r.text_preview}")


In [10]:
# ========= ② Optional: BM25 资源 & “图N”快速映射 =========

from llama_index.retrievers.bm25 import BM25Retriever
HAS_BM25 = True


# 需要：你已在之前单元拿到了 index, nodes_cache
assert "index" in globals(), "请先运行你构建索引的单元，获得 `index`。"
assert "nodes_cache" in globals(), "请先运行你构建节点的单元，获得 `nodes_cache`。"

BM25_TOP_K_DEFAULT = 8  # 候选越多，融合时越有余地
BM25_RET = None
if USE_BM25 and HAS_BM25:
    BM25_RET = BM25Retriever.from_defaults(nodes=nodes_cache, similarity_top_k=BM25_TOP_K_DEFAULT)

# “图N”联动索引（text 命中里提到了图N时，补充相应 figure node）
FIG_NODE_INDEX: Dict[int, TextNode] = {}
for n in nodes_cache:
    if (n.metadata or {}).get("node_type") == "figure":
        no_raw = (n.metadata or {}).get("fig_no")
        try:
            k = int(str(no_raw).strip())
            FIG_NODE_INDEX[k] = n
        except Exception:
            pass

In [11]:
# ========= ③ 检索模式：vector / bm25 / hybrid =========

def vector_search(
    index: VectorStoreIndex,
    query: str,
    *,
    top_k: int = TOP_K,
    vector_mode: str = VECTOR_MODE,
    mmr_alpha: float = MMR_ALPHA,
) -> List[Tuple[TextNode, float]]:
    retriever = index.as_retriever(
        similarity_top_k=top_k,
        vector_store_query_mode=vector_mode,
        alpha=mmr_alpha if vector_mode == "mmr" else None,
    )
    hits = retriever.retrieve(QueryBundle(query))
    return _coerce_hits(hits)

def bm25_search(
    query: str,
    *,
    top_k: int = TOP_K,
) -> List[Tuple[TextNode, float]]:
    if not (USE_BM25 and HAS_BM25 and BM25_RET is not None):
        return []
    return _coerce_hits(BM25_RET.retrieve(query)[:top_k])

def _normalize_scores(pairs: List[Tuple[TextNode, float]]) -> List[Tuple[TextNode, float, float]]:
    """min-max 归一化到 [0,1]，返回 (node, raw, norm)"""
    if not pairs:
        return []
    vals = [s for _, s in pairs]
    mx, mn = max(vals), min(vals)
    rng = (mx - mn) or 1.0
    out = []
    for n, s in pairs:
        out.append((n, s, (s - mn) / rng))
    return out

def hybrid_search(
    index: VectorStoreIndex,
    query: str,
    *,
    top_k: int = TOP_K,
    w_vec: float = HYBRID_W_VEC,
    w_bm25: float = HYBRID_W_BM25,
    vector_mode: str = VECTOR_MODE,
    mmr_alpha: float = MMR_ALPHA,
) -> List[Tuple[TextNode, float]]:
    vec_pairs = vector_search(index, query, top_k=top_k, vector_mode=vector_mode, mmr_alpha=mmr_alpha)
    bm_pairs  = bm25_search(query, top_k=top_k)

    vec_norm = _normalize_scores(vec_pairs)
    bm_norm  = _normalize_scores(bm_pairs)

    pool: Dict[str, Tuple[TextNode, float]] = {}
    for n, _, nv in vec_norm:
        pool[n.node_id] = (n, w_vec * nv)
    for n, _, nb in bm_norm:
        if n.node_id in pool:
            old_n, old_s = pool[n.node_id]
            pool[n.node_id] = (old_n, old_s + w_bm25 * nb)
        else:
            pool[n.node_id] = (n, w_bm25 * nb)

    merged = sorted(pool.values(), key=lambda x: x[1], reverse=True)[:top_k]
    return merged

# ---- 简单试跑 ----
for mode_name, fn in [
    ("vector", lambda q: vector_search(index, q, top_k=5)),
    ("bm25",   lambda q: bm25_search(q, top_k=5)),
    ("hybrid", lambda q: hybrid_search(index, q, top_k=5)),
]:
    q = "该专利发明了什么？"
    pairs = fn(q)
    rows = [_build_hit_row(n, s) for n, s in pairs]
    print(f"\n=== {mode_name.upper()} ===")
    print_rows(q, rows, show=5)



=== VECTOR ===
Q: 该专利发明了什么？

 1. [TEXT ] score=1.1748 | # 著录信息 (19)中华人民共和国国家知识产权局  (12)实用新型专利  (10)授权公告号CN207225508U(45)授权公告日2018.04.13  (21)申请号201721328994.5  (22)申请日2017.10.16  (73)专利权人杭州宇树科技有限公司地址310051浙江省杭州市滨江区聚业路26号金绣国际科技中心B座106室  (72)发明人王兴兴 杨知雨  (74)…
 2. [FIG  ] score=1.1866 | 图1 本实用新型整体结构示意图 | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\0b110668bdedeef913bee95d62f88cc6c1765ec83d8fd0cf83cbfe316729c0cf.jpg
    └─ 附图标记说明：附图标记说明：1：保护罩总成，2：足基座总成，3：足垫，2-1：传感部件，2-2：应变片，2-3：发光件，2-4：测控单元，2-5：足基座，2-1- 1：敏感部，2-1- 1-1：平行平面区域
 3. [TEXT ] score=1.2152 | [0013] 作为优选技术措施，所述足基座总成或保护罩总成外设有一凹槽，凹槽内安装一用于观测足端运动轨迹的发光件。所述发光件为LED灯，方便观测足端的运动轨迹，并且提高机器人的观赏性。  [0014] 与现有技术相比，本实用新型具有以下有益效果：  [0015] 本实用新型设置由高屈服强度材料制造而成的和具有特殊结构的敏感部，相比现有技术的机器人足端结构，所需的零部件少，结构简单，重量轻，便于生产…
 4. [FIG  ] score=1.2187 | 图3 本实用新型爆炸视图 | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\2

In [12]:

# retriever 
# ==== ① 向量检索参数 & 展示工具 ====
import re
from typing import List, Dict, Any
from llama_index.core import QueryBundle

# 超参数（随手改）
TOP_K = 8                       # 向量检索返回条数
VECTOR_MODE = "mmr"         # 可选: "default" | "mmr"
MMR_ALPHA = 0.5                 # 仅当 VECTOR_MODE="mmr" 生效，0~1，越大越多样化

# 简单预处理：看看用户是否在问“图N”
FIG_PAT = re.compile(r"图\s*([0-9０-９]+)")

def _has_fig_mention(q: str) -> List[int]:
    """检测 query 里有没有“图N”，返回命中的图号列表（半角化）"""
    nums = []
    for m in FIG_PAT.finditer(q or ""):
        s = m.group(1).translate(str.maketrans("０１２３４５６７８９","0123456789"))
        try:
            nums.append(int(s))
        except:
            pass
    return nums

def preview_hit(hit, max_chars: int = 120) -> str:
    """把 NodeWithScore 变成一行可读文本"""
    n = hit.node
    kind = n.metadata.get("node_type", "text")
    if kind == "figure":
        fig_no = n.metadata.get("fig_no","?")
        fig_desc = n.metadata.get("fig_desc","")
        fig_path = n.metadata.get("fig_path","")
        return f"[figure] 图{fig_no} - {fig_desc}  | path={fig_path}  | score={hit.score:.4f}"
    else:
        text = (n.get_content() or "").replace("\n"," ").strip()
        return f"[text] {text[:max_chars]}{'…' if len(text)>max_chars else ''}  | score={hit.score:.4f}"

# retriever 


# ==== ② 基础向量检索 ====
from llama_index.core.retrievers import VectorIndexRetriever

def vector_search(query: str, top_k: int = TOP_K) -> List[Any]:
    retriever = index.as_retriever(
        similarity_top_k=top_k,
        vector_store_query_mode=VECTOR_MODE,
        alpha=MMR_ALPHA if VECTOR_MODE == "mmr" else None,
    )
    hits = retriever.retrieve(QueryBundle(query))
    return hits

# --- 小测试 ---
q = "该专利发明了什么东西？"
hits = vector_search(q, top_k=5)
print("Q:", q)
for i, h in enumerate(hits, 1):
    print(f"============={i}==============")
    print(h)
    print(f"{i:>2}.", preview_hit(h))
    print("\n")


Q: 该专利发明了什么东西？
Node ID: ff8b8347-cde3-5220-be5e-4e0182b7c773::fig::1
Text: 图1为本实用新型整体结构示意图
Score:  1.160

 1. [figure] 图1 - 本实用新型整体结构示意图  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\0b110668bdedeef913bee95d62f88cc6c1765ec83d8fd0cf83cbfe316729c0cf.jpg  | score=1.1596


Node ID: ff8b8347-cde3-5220-be5e-4e0182b7c773::text::1
Text: # 著录信息 (19)中华人民共和国国家知识产权局  (12)实用新型专利
(10)授权公告号CN207225508U(45)授权公告日2018.04.13  (21)申请号201721328994.5
(22)申请日2017.10.16
(73)专利权人杭州宇树科技有限公司地址310051浙江省杭州市滨江区聚业路26号金绣国际科技中心B座106室  (72)发明人王兴兴
杨知雨  (74)专利代理机构浙江翔隆专利事务所（普通合伙）33206  代理人许守金  (51)Int.Cl.
B62D57/032(2006.01) G01L1/18(2006.01) G01C21/10(2006.01)  权利要求书1页
说明书4页 附图4页  (54)实用新型名称  一种机器人足端结构...
Score:  1.160

 2. [text] # 著录信息 (19)中华人民共和国国家知识产权局  (12)实用新型专利  (10)授权公告号CN207225508U(45)授权公告日2018.04.13  (21)申请号201721328994.5  (22)申请日2017.10.1…  | score=1.1600


Node ID: ff8b8347-cde3-5220-be5e-4e0182b7c773::text::8
Text: [0013] 作为优选技术措施

In [None]:
# retriever 
# 纯工程做法：用 BM25 对 nodes_cache（你前面 build 的文本/图片节点汇总）做关键词匹配，然后和向量检索融合。
# ==== ③ BM25 + 混合检索（可选） ====
try:
    from llama_index.retrievers.bm25 import BM25Retriever
    HAS_BM25 = True
except Exception:
    HAS_BM25 = False

# 融合权重（可调）
W_VEC = 0.7
W_BM25 = 0.3

def _merge_scores(vec_hits, bm_hits, top_k=TOP_K) -> List[Any]:
    """把两路结果按加权分数融合，返回前 top_k 的 NodeWithScore 列表"""
    # 归一化
    def _norm_scores(hits):
        if not hits:
            return []
        scores = [h.score for h in hits if h.score is not None]
        if not scores:
            return hits
        mx, mn = max(scores), min(scores)
        rng = (mx - mn) or 1.0
        for h in hits:
            h._norm = (h.score - mn) / rng if h.score is not None else 0.0
        return hits

    vec_hits = _norm_scores(list(vec_hits))
    bm_hits  = _norm_scores(list(bm_hits))

    pool: Dict[str, Dict[str, Any]] = {}
    def _accumulate(hits, w):
        for h in hits:
            nid = h.node.node_id
            if nid not in pool:
                pool[nid] = {"node": h.node, "score": 0.0, "v": 0.0, "b": 0.0}
            pool[nid]["score"] += w * getattr(h, "_norm", 0.0)
            pool[nid]["v"] += (w * getattr(h, "_norm", 0.0) if w==W_VEC else 0.0)
            pool[nid]["b"] += (w * getattr(h, "_norm", 0.0) if w==W_BM25 else 0.0)

    _accumulate(vec_hits, W_VEC)
    _accumulate(bm_hits,  W_BM25)

    # 排序&构造 NodeWithScore
    merged = sorted(pool.values(), key=lambda x: x["score"], reverse=True)[:top_k]
    out = []
    from llama_index.core.schema import NodeWithScore
    for it in merged:
        out.append(NodeWithScore(node=it["node"], score=it["score"]))
    return out

def hybrid_search(query: str, top_k: int = TOP_K) -> List[Any]:
    # 向量路
    vec_hits = vector_search(query, top_k=top_k)

    # 词匹配路（可选）
    if HAS_BM25 and nodes_cache:
        bm25 = BM25Retriever.from_defaults(nodes=nodes_cache, similarity_top_k=top_k)
        bm_hits = bm25.retrieve(query)
    else:
        bm_hits = []

    # 融合
    merged = _merge_scores(vec_hits, bm_hits, top_k=top_k)
    return merged

# --- 小测试 ---
q = "介绍一下这个专利"
hits = hybrid_search(q, top_k=8)
print("Q:", q)
for i, h in enumerate(hits, 1):
    print(f"{i:>2}.", preview_hit(h))


Q: 介绍一下这个专利
 1. [figure] 图6 - 本实用新型设置发光件的结构示意图（不包括保护罩总成）  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\730d3218997db726e8df83cfee8bab58bd2a80709dd3b13590f9d5dc91bc494d.jpg  | score=0.7000
 2. [figure] 图abs - 摘要图  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\88fe5b0e5b16afc42f78352f8df13f234eeb58b135cdcfdfea80134da7580363.jpg  | score=0.4828
 3. [figure] 图5 - 本实用新型部分结构示意图（不包括保护罩总成）  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\c13063d537ddc01148e49c7f83d4127ab3acbea6482574365a083c7f1efcb34e.jpg  | score=0.4643
 4. [text] # 著录信息 (19)中华人民共和国国家知识产权局  (12)实用新型专利  (10)授权公告号CN207225508U(45)授权公告日2018.04.13  (21)申请号201721328994.5  (22)申请日2017.10.1…  | score=0.4368
 5. [figure] 图4 - 本实用新型局部爆炸视图  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种

In [14]:
# retriever 
# ==== ④ 高级检索：支持“图N”偏好 + 统一打印 ====
from llama_index.core.schema import NodeWithScore

PREFER_FIG_BOOST = 0.02   # 出现“图N”时，对齐编号的 figure 节点加一个小偏置

def search(query: str, top_k: int = TOP_K, use_hybrid: bool = True, prefer_fig: bool = True) -> List[NodeWithScore]:
    # 1) 基础检索
    hits = hybrid_search(query, top_k=top_k) if use_hybrid else vector_search(query, top_k=top_k)

    # 2) 如果 query 里写了“图N”，对命中同号的 figure 轻度加权（不改变原有分数体系）
    if prefer_fig:
        fig_ids = set(_has_fig_mention(query))
        if fig_ids:
            boosted = []
            for h in hits:
                n = h.node
                kind = n.metadata.get("node_type")
                if kind == "figure":
                    try:
                        no = int(str(n.metadata.get("fig_no","")).strip())
                    except:
                        no = None
                    if no in fig_ids:
                        # 构造一个新的 NodeWithScore，增加一点偏置
                        boosted.append(NodeWithScore(node=n, score=(h.score or 0.0) + PREFER_FIG_BOOST))
                        continue
                boosted.append(h)
            # 重新排序
            hits = sorted(boosted, key=lambda x: x.score or 0.0, reverse=True)[:top_k]

    return hits

def print_results(query: str, hits: List[NodeWithScore], show=10):
    print(f"Q: {query}")
    for i, h in enumerate(hits[:show], 1):
        print(f"{i:>2}.", preview_hit(h))
        # 如是配图，还可以补充打印 annos（附图标记说明）
        if h.node.metadata.get("node_type") == "figure":
            ann = h.node.metadata.get("fig_annos","")
            if ann:
                print("    └─ annos:", ann[:140] + ("…" if len(ann)>140 else ""))

# --- 小测试：包含“图1”的偏好 ---
q = "介绍一下这个专利的标题"
hits = search(q, top_k=6, use_hybrid=True, prefer_fig=True)
print_results(q, hits)



Q: 介绍一下这个专利的标题
 1. [figure] 图abs - 摘要图  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\88fe5b0e5b16afc42f78352f8df13f234eeb58b135cdcfdfea80134da7580363.jpg  | score=0.7000
    └─ annos: 附图标记说明：1：保护罩总成，2：足基座总成，3：足垫，2-1：传感部件，2-2：应变片，2-3：发光件，2-4：测控单元，2-5：足基座，2-1- 1：敏感部，2-1- 1-1：平行平面区域
 2. [text] # 著录信息 (19)中华人民共和国国家知识产权局  (12)实用新型专利  (10)授权公告号CN207225508U(45)授权公告日2018.04.13  (21)申请号201721328994.5  (22)申请日2017.10.1…  | score=0.5774
 3. [figure] 图4 - 本实用新型局部爆炸视图  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\906093d2fd363840626568d48a84b01cc75bf18892b5b8197140496dc648bfe0.jpg  | score=0.5573
    └─ annos: 附图标记说明：1：保护罩总成，2：足基座总成，3：足垫，2-1：传感部件，2-2：应变片，2-3：发光件，2-4：测控单元，2-5：足基座，2-1- 1：敏感部，2-1- 1-1：平行平面区域
 4. [figure] 图2 - 本实用新型剖视图  | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca6241

In [15]:
# ==== ② 检索 + “图N”联动 + annos 一次性输出 ====

import re
from typing import Dict
from llama_index.core.schema import NodeWithScore

FIG_PAT = re.compile(r"图\s*([0-9０-９]+)")
_DIGIT_TRANS = str.maketrans("０１２３４５６７８９", "0123456789")

def _fig_nums_in_text(s: str) -> List[int]:
    out = []
    for m in FIG_PAT.finditer(s or ""):
        try:
            out.append(int(m.group(1).translate(_DIGIT_TRANS)))
        except:
            pass
    return out

# 为“图N联动”做个缓存索引：fig_no -> figure_node（从 nodes_cache 构建一次即可）
def build_fig_index(nodes_cache: List[TextNode]) -> Dict[int, TextNode]:
    idx: Dict[int, TextNode] = {}
    for n in nodes_cache:
        if n.metadata.get("node_type") == "figure":
            no = n.metadata.get("fig_no")
            try:
                k = int(str(no).strip())
                idx[k] = n
            except Exception:
                pass
    return idx

FIG_NODE_INDEX = build_fig_index(nodes_cache)

def retrieve(query: str, k: int = 8, use_hybrid: bool = True) -> List[NodeWithScore]:
    hits = hybrid_search(query, top_k=k) if use_hybrid else vector_search(query, top_k=k)

    # 文本命中里提到“图N”，联动补上对应 figure 节点（不重复）
    wanted_figs = set()
    for h in hits:
        if h.node.metadata.get("node_type") == "text":
            wanted_figs.update(_fig_nums_in_text(h.node.get_content()))

    # 把缺失的 figure node 加入（score 给个轻微 boost）
    node_ids = set(h.node.node_id for h in hits)
    extra: List[NodeWithScore] = []
    for n in sorted(wanted_figs):
        if n in FIG_NODE_INDEX:
            fn = FIG_NODE_INDEX[n]
            if fn.node_id not in node_ids:
                extra.append(NodeWithScore(node=fn, score=(hits[-1].score if hits else 0.0) + 1e-6))

    all_hits = hits + extra
    # 重新排序后截断
    all_hits = sorted(all_hits, key=lambda x: x.score or 0.0, reverse=True)[:k]
    return all_hits

def render_answer(query: str, hits: List[NodeWithScore], show: int = 8) -> None:
    print("Q:", query)
    annos_printed = False
    for i, h in enumerate(hits[:show], 1):
        n = h.node
        kind = n.metadata.get("node_type")
        if kind == "figure":
            no   = n.metadata.get("fig_no")
            desc = n.metadata.get("fig_desc","")
            path = n.metadata.get("fig_path","")
            print(f"{i:>2}. [FIG {no}] {desc} | path={path} | score={h.score:.4f}")
            # 附图标记说明只打印一次
            if not annos_printed and (n.metadata.get("fig_annos") or "").strip():
                ann = n.metadata["fig_annos"]
                print("    └─ 附图标记说明：", ann[:200] + ("…" if len(ann) > 200 else ""))
                annos_printed = True
        else:
            txt = (n.get_content() or "").replace("\n"," ").strip()
            print(f"{i:>2}. [TEXT] {txt[:160]}{'…' if len(txt)>160 else ''} | score={h.score:.4f}")


q = "请解释图2的含义，并引用对应段落"
hits = retrieve(q, k=8, use_hybrid=True)
render_answer(q, hits, show=8)



Q: 请解释图2的含义，并引用对应段落
 1. [TEXT] 对本领域技术人员来说没有这些细节部分的描述也可以完全理解本实用新型。  [0027] 需要说明的是，当元件被称为“固定于”另一个元件，它可以直接在另一个元件上或者也可以存在居中的元件。当一个元件被认为是“连接”另一个元件，它可以是直接连接到另一个元件或者可能同时存在居中元件。相反，当元件被称作“直接在”另一元件“上”时… | score=0.7000
 2. [FIG 6] 本实用新型设置发光件的结构示意图（不包括保护罩总成） | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\730d3218997db726e8df83cfee8bab58bd2a80709dd3b13590f9d5dc91bc494d.jpg | score=0.6600
    └─ 附图标记说明： 附图标记说明：1：保护罩总成，2：足基座总成，3：足垫，2-1：传感部件，2-2：应变片，2-3：发光件，2-4：测控单元，2-5：足基座，2-1- 1：敏感部，2-1- 1-1：平行平面区域
 3. [FIG 5] 本实用新型部分结构示意图（不包括保护罩总成） | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\c13063d537ddc01148e49c7f83d4127ab3acbea6482574365a083c7f1efcb34e.jpg | score=0.5860
 4. [FIG 4] 本实用新型局部爆炸视图 | path=D:\codespace\fhfeishi\raga\.log\SimplePDF\CN201721328994.5-一种机器人足端结构.pdf-e856a75f-7fe7-4813-a860-05ca62419ac7\images\906093d2fd363840626568d48a84b01cc75bf18892

In [None]:
# retriever 

from typing import Iterable, Optional

def _coerce_hits(hits: Iterable) -> List[Tuple[TextNode, float]]:
    out = []
    for h in hits:
        if hasattr(h, "node"):  # NodeWithScore
            out.append((h.node, float(getattr(h, "score", 0.0))))
        elif isinstance(h, TextNode):  # Node
            out.append((h, 0.0))
        else:
            try:
                n, s = h
                out.append((n, float(s)))
            except Exception:
                pass
    return out

def retrieve_nodes(
    index: VectorStoreIndex,
    query: str,
    top_k: int = 3, # 
    *,
    use_hybrid: bool = False, #  
    bm25_top_k: int = 5,
    bm25_nodes: Optional[List[TextNode]] = None,
    doc_id_filter: Optional[str] = None,
) -> List[Tuple[TextNode, float]]:
    if not (use_hybrid and HAS_BM25):
        retriever = index.as_retriever(similarity_top_k=top_k)
        pairs = _coerce_hits(retriever.retrieve(query))
        if doc_id_filter:
            pairs = [(n, s) for n, s in pairs if n.metadata.get("doc_id") == doc_id_filter]
        return pairs[:top_k]

    # Hybrid：BM25 + Vector 简单融合
    if bm25_nodes is None:
        bm25_nodes = nodes_cache
    bm25 = BM25Retriever.from_text_nodes(bm25_nodes, similarity_top_k=bm25_top_k)
    bm25_hits = _coerce_hits(bm25.retrieve(query))
    vec_hits  = _coerce_hits(index.as_retriever(similarity_top_k=top_k).retrieve(query))

    pool: Dict[str, Tuple[TextNode, float]] = {}
    if bm25_hits:
        max_b = max(s for _, s in bm25_hits) or 1.0
        for n, s in bm25_hits:
            pool[n.node_id] = (n, 0.6 * (s / max_b))
    if vec_hits:
        max_v = max(s for _, s in vec_hits) or 1.0
        for n, s in vec_hits:
            w = 0.7 * (s / max_v)
            if n.node_id in pool:
                old_n, old_s = pool[n.node_id]
                pool[n.node_id] = (old_n, old_s + w)
            else:
                pool[n.node_id] = (n, w)

    merged = sorted(pool.values(), key=lambda x: x[1], reverse=True)
    pairs = merged[:top_k]
    if doc_id_filter:
        pairs = [(n, s) for n, s in pairs if n.metadata.get("doc_id") == doc_id_filter]
    return pairs[:top_k]

def pretty_print_hits(pairs: List[Tuple[TextNode, float]], preview_chars: int = 260):
    for i, (n, s) in enumerate(pairs, 1):
        mt = n.metadata or {}
        ntype = mt.get("node_type", "text")
        print(f"\n[{i}] score={s:.4f}  id={n.node_id}  type={ntype}")
        if ntype == "figure":
            print(f"  图号   : {mt.get('fig_no')}")
            print(f"  描述   : {mt.get('fig_desc')}")
            print(f"  路径   : {mt.get('fig_path')}")
            ann = (mt.get('fig_annos') or "")
            if ann:
                print(f"  标记   : {(ann[:160] + '…') if len(ann)>160 else ann}")
            print("  ——向量文本——")
        print(n.get_content()[:preview_chars] + ("…" if len(n.get_content()) > preview_chars else ""))

In [None]:
# 交互式检索示例（你可反复改 query）

# 可选：只查某一份文档（按 doc_id 过滤）