### RAG  llama-index

In [None]:
# file-parser
from typing import List, Dict, Any   
from pathlib import Path
import json
from tqdm import tqdm 

from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings 
from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter, MarkdownNodeParser 
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.extractors import TitleExtractor   # 需要接入llm


from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# embedding
embedding = HuggingFaceEmbedding(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    device="cpu",                 # 建议放顶层
    cache_folder=r"E:\local_models\huggingface\cache\hub",
    trust_remote_code=True,       # 建议放顶层
    model_kwargs={"local_files_only": False},   # 允许联网 False
)
Settings.embed_model = embedding

# full_split.md      figs_MetaDict.json  
data_root = r"../.log/SimplePDF"
mdfs: str = str(next(Path(data_root).rglob('full_split.md'), None))
assert  Path(mdfs).exists()

# 元数据 ++
# 保存图片信息字段的json文件在mdfs的同级目录下 文件名 figs_MetaDict.json
def load_jsf(mdp: Path) -> Dict[str, str | List]:
    jsp = str(next(Path(mdp).parent.glob("figs_MetaDict.json"),None))
    if not jsp:
        raise TypeError("jsp is NoneType")
    # assert jsp
    
    with open(jsp, "r", encoding='utf-8') as jf:
        ims_metadata = json.load(jf)
    return ims_metadata

""" figs_metadict.json  
{
    "im_abs": [
        "摘要图",
        "path/to/im_abs"
    ],
    "lines_ims": [                    
        "图1为本实用新型的爆炸图",
        "图2为本实用新型的侧面示图",
        ...
    ],
    "annos_ims": "图中：1、驱动板；2、转子编码器芯片；...",
    "im_1": [
        "图1为本实用新型的爆炸图",
        "path/to/im_1"
    ],
    "im_2": [
        "图2为本实用新型的侧面示图",
        "path/to/im_2"
    ],
    ...
}
"""

# load_data
documents = SimpleDirectoryReader(input_files=[mdfs]).load_data()
# # figs_metadict  
figs_dict = load_jsf(mdfs)   

# spliter
text_spliter = SentenceSplitter(chunk_size=700, chunk_overlap=100)

# 节点后处理器  Postprocessor        
# 相应合成器    Response Synthesizer    # todo.
from llama_index.core.postprocessor import KeywordNodePostprocessor, LongContextReorder #  
# pipeline 
pipeline = IngestionPipeline(transformations=[text_spliter])

# nodes
nodes = pipeline.run(documents=documents)

# 给每个 node 记录 doc_id/figs_path，后面好取图，图的描述语也是需要一同提取出来的，图中的标记或许也需要展示/输出出来
doc_id = Path(mdfs).parent.name
for n in nodes:
    n.metadata["doc_id"] = doc_id
    n.metadata["figs_path"] = str(Path(mdfs).with_name("figs_MetaDict.json"))


# index
nodes_idx = VectorStoreIndex(nodes=nodes)

# 3) 纯检索：不走 LLM，只取 SourceNodes
retriever = nodes_idx.as_retriever(similarity_top_k=3)
query = "介绍一下这是什么专利"
source_nodes = retriever.retrieve(query)

# 4) 关联图片（最简单规则：chunk 文本里找“图N”）
import re
def pick_figs_for_text(text: str, figs: Dict[str, Any], top_k:int=2):
    out = []
    nums = [int(m.group(1)) for m in re.finditer(r'图\s*(\d+)', text)]
    seen = set()
    for n in nums:
        key = f"im_{n}"
        if key in figs and n not in seen:
            cap, path = figs[key]
            out.append({"no": n, "caption": cap, "url": path})
            seen.add(n)
            if len(out) >= top_k: break
    # 补一个摘要图兜底
    if len(out) == 0 and "im_abs" in figs:
        cap, path = figs["im_abs"]
        out.append({"no": 0, "caption": cap, "url": path})
    return out

payload = []
for sn in source_nodes:
    figs = figs_dict  # 单文档就直接用；多文档可按 sn.node.metadata["doc_id"] 选择
    figs_pick = pick_figs_for_text(sn.node.get_text(), figs, top_k=2)
    payload.append({
        "score": sn.score,
        "text": sn.node.get_text(),
        "figures": figs_pick,
        "annos": figs.get("annos_ims", "")
    })

# 打印/返回给前端
for i, item in enumerate(payload, 1):
    print(f"\n=== 命中 {i} (score={item['score']:.3f}) ===")
    print(item["text"], "...")
    if item["figures"]:
        print("相关图：", [f"图{f['no']}" for f in item["figures"]])
    if item["annos"]:
        print("附图标记：", item["annos"][:120], "...")

In [None]:
## ims-info-dict   txts-str 

# ims-info-dict.keys(): im_abs lines_ims annos_ims im_n
















In [None]:

# agents.ipynb
"""   
#### AgentWorkflow的多智能体系统
# AgentWorkflow 单个智能体、多智能体系统。 多智能体系统中多个智能体协作完成任务，并在需要时将控制权互相移交
# - ResearchAgent  : 它将搜索网络一查找给定主题的信息
# - WriteAgent     : 将用ResearchAgent检索到的信息来撰写报告
# - ReviewAgent    : 将审查报告并提供反馈
## 需要用到的工具
# web_search工具，  Tavily
# record_notes工具，将网络搜索道德研究保存到状态中（AgentWorkflow使用一个名为state的Context变量），然后其他工具就可以使用它
# write_repot工具，使用ResearchAgent检索到的信息撰写报告
# review_report工具，审查报告和提供反馈
"""


"""   
# HuggingFaceLLM 不支持函数调用/工具选择   许多本地小模型都不支持函数调用、工具支持，

想用本地/离线模型，但又要多智能体与工具调用，有两条路：
  1. 在本地模型上再套一层包装器（       -- 简单方法， "智能体"  模型自己决定是否使用工具/函数、然后本地机器调用工具/函数
- 跑一个 OpenAI-兼容网关（如 vLLM + OpenAI 接口、LiteLLM 等），把本地模型“挂成” OpenAI-style API，再用 OpenAI 封装；
  2. 模型只负责生成文本，不负责选工具   --复杂方法
- 放弃“由模型自己决定何时调工具”，改为工作流显式编排：由你在 Python 中决定先搜→再记笔记→再写→再审（模型只负责生成文本，不负责选工具）。
"""

from dotenv import load_dotenv
load_dotenv()
from llama_index.core.agent.workflow import AgentWorkflow 
from llama_index.core.workflow import Context 
from llama_index.core.agent.workflow import (
    AgentOutput,
    ToolCall,
    ToolCallResult,
)
from llama_index.tools.tavily_research import TavilyToolSpec
from llama_index.core.agent.workflow import FunctionAgent 
import os 

from llama_index.llms.huggingface import HuggingFaceLLM 
llm = HuggingFaceLLM(
    model_name     = r"Qwen/Qwen3-1.7B",
    tokenizer_name = r"Qwen/Qwen3-1.7B",
    context_window = 3900,  
    max_new_tokens = 640,
    generate_kwargs={"temperature": 0.7, "top_k": 30, "top_p": 0.95},
    device_map     ='cpu' 
)
# HuggingFaceLLM 不支持函数调用/工具选择

tavily_tool = TavilyToolSpec(api_key=os.getenv("TAVILY_API_KEY"))
search_web = tavily_tool.to_tool_list()[0]

# ---- 工具函数   
async def record_notes(ctx: Context, notes: str, notes_title: str) -> str:
    """ useful for recording notes on a gaven topic."""
    current_state = await ctx.store.get("state")
    if "research_notes" not in current_state:
        current_state["research_notes"] = {}
    current_state["research_notes"][notes_title] = notes
    await ctx.store.set("state", current_state)
    return "Notes recorded."

async def write_report(ctx:Context, report_content:str) -> str:
    """ useful write a report on a gaven topic."""
    current_state = ctx.store.get("state")
    current_state["report_content"] = report_content
    await ctx.store.set("state", current_state)
    return "Report written"
    
async def review_report(ctx:Context, review:str) -> str:
    """useful for reviewing a report and providing feedbacks"""
    current_state = ctx.store.get("state")
    current_state["review"] = review
    await ctx.store.set(["state"],current_state)
    return "Report reviewed" 

# ------------- agent
research_agent = FunctionAgent(
    name="ResearchAgent",
    description="Useful for searching the web for information on a given topic and recording notes on the topic.",
    system_prompt=(
        "You are the ResearchAgent that can search the web for information on a given topic and record notes on the topic. "
        "Once notes are recorded and you are satisfied, you should hand off control to the WriteAgent to write a report on the topic."
    ),
    llm=llm,
    tools=[search_web, record_notes],
    can_handoff_to=["WriteAgent"],
)

write_agent = FunctionAgent(
    name="WriteAgent",
    description="Useful for writing a report on a given topic.",
    system_prompt=(
        "You are the WriteAgent that can write a report on a given topic. "
        "Your report should be in a markdown format. The content should be grounded in the research notes. "
        "Once the report is written, you should get feedback at least once from the ReviewAgent."
    ),
    llm=llm,
    tools=[write_report],
    can_handoff_to=["ReviewAgent", "ResearchAgent"],
)

review_agent = FunctionAgent(
    name="ReviewAgent",
    description="Useful for reviewing a report and providing feedback.",
    system_prompt=(
        "You are the ReviewAgent that can review a report and provide feedback. "
        "Your feedback should either approve the current report or request changes for the WriteAgent to implement."
    ),
    llm=llm,
    tools=[review_report],
    can_handoff_to=["WriteAgent"],
)

agent_workflow = AgentWorkflow(
    agents=[research_agent, write_agent, review_agent],
    root_agent=research_agent.name,
    initial_state={
        "research_notes": {},
        "report_content": "Not written yet.",
        "review": "Review required.",
    },
)

async def main():
    handler = agent_workflow.run(user_msg="""
        Write me a report on the history of the web. Briefly describe the history 
        of the world wide web, including the development of the internet and the 
        development of the web, including 21st century developments.
    """)
    
    current_agent = None 
    current_tool_calls = "" 
    async for event in handler.stream_events():
        if (
            hasattr(event, "current_agent_name")
            and event.current_agent_name != current_agent
        ):
            current_agent = event.current_agent_name
            print(f"\n{'='*50}")
            print(f"🤖 Agent: {current_agent}")
            print(f"{'='*50}\n")
        elif isinstance(event, AgentOutput):
            if event.response.content:
                print("📤 Output:", event.response.content)
            if event.tool_calls:
                print(
                    "🛠️  Planning to use tools:",
                    [call.tool_name for call in event.tool_calls],
                )
        elif isinstance(event, ToolCallResult):
            print(f"🔧 Tool Result ({event.tool_name}):")
            print(f"  Arguments: {event.tool_kwargs}")
            print(f"  Output: {event.tool_output}")
        elif isinstance(event, ToolCall):
            print(f"🔨 Calling Tool: {event.tool_name}")
            print(f"  With arguments: {event.tool_kwargs}")

"""   .py
if __name__ == '__main__':
    import asyncio 
    asyncio.run(main())
"""
await main()


In [None]:
# 显示工作流安排， 因为现在我的本地hf-llm不支持工具/函数调用（无function-calling）  -- runs ok  
# ### 缺点： 中间信息复用的代码逻辑不好写，所以还是用llama-index（Context）
# 本地hf-llm只生成文本（通过提示词控制 生成文本 这个行为），
# 其余的我来
from dotenv import load_dotenv
load_dotenv()

import os, json, re, textwrap, requests

# 1) 本地/离线 LLM（不需要工具调用协议）
from llama_index.llms.huggingface import HuggingFaceLLM
llm = HuggingFaceLLM(
    model_name     = r"Qwen/Qwen3-1.7B",
    tokenizer_name = r"Qwen/Qwen3-1.7B",
    context_window = 3900,
    max_new_tokens = 640,
    generate_kwargs={"temperature": 0.5, "top_k": 30, "top_p": 0.95},
    device_map     = "cpu",
)

# 2) Tavily 搜索（直接 REST；不依赖 LlamaIndex 的 Tool）
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")

def tavily_search(query: str, max_results: int = 6, depth: str = "advanced"):
    """显式调用 Tavily 的 REST API。"""
    url = "https://api.tavily.com/search"
    payload = {
        "api_key": TAVILY_API_KEY,
        "query": query,
        "search_depth": depth,           # "basic" / "advanced"
        "include_images": False,
        "include_answer": False,
        "max_results": max_results,
    }
    r = requests.post(url, json=payload, timeout=60)
    r.raise_for_status()
    data = r.json()
    # 期望 data["results"] 是 [{title, url, content, score?}, ...]
    return data.get("results", [])

def _trim(s: str, n=8000):
    return s if len(s) <= n else (s[:n] + " ...[truncated]")

# --------------------- 显式“工作流”步骤 ---------------------

def step_research(topic: str) -> dict:
    """Step-1: 搜索 + 生成结构化笔记（显式调用 Tavily 与本地LLM）"""
    results = tavily_search(topic, max_results=6, depth="advanced")
    sources_md = "\n".join(
        f"- [{it.get('title','(no title)')}]({it.get('url','')}) — {it.get('content','')[:240].replace('\n', ' ')}"
        for it in results
    )

    prompt = f"""
你是研究助理。基于下列检索到的资料，为“{topic}”生成不超过 10 条的要点式研究笔记（Markdown）。
要求：
- 覆盖关键时间线/里程碑/核心人物或机构
- 突出 21 世纪的发展（若相关）
- 尽量引用来源（以 [编号] 形式对照下方“资料列表”序号）

资料列表（按顺序标号）：
{sources_md}

请输出：
### 笔记
1. ...
2. ...
3. ...
### 引用来源
- [1] 标题（链接）
- [2] ...
"""
    notes = llm.complete(prompt).text
    state = {
        "topic": topic,
        "search_results": results,
        "notes_md": notes,
        "sources_md": sources_md,
    }
    return state

def step_write_report(state: dict) -> dict:
    """Step-2: 写报告（显式把上一步的笔记传入 LLM）"""
    topic = state["topic"]
    notes_md = state["notes_md"]

    prompt = f"""
你是一名技术写作者。请基于下方“研究笔记”为主题“{topic}”撰写一篇**结构化 Markdown 报告**。
要求：
- 结构示例：# 概览 / ## 早期发展 / ## 1990s / ## 2000s / ## 2010s-2020s / ## 参考资料
- 内容必须**紧密依赖**研究笔记，不要编造
- 语言简洁，段落短小

--- 研究笔记 ---
{notes_md}
"""
    report_md = llm.complete(prompt).text
    state["report_md"] = report_md
    return state

def _extract_json_block(text: str) -> dict:
    """从模型输出中尽量提取 JSON（稳健解析）"""
    # 先找 ```json ... ``` 包裹
    m = re.search(r"```json\s*(\{.*?\})\s*```", text, flags=re.S)
    if not m:
        # 退化：找第一个 { ... } 块
        m = re.search(r"(\{.*\})", text, flags=re.S)
    if not m:
        return {}
    try:
        return json.loads(m.group(1))
    except Exception:
        return {}

def step_review(state: dict, strict: bool = False) -> dict:
    """Step-3: 审稿（输出 JSON：approved/changes）"""
    report_md = state["report_md"]
    prompt = f"""
你是审稿人。请仅以 JSON 方式给出审稿结论。
规则：
- 字段：approved (bool), summary (string), changes (string[])
- 若 approved=false，请给出 3-6 条具体修改建议
- 输出必须放在一个 ```json 块中，不要出现多余文字

--- 待审报告 ---
{_trim(report_md, 7000)}
"""
    review_raw = llm.complete(prompt).text
    parsed = _extract_json_block(review_raw)
    # 容错：默认通过
    approved = bool(parsed.get("approved", True))
    changes = parsed.get("changes", [])
    summary = parsed.get("summary", "OK")

    state["review"] = {
        "raw": review_raw,
        "parsed": parsed,
        "approved": approved,
        "changes": changes,
        "summary": summary,
    }
    return state

def step_revise_if_needed(state: dict, max_rounds: int = 1) -> dict:
    """可选：根据审稿意见进行 0~N 轮修订"""
    rounds = 0
    while rounds < max_rounds and not state["review"]["approved"]:
        changes = state["review"]["changes"]
        report_md = state["report_md"]
        topic = state["topic"]
        notes_md = state["notes_md"]

        prompt = f"""
根据以下审稿意见修订报告《{topic}》。必须遵循审稿条目逐条修改：
- 审稿意见：
{json.dumps(changes, ensure_ascii=False, indent=2)}

限制：
- 仍需严格依赖“研究笔记”，不要编造
- 保持 Markdown 结构清晰

--- 研究笔记 ---
{notes_md}

--- 旧版报告 ---
{_trim(report_md, 7000)}
"""
        new_report = llm.complete(prompt).text
        state["report_md"] = new_report
        # 重新审稿
        state = step_review(state)
        rounds += 1
    return state

# --------------------- 顶层 orchestrator ---------------------

def run_explicit_workflow(user_request: str, revise_rounds: int = 1) -> dict:
    # 1) 研究
    state = step_research(user_request)
    print("✅ Research done.")

    # 2) 写作
    state = step_write_report(state)
    print("✅ Draft written.")

    # 3) 审稿
    state = step_review(state)
    print(f"✅ Review: approved={state['review']['approved']}")

    # 4) 需要的话，修订若干轮
    state = step_revise_if_needed(state, max_rounds=revise_rounds)
    print(f"✅ Final approved={state['review']['approved']}")

    return state

# --------------------- 运行示例 ---------------------
if __name__ == "__main__":
    topic = "History of the World Wide Web and key 21st-century developments"
    final_state = run_explicit_workflow(topic, revise_rounds=1)

    print("\n" + "="*80)
    print("# 最终报告（截断展示）\n")
    print(_trim(final_state["report_md"], 4000))
    print("\n# 审稿解析：", final_state["review"]["parsed"])


In [None]:
# 用 LlamaIndex 的 Workflow（仍是显式路径，不让模型挑工具）
# explicit_workflow_llamaindex.py  —— 显式编排，无 function-calling

from dotenv import load_dotenv
load_dotenv()

import os, json, re, requests
from typing import Union

from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.workflow import (
    Workflow, StartEvent, StopEvent, step, Event
)

# ================== 基础组件 ==================
llm = HuggingFaceLLM(
    model_name     = r"Qwen/Qwen3-1.7B",
    tokenizer_name = r"Qwen/Qwen3-1.7B",
    context_window = 3900,
    max_new_tokens = 640,
    generate_kwargs={"temperature": 0.5, "top_k": 30, "top_p": 0.95},
    device_map     = "cpu",
)

TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")

def tavily_search(query: str, max_results: int = 6):
    url = "https://api.tavily.com/search"
    payload = {
        "api_key": TAVILY_API_KEY,
        "query": query,
        "search_depth": "advanced",
        "max_results": max_results
    }
    resp = requests.post(url, json=payload, timeout=60)
    resp.raise_for_status()
    return resp.json().get("results", [])

def _extract_json_block(text: str) -> dict:
    m = re.search(r"```json\s*(\{.*?\})\s*```", text, flags=re.S) or re.search(r"(\{.*\})", text, flags=re.S)
    if not m:
        return {}
    try:
        return json.loads(m.group(1))
    except Exception:
        return {}

# ================== 事件类型（显式定义） ==================
class ResearchDone(Event):
    topic: str
    sources_md: str
    notes_md: str

class DraftWritten(Event):
    topic: str
    notes_md: str
    report_md: str

class ReviewApproved(Event):
    topic: str
    notes_md: str
    report_md: str
    review: dict  # {"approved": True, "summary": "...", "changes": []}

class ReviewChangesNeeded(Event):
    topic: str
    notes_md: str
    report_md: str
    review: dict  # {"approved": False, "summary": "...", "changes": [...]}

# ================== Workflow & Steps ==================
class wf(Workflow):
    # 因为是从前往后，单线， 所以不用Context也可以
    @step
    def research(ev: StartEvent) -> ResearchDone:
        topic = ev.input["topic"]
        results = tavily_search(topic, max_results=6)
        sources_md = "\n".join(
            f"- [{it.get('title','(no title)')}]({it.get('url','')}) — {it.get('content','')[:240].replace('\n',' ')}"
            for it in results
        )
        prompt = f"请基于下列资料为“{topic}”生成 6-10 条要点式 Markdown 笔记：\n{sources_md}"
        notes_md = llm.complete(prompt).text
        return ResearchDone(topic=topic, sources_md=sources_md, notes_md=notes_md)

    @step
    def write(ev: ResearchDone) -> DraftWritten:
        topic, notes_md = ev.topic, ev.notes_md
        prompt = f"基于研究笔记为“{topic}”写一篇结构化 Markdown 报告：\n{notes_md}"
        report_md = llm.complete(prompt).text
        return DraftWritten(topic=topic, notes_md=notes_md, report_md=report_md)

    @step
    def review(ev: DraftWritten) -> Union[ReviewApproved, ReviewChangesNeeded]:
        report_md = ev.report_md
        prompt = f"""你是审稿人，仅以 JSON 回答：{{"approved": true/false, "summary": "...", "changes": ["..."]}}。
    报告：
    {report_md}
    """
        parsed = _extract_json_block(llm.complete(prompt).text)
        if not parsed:
            parsed = {"approved": True, "summary": "OK", "changes": []}

        if bool(parsed.get("approved", True)):
            return ReviewApproved(
                topic=ev.topic, notes_md=ev.notes_md, report_md=report_md, review=parsed
            )
        else:
            return ReviewChangesNeeded(
                topic=ev.topic, notes_md=ev.notes_md, report_md=report_md, review=parsed
            )

    @step
    def revise(ev: ReviewChangesNeeded) -> DraftWritten:
        """按审稿意见修订一轮后，回到 review。"""
        changes = ev.review.get("changes", [])
        prompt = (
            "根据以下审稿意见修订报告（保持 Markdown 结构，逐条落实）：\n"
            f"{json.dumps(changes, ensure_ascii=False, indent=2)}\n"
            "---\n旧版：\n"
            f"{ev.report_md}"
        )
        new_report = llm.complete(prompt).text
        return DraftWritten(topic=ev.topic, notes_md=ev.notes_md, report_md=new_report)

    @step
    def end(ev: ReviewApproved) -> StopEvent:
        """终止：返回最终报告与审稿结果。"""
        return StopEvent(result={"report_md": ev.report_md, "review": ev.review})


# =============== 运行与打印 ===============
handler = wf.run(input={"topic": "History of the World Wide Web and 21st-century developments"})

# 可选：打印事件流
for ev in handler.stream_events():
    name = type(ev).__name__
    payload = getattr(ev, "result", None)
    if payload:
        print(f"[{name}] result keys: {list(payload) if isinstance(payload, dict) else str(payload)[:60]}")
    else:
        print(f"[{name}]")

final = handler.get()  # StopEvent.result
print("\n=== FINAL REPORT (snippet) ===\n")
print(final["report_md"][:2000])
print("\n=== REVIEW ===\n", json.dumps(final["review"], ensure_ascii=False, indent=2))


In [None]:
# file 

from pathlib import Path 

pdf_root = r"./log/simplePDF"    # ~root/subdir/ ..data
imPDF_root = r"./log/ImagePDF"   # ~root/subdir/ ..data




In [None]:
from pathlib import Path 
project_root = Path(__file__).parent.parent
file_md = r""
file_pdf = r""


file_md_im = r""
file_pdf_im = r""




### 部署  llama_deploy