In [None]:
# zzb  需要  settings.yaml
from pathlib import Path
from zzb import build_injector, IngestService

inj = build_injector()
svc = inj.get(IngestService)


mdp = r"D:\codespace\fhfeishi\raga\scripts\PatentParser\full.md"
docs = svc.ingest_file(mdp, Path("mdp"))
print("ingested:", [d.doc_id for d in docs])

print("list:", svc.list_ingested()[:3])



In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# api  

local_dir = r"E:\local_models\huggingface\cache\hub"

embeds = HuggingFaceEmbedding(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    device="cpu",                 # 建议放顶层
    trust_remote_code=True,       # 建议放顶层
    cache_folder=local_dir,
    model_kwargs={"local_files_only": False},   # 允许联网 False
)

In [None]:
import os, torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM



import sys, transformers, huggingface_hub, sentence_transformers
print("py:", sys.executable)
print("tf:", transformers.__version__, "hub:", huggingface_hub.__version__, "st:", sentence_transformers.__version__)
print("HF_HOME:", os.getenv("HF_HOME"))
print("HF_ENDPOINT:", os.getenv("HF_ENDPOINT"))


# —— 离线&缓存环境（Windows 下也OK）——
os.environ["HF_HOME"] = r"E:\local_models\huggingface\cache"
# os.environ["HF_HUB_OFFLINE"] = "0"
# os.environ["TRANSFORMERS_OFFLINE"] = "1"
# os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"  # 关闭 symlink 警告
os.environ["HF_ENDPOINT"] = "https://mirrors.tuna.tsinghua.edu.cn/hugging-face/"


name_or_path = "Qwen/Qwen3-Embedding-0.6B"  # 也可换成本地目录（见方案2）
device = "cpu"  # 或 "cuda:0"（如果有GPU）

# 第一次若缓存不全，把 local_files_only 改为 False 联网补齐一次
tokenizer = AutoTokenizer.from_pretrained(
    name_or_path, trust_remote_code=True, local_files_only=False,
    cache_dir=os.environ["HF_HOME"],
)
model = AutoModelForCausalLM.from_pretrained(
    name_or_path, trust_remote_code=True, local_files_only=True,
    cache_dir=os.environ["HF_HOME"],
).to(device).eval()


In [None]:
# test zzc
import os
from pathlib import Path
from zzc import build_injector, IngestService
from zzc.di import EmbeddingComponent  # 仅用于冒烟

# --- 环境变量 ---
os.environ['PGM_EMBED_MODEL'] = "Qwen/Qwen3-Embedding-0.6B"
os.environ['PGM_DEVICE'] = "cpu"             # 无GPU就改 "cpu"
os.environ['PGM_LOCAL_FILES_ONLY'] = "0"        # 首次跑若没缓存，改成 "0"
os.environ['PGM_CHROMA_DIR'] = "/data/pgm/chroma"
os.environ['PGM_CHROMA_COLLECTION'] = "pgm_collection"
os.environ['PGM_PERSIST_DIR'] = "/data/pgm/storage"
os.environ['PGM_SENT_WINDOW'] = "3"

inj = build_injector()

# 先冒烟：仅加载嵌入模型，验证“缓存+离线”是否OK
_ = inj.get(EmbeddingComponent)

svc: IngestService = inj.get(IngestService)

# 路径：确保与当前运行 OS 一致
mdp = r"D:\codespace\fhfeishi\raga\scripts\PatentParser\full.md"
p = Path(mdp)
assert p.exists(), f"文件不存在：{p}（注意当前运行环境的操作系统路径）"

docs = svc.ingest_file(p.name, p)
print([d.doc_id for d in docs])
print(svc.list_ingested()[:3])


In [None]:
# test   zzc
import os 

# environment variables
os.environ['PGM_EMBED_MODEL'] = "Qwen/Qwen3-Embedding-0.6B"
os.environ['PGM_DEVICE'] = "cuda:0"
os.environ['PGM_LOCAL_FILES_ONLY'] = "1"    
os.environ['PGM_CHROMA_DIR'] = "/data/pgm/chroma"
os.environ['PGM_CHROMA_COLLECTION'] = "pgm_collection"
os.environ['PGM_PERSIST_DIR'] = "/data/pgm/storage"
os.environ['PGM_SENT_WINDOW'] = "3"


from pathlib import Path
from zzc import build_injector, IngestService

inj = build_injector()
svc: IngestService = inj.get(IngestService)

mdp = r"D:\codespace\fhfeishi\raga\scripts\PatentParser\full.md"
docs = svc.ingest_file(mdp, Path(mdp))
print([d.doc_id for d in docs])

print(svc.list_ingested()[:3])

# 删除
# svc.delete("demo.txt")


In [None]:
from pathlib import Path 


# ollama


model_cache_root = r"E:\local_models\huggingface\cache\hub"
# Qwen/Qwen3-1.7B  Qwen/Qwen3-Embedding-0.6B  Qwen/Qwen3-Reranker-0.6B

# prompt
from llama_index.core.prompts import PromptTemplate

# chat 
from llama_index.llms.huggingface import HuggingFaceLLM 
chat_model = HuggingFaceLLM(
    # cache_root可以省，model_name
    # or  local_dir
    model_name     = r"Qwen/Qwen3-1.7B",
    tokenizer_name = r"Qwen/Qwen3-1.7B",
    context_window = 3900,  
    max_new_tokens = 640,
    generate_kwargs={"temperature": 0.7, "top_k": 30, "top_p": 0.95},
    device_map     ='cpu' 
)

# reranker  
from llama_index.core.postprocessor import SentenceTransformerRerank
reranker = SentenceTransformerRerank(
    model = "Qwen/Qwen3-Reranker-0.6B",
    top_n = 4,
    trust_remote_code=True,     
    device= 'cpu',
) 

# embedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embedding = HuggingFaceEmbedding(
    # cache
    model_name="Qwen/Qwen3-Embedding-0.6B",
    cache_folder=model_cache_root, 
    # local-dir
    # model_name = r"E:\local_models\huggingface\local\path_to_qwen3Embedding0.6b_load_dir
    max_length=1024,
    trust_remote_code=True,
    model_kwargs={"local_files_only": True},   # 允许联网 False , 禁止联网 True
    device='cpu',
)


In [None]:
# Requires transformers>=4.51.0   qwen-official
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM

def format_instruction(instruction, query, doc):
    if instruction is None:
        instruction = 'Given a web search query, retrieve relevant passages that answer the query'
    output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(instruction=instruction,query=query, doc=doc)
    return output

def process_inputs(pairs):
    inputs = tokenizer(
        pairs, padding=False, truncation='longest_first',
        return_attention_mask=False, max_length=max_length - len(prefix_tokens) - len(suffix_tokens)
    )
    for i, ele in enumerate(inputs['input_ids']):
        inputs['input_ids'][i] = prefix_tokens + ele + suffix_tokens
    inputs = tokenizer.pad(inputs, padding=True, return_tensors="pt", max_length=max_length)
    for key in inputs:
        inputs[key] = inputs[key].to(model.device)
    return inputs

@torch.no_grad()
def compute_logits(inputs, **kwargs):
    batch_scores = model(**inputs).logits[:, -1, :]
    true_vector = batch_scores[:, token_true_id]
    false_vector = batch_scores[:, token_false_id]
    batch_scores = torch.stack([false_vector, true_vector], dim=1)
    batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
    scores = batch_scores[:, 1].exp().tolist()
    return scores

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Reranker-0.6B", padding_side='left')
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-Reranker-0.6B").eval()
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-Reranker-0.6B", torch_dtype=torch.float16, attn_implementation="flash_attention_2").cuda().eval()
token_false_id = tokenizer.convert_tokens_to_ids("no")
token_true_id = tokenizer.convert_tokens_to_ids("yes")
max_length = 8192

prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)
suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)
        
task = 'Given a web search query, retrieve relevant passages that answer the query'

queries = ["What is the capital of China?",
    "Explain gravity",
]

documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]

pairs = [format_instruction(task, query, doc) for query, doc in zip(queries, documents)]

# Tokenize the input texts
inputs = process_inputs(pairs)
scores = compute_logits(inputs)

print("scores: ", scores)


In [None]:
# qwen3_rerank_official.py
# pip install "transformers>=4.51.0" torch

"""  
Qwen3-Reranker-0.6B 官方是按 CausalLM + yes/no 概率来打分的，
不是 AutoModelForSequenceClassification 那一套。因此我们前面用分类头取 logits 会得到乱序分数。

"""

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def format_instruction(instruction, query, doc):
    if instruction is None:
        instruction = "Given a web search query, retrieve relevant passages that answer the query"
    return f"<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}"

def process_inputs(tokenizer, pairs, max_length=8192, prefix=None, suffix=None):
    # 官方模板（来自模型卡）
    if prefix is None:
        prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
    if suffix is None:
        suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"

    prefix_ids = tokenizer.encode(prefix, add_special_tokens=False)
    suffix_ids = tokenizer.encode(suffix, add_special_tokens=False)

    enc = tokenizer(
        pairs,
        padding=False,
        truncation="longest_first",
        return_attention_mask=False,
        max_length=max_length - len(prefix_ids) - len(suffix_ids),
    )
    # 手动拼接 prefix / suffix，再统一 pad
    for i, ids in enumerate(enc["input_ids"]):
        enc["input_ids"][i] = prefix_ids + ids + suffix_ids

    enc = tokenizer.pad(enc, padding=True, return_tensors="pt", max_length=max_length)
    return enc

@torch.no_grad()
def compute_scores(model, tokenizer, inputs):
    # 只看最后一个位置对 "yes"/"no" 的对数几率
    out = model(**{k: v.to(model.device) for k, v in inputs.items()})
    last_token_logits = out.logits[:, -1, :]  # [N, vocab]
    tok_yes = tokenizer.convert_tokens_to_ids("yes")
    tok_no  = tokenizer.convert_tokens_to_ids("no")
    yes_logit = last_token_logits[:, tok_yes]
    no_logit  = last_token_logits[:, tok_no]
    # softmax 取 “yes” 概率
    prob_yes = torch.softmax(torch.stack([no_logit, yes_logit], dim=1), dim=1)[:, 1]
    return prob_yes.cpu().tolist()

if __name__ == "__main__":
    query = "什么是 LIPM（线性倒立摆模型），它在仿人行走里有什么作用？"
    docs = [
        "Qwen3 是一个大语言模型系列，与机器人动力学无关。",
        "LIPM（Linear Inverted Pendulum Model）将质心视为在常高平面上运动的倒立摆，常用于人形/双足步态规划与控制。",
        "SGLang 专注推理加速与 KV cache 管理，不涉及步态物理建模。",
        "在仿人机器人中，LIPM 常用于近似 ZMP 约束，从而生成可行的足底支持多边形内的质心轨迹。"
    ]

    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Reranker-0.6B", padding_side="left")
    model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-Reranker-0.6B").eval()  # 有GPU可加 .cuda()

    pairs = [format_instruction(None, query, d) for d in docs]
    inputs = process_inputs(tokenizer, pairs)
    scores = compute_scores(model, tokenizer, inputs)  # 分数越大越相关

    ranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
    print("=== Qwen3 官方打分（高->低）===")
    for i, (t, s) in enumerate(ranked, 1):
        print(f"[{i}] score={s:.4f} | {t}")


In [None]:
# demo_rerank_llamaindex.py
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.schema import TextNode
from llama_index.core import QueryBundle

# 1) 准备查询与候选段落
query = "什么是 LIPM（线性倒立摆模型），它在仿人行走里有什么作用？"
candidates = [
    "Qwen3 是一个大语言模型系列，与机器人动力学无关。",
    "LIPM（Linear Inverted Pendulum Model）将质心视为在常高平面上运动的倒立摆，常用于人形/双足步态规划与控制。",
    "SGLang 专注推理加速与 KV cache 管理，不涉及步态物理建模。",
    "在仿人机器人中，LIPM 常用于近似 ZMP 约束，从而生成可行的足底支持多边形内的质心轨迹。"
]

# 创建节点
nodes = [TextNode(text=txt) for txt in candidates]

# 2) 初始化 Reranker
reranker = SentenceTransformerRerank(
    model="Qwen/Qwen3-Reranker-0.6B",
    top_n=4,
    device="cpu",  # 如果有 GPU 可用，改为 "cuda"
    keep_retrieval_score=False,
    trust_remote_code=True,
)

# 3) 使用同步方法（注意：是 postprocess_nodes，不是 apostprocess_nodes）
reranked = reranker.postprocess_nodes(
    nodes=nodes,
    query_bundle=QueryBundle(query)
)

# 4) 打印结果
print("\n=== LlamaIndex Rerank 结果（高->低）===")
for i, node in enumerate(reranked, 1):
    score = getattr(node, 'score', None)
    if score is not None:
        print(f"[{i}] score={score:.4f} | {node.get_content()}")
    else:
        print(f"[{i}] score=None | {node.get_content()}")

In [None]:
# pip install -U sentence-transformers "llama-index-core>=0.10.64" transformers torch

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"   # 降低线程开销（笔记本/CPU友好）
os.environ["OMP_NUM_THREADS"] = "1"              # 可选
os.environ["MKL_NUM_THREADS"] = "1"              # 可选

from sentence_transformers import CrossEncoder
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.schema import TextNode
from llama_index.core import QueryBundle

# 1) 更轻量的 reranker（推荐先用它验证链路）
MODEL_NAME = "BAAI/bge-reranker-base"  # 也可尝试 "cross-encoder/ms-marco-MiniLM-L-6-v2"
ce = CrossEncoder(MODEL_NAME, max_length=256, device="cpu")  # 压短序列，省内存

# 2) LlamaIndex 的 reranker（用我们自己创建的 CrossEncoder 实例）
reranker = SentenceTransformerRerank(model=ce, top_n=3)

# 3) 准备数据（尽量裁短文本，避免 OOM）
query = "什么是 LIPM（线性倒立摆模型），它在仿人行走里有什么作用？"
docs = [
    "Qwen3 是一个大语言模型系列，与机器人动力学无关。",
    "LIPM（Linear Inverted Pendulum Model）将质心视为在常高平面上运动的倒立摆，常用于人形/双足步态规划与控制。",
    "SGLang 专注推理加速与 KV cache 管理，不涉及步态物理建模。",
    "在仿人机器人中，LIPM 常用于近似 ZMP 约束，从而生成可行的足底支持多边形内的质心轨迹。"
]
nodes = [TextNode(text=t[:800]) for t in docs]  # 👈 文本裁短以保守内存

# 4) 执行重排
out = reranker.postprocess(nodes, QueryBundle(query))

print("=== LlamaIndex + SentenceTransformerRerank（高->低）===")
for i, n in enumerate(out, 1):
    score = float(getattr(n, "score", 0.0))
    print(f"[{i}] score={score:.4f} | {n.get_content()}")


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

query = "什么是 LIPM（线性倒立摆模型），它在仿人行走里有什么作用？"
docs = [
    "Qwen3 是一个大语言模型系列，与机器人动力学无关。",
    "LIPM（Linear Inverted Pendulum Model）将质心视为在常高平面上运动的倒立摆，常用于人形/双足步态规划与控制。",
    "SGLang 专注推理加速与 KV cache 管理，不涉及步态物理建模。",
    "在仿人机器人中，LIPM 常用于近似 ZMP 约束，从而生成可行的足底支持多边形内的质心轨迹。"
]

tok = AutoTokenizer.from_pretrained("tomaarsen/Qwen3-Reranker-0.6B-seq-cls")
model = AutoModelForSequenceClassification.from_pretrained("tomaarsen/Qwen3-Reranker-0.6B-seq-cls").eval()

tok.pad_token = tok.eos_token if tok.pad_token is None else tok.pad_token
model.config.pad_token_id = tok.pad_token_id

batch = tok([query]*len(docs), docs, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    scores = model(**batch).logits.view(-1).tolist()  # 分数越大越相关

ranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
print("=== seq-cls 版打分（高->低）===")
for i, (t, s) in enumerate(ranked, 1):
    print(f"[{i}] score={s:.4f} | {t}")


In [None]:
# demo_rerank_transformers_fixed2.py
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def ensure_pad_token(tok, model):
    if tok.pad_token is None:
        if tok.eos_token is not None:
            tok.pad_token = tok.eos_token
        else:
            tok.add_special_tokens({"pad_token": "[PAD]"})
            model.resize_token_embeddings(len(tok))
    model.config.pad_token_id = tok.pad_token_id
    tok.padding_side = "right"

query = "什么是 LIPM（线性倒立摆模型），它在仿人行走里有什么作用？"
passages = [
    "Qwen3 是一个大语言模型系列，与机器人动力学无关。",
    "LIPM（Linear Inverted Pendulum Model）将质心视为在常高平面上运动的倒立摆，常用于人形/双足步态规划与控制。",
    "SGLang 专注推理加速与 KV cache 管理，不涉及步态物理建模。",
    "在仿人机器人中，LIPM 常用于近似 ZMP 约束，从而生成可行的足底支持多边形内的质心轨迹。"
]

device = torch.device("cpu")  # 有 GPU 可改 "cuda"
tok = AutoTokenizer.from_pretrained("Qwen/Qwen3-Reranker-0.6B", trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(
    "Qwen/Qwen3-Reranker-0.6B", trust_remote_code=True
).to(device).eval()

ensure_pad_token(tok, model)

pairs_q = [query] * len(passages)
pairs_p = passages

inputs = tok(
    pairs_q,
    pairs_p,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
).to(device)

with torch.no_grad():
    out = model(**inputs)
    # logits shape: [N, 1] → flatten to [N]
    scores = out.logits.view(-1).cpu().tolist()  # 👈 强制展平成一维

ranked = sorted(zip(passages, scores), key=lambda x: x[1], reverse=True)

print("\n=== Transformers 直算得分（高->低）===")
for i, (text, s) in enumerate(ranked, 1):
    print(f"[{i}] score={s:.4f} | {text}")


In [None]:
#  还可以吧 qwen

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from typing import List, Tuple

# 设置设备
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# 模型名称
model_name = "Qwen/Qwen3-Reranker-0.6B"

# 加载 tokenizer 和 模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
).to(device)

# 示例数据
query = "人工智能的发展趋势"
candidates = [
    "人工智能正在改变世界，深度学习、大模型推动技术进步。",
    "苹果公司发布了新款 iPhone，性能更强，摄像头更清晰。",
    "机器学习和神经网络在自然语言处理中广泛应用。",
    "天气预报说明天有雨，记得带伞。",
    "大模型如 Qwen、LLaMA 正在推动 AI 代理的发展。",
]

print("\nQuery:", query)
print("\nCandidates:")
for i, cand in enumerate(candidates):
    print(f"[{i}] {cand}")

# 构造输入并打分
scores = []
inputs_for_model = []

with torch.no_grad():  # 推理阶段，关闭梯度
    for i, doc in enumerate(candidates):
        # 拼接 query 和 document
        text = f"Query: {query}\nDoc: {doc}"
        
        # Tokenize
        encoded = tokenizer(
            text,
            padding=False,
            truncation=True,
            return_tensors="pt",
            max_length=8192
        ).to(device)
        
        # 前向传播
        outputs = model(**encoded)
        logits = outputs.logits

        # 👇 修复：根据 num_labels 提取正确分数
        if logits.shape[-1] == 1:
            score = logits.item()
        else:
            score = logits[0, 1].item()  # 取正类（相关）分数

        scores.append(score)
        inputs_for_model.append(encoded)

# 排序：按分数从高到低
ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)

# 输出排序结果
print("\n" + "="*50)
print("🔍 Reranking Results (Higher score = more relevant)")
print("="*50)
for rank, (idx, score) in enumerate(ranked, 1):
    print(f"Rank {rank}: [Score: {score:+.3f}]")
    print(f"  {candidates[idx][:100]}{'...' if len(candidates[idx]) > 100 else ''}")

In [None]:
"""
验证 bge-reranker-base 能否正常打分并重排
"""

from sentence_transformers import CrossEncoder
from llama_index.core.schema import TextNode
from llama_index.core.postprocessor import SentenceTransformerRerank

# 0. 造几条假设的候选文本
query = "中国的首都在哪？"
candidates = [
    "北京是中国的首都，也是政治中心。",
    "上海是中国最大的经济中心。",
    "广州位于南方，气候温暖。",
    "北京有故宫、天安门等著名景点。",
]

# maidalun/bce-reranker-base_v1  中文优化
# BAAI/bge-reranker-base         中英双语
# BAAI/bge-reranker-large        large


# 1. 直接用 sentence-transformers 打原始分
model = CrossEncoder("BAAI/bge-reranker-base", device="cpu")  
pairs = [(query, c) for c in candidates]
raw_scores = model.predict(pairs)
print("--- 原始打分 ---")
for c, s in zip(candidates, raw_scores):
    print(f"{s:.4f}  {c}")

# 2. 用 llama-index 的 reranker 包装，再跑一次
nodes = [TextNode(text=c) for c in candidates]
reranker = SentenceTransformerRerank(
    model="BAAI/bge-reranker-base",
    top_n=3,           # 只保留前 3
    device="cpu"
)
ranked_nodes = reranker.postprocess_nodes(
    nodes=nodes,
    query_str=query
)

print("\n--- llama-index 重排后（top_n=3）---")
for i, n in enumerate(ranked_nodes, 1):
    print(f"{i}. {n.score:.4f}  {n.text}")