## Model

In [1]:
import torch
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from modelscope import AutoTokenizer, AutoModelForCausalLM

model_id = 'Qwen/Qwen3-8B'
# model_id = 'Qwen/Qwen3-0.6B'
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
qwen_model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16).cuda()

  from .autonotebook import tqdm as notebook_tqdm


Downloading Model from https://www.modelscope.cn to directory: D:\modelscope\models\Qwen\Qwen3-8B




Downloading Model from https://www.modelscope.cn to directory: D:\modelscope\models\Qwen\Qwen3-8B


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 5/5 [00:16<00:00,  3.31s/it]


In [23]:
# def qwen_llm(prompt):
#     inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
#     outputs = model.generate(**inputs, max_new_tokens=32768)
#     return tokenizer.decode(outputs[0], skip_special_tokens=True)

def qwen_llm(prompt_str: str) -> str:
    print("Qwen...")
    if hasattr(prompt_str, 'to_string'):
        prompt_str = prompt_str.to_string()
    # ✅ 确保输入是字符串
    assert isinstance(prompt_str, str), f"Expected string, got {type(prompt_str)}"
    messages = [
        {"role": "user", "content": prompt_str}
    ]
    text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=True
        )
    inputs = tokenizer([text], return_tensors='pt').to(qwen_model.device)
    outputs = qwen_model.generate(**inputs, max_new_tokens=32768)  
    # return tokenizer.decode(outputs[0], skip_special_tokens=True).split('<think>\n\n</think>\n\n')[-1]
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

## Vector Store

In [18]:
from pymilvus import connections, Collection
connections.connect("default", host="localhost", port="19530")
col_name = "AllBiddings"

col = Collection(col_name)
col.load()

## Embedding Model

In [4]:
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3',
                      use_fp16=False,
                      pooling_method='cls',
                      devices=['cuda:0'])

def get_embeddings(text):
    embeddings = model.encode(
        text,
        return_dense=True,
        return_sparse=True,
        return_colbert_vecs=False
    )
    return embeddings

Fetching 30 files: 100%|███████████████████████████████████████████████████████████████████████| 30/30 [00:00<?, ?it/s]


## Retriever

In [21]:
from pymilvus import AnnSearchRequest, WeightedRanker

def hybrid_search(
    col,
    query_dense_embedding,
    query_sparse_embedding,
    sparse_weight=1.0,
    dense_weight=1.0,
    limit=5,
):
    print("Searching...")
    dense_req = AnnSearchRequest(
        [query_dense_embedding], "dense_vector", {"metric_type": "L2", "params": {}}, limit=limit
    )
    sparse_req = AnnSearchRequest(
        [query_sparse_embedding], "sparse_vector", {"metric_type": "IP", "params": {}}, limit=limit
    )
    rerank = WeightedRanker(sparse_weight, dense_weight)
    res = col.hybrid_search(
        [sparse_req, dense_req],
        rerank=rerank,
        limit=limit,
        output_fields=["text"]
    )
    return [
        {"text": hit.entity.get("text")}
        for hit in res[0]
    ]

def hybrid_search_pipeline(query):
    # 在这里调用你的 embedding 模型（比如 bge-m3）
    print("Embedding...")
    query_embeddings = get_embeddings([query])
    query_dense_embeddings = query_embeddings['dense_vecs'][0]
    query_sparse_embeddings = query_embeddings.get('lexical_weights')[0]

    # 调用原始的搜索方法
    return hybrid_search(
        col,
        query_dense_embeddings,
        query_sparse_embeddings,
        sparse_weight=1.0,
        dense_weight=1.0,
        limit=50
    )


## Reranker

In [6]:
from pymilvus.model.reranker import BGERerankFunction

bge_rf = BGERerankFunction(
    model_name="BAAI/bge-reranker-v2-m3",  # Specify the model name. Defaults to `BAAI/bge-reranker-v2-m3`.
    device="cuda:0" # Specify the device to use, e.g., 'cpu' or 'cuda:0'
)


In [24]:
def rerank(results):
    print("Reranking...")
    documents = [p['text'] for p in results]
    rerank_results = bge_rf(query=query, documents=documents, top_k=1)
    return rerank_results

## Chain

In [22]:
PROMPT_TEMPLATE = """
Human: You are an AI assistant, and provides answers to questions by using fact based and statistical information when possible.
Use the following pieces of information to provide a concise answer to the question enclosed in <question> tags.
If users ask logical question rather than contexual question, you 'd better provide the url of project to avoid protential mistake.
If the question is an inference question, you need to inferent step by step.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
<context>
{context}
</context>

<question>
{question}
</question>

The response should be specific and use statistics or numbers when possible. 
At last you may remind user to get more info from provided url.
Answer in Chinese.

Assistant:"""

# Create a PromptTemplate instance with the defined template and input variables
prompt = PromptTemplate(
    template=PROMPT_TEMPLATE, input_variables=["context", "question"]
)


# Define a function to format the retrieved documents
def format_docs(docs):
    return "\n\n".join(doc.text for doc in docs)

In [23]:
from langchain_core.runnables import RunnableLambda

context_chain = RunnableLambda(hybrid_search_pipeline) | RunnableLambda(rerank) | RunnableLambda(format_docs)
llm = RunnableLambda(qwen_llm)

## Query

In [19]:
query = input("Enter your search query: ")
# query_embeddings = get_embeddings([query])

Enter your search query:  最新三条政府采购信息是什么？


In [24]:
rag_chain = (
    {"context": context_chain, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


# res = rag_chain.invoke(query)
# res
for s in rag_chain.stream(query):
    print(s, end="", flush=True)

Embedding...
Searching...
Reranking...
Qwen...
user

Human: You are an AI assistant, and provides answers to questions by using fact based and statistical information when possible.
Use the following pieces of information to provide a concise answer to the question enclosed in <question> tags.
If users ask logical question rather than contexual question, you 'd better provide the url of project to avoid protential mistake.
If the question is an inference question, you need to inferent step by step.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
<context>
项目编号：310107000250320195232-07224789。公告标题：采购悦心亭心理服务亭软件三期项目的中标（成交）结果公告。项目名称：采购悦心亭心理服务亭软件三期项目。采购项目子编号：1。标项名称：采购悦心亭心理服务亭软件三期项目。代理机构名称：上海市普陀区政府采购中心。代理机构代码：12310107764741781B。中标金额：1630000.00。中标供应商名称：上海园成医疗器械有限公司。中标供应商地址：上海市宝山区逸仙路2816号1幢9层A0902室。得分：85.27。链接：https://www.shggzy.com/jyxxzcgs/8231765?cExt=eyJhbGciOiJIUzI1NiJ9.eyJwYXRoIjoiL2p5eHh6YyIsInBhZ2VObyI6MSwiZXhwIjoxNzQ4MjY1NzgwMTIzfQ.U0z7uqmiYY

In [30]:
from google import genai

client = genai.Client(api_key="AIzaSyC_Zr5GS7vQNxOP8UNHeuPteCWHMR8QlVI")

response = client.models.generate_content(
    model="gemini-2.0-flash", contents="Explain how AI works in a few words"
)
print(response.text)

AI learns patterns from data to make predictions or decisions.



In [None]:
import jieba

def doc_text_formatting(query, docs):
    query_words = list(set(jieba.lcut(query)))  # 中文分词
    formatted_texts = []

    for doc in docs:
        try:
            highlighted = doc.get('text')
        except:
            highlighted = doc.text
        for word in query_words:
            if not word.strip():
                continue
            highlighted = re.sub(
                re.escape(word),
                f"<span style='color:red'>{word}</span>",
                highlighted
            )
        formatted_texts.append(highlighted)
    return formatted_texts


In [None]:
query_dense_embeddings = query_embeddings['dense_vecs']
query_sparse_embeddings = query_embeddings.get('lexical_weights')

In [None]:
hybrid_results = hybrid_search(
        col,
        query_dense_embeddings[0],
        query_sparse_embeddings[0],
        sparse_weight=1.0,
        dense_weight=1.0,
        limit=50
    )

In [None]:
import re
from IPython.display import Markdown, display

display(Markdown("### 🔄 **Hybrid Search Results:**"))
formatted_results = doc_text_formatting(query, hybrid_results)
for i, result in enumerate(formatted_results):
    display(Markdown(f"para_id: {hybrid_results[i].get('para_id')}"))
    display(Markdown(result))

In [None]:
hybrid_rerank_results = rerank(hybrid_results)

In [None]:
hybrid_rerank_results

In [None]:
display(Markdown("### 🔄 **Reranked Hybrid Search Results:**"))
formatted_results = doc_text_formatting(query, hybrid_rerank_results)
for i, result in enumerate(formatted_results):
    display(Markdown(f"para_id: {hybrid_results[i].get('para_id')}"))
    display(Markdown(result))

In [None]:
prompt

In [None]:
rag_sub_chain = (
    {"context": RunnableLambda(hybrid_results) | RunnableLambda(format_docs), "question": RunnablePassthrough()}
    | prompt
    | StrOutputParser()
)