In [2]:
import os
os.chdir('E:\学习\python\py_codbase\PK_LLM')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [3]:
from PK_LLM_endfront.D_retriever.retrivers import load_VBs

In [4]:
d1,d2 = load_VBs()

In [5]:
from langchain.prompts import ChatPromptTemplate

# RAG-Fusion: Related
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_rag_fusion 
    | ChatOpenAI(temperature=0)
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [11]:
from PK_LLM_endfront.evaluation.eva_funcs import DataLoader
data = DataLoader(filepath="_data\QA\\1\good_1.json")
record = next(data.get_data())
q = record.data['Question']
print(q)
generate_queries.invoke(q)

What are the benefits and challenges of using synergistic sample preparation and multiple analytical techniques in untargeted metabolomics?


['1. What are the advantages of combining synergistic sample preparation techniques in untargeted metabolomics?',
 '2. How do multiple analytical techniques enhance the results of untargeted metabolomics studies?',
 '3. What are the challenges associated with integrating synergistic sample preparation methods and multiple analytical techniques in untargeted metabolomics?',
 '4. How can the benefits of using synergistic sample preparation and multiple analytical techniques in untargeted metabolomics outweigh the challenges?']

In [18]:
from langchain.load import dumps, loads
def reciprocal_rank_fusion(results: list[list], k=60)->list[set]:
    """ 互惠排名融合方法，它接收多个排好序的文档列表
        以及在RRF公式中使用的可选参数k """
    # 初始化一个字典，用于保存每个唯一文档的融合得分
    fused_scores = {}
    # 遍历每个排好序的文档列表
    for docs in results:
        # 遍历列表中的每个文档，以及它的排名（在列表中的位置）
        for rank, doc in enumerate(docs):
            # 将文档转换为字符串格式以用作键（假设文档可以序列化为JSON）
            doc_str = dumps(doc)
            # 如果文档尚未在fused_scores字典中，则添加它并初始化得分为0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # 获取文档当前的得分，如果有的话
            previous_score = fused_scores[doc_str]
            # 使用RRF公式更新文档的得分：1 / (排名 + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # 根据它们的融合得分以降序排序文档，以获得最终重新排序的结果
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # 将重新排序的结果作为包含文档和其融合得分的元组列表返回
    return reranked_results

retriever = d1.as_retriever()
retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": q})
len(docs)

6

In [20]:
docs

[(Document(page_content='Metabolomics samples like human urine or serum contain upwards of a few thousand metabolites, but individual analytical techniques can only characterize a few hundred metabolites at best. The uncertainty in metabolite identification commonly encountered in untargeted metabolomics adds to this low coverage problem. A multiplatform (multiple analytical techniques) approach can improve upon the number of metabolites reliably detected and correctly assigned. This can be further improved by applying synergistic sample preparation along with the use of combinatorial or sequential non-destructive and destructive techniques. Similarly, peak detection and metabolite identification strategies that employ multiple probabilistic approaches have led to better annotation decisions. Applying these techniques also addresses the issues of reproducibility found in single platform methods. Nevertheless, the analysis of large data sets from disparate analytical techniques presents

In [19]:
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter
from langchain_openai import ChatOpenAI
# RAG
template = """Answer the following question based on this context:
使用中文回答

{context}

Question: {question}
"""
llm = ChatOpenAI()
prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":q})

'在未定向代谢组学中，使用协同样品准备和多种分析技术的好处包括可以提高可靠检测和正确分配的代谢物数量，改善代谢物鉴定的不确定性，并解决单一平台方法中的可重复性问题。另外，应用多种概率方法的峰检测和代谢物鉴定策略可以提高注释决策的准确性。然而，使用多平台方法分析来自不同分析技术的大型数据集也会带来独特的挑战，因为许多软件包只能完全处理来自单一分析仪器的数据类型。传统的统计方法如主成分分析并不适用于处理多个不同的数据集，而多元分析需要多块或其他类型的模型来理解多台仪器的贡献。因此，使用协同样品准备和多种分析技术的挑战包括数据处理中的复杂性和不同软件包的兼容性问题。'

SRTEBACK

In [21]:
# Few Shot Examples
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
examples = [
    {
        "input": "Could the members of The Police perform lawful arrests?",
        "output": "what can the members of The Police do?",
    },
    {
        "input": "Jan Sindel’s was born in what country?",
        "output": "what is Jan Sindel’s personal history?",
    },
]
# We now transform these to example messages
example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{input}"),
        ("ai", "{output}"),
    ]
)
few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are an expert at world knowledge. 
            Your task is to step back and paraphrase a question to a more generic step-back question, 
            which is easier to answer. Here are a few examples:""",
        ),
        # Few shot examples
        few_shot_prompt,
        # New question
        ("user", "{question}"),
    ]
)
generate_queries_step_back = prompt | ChatOpenAI(temperature=0) | StrOutputParser()
question = "什么是Agent的任务分解"
generate_queries_step_back.invoke({"question": question})

In [None]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
# Response prompt 
response_prompt_template = """You are an expert of world knowledge. 
I am going to ask you a question. 
Your response should be comprehensive and not contradicted with the following context 
if they are relevant. Otherwise, ignore them if they are not relevant.
请使用中文回答
# {normal_context}
# {step_back_context}

# Original Question: {question}
# Answer:"""
response_prompt = ChatPromptTemplate.from_template(response_prompt_template)

chain = (
    {
        # Retrieve context using the normal question
        "normal_context": RunnableLambda(lambda x: x["question"]) | retriever,
        # Retrieve context using the step-back question
        "step_back_context": generate_queries_step_back | retriever,
        # Pass on the question
        "question": lambda x: x["question"],
    }
    | response_prompt
    | ChatOpenAI(temperature=0)
    | StrOutputParser()
)

chain.invoke({"question": question})