# DB+paperQA

这是一个用于从PDF或文本文件（可以是原始HTML）进行问题回答的最简化软件包。它力求给出非常好的答案，不产生幻觉，通过用文本中的引用来支持回答。

默认情况下，它使用OpenAI Embeddings与一个称为FAISS的向量数据库来嵌入和搜索文档。然而，通过langchain，您可以使用开源模型或嵌入（详见下文）。

PaperQA使用下面显示的过程：

1. 将文档嵌入到向量中
2. 将查询嵌入到向量中
3. 在文档中搜索前k个片段
4. 创建与查询相关的每个片段的摘要
5. 将摘要放入提示中
6. 用提示生成答案

## 配置chromaDB

In [1]:
import chromadb
import os
import pandas as pd

In [2]:
# Initiating a persistent Chroma client
client = chromadb.PersistentClient(path="client_test")

In [9]:
# 查看所有 Collation
collections_list = client.list_collections()
print(collections_list)

[Collection(name=collection_test_4_literature)]


In [5]:
# 创建、检查和删除集合
#collection = client.create_collection(name="collection_test_4_literature")
# collection = client.get_collection(name="collection_test_4_literature")

# 删除 collection
client.delete_collection("collection_test_4_literature")

# 重新创建
collection = client.create_collection(name="collection_test_4_literature")

In [9]:
import os
def get_all_files_and_names_in_subfolders(directory):
    # 获得文件下所有文件的绝对路径
    dic = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)  # 获取文件的绝对路径
            file = os.path.splitext(file)[0]
            dic[file] = file_path
    return dic

In [18]:
import pandas as pd
# 获取所有文件路径
file_name_path_dic = get_all_files_and_names_in_subfolders("literature_base/知识库文献集")

# 文献的具体数据表格
db_detail = file_name_path_dic.pop('db_detail')
df_01 = pd.read_excel(db_detail,sheet_name=0,index_col=0)
df_02 = pd.read_excel(db_detail,sheet_name=1,index_col=0)


# 文献的摘要和id
doc_lis = []
id_lis = []

meta_data_lis =[]
for i in range(len(df_01.index)):
    doc_lis.append(df_01['Abstract'][i+1])
    id_lis.append(df_01['id '][i+1])
    meta_data = {'name' : df_01['name'][i+1] ,
            'Publication Date' : df_01['Publication Date'][i+1] 
        }
    meta_data_lis.append(meta_data)


for i in range(len(df_02.index)):
    doc_lis.append(df_02['Abstract'][i+1])
    id_lis.append(df_02['id '][i+1]
                  )
    meta_data = {'name' : df_02['name'][i+1],
            'Publication Date' : df_02['Publication Date'][i+1] 
        }
    meta_data_lis.append(meta_data)

In [19]:
# 将文件摘要和idf、metadata放入集合
collection.add(
    documents=doc_lis,
    ids=id_lis,
    metadatas = meta_data_lis
)


In [27]:
# 查询示例
a = collection.query(
    query_texts=['什么是纳米微气泡'],
    n_results=2,
)

In [28]:
a['ids'][0]

['SL02-02-J', 'SL02-01-A']

In [22]:
# collection.get(ids=["id2"])

In [29]:
file_name_path_dic

{'SL01-01-J': 'literature_base/知识库文献集\\SL01\\SL01-01-J.pdf',
 'SL01-02-J': 'literature_base/知识库文献集\\SL01\\SL01-02-J.pdf',
 'SL02-01-A': 'literature_base/知识库文献集\\SL02\\SL02-01-A.pdf',
 'SL02-02-J': 'literature_base/知识库文献集\\SL02\\SL02-02-J.pdf'}

In [30]:
# 从db中查询文档，并且返回最相关的文献路径

def retriever_db(question,n_results=2):
    a = collection.query(
        query_texts=[question],
        n_results=n_results,
        )
    
    id_lis = a['ids'][0]
    
    path_lis = []
    for _id in id_lis:
        _path = file_name_path_dic[_id]
        path_lis.append(_path)
    
    return path_lis

## 初始化paperQA

In [16]:
import os
os.environ['OPENAI_API_KEY'] = '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'
# OPENAI_API_BASE
#OPENAI_API_BASE: "https://api.openai-forward.com/v1"
#OPENAI_PROXY: "http://localhost:33210"
os.environ['OPENAI_API_BASE'] = "https://api.openai-forward.com/v1"
os.environ['OPENAI_PROXY'] = "http://localhost:33210"

In [17]:
from paperqa import Docs
import chromadb
import os
import pandas as pd

In [18]:
import os
def get_all_files_and_names_in_subfolders(directory):
    # 获得文件下所有文件的绝对路径
    dic = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)  # 获取文件的绝对路径
            file = os.path.splitext(file)[0]
            dic[file] = file_path
    return dic

In [19]:
class paperQA_bot():
    def __init__(self) -> None:
        # Initiating a persistent Chroma client
        self.client = chromadb.PersistentClient(path="client_test")
        self.collection = self.client.get_collection(name="collection_test_4_literature")
    
    def run(self,question):
        docs = self.init_docs(question)
        self.docs = docs
        answer = docs.query(question)
        return answer.formatted_answer
    
    def Continue_asking(self,question):
        return self.docs.query(question).formatted_answer
    
    def init_docs(self,question):
        # 初始化QA文档
        # 检索文档
        my_docs = self.retriever_db(question=question)
        # 将文档加入paperQA的文档库中
        #docs = Docs() # 默认使用4.0
        docs = Docs(llm='gpt-3.5-turbo')
        for d in my_docs:
            docs.add(d)
        return docs
    
    # 从db中查询文档，并且返回最相关的文献路径
    def retriever_db(self,question,n_results=2)->list :
        file_name_path_dic = get_all_files_and_names_in_subfolders("literature_base/知识库文献集")
        
        a = self.collection.query(
            query_texts=[question],
            n_results=n_results,
            )
        id_lis = a['ids'][0]
        path_lis = []
        for _id in id_lis:
            _path = file_name_path_dic[_id]
            path_lis.append(_path)
        
        return path_lis

In [20]:
bot = paperQA_bot()

In [21]:
import nest_asyncio
nest_asyncio.apply()

In [22]:
a = bot.run('什么是QSAR')

In [24]:
bot.Continue_asking('QSAR模型的适用域是什么')

'Question: QSAR模型的适用域是什么\n\nQSAR模型的适用域是指模型在哪些情况下能够有效地进行预测和分类。适用域的确定对于模型的性能至关重要，特别是在监管应用中。文献提到，描述符系统和适用域表征方法之间的兼容性对于分类器的性能非常重要。例如，非三维Mordred描述符基于的分类器在平均AUC超过0.86的情况下表现良好，与先前报告的分类器相当。然而，基于ECFP4指纹的分类器在某些化合物上表现较差，这是由于训练中存在偏差。相比之下，MACCS键显示出比ECFP4指纹更好的泛化能力。因此，适用域的不当组合可能导致分类器的意外性能下降。(Wang2020 pages 4-4)\n\nReferences\n\n1. (Wang2020 pages 4-4): Wang, Zhongyu, et al. "Applicability Domains Enhance Application of PPARγ Agonist Classifiers Trained by Drug-like Compounds to Environmental Chemicals." Chem. Res. Toxicol., vol. 33, no. 6, 2020, pp. 1382-1388.\n'