In [1]:
from langchain.document_loaders import UnstructuredExcelLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores.pgvector import PGVector
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import pandas as pd
from dotenv import load_dotenv, find_dotenv
import os
import openai
import sys

# 1. 加载 openai 密钥
sys.path.append('../..')
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ['OPENAI_API_KEY']

# 2. 加载指定路径下所有文件
workpath = "../data/output"
excel_file = f"{workpath}/wos_coredata_cleaned.xlsx"
csv_file = f"{workpath}/paper_title_abstract.csv"


def xlsx2csv():
    # 用pandas.read_excel()读取xlsx文件，并返回一个DataFrame对象
    df = pd.read_excel(excel_file).loc[:, ["Title", 'Abstract']]
    # 用pandas.DataFrame.to_csv()将DataFrame对象写入到csv文件中
    df.to_csv(csv_file, index=False)

def load_files():
    # 设置源信息为论文名称
    loader = CSVLoader(csv_file, source_column="Title")
    data = loader.load()
    return data


# 3. 分割文本


def split_doc():
    chunk_size = 200  # 设置块大小
    chunk_overlap = 40  # 设置块重叠大小
    # 初始化文本分割器
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        # 设置 换行, 句号, 逗号, 空格 为分隔符 (按顺序切割)
        separators=["\n", "(?<=\. )", ",", " "]
    )
    # 调用分割器分割文本
    data = load_files()
    split_texts = text_splitter.split_documents(data)
    print("分割前:", len(data), "\n分割后:", len(split_texts))
    return split_texts

In [2]:
# 4. 加载 embedding 模型
embedding = OpenAIEmbeddings()

# 5. 连接向量数据库并创建表

# PGVector 数据库连接字符串
CONNECTION_STRING = "postgresql+psycopg2://postgres:123qwe@localhost:4321/essay_anaylsis"
# PGVector 模块将尝试创建一个表, 注意表的名称并不是集合的名称
# 因此, 请确保集合名称是唯一的, 并且用户具有创建表的权限.
COLLECTION_NAME = "llm_langchain_essay"

In [3]:
def create_vectorstore():
    split_texts = split_doc()
    vectordb = PGVector.from_documents(
        documents=split_texts,
        embedding=embedding,
        collection_name=COLLECTION_NAME,
        connection_string=CONNECTION_STRING,
    )
# create_vectorstore()

In [4]:
# 6. 连接已有的向量数据库, 作为检索器
def conn_vectorstore():
    store = PGVector(
        collection_name=COLLECTION_NAME,
        connection_string=CONNECTION_STRING,
        embedding_function=embedding,
    )
    # 使用矢量数据库作为检索器
    retriever = store.as_retriever()
    return retriever

In [11]:
# 8. 提出问题, 进行相似性检索
# question = "Please list the specific application of SIF in the field of agriculture."
# question = "Please list the application of SIF in agricultural yield estimation,"

In [10]:
# 9. 构建 Prompt 模板
qa_template = """结合以下上下文片段来回答最后的问题。
如果你不知道答案，只需说不知道，不要试图编造答案。
在回答的最后一定要说"感谢您的提问！"
{context}
问题：{question}
有用的回答："""
QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template=qa_template
)

In [12]:
# 11. 生成 QA 问答链
# 加载模型和检索器
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7)
retriever = conn_vectorstore()
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)
result = qa_chain({"query": question})
print(result["result"])

I'm sorry, but I don't know how many papers there are in the context. Thank you for your question!
