In [25]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY=os.getenv("AZURE_OPENAI_API_KEY")

In [None]:
# 1️⃣ 加载文档
loader = PyPDFLoader("./白话机器学习的数学_图灵图书_立石贤吾_Z_Library.pdf")
docs = loader.load()
# 2️⃣ 切块
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(docs)
# 3️⃣ 向量化 & 存储
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(chunks, embeddings)
# 4️⃣ 构建 RAG 问答链
llm = ChatOpenAI(model="gpt-4o-mini")
qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever())
# 5️⃣ 进行问答
query = "根据文档，绫乃是谁，有什么特色，学习到了什么？"
print("Q:", query)
print("A:", qa.run(query))