In [1]:
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name='BAAI/bge-small-zh-v1.5')



In [2]:
from langchain.document_loaders.pdf import PyMuPDFLoader

# 创建一个 PyMuPDFLoader Class 实例，输入为待加载的 pdf 文档路径
loader = PyMuPDFLoader("../../../data_base/knowledge_db/pumkin_book/pumpkin_book.pdf")

# 调用 PyMuPDFLoader Class 的函数 load 对 pdf 文件进行加载
pdf_pages = loader.load()
# 第13页为南瓜书第一页正文，因此从13页开始,从倒数13页涉及敏感用语，因此从-13页结束
data_pages = pdf_pages[13:-13]

In [3]:
import re

def clean_text(text: str):
    # 删除每页开头与结尾标语及链接
    text = re.sub(r'→_→\n欢迎去各大电商平台选购纸质版南瓜书《机器学习公式详解》\n←_←', '', text)
    text = re.sub(r'→_→\n配套视频教程：https://www.bilibili.com/video/BV1Mh411e7VU\n←_←', '', text)
    # 删除字符串开头的空格
    text = re.sub(r'\s+', '', text)
    # 删除回车
    text = re.sub(r'\n+', '', text)

    return text

for page in data_pages:
    page.page_content = clean_text(page.page_content)

In [4]:
# from generate_qa_pairs import docs_generate_pdf_qa_pairs

# qa_pairs = docs_generate_pdf_qa_pairs(pdf_pages=train_pages, num_questions_per_page=1)
# qa_pairs.save_json("train_dataset.json")

In [5]:
from generate_qa_pairs import QaPairs
qa_pairs = QaPairs.from_json('train_dataset.json')

In [6]:
from tqdm import tqdm

i = 0
for qa_pair in tqdm(qa_pairs.qa_pairs):
    if len(qa_pair['query']) > 10:
        i += 1
print('问答对成功生成率：' + str(i/len(qa_pairs.qa_pairs)))

100%|██████████| 119/119 [00:00<00:00, 1890614.30it/s]

问答对成功生成率：0.9243697478991597





In [7]:
from langchain.vectorstores.chroma import Chroma
def calculat_recall(k: int, vectordb: Chroma):
    i = 0
    j = 0
    for qa_pair in tqdm(qa_pairs.qa_pairs):
        if len(qa_pair['query']) > 10:
            query = qa_pair['query']
            sim_docs = vectordb.similarity_search(query,k=k)
            page_nums = [doc.metadata['page'] for doc in sim_docs]
            if qa_pair['page_num'] in page_nums: i += 1
            j += 1
    return i/j

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def evaluate_chunk(chunk_size: int):
    # 切分文档
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=50, separators=['。', '，', ''])

    split_docs = text_splitter.split_documents(data_pages)

    # 构建向量库
    vectordb = Chroma.from_documents(
        documents=split_docs,
        embedding=embedding,
    )
    return [calculat_recall(i, vectordb) for i in [1, 3, 5, 10]]


In [9]:
chunksize_recall_scores = [evaluate_chunk(i) for i in [200, 300, 400, 500]]

100%|██████████| 119/119 [00:07<00:00, 14.95it/s]
100%|██████████| 119/119 [00:00<00:00, 138.61it/s]
100%|██████████| 119/119 [00:00<00:00, 139.66it/s]
100%|██████████| 119/119 [00:00<00:00, 141.72it/s]
100%|██████████| 119/119 [00:00<00:00, 141.97it/s]
100%|██████████| 119/119 [00:00<00:00, 139.12it/s]
100%|██████████| 119/119 [00:00<00:00, 136.24it/s]
100%|██████████| 119/119 [00:00<00:00, 138.65it/s]
100%|██████████| 119/119 [00:00<00:00, 142.85it/s]
100%|██████████| 119/119 [00:00<00:00, 126.86it/s]
100%|██████████| 119/119 [00:00<00:00, 134.26it/s]
100%|██████████| 119/119 [00:00<00:00, 126.83it/s]
100%|██████████| 119/119 [00:00<00:00, 125.34it/s]
100%|██████████| 119/119 [00:00<00:00, 134.81it/s]
100%|██████████| 119/119 [00:00<00:00, 136.61it/s]
100%|██████████| 119/119 [00:01<00:00, 118.89it/s]


In [10]:
import csv

column_names = ['', 'top_1', 'top_3', 'top_5', 'top_10']
row_names = ['chunk_200', 'chunk_300', 'chunk_400', 'chunk_500']

with open('chunksize_recall.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(column_names)
    
    for i, row in enumerate(chunksize_recall_scores):
        writer.writerow([row_names[i]] + row)