In [1]:
import os
import uuid

## PDF読み込み

In [2]:
import fitz # pymupdfライブラリ

In [42]:
def pdf_reader(file_path,output_dir):
    doc = fitz.open(filename)
    for page in range(len(doc)):
        page_data = doc[page]
        text = page_data.get_text()
        table_data = page_data.find_tables()
        if len(table_data.tables) > 0:
            table_text = ""
            for tbl in table_data:
                for t in tbl.extract():
                    if None in t:
                        t = [_t for _t in t if _t != None]
                    table_text += ":".join(t).replace("\n","") + "\n"
                table_text += "\n"
            with open(f"{output_dir}/table_{page}.txt",mode="w",encoding="utf-8") as f:
                f.write(table_text)
        with open(f"{output_dir}/{page}.txt",mode="w",encoding="utf-8") as f:
            f.write(text)

filename = 'pdf_data/data_0.pdf'
output_dir = 'pdf_to_text/data0'
pdf_reader(filename,output_dir)

## 前処理

In [43]:
import re
from unicodedata import normalize
from glob import glob

In [47]:
def concat_pages(dir_path,skip_page=[]):
    text_data = ""
    for page in range(len(glob(f"{dir_path}/[0-9]*.txt"))):
        if page not in skip_page:
            with open(f"{dir_path}/{page}.txt",mode="r",encoding="utf-8") as f:
                page_data = f.read()
                text_data += page_data+"\n"
    return text_data
        
def normalize_text(text_data,remove_str=r"\u3000"):
    clean_text = normalize('NFKC', text_data)
    clean_text = re.sub(remove_str, r'', clean_text)
    return clean_text

def skip_text(text_data,skip_pattern):
    text_list = text_data.split("\n")
    result = []
    for text in text_list:
        text = text.replace("\n","")
        if text == "" or re.fullmatch(skip_pattern,text):
            continue
        else:
            result.append(text)
    return "\n".join(result)

def split_text(text_data,pattern):
    text_list = text_data.split("\n")
    result = []
    for text in text_list:
        if re.match(pattern,text):
            result.append("\n\n"+text)
        else:
            result.append(text)
    return "".join(result)
    
def sentence_split(text_data,split_str="。"):
    text_list = text_data.split("\n")
    result = []
    for text in text_list:
        if text[-1]==split_str:
            result.append(text+"\n")
        else:
            result.append(text)
    return "".join(result)

In [48]:
skip_page = [0,1,2,3,21,29,46,48]
concat_data = concat_pages("./pdf_to_text/data0",skip_page)
clean_data = normalize_text(concat_data,remove_str="[\u3000 ]")
clean_data = sentence_split(skip_text(clean_data,skip_pattern=r"^\d+"))
# split_text = split_text(clean_data,pattern=r"^第[0-9]+章")

In [54]:
len(clean_data.split("\n"))

922

## Qdrantに接続& ベクトル化

In [93]:
paragraph_separator = "\n\n" # 段落分割
chunk_size = 1024 #チャンク（トークン）数
chunk_overlap = 20 # 前のチャンクをどのくらい含めるか
secondary_chunking_regex = '[^,.．;。]+[,.．;。]?'# 文分割

In [95]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from langchain.schema import Document
embeddings = OpenAIEmbeddings()
documents = [Document(page_content = data,metadata ={"type":"related","filename":"ビジネス光ネットサービス契約約款"}) for data in clean_data.split("\n")]
qdrant = Qdrant.from_documents(documents, embeddings,host="qdrant",port=6333, collection_name= "DocumentsDB_Langchain")

INFO:httpx:HTTP Request: GET http://qdrant:6333/collections/DocumentsDB_Langchain "HTTP/1.1 404 Not Found"
HTTP Request: GET http://qdrant:6333/collections/DocumentsDB_Langchain "HTTP/1.1 404 Not Found"
HTTP Request: GET http://qdrant:6333/collections/DocumentsDB_Langchain "HTTP/1.1 404 Not Found"
HTTP Request: GET http://qdrant:6333/collections/DocumentsDB_Langchain "HTTP/1.1 404 Not Found"
HTTP Request: GET http://qdrant:6333/collections/DocumentsDB_Langchain "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: DELETE http://qdrant:6333/collections/DocumentsDB_Langchain "HTTP/1.1 200 OK"
HTTP Request: DELETE http://qdrant:6333/collections/DocumentsDB_Langchain "HTTP/1.1 200 OK"
HTTP Request: DELETE http://qdrant:6333/collections/DocumentsDB_Langchain "HTTP/1.1 200 OK"
HTTP Request: DELETE http://qdrant:6333/collections/DocumentsDB_Langchain "HTTP/1.1 200 OK"
HTTP Request: DELETE http://qdrant:6333/collections/DocumentsDB_Langchain "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: PUT http://qd

## 応答テスト

In [83]:
query_engine = index.as_query_engine(similarity_top_k=10)
response = query_engine.query("ビジネス光ネットサービスはインターネットで解約できないの？")
print(response)

INFO:httpx:HTTP Request: POST http://qdrant:6333/collections/DocumentDB/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://qdrant:6333/collections/DocumentDB/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://qdrant:6333/collections/DocumentDB/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://qdrant:6333/collections/DocumentDB/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://qdrant:6333/collections/DocumentDB/points/search "HTTP/1.1 200 OK"
ビジネス光ネットサービスはインターネットで解約することはできません。光ネットサービス契約者は、解約を行うためにはあらかじめ書面により光ネットサービス取扱所に通知する必要があります。


## 類似度が高いNode取り出し

In [87]:
from llama_index.retrievers import VectorIndexRetriever

retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=10,
)
retriever.retrieve("ビジネス光ネットサービスとは？")

INFO:httpx:HTTP Request: POST http://qdrant:6333/collections/DocumentDB/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://qdrant:6333/collections/DocumentDB/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://qdrant:6333/collections/DocumentDB/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://qdrant:6333/collections/DocumentDB/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://qdrant:6333/collections/DocumentDB/points/search "HTTP/1.1 200 OK"


[NodeWithScore(node=TextNode(id_='481d0c75-7f1b-49e6-81b7-2c11fd265b4f', embedding=None, metadata={'name': 'related'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='8ac8a086-fd34-4ed3-bada-93e146b85b06', node_type=None, metadata={'name': 'related'}, hash='06fd0a46ff113f212d2d17f52f9f6ce0762f5fbe5305807eb4d5b1bffe1a42de')}, hash='06fd0a46ff113f212d2d17f52f9f6ce0762f5fbe5305807eb4d5b1bffe1a42de', text='(注)業務の遂行上必要な範囲での利用には、光ネットサービス契約者に係る情報を当社の業務を委託している者に提供する場合を含みます。', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.872484),
 NodeWithScore(node=TextNode(id_='2abdc908-4893-417e-bcc2-9ead5ddf914c', embedding=None, metadata={'name': 'related'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='09be3fa4-cb58-414e-9