# PDF 문서를 읽어 chunk

In [2]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("./PDFS/sample-statement-of-work.pdf")
doc = loader.load()

In [3]:
len(doc)

9

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size = 400,
  chunk_overlap  = 100,
  length_function = len,
)
chunks = text_splitter.split_documents(doc)

In [5]:
len(chunks)

60

# Embedding

In [6]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
  model_name="BAAI/bge-small-en",
  model_kwargs={'device': 'cpu'},
  encode_kwargs={'normalize_embeddings': True}
)

In [11]:
from langchain.vectorstores import Chroma

vector_db = Chroma.from_documents(
  documents=chunks,
  embedding=embeddings,
  persist_directory="./vectorstore/example-embedding",
)

# jupyter를 사용하면 persist()함수를 호출해야 함
# reference: https://stackoverflow.com/questions/77231763/cannot-load-persisted-db-using-chroma-langchain
vector_db.persist()

In [8]:
query = "The Customer’s IT organization"
docs = vector_db.similarity_search_with_score(query)

In [9]:
docs

[(Document(page_content='Page 7 of 9 \n SAMPLE STATEMENT OF WORK  \nTechnology Assumptions  \n1. The Customer’s IT organization is responsible for workstation compliance to Globex  \nminimum requirements and any  pre-installation activities (if applicable).  \n2. Customer is responsible for the purchase and installation of any third -party', metadata={'source': './PDFS/sample-statement-of-work.pdf', 'page': 6}),
  0.29618027806282043),
 (Document(page_content='The Customer will designate one person to serve  as Project Manager .  The Customer’s \nproject manager  will have authority to approve project -related services and may designate \nother individuals to act as project manager s, subject -matter experts, and /or advisors  during \nthe engagement .  The Customer  will be responsible for the quality and timeliness of work', metadata={'source': './PDFS/sample-statement-of-work.pdf', 'page': 3}),
  0.3413243889808655),
 (Document(page_content='• Network Appliance setup/troubleshooting

In [10]:
retriever = vector_db.as_retriever(search_type="mmr")
retriever.get_relevant_documents(query)

[Document(page_content='Page 7 of 9 \n SAMPLE STATEMENT OF WORK  \nTechnology Assumptions  \n1. The Customer’s IT organization is responsible for workstation compliance to Globex  \nminimum requirements and any  pre-installation activities (if applicable).  \n2. Customer is responsible for the purchase and installation of any third -party', metadata={'source': './PDFS/sample-statement-of-work.pdf', 'page': 6}),
 Document(page_content='Deliverable 4  Administration Support  4 € 4,800  \nDeliverable 5  Custom Report  6 € 7,200  \nTOTAL:  28 € 33,600', metadata={'source': './PDFS/sample-statement-of-work.pdf', 'page': 7}),
 Document(page_content='• Network Appliance setup/troubleshooting  \n• Web Server set -up and troubleshooting  \n• Remote Access Customer/server configuration  \n• Windows and/or any othe r OS Installation', metadata={'source': './PDFS/sample-statement-of-work.pdf', 'page': 6}),
 Document(page_content='Distribution List  ................................ ................