In [2]:
# 0. 필요한 라이브러리 설치

!pip install langchain-community langchain_text_splitters langchain_chroma langchain_openai unstructured tiktoken


In [2]:
# 1. docx 파일로부터 content를 추출한다.

from langchain_community.document_loaders import UnstructuredFileLoader

loader = UnstructuredFileLoader(file_path="5G SA 도입관련 과금 세부화 내역 조회 시스템 요건(Ver.042).docx", mode="single")
docs = loader.load()
print(docs[0])


In [3]:
# 2. content를 적절한 크기로 나눈다.

from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1500,
    chunk_overlap=200
)

splits = []
for doc in docs:
    splits.extend(splitter.split_text(doc.page_content))

print(f"Number of splits: {len(splits)}")
print(splits[0])


In [4]:
# 3. 나눈 content를 embedding하여 chromaDB에 저장한다.

from langchain_chroma import Chroma
from langchain_openai.embeddings import OpenAIEmbeddings

llm_api_url = "https://aihub-api.sktelecom.com/aihub/v1/sandbox"
llm_api_key = "ba3954fe-9cbb-4599-966b-20b04b5d3441"
persist_directory = "./daisy_demo_chromadb"
embedding_model_name = "text-embedding-3-large"
collection_name = "ds-dtseungbum-billing_granularify_system_requirements-text-4d94"

embeddings = OpenAIEmbeddings(
    model=embedding_model_name,
    openai_api_key=llm_api_key,
    openai_api_base=llm_api_url)

chroma = Chroma(
    collection_name=collection_name,
    embedding_function=embeddings,
    persist_directory=persist_directory)
chroma.add_texts(splits)


In [ ]:
# 4. 검색을 수행한다.

retriever = chroma.as_retriever(saerch_type="similarity", k=4)

for i, d in enumerate(retriever.invoke("과금 CDR 조회 기능의 제약 사항")):
    print("*************")
    print(f"Document {i}")
    print(d.page_content)
    print("*************")
