In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
print((torch.cuda._get_nvml_device_index(0)))

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFDirectoryLoader, WebBaseLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

In [None]:
# 모델 경로 설정
root_path = Path(".").resolve().parent.parent
embedding_model_path = root_path / "ai_models" / "base_models" / "embeddings" /"dragonkue" / "BGE-m3-ko"


print(root_path)
print(embedding_model_path)

In [None]:
# 임베딩 모델 초기화
embeddings = HuggingFaceEmbeddings(
    model_name=str(embedding_model_path),
    model_kwargs={'device': 'cuda:0'},
    encode_kwargs={'normalize_embeddings': True}
)

In [None]:
# def load_pdf_directory(directory_path):
#     loader = PyPDFDirectoryLoader(directory_path)
#     pages = loader.load()
#     return pages

def load_pdf_directory(directory_path):
    loader = PyPDFDirectoryLoader(directory_path, recursive=True, silent_errors=True)
    pages = loader.load()

    # 줄바꿈 노이즈 정리
    for page in pages:
        # 하이픈으로 줄바꿈된 단어 복원
        page.page_content = page.page_content.replace("-\n", "")
        # 일반 줄바꿈은 공백으로 변환
        page.page_content = page.page_content.replace("\n", " ")

    return pages
pdf_paths = "./data/pdf"
pdf_data = load_pdf_directory(pdf_paths)

In [None]:
pdf_data[:3]

In [None]:

def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=50,
        length_function=len,
        separators=[r"\n{2,}", r"\n", r"[.!?]", r"[,;:]", r" "],
        is_separator_regex=True
    )
    return text_splitter.split_documents(documents)

chunks = split_documents(pdf_data)

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=400,  # 한 청크에 너무 많은 문장이 담기지 않도록
#     chunk_overlap=50,  # 앞뒤 문맥 연결 위해 소폭 겹침
#     separators=[r"\n{2,}", r"\n", r"[.!?]", r"[,;:]", r" "],  # 문단, 줄, 문장, 쉼표, 공백 순으로 분할
#     is_separator_regex=True
# )

# chunks = text_splitter.split_documents(pdf_data)

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=500,  # PDF 문서는 더 작은 청크로 나누는 것이 좋음
#     chunk_overlap=50, # 청크 간 중복도 줄임
#     length_function=len,
#     separators=["\n\n", "\n", ".", "!", "?", ";", ":", " ", ""],  # PDF 문서의 구조를 고려한 구분자 추가
#     is_separator_regex=False
# )

# chunks = text_splitter.split_documents(pdf_data)

In [None]:
# FAISS 벡터 스토어 생성
db = FAISS.from_documents(
    chunks, 
    embeddings
)

In [None]:
# FAISS 벡터 스토어 저장
faiss_index_directory = "./faiss_pdf_directory"
# os.makedirs(faiss_index_directory, exist_ok=True)
db.save_local(faiss_index_directory)

In [None]:
# FAISS 벡터 스토어에서 유사도 검색
query = "피씨엔 회사소개"
docs = db.similarity_search(query, k=3)  # k는 반환할 문서 수

# 검색 결과 출력
for i, doc in enumerate(docs):
    print(f"\n## 검색결과 {i+1}")
    print(f"내용: {doc.page_content}")
    print(f"출처: {doc.metadata}")

In [None]:
faiss_index_directory = "./faiss_pdf_directory"
vectorstore = FAISS.load_local(faiss_index_directory, embeddings, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(    
    search_type="similarity_score_threshold", 
    search_kwargs={"score_threshold": 0.5, "k": 3}
)

In [None]:
vectorstore.similarity_search(k=3, query="AI 프로젝트 소개")