In [None]:
from bs4 import SoupStrainer
from langchain_community.document_loaders import WebBaseLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
from bs4 import SoupStrainer
from langchain_community.document_loaders import WebBaseLoader, SeleniumURLLoader

urls = [
    "https://www.pcninc.co.kr/",
    "https://www.pcninc.co.kr/digital/ai.do",
    "https://www.pcninc.co.kr/digital/bigdata.do",
    "https://www.pcninc.co.kr/digital/xrcontents.do",
    "https://www.pcninc.co.kr/digital/portfolio/list.do",
    "https://www.pcninc.co.kr/siux/public.do",
    "https://www.pcninc.co.kr/siux/finance.do",
    "https://www.pcninc.co.kr/siux/brand.do",
    "https://www.pcninc.co.kr/siux/health.do",
    "https://www.pcninc.co.kr/solution/oasis.do",
    "https://www.pcninc.co.kr/solution/apim.do",
    "https://www.pcninc.co.kr/solution/esearch.do",
    "https://www.pcninc.co.kr/solution/oasisx.do",
    "https://www.pcninc.co.kr/solution/datamap.do",
    "https://www.pcninc.co.kr/solution/trenddata.do",
    "https://www.pcninc.co.kr/solution/ozai.do",
    "https://www.pcninc.co.kr/company/introduce.do",
    "https://www.pcninc.co.kr/company/business.do?accYear=2023",
    "https://www.pcninc.co.kr/company/benefit.do",
    "https://www.pcninc.co.kr/company/history.do",
    "https://www.pcninc.co.kr/company/location.do",
    "https://www.pcninc.co.kr/ir/disinfo/list.do?page=1&pageSize=10",
    "https://www.pcninc.co.kr/notice/press/list.do?page=1&pageSize=6",
    "https://www.pcninc.co.kr/notice/plus/list.do?page=1&pageSize=6",
    "https://www.pcninc.co.kr/notice/news/list.do?page=1&pageSize=6",
]
    
loader = WebBaseLoader(
    web_paths=urls,
    # urls=urls,
    # browser="chrome",
    # headless=True,
    header_template={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
    },
    bs_kwargs={
        "parse_only": SoupStrainer(["p", "h1", "h2", "h3", "div", "span"])  # 텍스트 노드만 파싱
    }
)
docs = loader.load()
print(len(docs))


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=100,
    length_function=len,  # 텍스트 길이를 측정하는 함수
    is_separator_regex=False,  # 구분자가 정규식이 아님을 명시
    separators=["\n\n", "\n", " ", ""]
)

split_docs = text_splitter.split_documents(docs)
split_docs

In [None]:
import os
from pathlib import Path
root_path = Path(os.getcwd()).parent.parent
embeddings_path = root_path / "ollama-service" / "models" / "BGE-m3-ko"
print(root_path)
print(embeddings_path)

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

# HuggingFace 임베딩 모델 로드 (경로를 문자열로 변환하여 오류 방지)
embedding_model = HuggingFaceEmbeddings(
    model_name=str(embeddings_path)
)

In [None]:
# 원본
# embeddings = embedding_model.embed_documents([doc.page_content for doc in split_docs])

# 개선 제안 1: 더 명확한 변수명 사용
document_contents = [doc.page_content for doc in split_docs]
embeddings = embedding_model.embed_documents(document_contents)

In [None]:
from langchain_community.vectorstores import FAISS

In [None]:
db = FAISS.from_documents(split_docs, embedding_model)

In [None]:
db.save_local("./vector_db/pcn_web")

In [None]:
db = FAISS.load_local("./vector_db/pcn_web", embedding_model, allow_dangerous_deserialization=True)

In [None]:
# 쿼리 문서 임베딩 생성 (예: 첫 번째 문서 사용)
query_embedding = embedding_model.embed_query("피씨엔 소개")

# 벡터 유사도 검색 (상위 3개 결과 반환)
results = db.similarity_search_by_vector(query_embedding, k=10)

# for i, doc in enumerate(results, 1):
#     print(f"{i}번째 결과:\n{doc.page_content}\n")

In [None]:
results