In [None]:
from langchain_community.document_loaders import WebBaseLoader
from sentence_transformers import SentenceTransformer
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

In [None]:
embedding_model = SentenceTransformer('./models/BGE-m3-ko')

In [None]:
# Qdrant 클라이언트 설정

qdrant_client = QdrantClient(
    host="qdrant",
    port=6333
)

qdrant_client.create_collection(
    collection_name="demo_collection",
    vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name="demo_collection",
    embedding=embedding_model,
)

In [None]:


def embed_and_upload_from_web(collection_name, urls):
    # 웹에서 문서 로드
    loader = WebBaseLoader(urls)
    docs = loader.load()
    texts = [doc.page_content for doc in docs]

    # 텍스트 임베딩 생성
    embeddings = embedding_model.encode(texts).tolist()

    # Qdrant에 업로드할 포인트 데이터 생성
    points = [
        qdrant_client.models.PointStruct(
            id=i,
            vector=embeddings[i],
            payload={"text": texts[i]}
        )
        for i in range(len(texts))
    ]

    # 컬렉션이 없으면 생성
    if collection_name not in [c.name for c in qdrant_client.get_collections().collections]:
        qdrant_client.recreate_collection(
            collection_name=collection_name,
            vectors_config=qdrant_client.models.VectorParams(
                size=len(embeddings[0]),
                distance="Cosine"
            )
        )

    # 포인트 업로드
    qdrant_client.upsert(
        collection_name=collection_name,
        points=points
    )
    print(f"{len(points)}개의 웹 문서가 Qdrant에 임베딩되어 업로드되었습니다.")

# 사용 예시
urls = [
    "https://www.pcninc.co.kr/"
    "https://www.pcninc.co.kr/digital/ai.do"
    "https://www.pcninc.co.kr/digital/bigdata.do"
    "https://www.pcninc.co.kr/digital/xrcontents.do"
    "https://www.pcninc.co.kr/digital/portfolio/list.do"
    "https://www.pcninc.co.kr/siux/public.do"
    "https://www.pcninc.co.kr/siux/finance.do"
    "https://www.pcninc.co.kr/siux/brand.do"
    "https://www.pcninc.co.kr/siux/health.do"
    "https://www.pcninc.co.kr/solution/oasis.do"
    "https://www.pcninc.co.kr/solution/apim.do"
    "https://www.pcninc.co.kr/solution/esearch.do"
    "https://www.pcninc.co.kr/solution/oasisx.do"
    "https://www.pcninc.co.kr/solution/datamap.do"
    "https://www.pcninc.co.kr/solution/trenddata.do"
    "https://www.pcninc.co.kr/solution/ozai.do"
    "https://www.pcninc.co.kr/company/introduce.do"
    "https://www.pcninc.co.kr/company/business.do?accYear=2023"
    "https://www.pcninc.co.kr/company/benefit.do"
    "https://www.pcninc.co.kr/company/history.do"
    "https://www.pcninc.co.kr/company/location.do"
    "https://www.pcninc.co.kr/ir/disinfo/list.do?page=1&pageSize=10"
    "https://www.pcninc.co.kr/notice/press/list.do?page=1&pageSize=6"
    "https://www.pcninc.co.kr/notice/plus/list.do?page=1&pageSize=6"
    "https://www.pcninc.co.kr/notice/news/list.do?page=1&pageSize=6"
]

embed_and_upload_from_web("pcn_web_intro", urls)

