# Docling

In [None]:
import os
os.environ["HF_HOME"] = "../huggingface_data"

from pathlib import Path
from langchain_docling import DoclingLoader
from docling.chunking import HybridChunker

DATA_DIR = Path("../data")
EXPORT_TYPE = "doc_chunks"
EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-8B"
CHUNKER = HybridChunker(tokenizer=EMBED_MODEL_ID, max_tokens=1000)

all_files = [
    f for f in DATA_DIR.rglob("*")
    if f.is_file() and not f.name.startswith(".")
]

# all_files = all_files[:3]

all_docs = []
for file_path in all_files:
    try:
        loader = DoclingLoader(
            file_path=str(file_path),
            export_type=EXPORT_TYPE,
            chunker=CHUNKER,
        )
        docs = loader.load()
        all_docs.extend(docs)
        print(f"{file_path.name} → {len(docs)} chunks 생성 완료")
    except Exception as e:
        print(f"{file_path.name} 처리 중 오류 발생: {e}")

In [None]:
len(all_docs)

In [None]:
import hashlib
from langchain_core.documents import Document

def normalize_dedup_convert_inplace(docs: list[Document]) -> None:
    seen_index: dict[int, int] = {}   # pk -> 첫 등장 인덱스
    remove_indices: list[int] = []

    for i in range(len(docs)):
        doc = docs[i]
        md = getattr(doc, "metadata", {}) or {}

        src = md.get("source")
        if isinstance(src, list):
            src_list = [s for s in src if s is not None]
        elif src is None:
            src_list = []
        else:
            src_list = [src]

        pk = md.get("pk")
        if pk is None:
            hash_val = hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest()
            pk = int(hash_val, 16) % (1 << 63)

        if pk not in seen_index:
            docs[i] = {
                "text": doc.page_content,
                "pk": pk,
                "source": src_list,
            }
            seen_index[pk] = i
        else:
            base = docs[seen_index[pk]]       
            base_src = base.get("source", [])
            for s in src_list:
                if s not in base_src:
                    base_src.append(s)
            base["source"] = base_src

            remove_indices.append(i)

    for idx in reversed(remove_indices):
        del docs[idx]

In [None]:
normalize_dedup_convert_inplace(all_docs)

In [None]:
len(all_docs)

In [None]:
all_docs[:3]

In [None]:
from typing import Any, Iterable
import time
from openai import OpenAI

OPENAI_URL = "http://127.0.0.1:9804/v1"
EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-4B"

client = OpenAI(
    api_key="EMPTY",
    base_url=OPENAI_URL,
)

def _chunks(seq: list[dict[str, Any]], size: int) -> Iterable[list[dict[str, Any]]]:
    for i in range(0, len(seq), size):
        yield seq[i:i+size]

def embed_docs_inplace(
    docs: list[dict[str, Any]],
    *,
    model: str = EMBED_MODEL_ID,
    batch_size: int = 128,
    text_key: str = "text",
    vector_key: str = "vector",
    max_retries: int = 5,
    backoff_base: float = 1.5,
) -> None:
    batches = []
    for batch in _chunks(docs, batch_size):
        to_process = [(i, d) for i, d in enumerate(batch)]

        if not to_process:
            continue

        inputs = []
        idx_map = []
        for j, d in to_process:
            text = d.get(text_key)
            if not isinstance(text, str) or not text.strip():
                d[vector_key] = None
                continue
            inputs.append(text)
            idx_map.append(j)

        if not inputs:
            continue

        attempt = 0
        while True:
            try:
                resp = client.embeddings.create(model=model, input=inputs)
                for k, item in enumerate(resp.data):
                    batch[idx_map[k]][vector_key] = item.embedding
                break
            except Exception as e:
                attempt += 1
                if attempt > max_retries:
                    for j in idx_map:
                        batch[j][vector_key] = None
                    break
                time.sleep(backoff_base ** attempt)

In [None]:
embed_docs_inplace(all_docs)

In [None]:
from pymilvus import MilvusClient

URI = "http://localhost:19530"

client = MilvusClient(
    uri=URI,
    token="root:Milvus"
)

In [None]:
databases = client.list_databases()
print(databases)

In [None]:
client.use_database("doc_embeddings")

collection_name = "doc_embeddings"

desc = client.describe_collection(collection_name=collection_name)
desc

In [None]:
client.use_database("doc_embeddings2")

collection_name = "doc_embeddings"

desc = client.describe_collection(collection_name=collection_name)
desc

In [None]:
from pymilvus import connections, Collection

# 1) Milvus 접속
connections.connect(
    alias="default",
    uri="http://127.0.0.1:19530",
    token="root:Milvus",
    db_name = "doc_embeddings"
)

# 2) 기존 컬렉션 핸들러 생성 (컬렉션 이름만 알면 됩니다)
col = Collection(name="doc_embeddings")  # ← 본인 컬렉션명

# 3) 인덱스 목록 조회
if not col.indexes:
    print("인덱스가 없습니다.")
else:
    for idx in col.indexes:
        print("=== Index Info ===")
        print("Collection :", col.name)
        print("Field      :", idx.field_name)        # 인덱스가 걸린 필드명 (예: 'vector')
        print("Index Name :", idx.index_name)        # 인덱스 이름 (예: 'vector')
        print("Params     :", idx.params)            # {'index_type': 'FLAT', 'metric_type': 'L2', ...}
        # 편의 출력
        print("Index Type :", idx.params.get("index_type"))
        print("Metric Type:", idx.params.get("metric_type"))
        print()


In [None]:
from pymilvus import connections, Collection

# 1) Milvus 접속
connections.connect(
    alias="default",
    uri="http://127.0.0.1:19530",
    token="root:Milvus",
    db_name = "doc_embeddings2"
)

# 2) 기존 컬렉션 핸들러 생성 (컬렉션 이름만 알면 됩니다)
col = Collection(name="doc_embeddings")  # ← 본인 컬렉션명

# 3) 인덱스 목록 조회
if not col.indexes:
    print("인덱스가 없습니다.")
else:
    for idx in col.indexes:
        print("=== Index Info ===")
        print("Collection :", col.name)
        print("Field      :", idx.field_name)        # 인덱스가 걸린 필드명 (예: 'vector')
        print("Index Name :", idx.index_name)        # 인덱스 이름 (예: 'vector')
        print("Params     :", idx.params)            # {'index_type': 'FLAT', 'metric_type': 'L2', ...}
        # 편의 출력
        print("Index Type :", idx.params.get("index_type"))
        print("Metric Type:", idx.params.get("metric_type"))
        print()

In [None]:
db_name = "my_test"
client.create_database(db_name=db_name)

In [None]:
client.use_database("my_test")

In [None]:
from pymilvus import MilvusClient, DataType

schema = client.create_schema()

schema.add_field(
    field_name="text",
    datatype=DataType.VARCHAR,
    max_length=65535,
)

schema.add_field(
    field_name="pk",
    datatype=DataType.INT64,
    is_primary=True,
    auto_id=False,
)

schema.add_field(
    field_name="vector",
    datatype=DataType.FLOAT_VECTOR,
    dim=2560,
)

schema.add_field(
    field_name="source",
    datatype=DataType.JSON,

)

index_params = client.prepare_index_params()

index_params.add_index(
    field_name="vector",
    index_name = "vector",
    index_type="FLAT",
    metric_type="L2"
)

if client.has_collection("my_test"):
    client.drop_collection("my_test")
client.create_collection(collection_name="my_test",index_params=index_params ,schema=schema)

In [None]:
res = client.insert(collection_name="my_test", data=all_docs)

In [None]:
res

In [None]:
from pymilvus import MilvusClient

URI = "http://localhost:19530"

client = MilvusClient(
    uri=URI,
    token="root:Milvus"
)

client.use_database("doc_embeddings")

def fetch_any_vector(client: MilvusClient, collection_name: str, vector_field: str = "vector"):
    # pk >= 0 같은 전역 필터를 주고, limit=1로 "아무거나" 1개를 가져옵니다.
    rows = client.query(
        collection_name=collection_name,
        filter="pk >= 0",                  # 전체 범위
        output_fields=["pk", "source", "text"],
        limit=171
    )
    if not rows:
        return None
    return rows

rows = fetch_any_vector(client, "doc_embeddings")

In [None]:
for row in rows:
    print(row["text"][:5])
    print()

In [None]:
len(rows)

In [None]:
from pymilvus import MilvusClient

from langchain_milvus import Milvus
from langchain_openai import OpenAIEmbeddings

# -------------------------------
# Milvus + Embedding + LLM 설정
# -------------------------------
MILVUS_URI = "http://localhost:19530"
DB_NAME = "doc_embeddings"
COLLECTION_NAME = "doc_embeddings"
EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-4B"
OPENAI_URL = "http://127.0.0.1:9804/v1"

embeddings = OpenAIEmbeddings(
    api_key="EMPTY",
    base_url=OPENAI_URL,
    model=EMBED_MODEL_ID
)
# ② Milvus 연결
vector_store = Milvus(
    embedding_function=embeddings,
    collection_name="doc_embeddings",
    connection_args={
        "uri": MILVUS_URI,
        "token": "root:Milvus",
        "db_name": "doc_embeddings"
    },
    index_params={
        "index_type": "FLAT",
        "metric_type": "L2"
    },
)
documents = vector_store.similarity_search("아순시온", k=20)

In [None]:
for doc in documents:
    print(doc.page_content[:10])
    print() 

In [None]:
documents

In [None]:
from pymilvus import MilvusClient

from langchain_milvus import Milvus
from langchain_openai import OpenAIEmbeddings

# -------------------------------
# Milvus + Embedding + LLM 설정
# -------------------------------
MILVUS_URI = "http://localhost:19530"
DB_NAME = "doc_embeddings"
COLLECTION_NAME = "doc_embeddings"
EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-8B"
OPENAI_URL = "http://127.0.0.1:9804/v1"

embeddings = OpenAIEmbeddings(
    api_key="EMPTY",
    base_url=OPENAI_URL,
    model=EMBED_MODEL_ID
)
# ② Milvus 연결
vector_store = Milvus(
    embedding_function=embeddings,
    collection_name="doc_embeddings",
    connection_args={
        "uri": MILVUS_URI,
        "token": "root:Milvus",
        "db_name": "doc_embeddings"
    },
    index_params={
        "index_type": "FLAT",
        "metric_type": "L2"
    },
)
documents = vector_store.similarity_search("스마트시티", k=20)

In [None]:
for doc in documents:
    print(doc.page_content[:10])
    print() 

In [None]:
from pymilvus import MilvusClient

from langchain_milvus import Milvus
from langchain_openai import OpenAIEmbeddings

# -------------------------------
# Milvus + Embedding + LLM 설정
# -------------------------------
MILVUS_URI = "http://localhost:19530"
DB_NAME = "doc_embeddings"
COLLECTION_NAME = "doc_embeddings"
EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-8B"
OPENAI_URL = "http://127.0.0.1:9804/v1"

embeddings = OpenAIEmbeddings(
    api_key="EMPTY",
    base_url=OPENAI_URL,
    model=EMBED_MODEL_ID
)
# ② Milvus 연결
vector_store = Milvus(
    embedding_function=embeddings,
    collection_name="doc_embeddings",
    connection_args={
        "uri": MILVUS_URI,
        "token": "root:Milvus",
        "db_name": "doc_embeddings"
    },
    index_params={
        "index_type": "FLAT",
        "metric_type": "L2"
    },
)
documents = vector_store.similarity_search("스마트시티의 진행은 어떻게 되가?", k=3)

In [None]:
for doc in documents:
    print(doc.page_content[:10])
    print() 

In [None]:
from pymilvus import MilvusClient

from langchain_milvus import Milvus
from langchain_openai import OpenAIEmbeddings

# -------------------------------
# Milvus + Embedding + LLM 설정
# -------------------------------
MILVUS_URI = "http://localhost:19530"
DB_NAME = "doc_embeddings2"
COLLECTION_NAME = "doc_embeddings"
EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-8B"
OPENAI_URL = "http://127.0.0.1:9804/v1"

embeddings = OpenAIEmbeddings(
    api_key="EMPTY",
    base_url=OPENAI_URL,
    model=EMBED_MODEL_ID
)
# ② Milvus 연결
vector_store = Milvus(
    embedding_function=embeddings,
    collection_name="doc_embeddings",
    connection_args={
        "uri": MILVUS_URI,
        "token": "root:Milvus",
        "db_name": DB_NAME
    },
    index_params={
        "index_type": "FLAT",
        "metric_type": "L2"
    },
)
documents = vector_store.similarity_search("스마트시티의 진행은 어떻게 되가?", k=50)

In [None]:
for doc in documents:
    print(doc.page_content[:10])
    print() 

In [None]:
from pymilvus import MilvusClient

from langchain_milvus import Milvus
from langchain_openai import OpenAIEmbeddings

# -------------------------------
# Milvus + Embedding + LLM 설정
# -------------------------------
MILVUS_URI = "http://localhost:19530"
DB_NAME = "doc_embeddings"
COLLECTION_NAME = "doc_embeddings"
EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-8B"
OPENAI_URL = "http://127.0.0.1:9804/v1"

embeddings = OpenAIEmbeddings(
    api_key="EMPTY",
    base_url=OPENAI_URL,
    model=EMBED_MODEL_ID
)
# ② Milvus 연결
vector_store = Milvus(
    embedding_function=embeddings,
    collection_name="doc_embeddings",
    connection_args={
        "uri": MILVUS_URI,
        "token": "root:Milvus",
        "db_name": "doc_embeddings"
    },
    index_params={
        "index_type": "FLAT",
        "metric_type": "L2"
    },
)
documents = vector_store.similarity_search("스마트시티의 진행은 어떻게 되가?", k=50)

In [None]:
for doc in documents:
    print(doc.page_content[:10])
    print() 

In [None]:
from pymilvus import MilvusClient

URI = "http://localhost:19530"

client = MilvusClient(
    uri=URI,
    token="root:Milvus"
)

client.use_database("doc_embeddings")

def fetch_any_vector(client: MilvusClient, collection_name: str, vector_field: str = "vector"):
    # pk >= 0 같은 전역 필터를 주고, limit=1로 "아무거나" 1개를 가져옵니다.
    rows = client.query(
        collection_name=collection_name,
        filter="pk >= 0",                  # 전체 범위
        output_fields=["pk", "source", "text"],
        limit=9999
    )
    if not rows:
        return None
    return rows

rows = fetch_any_vector(client, "doc_embeddings")

In [None]:
for row in rows:
    print(row["text"][:10])
    print()

In [None]:
from pymilvus import MilvusClient

URI = "http://localhost:19530"

client = MilvusClient(
    uri=URI,
    token="root:Milvus"
)

client.use_database("doc_embeddings2")

def fetch_any_vector(client: MilvusClient, collection_name: str, vector_field: str = "vector"):
    # pk >= 0 같은 전역 필터를 주고, limit=1로 "아무거나" 1개를 가져옵니다.
    rows = client.query(
        collection_name=collection_name,
        filter="pk >= 0",                  # 전체 범위
        output_fields=["pk", "source", "text"],
        limit=9999
    )
    if not rows:
        return None
    return rows

rows = fetch_any_vector(client, "doc_embeddings")

In [None]:
for row in rows:
    print(row["text"][:10])
    print()

In [None]:
from pymilvus import MilvusClient

from langchain_milvus import Milvus
from langchain_openai import OpenAIEmbeddings

# -------------------------------
# Milvus + Embedding + LLM 설정
# -------------------------------
MILVUS_URI = "http://localhost:19530"
DB_NAME = "doc_embeddings"
COLLECTION_NAME = "doc_embeddings"
EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-8B"
OPENAI_URL = "http://127.0.0.1:9804/v1"

embeddings = OpenAIEmbeddings(
    api_key="EMPTY",
    base_url=OPENAI_URL,
    model=EMBED_MODEL_ID
)
# ② Milvus 연결
vector_store = Milvus(
    embedding_function=embeddings,
    collection_name="doc_embeddings",
    connection_args={
        "uri": MILVUS_URI,
        "token": "root:Milvus",
        "db_name": DB_NAME
    },
    index_params={
        "index_type": "FLAT",
        "metric_type": "L2"
    },
)
documents = vector_store.similarity_search("스마트시티의 진행은 어떻게 되가?", k=350)

In [None]:
for doc in documents:
    print(doc.page_content[:10])
    print() 

In [None]:
from pymilvus import MilvusClient

from langchain_milvus import Milvus
from langchain_openai import OpenAIEmbeddings

# -------------------------------
# Milvus + Embedding + LLM 설정
# -------------------------------
MILVUS_URI = "http://localhost:19530"
DB_NAME = "doc_embeddings"
COLLECTION_NAME = "doc_embeddings"
EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-8B"
OPENAI_URL = "http://127.0.0.1:9804/v1"

embeddings = OpenAIEmbeddings(
    api_key="EMPTY",
    base_url=OPENAI_URL,
    model=EMBED_MODEL_ID
)
# ② Milvus 연결
vector_store = Milvus(
    embedding_function=embeddings,
    collection_name="doc_embeddings",
    connection_args={
        "uri": MILVUS_URI,
        "token": "root:Milvus",
        "db_name": DB_NAME
    },
    index_params={
        "index_type": "FLAT",
        "metric_type": "L2"
    },
)
documents = vector_store.similarity_search("스마트시티의 진행은 어떻게 되가?", k=171)

In [None]:
for doc in documents:
    print(doc.page_content[:10])
    print() 

In [None]:
from pymilvus import MilvusClient

from langchain_milvus import Milvus
from langchain_openai import OpenAIEmbeddings

# -------------------------------
# Milvus + Embedding + LLM 설정
# -------------------------------
MILVUS_URI = "http://localhost:19530"
DB_NAME = "doc_embeddings2"
COLLECTION_NAME = "doc_embeddings"
EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-8B"
OPENAI_URL = "http://127.0.0.1:9804/v1"

embeddings = OpenAIEmbeddings(
    api_key="EMPTY",
    base_url=OPENAI_URL,
    model=EMBED_MODEL_ID
)
# ② Milvus 연결
vector_store = Milvus(
    embedding_function=embeddings,
    collection_name="doc_embeddings",
    connection_args={
        "uri": MILVUS_URI,
        "token": "root:Milvus",
        "db_name": DB_NAME
    },
    index_params={
        "index_type": "FLAT",
        "metric_type": "L2"
    },
)
documents = vector_store.similarity_search("스마트시티의 진행은 어떻게 되가?", k=171)

In [None]:
for doc in documents:
    print(doc.page_content[:10])
    print() 

In [None]:
! ls -a ../../

In [None]:
import logging, time, threading
import os
from pathlib import Path
# from src.state import State
from langchain_milvus import Milvus
from langchain_openai import OpenAIEmbeddings
from langchain.messages import HumanMessage

os.makedirs("../../huggingface_data", exist_ok=True)
os.environ["HF_HOME"] = "../../huggingface_data"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# logger = logging.getLogger(__name__)

EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-8B"
# EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-4B"
OPENAI_URL = "http://127.0.0.1:9804/v1"
MILVUS_URI = "http://127.0.0.1:19530"
TIMEOUT_SEC = 30

In [None]:
embeddings = OpenAIEmbeddings(
    api_key="EMPTY",
    base_url=OPENAI_URL,
    model=EMBED_MODEL_ID,
    tiktoken_enabled=False
)

In [None]:
vector_store = Milvus(
    embedding_function=embeddings,
    collection_name="doc_embeddings",
    connection_args={
        "uri": MILVUS_URI,
        "token": "root:Milvus",
        "db_name": "doc_embeddings"
    },
    index_params={
        "index_type": "FLAT",
        "metric_type": "L2"
    },
)

In [None]:
question = "스마트 시티"
human_message = HumanMessage(question)


documents = vector_store.similarity_search(question, k=20)

In [None]:
documents