In [None]:
import os
os.environ["HF_HOME"] = "../huggingface_data"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from pathlib import Path

hwp_dir = Path("./hwp_files")
hwp_files = [
    f for f in hwp_dir.rglob("*.hwp")
]

In [None]:
import tempfile
import subprocess
from typing import Iterator
from langchain_core.documents import Document
from langchain_community.document_loaders import BSHTMLLoader

class HWPLoader(BSHTMLLoader):
    def __init__(self, file_path: str) -> None:
        self.original_file_path = file_path
        
        xml_text = subprocess.check_output(
            ["hwp5proc", "xml", file_path],
            text=True,
            stderr=subprocess.DEVNULL,
        )

        with tempfile.NamedTemporaryFile(
            delete=False, suffix=".xml", mode="w", encoding="utf-8"
        ) as tmp:
            tmp.write(xml_text)
            self._temp_path = tmp.name

        super().__init__(file_path=self._temp_path, bs_kwargs={"features": "xml"})

    def load(self):
        docs = super().load()

        try:
            os.remove(self._temp_path)
        except FileNotFoundError:
            pass

        return docs

    def lazy_load(self) -> Iterator[Document]:
        for doc in super().lazy_load():
            doc.metadata["source"] = f"{Path(self.original_file_path)}"
            yield doc

In [None]:
all_docs = []
for file_path in hwp_files:
    try:
        loader = HWPLoader(
            file_path=str(file_path),
        )
        docs = loader.load()
        all_docs.extend(docs)
    except Exception as e:
        pass
    break

In [None]:
docs

In [None]:
all_docs

In [None]:
from transformers import AutoTokenizer

EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-8B"

tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)

In [None]:
from langchain_text_splitters import TokenTextSplitter

text_splitter = TokenTextSplitter.from_huggingface_tokenizer(
    tokenizer,
    chunk_size=512,
    chunk_overlap = 10
)

# 4️⃣ 텍스트 분리
texts = text_splitter.split_documents(all_docs)


In [None]:
texts

In [None]:
import os

from pathlib import Path

os.environ["HF_HOME"] = "../huggingface_data"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import json
import hashlib
import time
from typing import Any, Iterable
from openai import OpenAI
from pymilvus import MilvusClient, DataType
from docling.chunking import HybridChunker
from langchain_docling import DoclingLoader
from langchain_core.documents import Document


DATA_DIR = Path("data")
EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-8B"
# EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-4B"
OPENAI_URL = "http://127.0.0.1:9804/v1"
MILVUS_URI = "http://127.0.0.1:19530"
EXPORT_TYPE = "doc_chunks"
CHUNKER = HybridChunker(tokenizer=EMBED_MODEL_ID, max_tokens=1000)

all_files = ["./hwp_files/Consulting of Asuncion Smart City for Digital District 20221130_최종.pptx"]

all_docs = []
for file_path in all_files:
    try:
        loader = DoclingLoader(
            file_path=str(file_path),
            export_type=EXPORT_TYPE,
            chunker=CHUNKER,
        )
        docs = loader.load()
        all_docs.extend(docs)
    except Exception as e:
        pass

In [None]:
all_docs[0]

In [None]:
from transformers import AutoTokenizer
from langchain_text_splitters import TokenTextSplitter

# 1️⃣ 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-large-v2")

# 2️⃣ Text Splitter 정의
splitter = TokenTextSplitter.from_huggingface_tokenizer(
    tokenizer,
    chunk_size=512,
    chunk_overlap=0,
)

# 3️⃣ 테스트용 텍스트
xml_text = "<root>" + " ".join(["hello world!"] * 1000) + "</root>"

# 4️⃣ Split 실행
texts = splitter.split_text(xml_text)
print(f"{len(texts)} chunks created")

# 5️⃣ 검증 코드: 각 청크의 실제 토큰 개수를 계산
for i, chunk in enumerate(texts[:3]):  # 앞의 몇 개만 확인
    input_ids = tokenizer.encode(chunk, add_special_tokens=False)
    print(f"Chunk {i}: {len(input_ids)} tokens")


In [None]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer,
    chunk_size=512,       # 모델 최대 토큰 길이에 맞게 조절 (예: 512, 1024 등)
)

# 4️⃣ 텍스트 분리
texts = text_splitter.split_text(xml_text)
print(len(texts), "chunks created")
print(texts[0][:200])  # 첫 번째 청크 일부 출력

In [None]:
from docling.document_converter import DocumentConverter

doc = DocumentConverter().convert(source=DOC_SOURCE).document

In [None]:
import os
os.environ["HF_HOME"] = "../huggingface_data"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from docling.chunking import HybridChunker
from langchain_docling import DoclingLoader

EMBED_MODEL_ID = "Qwen/Qwen3-Embedding-8B"
EXPORT_TYPE = "doc_chunks"
CHUNKER = HybridChunker(tokenizer=EMBED_MODEL_ID, max_tokens=1000)

# loader = DoclingLoader(
#     file_path=str(_temp_path),
#     export_type=EXPORT_TYPE,
#     chunker=CHUNKER,
# )

# loader.load()

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)

In [None]:
CHUNKER.chunk

In [None]:
import os
import tempfile
import subprocess
from pathlib import Path
from typing import Iterator
from langchain_core.documents import Document
from langchain_community.document_loaders import BSHTMLLoader

class HWPLoader(BSHTMLLoader):
    def __init__(self, file_path: str) -> None:
        self.original_file_path = file_path
        
        xml_text = subprocess.check_output(
            ["hwp5proc", "xml", file_path],
            text=True,
            stderr=subprocess.DEVNULL,
        )

        with tempfile.NamedTemporaryFile(
            delete=False, suffix=".xml", mode="w", encoding="utf-8"
        ) as tmp:
            tmp.write(xml_text)
            self._temp_path = tmp.name

        super().__init__(file_path=self._temp_path, bs_kwargs={"features": "xml"})

    def load(self):
        docs = super().load()

        try:
            os.remove(self._temp_path)
        except FileNotFoundError:
            pass

        return docs

    def lazy_load(self) -> Iterator[Document]:
        for doc in super().lazy_load():
            doc.metadata["source"] = f"{Path(self.original_file_path)}"
            yield doc

In [None]:
HWPLoader(hwp_files[0]).load()