- dataset: https://korquad.github.io/category/1.0_KOR.html
- elasticsearch: https://www.elastic.co/docs/deploy-manage/deploy/self-managed/install-elasticsearch-docker-basic

In [None]:
# docker run -d --name es01 \
#   --net elastic \
#   -p 9200:9200 -m 1GB \
#   -e "discovery.type=single-node" \
#   -e "xpack.security.enabled=false" \
#   -e "xpack.security.http.ssl.enabled=false" \
#   docker.elastic.co/elasticsearch/elasticsearch:9.0.0


In [1]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

from elasticsearch import Elasticsearch
from datasets import load_dataset

In [2]:
ds = load_dataset('json', data_files='..\\data\\korquad\\KorQuAD_v1.0_train.json')

In [3]:
documents = []
korquad_data = ds['train'][0]['data']  # ✔️ 이게 실제 문서 리스트

for item in korquad_data:
    for para in item['paragraphs']:
        context = para['context']
        documents.append(Document(page_content=context))

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    length_function=len,
    separators=["\n\n", "\n", ".", " "]
)

chunks = text_splitter.split_documents(documents)

In [5]:
embeddings = HuggingFaceEmbeddings(
    model_name="../ai_models/base_models/BGE-m3-ko",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)
es = Elasticsearch("http://127.0.0.1:9200")


In [6]:
es.options(ignore_status=400).indices.create(
    index="korquad",
    mappings={
        "properties": {
            "content": {"type": "text"},
            "embedding": {
                "type": "dense_vector",
                "dims": 1024,  # 모델에 따라 조정
                "index": True,
                "similarity": "cosine"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'korquad'})

In [None]:
from elasticsearch import helpers
from tqdm import tqdm

actions = []
for chunk in tqdm(chunks):
    text = chunk.page_content
    vector = embeddings.embed_documents([text])[0]  # ✅ 문서용

    actions.append({
        "_index": "korquad",
        "_source": {
            "content": text,
            "embedding": vector
        }
    })

helpers.bulk(es, actions)


  0%|          | 66/13981 [00:51<2:51:00,  1.36it/s]

In [None]:
query = "ChatGPT는 어떤 기술로 만들어졌나요?"
query_vector = embeddings.embed_query(query)

response = es.search(
    index="korquad",
    knn={
        "field": "embedding",
        "query_vector": query_vector,
        "k": 5,
        "num_candidates": 50
    }
)

# 결과 출력
for hit in response["hits"]["hits"]:
    print(f"점수: {hit['_score']:.4f}")
    print(f"문서 내용: {hit['_source']['content']}\n")
