#### 라이브러리 import

In [1]:
import pandas as pd
import numpy as np

from datasets import load_dataset
from langchain_text_splitters import RecursiveCharacterTextSplitter
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import Distance, VectorParams
from sentence_transformers import SentenceTransformer

#### Method 정의

In [None]:
def chunking(documents_df_, chunk_size_, chunk_overlap_):
    """
    chunk_size_, chunk_overlap_에 따라, documents_df_의 텍스트를 Chunking
    """
    chunk_results = []
    chunk_idx = 0
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size_,
        chunk_overlap=chunk_overlap_,
        length_function=len
    )

    for idx, row in documents_df_.iterrows():
        origin_text = row['summary']

        chunks = text_splitter.create_documents([origin_text])

        for chunk in chunks:
            chunk_results.append({
                'chunk_id':chunk_idx,
                'text':chunk.page_content,
                'metadata':{
                    'category':row['category'],
                    'press':row['press'],
                    'title':row['title'],
                    'chunk_size':len(chunk.page_content)
                }
            })
            chunk_idx += 1

    return chunk_results

def retrieval(embedding_model_name: str, collection_name: str, query: str, top_k: int, use_instruct_prefix=False):
    """
    Query를 임베딩하고, Qdrant 컬렉션에서 관련 문서를 검색.
    """

    # 임베딩 모델 로드
    embedding_model = SentenceTransformer(embedding_model_name)

    # 쿼리 임베딩
    if use_instruct_prefix: # Instruct 모델을 사용하는 경우 'query: ' 접두사 추가
        query = f"query: {query}"

    query_vector = embedding_model.encode(query).tolist()

    # Qdrant Client 로드
    client = QdrantClient(host='localhost', port=6333)

    # Qdrant 검색 수행
    search_results = client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=top_k,
        with_payload=True, # 검색된 포인트의 메타 데이터
        with_vectors=False # 검색된 포인트의 임베딩 벡터 (필요 X)
    )

    # 결과 출력
    if not search_results:
        print(f"ERROR: 검색 결과가 없습니다.")
        
        return
    
    for rank, result in enumerate(search_results):
        payload = result.payload

        chunk_text = payload.get('text', '텍스트 없음')
        source_title = payload.get('title', '제목 없음')
        source_press = payload.get('press', '출처 없음')

        print(f"[{rank}]: \nchunk_text: {chunk_text}\nsource_title: {source_title}\nsource_press: {source_press}")

#### 데이터셋 로드

In [None]:
# 데이터셋 로드
dataset_id = 'daekeun-ml/naver-news-summarization-ko'
dataset = load_dataset(dataset_id, split='test')
documents_df = dataset.to_pandas()

# # Example
# print(f"문서 수 : {len(documents_df)}")
# print(f"문서 Columns : {documents_df.columns}")

#### CHUNKING -> EMBEDDING -> INDEXING
* Chunk Size, Chunk Overlap, Embedding Model, Collection Name 확인 필수!

In [None]:
# 변수 설정
CHUNK_SIZE, CHUNK_OVERLAP = 180, 36
EMBEDDING_MODEL_NAME = "BM-K/KoSimCSE-roberta-multitask"
COLLECTION_NAME = ''
USE_INSTRUCT_PREFIX = False

# Chunking
chunk_results = chunking(documents_df, CHUNK_SIZE, CHUNK_OVERLAP)

# Retriever 모델 선택 및 임베딩
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

EMBEDDING_DIM = embedding_model.get_sentence_embedding_dimension() # 임베딩 차원 확인

chunk_texts = [chunk['text'] for chunk in chunk_results] # 청크 텍스트 리스트

if USE_INSTRUCT_PREFIX:
    chunk_texts = [f"passage: {chunk_text}" for chunk_text in chunk_texts]

embeddings = embedding_model.encode(chunk_texts, show_progress_bar=True) # 임베딩 / 임베딩 과정 시각화

# Qdrant 컬렉션 구축
client = QdrantClient(host='localhost', port=6333)

client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIM,
        distance=models.Distance.COSINE # 코사인 유사도
    )
)

# Indexing: Qdrant에 포인트 삽입
points = []

# Qdrant에 삽입할 PointStruct 리스트 생성
for idx, chunk in enumerate(chunk_results):
    # metadata를 Payload로 사용, 원문 'text'를 포함
    payload_data = chunk['metadata']
    payload_data['text'] = chunk['text']

    # PointStruct 생성: id는 chunk_id(0부터 시작) 사용
    points.append(
        models.PointStruct(
            id=chunk['chunk_id'], # 청크 ID를 Qdrant의 고유 ID로 사용
            vector=embeddings[idx].tolist(), # numpy 배열을 list로 변환하여 삽입
            payload=payload_data
        )
    )

# 데이터 삽입(upsert)
operation_info = client.upsert(
    collection_name=COLLECTION_NAME,
    wait=True, # 작업 완료 대기
    points=points
)

No sentence-transformers model found with name BM-K/KoSimCSE-roberta-multitask. Creating a new one with mean pooling.


Batches:   0%|          | 0/1755 [00:00<?, ?it/s]

#### Retrieval: Query -> Top K개의 Chunk 반환

In [None]:
EMBEDDING_MODEL_NAME = "BM-K/KoSimCSE-roberta-multitask"
COLLECTION_NAME = ''
QUERY = ''
TOP_K = 3

retrieval(EMBEDDING_MODEL_NAME, COLLECTION_NAME, QUERY, TOP_K)