In [None]:
import pandas as pd
import numpy as np

from datasets import load_dataset
from langchain_text_splitters import RecursiveCharacterTextSplitter
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import Distance, VectorParams
from sentence_transformers import SentenceTransformer

In [5]:
client = QdrantClient(host='localhost', port=6333)

In [32]:
# 데이터셋 로드
dataset_id = 'daekeun-ml/naver-news-summarization-ko'
dataset = load_dataset(dataset_id, split='train')
documents_df = dataset.to_pandas()

# Example
print(f"문서 수 : {len(documents_df)}")
print(f"문서 Columns : {documents_df.columns}")

문서 수 : 22194
문서 Columns : Index(['date', 'category', 'press', 'title', 'document', 'link', 'summary'], dtype='object')


In [33]:
# 문서 전처리 및 청킹
CHUNK_SIZE = 100
CHUNK_OVERLAP = 20

def chunking(documents_df_, chunk_size_, chunk_overlap_):
    chunk_results = []
    chunk_idx = 0
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size_,
        chunk_overlap=chunk_overlap_,
        length_function=len
    )

    for idx, row in documents_df_.iterrows():
        origin_text = row['summary']

        chunks = text_splitter.create_documents([origin_text])

        for chunk in chunks:
            chunk_results.append({
                'chunk_id':chunk_idx,
                'text':chunk.page_content,
                'metadata':{
                    'date':row['date'],
                    'category':row['category'],
                    'press':row['press'],
                    'title':row['title'],
                    'chunk_size':len(chunk.page_content)
                }
            })
            chunk_idx += 1

    return chunk_results

# Example
documents_df = documents_df.head(10)
chunk_results = chunking(documents_df, CHUNK_SIZE, CHUNK_OVERLAP)
for info in chunk_results[0].values():
    print(info)

0
올해 상반기 우리나라 무역수지는 역대 최악인 103억 달러 적자를 기록한 가운데, 정부가 하반기에 우리 경제의 버팀목인 수출 확대를 위해 총력을 기울이기로 결정한 가운데, 특히
{'date': '2022-07-03 17:14:37', 'category': 'economy', 'press': 'YTN ', 'title': '추경호 중기 수출지원 총력 무역금융 40조 확대', 'chunk_size': 98}


In [37]:
# Retriever 모델 선택 및 임베딩
MODEL_NAME = "BM-K/KoSimCSE-roberta-multitask"
model = SentenceTransformer(MODEL_NAME)

EMBEDDING_DIM = model.get_sentence_embedding_dimension() # 임베딩 차원 확인

chunk_texts = [chunk['text'] for chunk in chunk_results] # 청크 텍스트 리스트

embeddings = model.encode(chunk_texts, show_progress_bar=True) # 임베딩 / 임베딩 과정 시각화

No sentence-transformers model found with name BM-K/KoSimCSE-roberta-multitask. Creating a new one with mean pooling.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]