In [26]:
import pandas as pd
import os
from chunking import ParentChildChunking
from embedding import extract_table_token, bm25_embedding, openai_embeddings
from pymilvus import Collection, CollectionSchema, FieldSchema, DataType, connections
from pymilvus import connections, utility

In [90]:
# NAMESPACE = 'IONIQ5_2024'
# NAMESPACE = 'SANTAFE_MX5_2023'
# NAMESPACE = 'SONATA_DN8_2024'
NAMESPACE = 'TESLA_MODEL3'


context_file_root_path = '/Users/yj/Kim/1.work/SKR/8.GenAI/my-small-mechanic/pdf_context'
bm25_model_root_path = '/Users/yj/Kim/1.work/SKR/8.GenAI/my-small-mechanic/vector_db/bm25'
context_text_dir = context_file_root_path + f'/text/{NAMESPACE}'

MILVUS_HOST = 'localhost'#os.environ['MILVUS_HOST']#'localhost' #
MILVUS_PORT = os.environ['MILVUS_PORT']
COLLECTION_NAME = "HYUNDAI_CAR_MANUAL"#""

os.makedirs(bm25_model_root_path, exist_ok=True)

connections.connect(alias="default", host=MILVUS_HOST, port=MILVUS_PORT)


In [91]:
df = pd.read_parquet(f'{context_text_dir}/doc_{NAMESPACE}_result.parquet' )
pc_df = pd.read_parquet(context_file_root_path+f'/embedding/{NAMESPACE}_embeddings.parquet')

In [92]:
# pcc = ParentChildChunking()
# semantic_df = pcc.colelct_context_by_semantic(df)
# pc_df = pcc.parent_child_chunking(semantic_df)
# pc_df = extract_table_token(pc_df)

In [93]:
bm25_model_path = bm25_model_root_path + f'/bm25_{NAMESPACE}_params.json'
bm25_embeddings = bm25_embedding(pc_df['embedding_contents'].values.tolist(), bm25_model_path, train=False)
pc_df['bm25_embeddings'] = bm25_embeddings

start to install package: unidic-lite
successfully installed package: unidic-lite


You should consider upgrading via the '/Users/yj/.pyenv/versions/3.9.11/envs/myMechanic_py39/bin/python3.9 -m pip install --upgrade pip' command.


In [None]:
oai_embeddings = openai_embeddings(pc_df['embedding_contents'].values)
pc_df['embeddings'] = oai_embeddings

In [69]:
pc_df['car_type'] = NAMESPACE
pc_df = pc_df.reset_index(drop=True).reset_index()

In [71]:
pc_df.drop('bm25_embeddings', axis=1).to_parquet(context_file_root_path+f'/embedding/{NAMESPACE}_embeddings.parquet')
pc_df.drop(['bm25_embeddings', 'embeddings'], axis=1).to_csv(context_file_root_path+f'/embedding/{NAMESPACE}_embeddings.csv')

In [42]:
id_field = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False)
car_type_filed = FieldSchema(name="car_type", dtype=DataType.VARCHAR, max_length=100, is_primary=False, auto_id=False,
                           description='차종')
major_h_filed = FieldSchema(name="majorheading", dtype=DataType.VARCHAR, max_length=100, is_primary=False, auto_id=False,
                           description='설명서의 대제목')
minor_h_filed = FieldSchema(name="minorheading", dtype=DataType.VARCHAR, max_length=100, is_primary=False, auto_id=False,
                           description='설명서의 소제목')
minor_sub_id_filed = FieldSchema(name="minorheading_sub_id", dtype=DataType.INT64, max_length=100, is_primary=False, auto_id=False,
                            description='소제목 ID')

parent_doc_id_filed = FieldSchema(name="parent_doc_id", dtype=DataType.INT64, max_length=1000, is_primary=False, auto_id=False,
                            description='Parent 문서ID ')

contents_filed = FieldSchema(name="doc_contents", dtype=DataType.VARCHAR, max_length=40000, is_primary=False, auto_id=False,
                            description='설명서의 내용')
imgurl_filed = FieldSchema(name="img_urls", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_capacity=50, max_length=50,
                            description='이미지 URL 경로')
tblimgurl_field = FieldSchema(name="table_img_urls", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_capacity=50, max_length=50,
                            description='표 이미지 URL 경로')
tblcsvurl_filed = FieldSchema(name="table_csv_urls", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_capacity=50, max_length=3000,
                            description='표 csv 경로')

embedding_contents_filed = FieldSchema(name="embedding_contents", dtype=DataType.VARCHAR, max_length=10000, is_primary=False, auto_id=False,
                            description='검증용 임베딩 내용')

vector_filed = FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=3072)
bm_25_vector_filed = FieldSchema(name="bm25_vector", dtype=DataType.SPARSE_FLOAT_VECTOR)
# 컬렉션 스키마 정의
schema = CollectionSchema(
    fields=[
        id_field, 
        car_type_filed,
        major_h_filed,
        minor_h_filed,
        minor_sub_id_filed,
        parent_doc_id_filed,
        contents_filed,
        imgurl_filed,
        tblimgurl_field,
        tblcsvurl_filed,
        embedding_contents_filed,
        vector_filed,
        bm_25_vector_filed
        ],
    description="현대 기아 차량 매뉴얼",
    partition_key_field="car_type"

)

# 컬렉션 생성
if COLLECTION_NAME not in utility.list_collections():
    collection = Collection(name=COLLECTION_NAME, schema=schema)
else:
    collection = Collection(name=COLLECTION_NAME)



In [10]:
# utility.drop_collection(COLLECTION_NAME)

In [94]:
#Index 생성
# Pandas DataFrame에서 데이터 추출 및 Milvus에 삽입
ids = pc_df['index'].to_list()
car_types = pc_df['car_type'].to_list()
h1s = pc_df['h1'].to_list()
h2s = pc_df['h2'].fillna('').astype(str).to_list()
h3s = pc_df['chunk_group2'].to_list()
parent_doc_id= pc_df['doc_id'].to_list()

cons = pc_df['doc_contents'].to_list()
img_urls = pc_df['img_urls'].to_list()
table_img_urls = pc_df['table_img_urls'].to_list()
table_csv_urls = pc_df['table_csv_urls'].to_list()
embedding_cont = pc_df['embedding_contents'].to_list()

vectors = pc_df['embeddings'].to_list()
bm25_vectors = pc_df['bm25_embeddings'].to_list()

In [95]:
# expr = f'''car_type=="{NAMESPACE}"'''
# collection.delete(expr)

In [103]:
# NAMESPACE = 'IONIQ5_2024'
# NAMESPACE = 'SANTAFE_MX5_2023'
NAMESPACE = 'SONATA_DN8_2024'

In [104]:
expr = f'''car_type=="{NAMESPACE}"'''
response = collection.query(expr, output_fields=["car_type", "doc_contents"])
len(response)

583

In [96]:
mr = collection.insert([ids, car_types, h1s, h2s, h3s, parent_doc_id, cons, img_urls, table_img_urls, table_csv_urls, embedding_cont, vectors, bm25_vectors])


In [38]:
M = 20
efConstruction=100

In [39]:
collection.create_index(
    field_name="vector", 
    index_params={
        "index_type": "HNSW", 
        "index_name":"ann_index",
        "metric_type": "COSINE", 
        "params": {"M": M, 'efConstruction':efConstruction} # Cluster 개수
        }
)

Status(code=0, message=)

In [40]:

collection.create_index(
    field_name= "bm25_vector",
    index_params={
        "index_name": "bm25_index",
        "index_type": "SPARSE_INVERTED_INDEX", # the type of index to be created. set to `SPARSE_INVERTED_INDEX` or `SPARSE_WAND`.
        "metric_type": "IP", # the metric type to be used for the index. Currently, only `IP` (Inner Product) is supported.
        "params": {"drop_ratio_build": 0.01}, # the ratio of small vector values to be dropped during indexing.
})

Status(code=0, message=)

In [47]:
collection = Collection(COLLECTION_NAME)
collection.load()

In [42]:
#컬렉션 마이그레이션
'''
from pymilvus import Collection, connections

# Milvus에 연결
connections.connect("default", host="localhost", port="19530")

# 기존 컬렉션과 새 컬렉션 객체 생성
old_collection = Collection("TEST")
new_collection = Collection("HYUNDAI_CAR_MANUAL")

# 기존 컬렉션의 모든 데이터 검색
old_collection.load()
results = old_collection.query(expr="id > 0", output_fields=["id", "car_type", "majorheading", "minorheading", "minorheading_sub_id", "parent_doc_id", "doc_contents", "img_urls",
                                                             "table_img_urls", "table_csv_urls", "embedding_contents", "vector", "bm25_vector"])

# 새 컬렉션에 데이터 삽입
new_collection.insert(results)

# 변경사항 적용
new_collection.flush()

# 컬렉션 언로드
old_collection.release()
new_collection.release()
'''