In [None]:
from pymilvus.model.hybrid import BGEM3EmbeddingFunction
import pandas as pd

from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection

In [None]:
data_ = pd.read_json(r'autodl-tmp/splits_docs_1000chunks.json')['page_content'].tolist()


In [None]:
metadata = pd.read_json(r'autodl-tmp/splits_docs_1000chunks.json')['metadata'].tolist()

In [None]:
# meta = [data['title'] for data in metadata]
metadata[0]

In [None]:
meta[:5]

In [None]:
# 加载BGEM3模型
bge_m3_embedding = BGEM3EmbeddingFunction(
    model_name=r'autodl-tmp/embedding_model/BAAI/bge-m3',
    use_fp16=False,
    device='cpu'
)
bge_m3_embedding

In [None]:
dense_dim = bge_m3_embedding.dim['dense']
dense_dim

In [None]:
# 需要被索引的文本
docs_embeddings = bge_m3_embedding(data_)

In [None]:
# 通过uri连接milvus数据库
connections.connect(uri='vectordb/milvus_mix/milvus_m3_2.db')
# 创建新合集并添加数据字段
fields = [
    # 使用自动生成的id作为主键
    FieldSchema(name='pk', dtype=DataType.VARCHAR, is_primary=True, auto_id=True, max_length=10000), # , max_length=100
    # 基于语义距离的原始文本检索
    FieldSchema(name='text', dtype=DataType.VARCHAR, max_length=8000), #, max_length=512
    FieldSchema(name='title', dtype=DataType.VARCHAR, max_length=8000),
    FieldSchema(name='time', dtype=DataType.VARCHAR, max_length=8000),
    FieldSchema(name='infosource', dtype=DataType.VARCHAR, max_length=8000),
    # Milvus现在支持稀疏和密集向量,我们可以将每个向量存储在单独的字段中，以便对两个向量进行混合搜索
    FieldSchema(name='sparse_vector', dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name='dense_vector', dtype=DataType.FLOAT_VECTOR, dim=dense_dim),
    
]
# 模式,或者叫集合模板
schema = CollectionSchema(fields)
# 创建集合（如果存在，请删除旧集合）
col_name = 'hybrid_demo'
if utility.has_collection(col_name):
    Collection(col_name).drop()
col = Collection(col_name, schema, consistency_level='Strong')

# 为了使向量搜索高效，我们需要为向量字段创建索引
sparse_index = {'index_type': 'SPARSE_INVERTED_INDEX', 'metric_type':'IP'}
col.create_index('sparse_vector', sparse_index)
dense_index = {'index_type':'AUTOINDEX', 'metric_type':'IP'}
col.create_index('dense_vector', dense_index)
col.load()

In [None]:
# 数据保存到集合中并且是将文档和对应embedding一起保存
# 为了提高效率，我们在每个小批量中插入50条记录
time = [data['time']for data in metadata]
infosource = [data['infosource']for data in metadata]
title = [data['title']for data in metadata]
for i in range(0, len(data_), 50):
    batched_entities = [
        data_[i:i+50],
        title[i:i+50],
        time[i:i+50],
        infosource[i:i+50],
        docs_embeddings['sparse'][i:i+50],
        docs_embeddings['dense'][i:i+50]
    ]
    col.insert(batched_entities)
print('数据插入数量：', col.num_entities)


In [None]:
# 输入搜索查询,进行测试
query = input('请输入你的问题：')
print(query)

# 生成问题的嵌入向量
query_embeddings = bge_m3_embedding([query])

In [None]:
# 运行搜索
from pymilvus import AnnSearchRequest, WeightedRanker

def hybrid_search(
    col,
    query_dense_embedding,
    query_sparse_embedding,
    sparse_weight=1.0,
    dense_weight=1.0,
    limit=10
):

    dense_search_params = {'metric_type': 'IP', 'params': {}}
    dense_req = AnnSearchRequest(
        [query_dense_embedding], 'dense_vector', dense_search_params, limit=limit
    )

    sparse_search_params = {'metric_type':'IP', 'params':{}}
    sparse_req = AnnSearchRequest(
        [query_sparse_embedding], 'sparse_vector', sparse_search_params, limit=limit
    )

    rerank = WeightedRanker(sparse_weight, dense_weight)
    res = col.hybrid_search(
        [sparse_req, dense_req], rerank=rerank, limit=limit, output_fields=['text']
    )[0]

    return [hit.get('text') for hit in res]

In [None]:
# 定义搜索对象
hybrid_results = hybrid_search(
    col,
    query_embeddings['dense'][0],
    query_embeddings['sparse']._getrow(0),
    sparse_weight=0.7,
    dense_weight=1.0,
)

In [None]:
len(hybrid_results), type(hybrid_results), type(hybrid_results[0])