In [2]:
from pymilvus import AnnSearchRequest, DataType, MilvusClient, Function, FunctionType, RRFRanker
client = MilvusClient('./milvus_embeddings.db')
schema = client.create_schema(auto_id=False)

schema.add_field(field_name="id", datatype=DataType.VARCHAR, max_length=1000, is_primary=True, description="package id")
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=1000, enable_analyzer=True, description="title+question")
schema.add_field(field_name="text_dense", datatype=DataType.FLOAT_VECTOR, dim=1024, description="text dense embedding")
schema.add_field(field_name="text_sparse", datatype=DataType.SPARSE_FLOAT_VECTOR, description="text sparse embedding auto-generated by the built-in BM25 function")

# Add function to schema
bm25_function = Function(
    name="text_bm25_emb",
    input_field_names=["text"],
    output_field_names=["text_sparse"],
    function_type=FunctionType.BM25,
)

schema.add_function(bm25_function)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': 'package id', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1000}, 'is_primary': True, 'auto_id': False}, {'name': 'text', 'description': 'title+question', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1000, 'enable_analyzer': True}}, {'name': 'text_dense', 'description': 'text dense embedding', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}, {'name': 'text_sparse', 'description': 'text sparse embedding auto-generated by the built-in BM25 function', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>, 'is_function_output': True}], 'enable_dynamic_field': False, 'functions': [{'name': 'text_bm25_emb', 'description': '', 'type': <FunctionType.BM25: 1>, 'input_field_names': ['text'], 'output_field_names': ['text_sparse'], 'params': {}}]}

In [3]:
index_params = client.prepare_index_params()

index_params.add_index(
    field_name="text_dense",
    index_name="text_dense_index",
    index_type="AUTOINDEX",
    metric_type="IP"
)

index_params.add_index(
    field_name="text_sparse",
    index_name="text_sparse_index",
    index_type="SPARSE_INVERTED_INDEX",
    metric_type="BM25",
    params={"inverted_index_algo": "DAAT_MAXSCORE"}, # or "DAAT_WAND" or "TAAT_NAIVE"
)

In [4]:
client.create_collection(
            collection_name="packages",
            schema=schema,
            index_params=index_params,
            overwrite=True
        )