In [1]:
import numpy as np
from milvus import default_server
from pymilvus import (
    connections,
    FieldSchema, CollectionSchema, DataType,
    Collection,
    utility
)
from utils import extract_sample_data, split_into_batches

2023-05-16 13:15:04.791638: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# clean up any lingering data in milvus server
default_server.cleanup()

# startup milvus server
default_server.start()




    __  _________ _   ____  ______
   /  |/  /  _/ /| | / / / / / __/
  / /|_/ // // /_| |/ / /_/ /\ \
 /_/  /_/___/____/___/\____/___/ {Lite}

 Welcome to use Milvus!

 Version:   v2.2.8-lite
 Process:   33255
 Started:   2023-05-16 13:15:09
 Config:    /Users/leo/.milvus.io/milvus-server/2.2.8/configs/milvus.yaml
 Logs:      /Users/leo/.milvus.io/milvus-server/2.2.8/logs

 Ctrl+C to exit ...


In [3]:
# Set up milvus server
HOST = "127.0.0.1"
PORT = default_server.listen_port

# Spin up server (remember to close after)
connections.connect(host=HOST, port=PORT)
print("Connections: ", connections.list_connections())

Connections:  [('default', <pymilvus.client.grpc_handler.GrpcHandler object at 0x12c705880>)]


In [4]:
# set up collection
COLLECTION_NAME = "a2o_bioacoustics"

# Drop collection if exists
if utility.has_collection(COLLECTION_NAME): 
    collection = Collection(COLLECTION_NAME)
    collection.drop()

# define collection fields
id_field = FieldSchema(
    name="id", 
    dtype=DataType.INT64, 
    descrition="primary field", 
    is_primary=True, 
    auto_id=True
)

embedding_field = FieldSchema(
    name="embedding", 
    dtype=DataType.FLOAT_VECTOR, 
    description="Float32 vector with dim 1280", 
    dim=1280,
    is_primary=False
)
file_timestamp_field = FieldSchema(
    name="file_timestamp", 
    dtype=DataType.INT64, 
    description="File timestamp (in seconds since 1970-01-01T00:00:00)"
)
offset_field = FieldSchema(
    name="offset", 
    dtype=DataType.INT64, 
    description="Offset (in seconds) from start of file where embedding window starts"
)
site_id_field = FieldSchema(
    name="site_id", 
    dtype=DataType.INT64, 
    description="Site ID", 
)
site_name_field = FieldSchema(
    name="site_name", 
    dtype=DataType.VARCHAR, 
    description="Site name", 
    max_length=1000
)
subsite_name_field = FieldSchema(
    name="subsite_name", 
    dtype=DataType.VARCHAR, 
    description="Subsite name", 
    max_length=1000
)
file_seq_id_field = FieldSchema(
    name="file_seq_id", 
    dtype=DataType.INT64, 
    description="File sequence ID", 
)
filename_field = FieldSchema(
    name="filename", 
    dtype=DataType.VARCHAR, 
    max_length=1000
)

schema = CollectionSchema(
    fields=[
        id_field,
        embedding_field, 
        file_timestamp_field,
        offset_field, 
        site_id_field, 
        site_name_field, 
        subsite_name_field, 
        file_seq_id_field, 
        filename_field
    ], 
    description="Collection for searching A20 bird embeddings"
)
collection = Collection(
    name=COLLECTION_NAME, 
    data=None,
    schema=schema, 
    # Set TTL to 0 to disable
    properties={"collection.ttl.seconds": 0}
)
print(f"Collections: {utility.list_collections()}")
print(f"Collection {COLLECTION_NAME} instantiated with {collection.num_entities} entities")

[93m[has_collection] retry:4, cost: 0.27s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, internal: Milvus Proxy is not ready yet. please wait>[0m
[93m[has_collection] retry:5, cost: 0.81s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, internal: Milvus Proxy is not ready yet. please wait>[0m


Collections: ['a2o_bioacoustics']
Collection a2o_bioacoustics instantiated with 0 entities


In [5]:
data = extract_sample_data()

2023-05-16 13:15:11.610981: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00000-of-00007, found 24903 records


2023-05-16 13:15:19.899305: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00001-of-00007, found 3102 records


2023-05-16 13:15:20.969666: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00002-of-00007, found 7740 records


2023-05-16 13:15:23.526890: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00003-of-00007, found 5981 records
Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00004-of-00007, found 424 records


2023-05-16 13:15:25.410433: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]
2023-05-16 13:15:25.594177: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00005-of-00007, found 21629 records


2023-05-16 13:15:33.254775: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00006-of-00007, found 9041 records


In [6]:
%%time
# split data into batches of 10_000 for insertion into Milvus collection
# TODO: find documentation on why this is necessary, I did this to try 
# to get around the kernel dying when trying to insert the entire 
# collection at once
for _batch in split_into_batches(data): 

    #insert 
    collection.insert(
        [
            [_data[fieldname] for _data in _batch] 
            for fieldname in (
                "embedding",
                "file_timestamp",
                "offset",
                "site_id",
                "site_name",
                "subsite_name", 
                "file_seq_id", 
                "filename"
            )
        ]
    ) 

collection.flush()
print(f"Collection {COLLECTION_NAME} currently loaded with {collection.num_entities} entities")


Collection a2o_bioacoustics currently loaded with 872994 entities
CPU times: user 5min 42s, sys: 47 s, total: 6min 29s
Wall time: 7min 17s


In [7]:
%%time
index_params = {
    "index_type": "FLAT",
    "params":{},
    "metric_type": "L2"
}
collection.create_index("embedding", index_params)
print(f"Created index {collection.index().params}")

Created index {'index_type': 'FLAT', 'params': {}, 'metric_type': 'L2'}
CPU times: user 8.34 ms, sys: 12.7 ms, total: 21 ms
Wall time: 1.24 s


In [8]:
# randomly select a set of vectors to use as search
# vectors to compare the different indexing strategies
search_vectors = [data[i]["embedding"] for i in np.random.choice(range(len(data)), size=100)]

In [9]:
%%time
collection.load()

search_param = {
    "data": search_vectors,
    "anns_field": "embedding",
    "params": {"metric_type": "L2", "params": {}},
    "limit": 100,
}

reference_results = collection.search(**search_param)
collection.release()

CPU times: user 85.1 ms, sys: 145 ms, total: 230 ms
Wall time: 50.1 s


In [10]:
# Result set is a 2D list with dims (100,100). 
# For each of the 100 input vectors, the inner list contains the 100
# "closest" vectors
print([v[0:5] for v in reference_results[0:5]])

# the result set is a custom Milvus type:
print(f"{type(reference_results)}[{type(reference_results[0])}[{type(reference_results[0][0])}]]")

[[id: 441517948086289134, distance: 0.0, entity: {}, id: 441517948085808963, distance: 2.2333621978759766, entity: {}, id: 441517948085707353, distance: 2.244128942489624, entity: {}, id: 441517948085659490, distance: 2.2578206062316895, entity: {}, id: 441517948085981560, distance: 2.273188591003418, entity: {}], [id: 441517948086259738, distance: 0.0, entity: {}, id: 441517948086259737, distance: 0.9440094232559204, entity: {}, id: 441517948086259739, distance: 0.9482507109642029, entity: {}, id: 441517948085749689, distance: 1.0887136459350586, entity: {}, id: 441517948086073925, distance: 1.0941250324249268, entity: {}], [id: 441517948085612170, distance: 0.0, entity: {}, id: 441517948086248239, distance: 3.3319246768951416, entity: {}, id: 441517948085991406, distance: 3.396502733230591, entity: {}, id: 441517948085517466, distance: 4.2184624671936035, entity: {}, id: 441517948085681784, distance: 4.463274955749512, entity: {}], [id: 441517948085735201, distance: 0.0, entity: {}, 

In [11]:
%%time
# Example hybrid search (with metadata filtering) and specifying output values
# Load collection into memory in order to be able to search
collection.load()

search_param = {
    "data": search_vectors,
    "anns_field": "embedding",
    "params": {"metric_type": "L2", "params": {"nprobe": 16}},
    "limit": 10,
    "expr": "subsite_name == \"Wet-A\"", 
    "output_fields": ["site_name", "subsite_name", "file_timestamp"]
}
results = collection.search(**search_param)

for i, result in enumerate(results):
    print(f"Results for search vector {i}:")
    for j, res in enumerate(result):
        print(f"Top {j}: {res}")

Results for search vector 0:
Top 0: id: 441517948086289134, distance: 0.0, entity: {'subsite_name': 'Wet-A', 'file_timestamp': 1622224800, 'site_name': 'Cape-Barren-Island'}
Top 1: id: 441517948085672232, distance: 2.314201593399048, entity: {'subsite_name': 'Wet-A', 'file_timestamp': 1622224800, 'site_name': 'Cape-Barren-Island'}
Top 2: id: 441517948086047245, distance: 2.3200578689575195, entity: {'subsite_name': 'Wet-A', 'file_timestamp': 1585593000, 'site_name': 'Boolcoomatta'}
Top 3: id: 441517948085965537, distance: 2.33107852935791, entity: {'subsite_name': 'Wet-A', 'file_timestamp': 1622224800, 'site_name': 'Cape-Barren-Island'}
Top 4: id: 441517948086073119, distance: 2.3458263874053955, entity: {'subsite_name': 'Wet-A', 'file_timestamp': 1622224800, 'site_name': 'Cape-Barren-Island'}
Top 5: id: 441517948085874930, distance: 2.3469960689544678, entity: {'subsite_name': 'Wet-A', 'file_timestamp': 1622224800, 'site_name': 'Cape-Barren-Island'}
Top 6: id: 441517948085483170, dist

In [12]:
%%time
# Example hybrid search (with metadata filtering) and specifying output values
# Load collection into memory in order to be able to search
collection.load()

search_param = {
    "data": search_vectors,
    "anns_field": "embedding",
    "params": {"metric_type": "L2", "params": {"nprobe": 16}},
    "limit": 10,
    "output_fields": ["site_name", "subsite_name", "file_timestamp"]
}
results = collection.search(**search_param)

for i, result in enumerate(results):
    print(f"Results for search vector {i}:")
    for j, res in enumerate(result):
        print(f"Top {j}: {res}")

Results for search vector 0:
Top 0: id: 441517948086289134, distance: 0.0, entity: {'file_timestamp': 1622224800, 'site_name': 'Cape-Barren-Island', 'subsite_name': 'Wet-A'}
Top 1: id: 441517948085808963, distance: 2.2333621978759766, entity: {'file_timestamp': 1594958400, 'site_name': 'Toorale-National-Park', 'subsite_name': 'Wet-B'}
Top 2: id: 441517948085707353, distance: 2.244128942489624, entity: {'file_timestamp': 1586226600, 'site_name': 'Bon-Bon-Station', 'subsite_name': 'Wet-B'}
Top 3: id: 441517948085659490, distance: 2.2578206062316895, entity: {'file_timestamp': 1562140800, 'site_name': 'Sturt-National-Park', 'subsite_name': 'Dry-A'}
Top 4: id: 441517948085981560, distance: 2.273188591003418, entity: {'file_timestamp': 1596153600, 'site_name': 'Matuwa-Indigenous-Protected-Area', 'subsite_name': 'Dry-B'}
Top 5: id: 441517948086131331, distance: 2.2833311557769775, entity: {'file_timestamp': 1595320200, 'site_name': 'Boolcoomatta', 'subsite_name': 'Wet-B'}
Top 6: id: 44151794

In [13]:
# Misc helper functions
def drop_index(collection):
    collection.drop_index()
    print("\nDrop index sucessfully")

def release_collection(collection):
    collection.release()

def set_properties(collection):
    collection.set_properties(properties={"collection.ttl.seconds": 1800})