In [10]:
import time
import numpy as np
from milvus import default_server
from pymilvus import (
    connections,
    FieldSchema, CollectionSchema, DataType,
    Collection,
    utility
)
from tqdm import tqdm
from utils import extract_sample_data, split_into_batches

In [2]:
# Connecte to remote Milvus instance by using kubectl port forwarding: 
# gcloud auth login
# gcloud container clusters get-credentials bioacoustics-devseed-staging-cluster --region=us-central1-f
# kubectl port-forward service/milvus 9091:9091 & \
# kubectl port-forward service/milvus 19530:19530 &

HOST = "127.0.0.1"
PORT = 19530
connections.connect(host=HOST, port=PORT)
print("Connections: ", connections.list_connections())

Connections:  [('default', <pymilvus.client.grpc_handler.GrpcHandler object at 0x10cd26520>)]


In [3]:
# set up collection
COLLECTION_NAME = "a2o_bioacoustics"

# Drop collection if exists
if utility.has_collection(COLLECTION_NAME): 
    print("Collection exists, dropping...")
    collection = Collection(COLLECTION_NAME)
    collection.drop()

# define collection fields
id_field = FieldSchema(
    name="id", 
    dtype=DataType.INT64, 
    descrition="primary field", 
    is_primary=True, 
    auto_id=True
)

embedding_field = FieldSchema(
    name="embedding", 
    dtype=DataType.FLOAT_VECTOR, 
    description="Float32 vector with dim 1280", 
    dim=1280,
    is_primary=False
)
file_timestamp_field = FieldSchema(
    name="file_timestamp", 
    dtype=DataType.INT64, 
    description="File timestamp (in seconds since 1970-01-01T00:00:00)"
)
offset_field = FieldSchema(
    name="offset", 
    dtype=DataType.INT64, 
    description="Offset (in seconds) from start of file where embedding window starts"
)
site_id_field = FieldSchema(
    name="site_id", 
    dtype=DataType.INT64, 
    description="Site ID", 
)
site_name_field = FieldSchema(
    name="site_name", 
    dtype=DataType.VARCHAR, 
    description="Site name", 
    max_length=1000
)
subsite_name_field = FieldSchema(
    name="subsite_name", 
    dtype=DataType.VARCHAR, 
    description="Subsite name", 
    max_length=1000
)
file_seq_id_field = FieldSchema(
    name="file_seq_id", 
    dtype=DataType.INT64, 
    description="File sequence ID", 
)
filename_field = FieldSchema(
    name="filename", 
    dtype=DataType.VARCHAR, 
    max_length=1000
)

schema = CollectionSchema(
    fields=[
        id_field,
        embedding_field, 
        file_timestamp_field,
        offset_field, 
        site_id_field, 
        site_name_field, 
        subsite_name_field, 
        file_seq_id_field, 
        filename_field
    ], 
    description="Collection for searching A20 bird embeddings"
)
collection = Collection(
    name=COLLECTION_NAME, 
    data=None,
    schema=schema, 
    # Set TTL to 0 to disable
    properties={"collection.ttl.seconds": 0}
)
print(f"Collections: {utility.list_collections()}")
print(f"Collection {COLLECTION_NAME} instantiated with {collection.num_entities} entities")

Collection exists, dropping...
Collections: ['a2o_bioacoustics']
Collection a2o_bioacoustics instantiated with 0 entities


In [4]:
%%time
data = extract_sample_data()

2023-05-16 14:08:39.423373: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00000-of-00007, found 24903 records


2023-05-16 14:08:48.467108: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00001-of-00007, found 3102 records


2023-05-16 14:08:49.568549: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00002-of-00007, found 7740 records


2023-05-16 14:08:52.070802: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00003-of-00007, found 5981 records
Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00004-of-00007, found 424 records


2023-05-16 14:08:54.066214: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]
2023-05-16 14:08:54.225559: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00005-of-00007, found 21629 records


2023-05-16 14:09:00.467281: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00006-of-00007, found 9041 records
CPU times: user 31.1 s, sys: 5.63 s, total: 36.7 s
Wall time: 23.9 s


In [5]:
%%time

# split data into batches of 5000 for insertion into Milvus collection
# TODO: find documentation on why this is necessary, I did this to try 
# to get around the kernel dying when trying to insert the entire 
# collection at once
for _batch in tqdm(split_into_batches(data, 5000), total=int(len(data)/5000)): 
    
    #insert 
    collection.insert(
        [
            [_data[fieldname] for _data in _batch] 
            for fieldname in (
                "embedding",
                "file_timestamp",
                "offset",
                "site_id",
                "site_name",
                "subsite_name", 
                "file_seq_id", 
                "filename"
            )
        ]
    ) 

    collection.flush()    
print(f"Collection {COLLECTION_NAME} currently loaded with {collection.num_entities} entities")

175it [1:09:27, 23.81s/it]                                                                           

Collection a2o_bioacoustics currently loaded with 872994 entities
CPU times: user 7min 5s, sys: 57.4 s, total: 8min 2s
Wall time: 1h 9min 27s





In [8]:
%%time
if len(collection.indexes): 
    print(f"Dropping current index on field: {collection.index().field_name} -> {collection.index().params}")
    collection.drop_index()
    print(f"Indexes remaining after dropping: {collection.indexes}")
    
print(f"Creating new index: IVF_SQ8 with params nlist:1024")
# create new index: 
index_params = {
    "index_type": "IVF_SQ8",
    "params":{"nlist": 1024},
    "metric_type": "L2"
}

collection.create_index("embedding", index_params)

Creating new index: IVF_SQ8 with params nlist:1024
CPU times: user 2.41 s, sys: 1.1 s, total: 3.5 s
Wall time: 11min 48s


Status(code=0, message=)

In [26]:
%%time
collection.load()

search_vectors = [data[i]["embedding"] for i in np.random.choice(range(len(data)), size=5)]

search_param = {
    "data": search_vectors,
    "anns_field": "embedding",
    "param": {"metric_type": "L2", "params": {"nprobe": 16}},
    "limit": 25,
    "output_fields": ["site_name", "subsite_name", "file_timestamp", "filename"]
}

search_results = collection.search(**search_param)

CPU times: user 83 ms, sys: 21.6 ms, total: 105 ms
Wall time: 435 ms


In [23]:
import json
with open("sample_search_vectors.json", "w") as f: 
    f.write(json.dumps([v.tolist() for v in search_vectors]))


In [18]:
list(search_results)

[<pymilvus.orm.search.Hits at 0x135baee80>,
 <pymilvus.orm.search.Hits at 0x1811e78e0>,
 <pymilvus.orm.search.Hits at 0x1811e7940>,
 <pymilvus.orm.search.Hits at 0x1811e7a00>,
 <pymilvus.orm.search.Hits at 0x1811e7b80>]