In [1]:
import os
import re
import datetime
import time
import numpy as np
import tensorflow as tf
from milvus import default_server
from pymilvus import (
    connections,
    FieldSchema, CollectionSchema, DataType,
    Collection,
    utility
)

2023-05-08 12:43:58.089159: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# quick and dirty download of sample data using `!` to execute 
# command directly against CLI (rather than through the python 
# interpreter)
EMBEDDINGS_DIR = "point_one_percent_embeddings"
if not os.path.exists(EMBEDDINGS_DIR): 
    !gsutil -m cp -r "gs://a20_dropbox/point_one_percent_embeddings" .

In [3]:
# Define the feature description for parsing the TFRecordDataset
feature_description = {
    'timestamp_s': tf.io.FixedLenFeature([], tf.float32),
    'filename': tf.io.FixedLenFeature([], tf.string),
    'embedding': tf.io.FixedLenFeature([], tf.string),
    'embedding_shape': tf.io.FixedLenFeature([3], tf.int64)
}

# Define a function to parse the TFRecordDataset
def parse_tfrecord(example_proto):
    # Parse the features from the serialized example
    features = tf.io.parse_single_example(example_proto, feature_description)
    
    # extract embedding as 3D array of float32, from byte string 
    embedding = tf.io.parse_tensor(features["embedding"], out_type=tf.float32)
    
    return features['timestamp_s'], features["filename"], embedding, features["embedding_shape"]

In [4]:
# Parse single dataset file as an example
dataset_file = f"point_one_percent_embeddings/a2o_sample_embeddings-00000-of-00007"
raw_dataset = tf.data.TFRecordDataset(dataset_file)
dataset = raw_dataset.map(parse_tfrecord)

for timestamp_s, filename, embedding, embedding_shape in dataset.take(1):
    print("Timestamp: ", timestamp_s)
    print("Filename: ", filename)
    print("Embedding: ", embedding)
    print("Embedding shape: ", embedding_shape)


Timestamp:  tf.Tensor(5640.0, shape=(), dtype=float32)
Filename:  tf.Tensor(b'site_0062/20200308T220000+0800_Uunguu-Indigenous-Protected-Area-Wunambal-Gaambera-Wet-A_192000.flac', shape=(), dtype=string)
Embedding:  tf.Tensor(
[[[ 1.2194262e-01 -5.2180082e-02  5.2327113e-03 ... -2.1131415e-02
   -2.7775053e-02 -1.7569216e-02]]

 [[ 8.3896384e-02  1.5045750e-01  4.5462926e-03 ... -6.7515844e-03
    1.8597849e-02 -2.3018550e-02]]

 [[ 1.0783694e-01  1.3868606e-01  2.0184470e-02 ... -8.1872819e-03
    2.5923029e-02 -2.4738500e-02]]

 ...

 [[ 5.9446793e-02  2.2982250e-01 -1.7121695e-04 ...  1.0457955e-02
   -7.0175850e-03 -1.4521907e-02]]

 [[ 1.1026376e-01 -3.6579993e-02  3.6871318e-02 ... -4.6614930e-03
    3.0870575e-02 -3.2219391e-03]]

 [[ 1.8443571e-01 -4.1682284e-02  2.3081422e-02 ... -8.0404608e-03
   -3.4469940e-02 -2.2840943e-02]]], shape=(12, 1, 1280), dtype=float32)
Embedding shape:  tf.Tensor([  12    1 1280], shape=(3,), dtype=int64)


2023-05-08 12:44:36.378329: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


In [5]:
# convert extracted metadata to insertable data records
def prep_data_from_file(dataset_filename): 
    data = []
    raw_dataset = tf.data.TFRecordDataset(dataset_filename)
    count = 0
    for timestamp_s, filename, embedding, embedding_shape in raw_dataset.map(parse_tfrecord).as_numpy_iterator():
        [(
            site_id, 
            file_datetime, 
            timezone, 
            site_name, 
            subsite_name, 
            file_seq_id
        )] = re.findall(
            # I'm quite proud of myself for this regex, but if anyone can see 
            # a way to simplify it, please let me know!
            r"site_(?P<site_id>\d{4})\/(?P<datetime>\d{8}T\d{6})(?P<timezone>(?:\+\d{4})|Z)_(?P<site_name>(?:\w*|-)*)-(?P<subsite_name>(?:Wet|Dry)-(?:A|B))_(?P<file_seq_id>\d*).flac",
            filename.decode("utf-8")
        )
        
        # Some files have just "Z" as timezone
        timezone = "+0000" if timezone == "Z" else timezone
        file_datetime = datetime.datetime.strptime(f"{file_datetime}{timezone}", f"%Y%m%dT%H%M%S%z").timestamp()
        
        # `embedding` is a 3D array with Dims [12,1,1280]
        # We loop over the first dimension to "flatten" 
        # the 12 emebddings per minute
        # and extract the single channel (2nd dimension). 
        # We add each of the 12 embeddings as their own record
        for i, _embedding in enumerate(embedding[:,0]):
            _data = {
                "embedding": _embedding, 
                "file_timestamp": int(file_datetime), 
                "offset": int(timestamp_s + (5*i)), 
                "site_id": int(site_id), 
                "site_name": site_name, 
                "subsite_name": subsite_name, 
                "file_seq_id": int(file_seq_id),
                "filename": filename.decode("utf-8")
            }
            data.append(_data)
        count += 1
    print(f"Processed data file: {dataset_filename}, found {count} records")
    return data
    

In [6]:
# example 
data = prep_data_from_file("point_one_percent_embeddings/a2o_sample_embeddings-00000-of-00007")
data[0:5]

2023-05-08 12:44:36.499117: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00000-of-00007, found 24903 records


[{'embedding': array([ 0.12194262, -0.05218008,  0.00523271, ..., -0.02113141,
         -0.02777505, -0.01756922], dtype=float32),
  'file_timestamp': 1583676000,
  'offset': 5640,
  'site_id': 62,
  'site_name': 'Uunguu-Indigenous-Protected-Area-Wunambal-Gaambera',
  'subsite_name': 'Wet-A',
  'file_seq_id': 192000,
  'filename': 'site_0062/20200308T220000+0800_Uunguu-Indigenous-Protected-Area-Wunambal-Gaambera-Wet-A_192000.flac'},
 {'embedding': array([ 0.08389638,  0.1504575 ,  0.00454629, ..., -0.00675158,
          0.01859785, -0.02301855], dtype=float32),
  'file_timestamp': 1583676000,
  'offset': 5645,
  'site_id': 62,
  'site_name': 'Uunguu-Indigenous-Protected-Area-Wunambal-Gaambera',
  'subsite_name': 'Wet-A',
  'file_seq_id': 192000,
  'filename': 'site_0062/20200308T220000+0800_Uunguu-Indigenous-Protected-Area-Wunambal-Gaambera-Wet-A_192000.flac'},
 {'embedding': array([ 0.10783694,  0.13868606,  0.02018447, ..., -0.00818728,
          0.02592303, -0.0247385 ], dtype=float

In [7]:
# clean up any lingering data in milvus server
default_server.cleanup()

# startup milvus server
default_server.start()




    __  _________ _   ____  ______
   /  |/  /  _/ /| | / / / / / __/
  / /|_/ // // /_| |/ / /_/ /\ \
 /_/  /_/___/____/___/\____/___/ {Lite}

 Welcome to use Milvus!

 Version:   v2.2.8-lite
 Process:   72235
 Started:   2023-05-08 12:45:07
 Config:    /Users/leo/.milvus.io/milvus-server/2.2.8/configs/milvus.yaml
 Logs:      /Users/leo/.milvus.io/milvus-server/2.2.8/logs

 Ctrl+C to exit ...


In [8]:
# Set up milvus server
HOST = "127.0.0.1"
PORT = default_server.listen_port

# Spin up server (remember to close after)
connections.connect(host=HOST, port=PORT)
print("Connections: ", connections.list_connections())

Connections:  [('default', <pymilvus.client.grpc_handler.GrpcHandler object at 0x1330dcdc0>)]


In [9]:
# set up collection
COLLECTION_NAME = "a2o_bioacoustics"

# Drop collection if exists
if utility.has_collection(COLLECTION_NAME): 
    collection = Collection(COLLECTION_NAME)
    collection.drop()

# define collection fields
id_field = FieldSchema(
    name="id", 
    dtype=DataType.INT64, 
    descrition="primary field", 
    is_primary=True, 
    auto_id=True
)

embedding_field = FieldSchema(
    name="embedding", 
    dtype=DataType.FLOAT_VECTOR, 
    description="Float32 vector with dim 1280", 
    dim=1280,
    is_primary=False
)
file_timestamp_field = FieldSchema(
    name="file_timestamp", 
    dtype=DataType.INT64, 
    description="File timestamp (in seconds since 1970-01-01T00:00:00)"
)
offset_field = FieldSchema(
    name="offset", 
    dtype=DataType.INT64, 
    description="Offset (in seconds) from start of file where embedding window starts"
)
site_id_field = FieldSchema(
    name="site_id", 
    dtype=DataType.INT64, 
    description="Site ID", 
)
site_name_field = FieldSchema(
    name="site_name", 
    dtype=DataType.VARCHAR, 
    description="Site name", 
    max_length=1000
)
subsite_name_field = FieldSchema(
    name="subsite_name", 
    dtype=DataType.VARCHAR, 
    description="Subsite name", 
    max_length=1000
)
file_seq_id_field = FieldSchema(
    name="file_seq_id", 
    dtype=DataType.INT64, 
    description="File sequence ID", 
)
filename_field = FieldSchema(
    name="filename", 
    dtype=DataType.VARCHAR, 
    max_length=1000
)

schema = CollectionSchema(
    fields=[
        id_field,
        embedding_field, 
        file_timestamp_field,
        offset_field, 
        site_id_field, 
        site_name_field, 
        subsite_name_field, 
        file_seq_id_field, 
        filename_field
    ], 
    description="Collection for searching A20 bird embeddings"
)
collection = Collection(
    name=COLLECTION_NAME, 
    data=None,
    schema=schema, 
    # Set TTL to 0 to disable
    properties={"collection.ttl.seconds": 0}
)
print(f"Collections: {utility.list_collections()}")
print(f"Collection {COLLECTION_NAME} instantiated with {collection.num_entities} entities")

[93m[has_collection] retry:4, cost: 0.27s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, internal: Milvus Proxy is not ready yet. please wait>[0m
[93m[has_collection] retry:5, cost: 0.81s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, internal: Milvus Proxy is not ready yet. please wait>[0m


Collections: ['a2o_bioacoustics']
Collection a2o_bioacoustics instantiated with 0 entities


In [10]:
# helper function to batch data: 
def split_into_batches(data, n=10_000): 
    for i in range(0, len(data), n):
        yield data[i:i + n]
 

In [11]:
%%time
# generate insertable records from all files in the 0.1 percent sample
data = [
    prep_data_from_file(f"point_one_percent_embeddings/a2o_sample_embeddings-0000{i}-of-00007") 
    for i in range(0,7)
]

# flatten 2 list for of records 
data = [record for _data in data for record in _data]

print(f"{len(data)} records to insert into the milvus collection")

# split data into batches of 10_000 for insertion into Milvus collection
# TODO: find documentation on why this is necessary, I did this to try 
# to get around the kernel dying when trying to insert the entire 
# collection at once
for _batch in split_into_batches(data): 

    #insert 
    collection.insert(
        [
            [_data[fieldname] for _data in _batch] 
            for fieldname in (
                "embedding",
                "file_timestamp",
                "offset",
                "site_id",
                "site_name",
                "subsite_name", 
                "file_seq_id", 
                "filename"
            )
        ]
    ) 

collection.flush()
print(f"Collection {COLLECTION_NAME} currently loaded with {collection.num_entities} entities")


2023-05-08 12:45:09.460493: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00000-of-00007, found 24903 records


2023-05-08 12:45:25.681044: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00001-of-00007, found 3102 records


2023-05-08 12:45:27.582413: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00002-of-00007, found 7740 records


2023-05-08 12:45:32.265129: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00003-of-00007, found 5981 records


2023-05-08 12:45:35.722355: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00004-of-00007, found 424 records


2023-05-08 12:45:35.994218: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00005-of-00007, found 21629 records


2023-05-08 12:45:48.441521: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00006-of-00007, found 9041 records
872994 records to insert into the milvus collection
Collection a2o_bioacoustics currently loaded with 872994 entities
CPU times: user 8min 48s, sys: 1min 30s, total: 10min 19s
Wall time: 11min 14s


In [12]:
%%time
index_params = {
    "index_type": "FLAT",
    "params":{},
    "metric_type": "L2"
}
collection.create_index("embedding", index_params)
print(f"Created index {collection.index().params}")

Created index {'index_type': 'FLAT', 'params': {}, 'metric_type': 'L2'}
CPU times: user 5.94 ms, sys: 9.68 ms, total: 15.6 ms
Wall time: 905 ms


In [13]:
# randomly select a set of vectors to use as search
# vectors to compare the different indexing strategies
search_vectors = [data[i]["embedding"] for i in np.random.choice(range(len(data)), size=100)]

In [14]:
%%time
collection.load()

search_param = {
    "data": search_vectors,
    "anns_field": "embedding",
    "param": {"metric_type": "L2", "params": {}},
    "limit": 100,
}

reference_results = collection.search(**search_param)
collection.release()

CPU times: user 88.1 ms, sys: 111 ms, total: 199 ms
Wall time: 59.5 s


In [15]:
# Result set is a 2D list with dims (100,100). 
# For each of the 100 input vectors, the inner list contains the 100
# "closest" vectors
print([v[0:5] for v in reference_results[0:5]])

# the result set is a custom Milvus type:
print(f"{type(reference_results)}[{type(reference_results[0])}[{type(reference_results[0][0])}]]")

[[id: 441336281742832975, distance: 0.0, entity: {}, id: 441336281742832976, distance: 1.9803709983825684, entity: {}, id: 441336281742774363, distance: 3.019282579421997, entity: {}, id: 441336281743335561, distance: 3.0435564517974854, entity: {}, id: 441336281742975836, distance: 3.065373420715332, entity: {}], [id: 441336281743155740, distance: 0.0, entity: {}, id: 441336281743427851, distance: 0.4990261495113373, entity: {}, id: 441336281743591324, distance: 0.512794017791748, entity: {}, id: 441336281742791997, distance: 0.5312370657920837, entity: {}, id: 441336281743355754, distance: 0.5318313837051392, entity: {}], [id: 441336281743035609, distance: 0.0, entity: {}, id: 441336281743386831, distance: 0.7322882413864136, entity: {}, id: 441336281742953721, distance: 0.779429018497467, entity: {}, id: 441336281743060418, distance: 0.7794749736785889, entity: {}, id: 441336281743571204, distance: 0.7816017270088196, entity: {}], [id: 441336281743475037, distance: 0.0, entity: {}, 

In [16]:
def score_recall(search, reference, at=100):
    return len([i for i in search.ids[:at] if i in reference.ids[:at]])/at
    
# returns average recall@1, recall @10 and recall@100 
# for each search result (compared to the reference result
# set above)
def score(search_results):
    scores = [
        [score_recall(search,ref,at=i) 
        for i in (1,10,100)]
        # the search results class should be iterable, however something 
        # was affecting the orders of the ids when zipping the search
        # result class, so I convert them first to a List[Hits].
        for search, ref in zip(list(search_results), list(reference_results))
    ]
    return np.mean(np.array(scores), axis=0)

In [17]:
# Assert scoring method works by scoring the 
# reference set against itself.
score(reference_results)

array([1., 1., 1.])

In [18]:
def evaluate_index(index_name, index_params, search_params):
    if len(collection.indexes): 
        print(f"Dropping current index on field: {collection.index().field_name} -> {collection.index().params}")
        collection.drop_index()
        print(f"Indexes remaining after dropping: {collection.indexes}")
    
    print(f"Creating new index: {index_name} with params {index_params}")
    # create new index: 
    index_params = {
        "index_type": index_name,
        "params":index_params,
        "metric_type": "L2"
    }

    start = time.time()
    collection.create_index("embedding", index_params)
    index_build_time = round((time.time()-start), 2)
    
    start = time.time()
    collection.load()
    print(f"Loading collection took {round((time.time()-start), 2)} seconds")
    
    search_param = {
        "data": search_vectors,
        "anns_field": "embedding",
        "param": search_params,
        "limit": 100,
    }
    
    start = time.time()
    search_results = collection.search(**search_param)
    search_time = round((time.time()-start), 2)
    
    collection.release()
    recall_score = score(search_results)
    
    return {
        "index_build_time": index_build_time, 
        "search_time": search_time, 
        "search_scores":{
            "recall@1": recall_score[0],
            "recall@10": recall_score[1],
            "recall@100": recall_score[2]
        }
    }

In [19]:
# Evaluate 
ivf_flat_eval = evaluate_index(
    index_name="IVF_FLAT", 
    index_params={"nlist": 1024}, 
    search_params={"metric_type": "L2", "params": {"nprobe": 16}}
)
ivf_flat_eval

Dropping current index on field: embedding -> {'index_type': 'FLAT', 'params': {}, 'metric_type': 'L2'}
Indexes remaining after dropping: []
Creating new index: IVF_FLAT with params {'nlist': 1024}
Took 248.51


{'index_build_time': 1359.0,
 'search_time': 1.15,
 'search_scores': {'recall@1': 1.0,
  'recall@10': 0.9660000000000001,
  'recall@100': 0.9248999999999999}}

In [20]:
# create index IVF_SQ8 - good mix of improved speed + reduced memory footprint
# NOTE: further investigate alternative indexing strategies: 
# HSNW/ANNOY for search time reduction
# Product Quantization and PCA dimensionality reduction to reduce memory footprint
ivf_sq8_eval = evaluate_index(
    index_name="IVF_SQ8", 
    index_params={"nlist": 1024}, 
    search_params={"metric_type": "L2", "params": {"nprobe": 16}}
)
ivf_sq8_eval

Dropping current index on field: embedding -> {'index_type': 'IVF_FLAT', 'params': {'nlist': 1024}, 'metric_type': 'L2'}
Indexes remaining after dropping: []
Creating new index: IVF_SQ8 with params {'nlist': 1024}
Took 7.29


{'index_build_time': 1196.98,
 'search_time': 1.44,
 'search_scores': {'recall@1': 1.0,
  'recall@10': 0.954,
  'recall@100': 0.9189999999999996}}

In [21]:
ivf_pq4_eval = evaluate_index(
    index_name="IVF_PQ", 
    index_params={"nlist":1024, "m": 128, "nbits":8}, 
    search_params={"metric_type": "L2", "params": {"nprobe":16}}
)
ivf_pq4_eval

Dropping current index on field: embedding -> {'index_type': 'IVF_SQ8', 'params': {'nlist': 1024}, 'metric_type': 'L2'}
Indexes remaining after dropping: []
Creating new index: IVF_PQ with params {'nlist': 1024, 'm': 128, 'nbits': 8}
Took 5.15


{'index_build_time': 2394.84,
 'search_time': 0.4,
 'search_scores': {'recall@1': 0.99,
  'recall@10': 0.7320000000000003,
  'recall@100': 0.7466999999999996}}

In [22]:
%%time
# Example hybrid search (with metadata filtering) and specifying output values
# Load collection into memory in order to be able to search
collection.load()

search_param = {
    "data": search_vectors,
    "anns_field": "embedding",
    "param": {"metric_type": "L2", "params": {"nprobe": 16}},
    "limit": 10,
    "expr": "subsite_name == \"Wet-A\"", 
    "output_fields": ["site_name", "subsite_name", "file_timestamp"]
}
start = time.time()
results = collection.search(**search_param)
search_time = round((time.time()-start), 2)
print(f"Search (with metadata filtering) took: {search_time}s")

for i, result in enumerate(results):
    print(f"Results for search vector {i}:")
    for j, res in enumerate(result):
        print(f"Top {j}: {res}")

Search (with metadata filtering) took: 0.39s
Results for search vector 0:
Top 0: id: 441336281742959784, distance: 3.5741782188415527, entity: {'site_name': 'Katarapko', 'subsite_name': 'Wet-A', 'file_timestamp': 1591101000}
Top 1: id: 441336281743121763, distance: 3.6820573806762695, entity: {'site_name': 'Calperum-Mallee', 'subsite_name': 'Wet-A', 'file_timestamp': 1587083400}
Top 2: id: 441336281742980133, distance: 3.9008371829986572, entity: {'site_name': 'Calperum-Mallee', 'subsite_name': 'Wet-A', 'file_timestamp': 1597033800}
Top 3: id: 441336281743293978, distance: 3.921776533126831, entity: {'site_name': 'Calperum-Mallee', 'subsite_name': 'Wet-A', 'file_timestamp': 1589833800}
Top 4: id: 441336281743585600, distance: 4.006672382354736, entity: {'site_name': 'Calperum-Mallee', 'subsite_name': 'Wet-A', 'file_timestamp': 1597033800}
Top 5: id: 441336281743143482, distance: 4.061380863189697, entity: {'site_name': 'Binya', 'subsite_name': 'Wet-A', 'file_timestamp': 1618992000}
Top

In [23]:
%%time
# Example hybrid search (with metadata filtering) and specifying output values
# Load collection into memory in order to be able to search
collection.load()

search_param = {
    "data": search_vectors,
    "anns_field": "embedding",
    "param": {"metric_type": "L2", "params": {"nprobe": 16}},
    "limit": 10,
    "output_fields": ["site_name", "subsite_name", "file_timestamp"]
}
start = time.time()
results = collection.search(**search_param)
search_time = round((time.time()-start), 2)
print(f"Search (withOUT metadata filtering) took: {search_time}s")

for i, result in enumerate(results):
    print(f"Results for search vector {i}:")
    for j, res in enumerate(result):
        print(f"Top {j}: {res}")

Search (withOUT metadata filtering) took: 0.47s
Results for search vector 0:
Top 0: id: 441336281742832975, distance: 0.9387864470481873, entity: {'site_name': 'Binya', 'subsite_name': 'Wet-B', 'file_timestamp': 1622923200}
Top 1: id: 441336281742832976, distance: 2.3995063304901123, entity: {'site_name': 'Binya', 'subsite_name': 'Wet-B', 'file_timestamp': 1622923200}
Top 2: id: 441336281742832985, distance: 2.9995627403259277, entity: {'site_name': 'Binya', 'subsite_name': 'Wet-B', 'file_timestamp': 1622923200}
Top 3: id: 441336281742774363, distance: 3.1652634143829346, entity: {'site_name': 'Tarcutta-Hills', 'subsite_name': 'Dry-B', 'file_timestamp': 1620597600}
Top 4: id: 441336281743335561, distance: 3.2277541160583496, entity: {'site_name': 'Tarcutta-Hills', 'subsite_name': 'Dry-B', 'file_timestamp': 1593734400}
Top 5: id: 441336281742975836, distance: 3.2718679904937744, entity: {'site_name': 'Tarcutta-Hills', 'subsite_name': 'Dry-B', 'file_timestamp': 1593734400}
Top 6: id: 441

In [24]:
# Misc helper functions
def drop_index(collection):
    collection.drop_index()
    print("\nDrop index sucessfully")

def release_collection(collection):
    collection.release()

def set_properties(collection):
    collection.set_properties(properties={"collection.ttl.seconds": 1800})

In [25]:
# Next steps: 
# - include data consumption (ram/disk) for each indexing stratgy
# - Add PCA pre-computation step to index evaluation 
