In [2]:
import time
import json
import numpy as np
import faiss
from milvus import default_server
from pymilvus import (
    connections,
    FieldSchema, CollectionSchema, DataType,
    Collection,
    utility, 
    exceptions
)
from utils import extract_sample_data, split_into_batches

2023-05-22 14:54:59.529149: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def prep_collection(data):

    # set up collection
    COLLECTION_NAME = "a2o_bioacoustics"

    # If collection exists, check that it has the right 
    # size vectors ingested. If not, drop and re-ingest
    if utility.has_collection(COLLECTION_NAME): 
        collection = Collection(COLLECTION_NAME)
        [embedding_field] = [f for f in collection.schema.fields if f.name == "embedding"]
        if len(data[0]["embedding"]) == embedding_field.params["dim"]: 
            if len(collection.indexes):
                collection.release()
                collection.drop_index()
            return collection

        print(f"Collection embedding current size: {embedding_field.params['dim']}, desired size: {len(data[0]['embedding'])}. Dropping and re-creating...")
        collection.drop()

    # define collection fields
    id_field = FieldSchema(
        name="id", 
        dtype=DataType.INT64, 
        descrition="primary field", 
        is_primary=True, 
        auto_id=False
    )

    embedding_field = FieldSchema(
        name="embedding", 
        dtype=DataType.FLOAT_VECTOR, 
        description="Float32 vector with dim 1280", 
        dim=len(data[0]["embedding"]),
        is_primary=False
    )
    file_timestamp_field = FieldSchema(
        name="file_timestamp", 
        dtype=DataType.INT64, 
        description="File timestamp (in seconds since 1970-01-01T00:00:00)"
    )
    offset_field = FieldSchema(
        name="offset", 
        dtype=DataType.INT64, 
        description="Offset (in seconds) from start of file where embedding window starts"
    )
    site_id_field = FieldSchema(
        name="site_id", 
        dtype=DataType.INT64, 
        description="Site ID", 
    )
    site_name_field = FieldSchema(
        name="site_name", 
        dtype=DataType.VARCHAR, 
        description="Site name", 
        max_length=1000
    )
    subsite_name_field = FieldSchema(
        name="subsite_name", 
        dtype=DataType.VARCHAR, 
        description="Subsite name", 
        max_length=1000
    )
    file_seq_id_field = FieldSchema(
        name="file_seq_id", 
        dtype=DataType.INT64, 
        description="File sequence ID", 
    )
    filename_field = FieldSchema(
        name="filename", 
        dtype=DataType.VARCHAR, 
        max_length=1000
    )

    schema = CollectionSchema(
        fields=[
            id_field,
            embedding_field, 
            file_timestamp_field,
            offset_field, 
            site_id_field, 
            site_name_field, 
            subsite_name_field, 
            file_seq_id_field, 
            filename_field
        ], 
        description="Collection for searching A20 bird embeddings"
    )
    collection = Collection(
        name=COLLECTION_NAME, 
        data=None,
        schema=schema, 
        # Set TTL to 0 to disable
        properties={"collection.ttl.seconds": 0}
    )

    # split data into batches of 10_000 for insertion into Milvus collection
    # TODO: find documentation on why this is necessary, I did this to try 
    # to get around the kernel dying when trying to insert the entire 
    # collection at once
    for _batch in split_into_batches(data): 

        #insert 
        collection.insert(
            [
                [_data[fieldname] for _data in _batch] 
                for fieldname in (
                    "id",
                    "embedding",
                    "file_timestamp",
                    "offset",
                    "site_id",
                    "site_name",
                    "subsite_name", 
                    "file_seq_id", 
                    "filename"
                )
            ]
        ) 

    collection.flush()
    #print(f"Collection {COLLECTION_NAME} currently loaded with {collection.num_entities} entities")
    return collection

In [4]:
# This notebook can either use a remote/deployed Milvus instance, using
# kubectl port forwarding, or a locally running Milvu Lite binary. To 
# use the locally running Milvus Lite instnace, uncomment the following lines: 

# =============================================================================
# # clean up any lingering data in milvus server
# default_server.cleanup()
# # startup milvus server
# default_server.start()
# =============================================================================

# Otherwise be sure to run the port forwarding commands from CLI: 
# $ kubectl port-forward service/milvus 9091:9091 & \
# $ kubectl port-forward service/milvus 19530:19530 & 

# Set up milvus server
HOST = "127.0.0.1"
PORT = 19530

# Spin up server (remember to close after)
connections.connect(host=HOST, port=PORT)
print("Connections: ", connections.list_connections())

Connections:  [('default', <pymilvus.client.grpc_handler.GrpcHandler object at 0x1034e1df0>)]


In [5]:
POINT_ONE_PERCENT_SAMPLE_DATA = extract_sample_data(file_ids=[2])

# randomly select a set of vectors to use as search
# vectors to compare the different indexing strategies
SEARCH_VECTORS = [
    POINT_ONE_PERCENT_SAMPLE_DATA[i]["embedding"] 
    for i in np.random.choice(range(len(POINT_ONE_PERCENT_SAMPLE_DATA)), size=100)
]

print(f"Found {len(POINT_ONE_PERCENT_SAMPLE_DATA)} vectors to ingest")

2023-05-22 14:55:04.525693: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Processed data file: point_one_percent_embeddings/a2o_sample_embeddings-00002-of-00007, found 7740 records
Found 92793 vectors to ingest


In [6]:
%%time
index_build_params = {
    "index_type": "FLAT",
    "params":{},
    "metric_type": "L2"
}

collection = prep_collection(POINT_ONE_PERCENT_SAMPLE_DATA)

collection.create_index("embedding", index_build_params)

print(f"Created index {collection.index().params}")

Collection embedding current size: 512, desired size: 1280. Dropping and re-creating...
Created index {'index_type': 'FLAT', 'params': {}, 'metric_type': 'L2'}
CPU times: user 35.3 s, sys: 5.42 s, total: 40.7 s
Wall time: 6min 48s


In [7]:
%%time
collection.load()

search_params = {
    "data": SEARCH_VECTORS,
    "anns_field": "embedding",
    "param": {"metric_type": "L2", "params": {}},
    "limit": 100,
}

reference_results = collection.search(**search_params)
collection.release()

CPU times: user 105 ms, sys: 52.3 ms, total: 157 ms
Wall time: 33.1 s


In [8]:
# Result set is a 2D list with dims (100,100). 
# For each of the 100 input vectors, the inner list contains the 100
# "closest" vectors
print([v[0:5] for v in reference_results[0:5]])

# the result set is a custom Milvus type:
print(f"{type(reference_results)}[{type(reference_results[0])}[{type(reference_results[0][0])}]]")

[[id: 90303, distance: 0.0, entity: {}, id: 90304, distance: 1.558118224143982, entity: {}, id: 42219, distance: 1.5814385414123535, entity: {}, id: 40294, distance: 1.6141364574432373, entity: {}, id: 42220, distance: 1.6342560052871704, entity: {}], [id: 21401, distance: 0.0, entity: {}, id: 21407, distance: 1.191057562828064, entity: {}, id: 21400, distance: 1.1954035758972168, entity: {}, id: 58536, distance: 1.3918529748916626, entity: {}, id: 58534, distance: 1.4089027643203735, entity: {}], [id: 62133, distance: 0.0, entity: {}, id: 21602, distance: 1.6269677877426147, entity: {}, id: 70391, distance: 1.6901823282241821, entity: {}, id: 75983, distance: 1.7660950422286987, entity: {}, id: 3174, distance: 1.8164817094802856, entity: {}], [id: 24669, distance: 0.0, entity: {}, id: 58979, distance: 0.5075481534004211, entity: {}, id: 31824, distance: 0.5765305757522583, entity: {}, id: 30275, distance: 0.6378152966499329, entity: {}, id: 31820, distance: 0.6456799507141113, entity:

In [9]:
def score_recall(search, reference, at=100):
    return len([i for i in search.ids[:at] if i in reference.ids[:at]])/at
    
# returns average recall@1, recall @10 and recall@100 
# for each search result (compared to the reference result
# set above)
def score(search_results):
    scores = [
        [score_recall(search,ref,at=i) 
        for i in (1,10,100)]
        # the search results class should be iterable, however something 
        # was affecting the orders of the ids when zipping the search
        # result class, so I convert them first to a List[Hits].
        for search, ref in zip(list(search_results), list(reference_results))
    ]
    return np.mean(np.array(scores), axis=0)

# Assert scoring method works by scoring the 
# reference set against itself.
score(reference_results)

array([1., 1., 1.])

In [42]:
def calculate_memory_consumption(data, index_name, index_build_params):
    # ref: https://github.com/milvus-io/milvus/discussions/18719#discussioncomment-3428862
    # FLAT: nb * d * 4
    # IVF_FLAT: (nb + nlist) * d * 4
    # IVF_SQ8: (nb * d) + (nlist * d * 4
    # IVF_PQ: (nb * m * nbits/8) + 2**nbits * d * 4 + nlist * d * 4
        
    nb = len(data) # number of vectors
    d = len(data[0]["embedding"]) # dimensionality
    
    if index_name== "FLAT": 
        return nb * d * 4

    if index_name == "IVF_FLAT": 
        nlist = index_build_params["nlist"]
        return (nb + nlist) * d * 4
    
    if index_name == "IVF_SQ8": 
        nlist = index_build_params["nlist"]
        return (nb * d) + (nlist * d * 4)
    
    if index_name == "IVF_PQ":
        nlist = index_build_params["nlist"]
        m = index_build_params["m"]
        nbits = index_build_params["nbits"]
        return (nb * m * (nbits/8)) + ((2**nbits) * d * 4) + (nlist * d * 4)
    
    return -1
    
def evaluate_index(index_name, index_build_params, search_params, data, search_vectors):
    
    result = {}
        
    collection = prep_collection(data)
    
    # print(f"Creating new index: {index_name} with params {index_params}")
    start = time.time()
    # create new index: 
    collection.create_index(
        "embedding", 
        {
            "index_type": index_name,
            "params":index_build_params,
            "metric_type": "L2"
        }
    )

    result["index_build_time"] = round((time.time()-start), 2)
    
    start = time.time()
    collection.load()

    result["collection_load_time"] = round((time.time()-start), 2)
        
    start = time.time()
    search_results = collection.search(
        data=search_vectors, 
        anns_field="embedding", 
        param=search_params, 
        limit=100
    )

    result["search_time"] = round((time.time()-start), 2)
    
    collection.release()
    recalls = score(search_results)
    result["search_scores"] = {f"recall@{10**i}": round(recalls[i], 4) for i in range(len(recalls))}
        
    result["memory_consumption"] = calculate_memory_consumption(data, index_name, index_build_params)

    return result

In [10]:
pca_dims_opts = [0,64,128,256, 512]
index_configs = []
for pca_dims in pca_dims_opts: 
    
    m_opts =[32, 64, 128, 256] if pca_dims == 0 else [int(pca_dims/8), int(pca_dims/4), int(pca_dims/2)]
    index_configs.extend([
        {
            "index_name": "FLAT", 
            "index_build_params": {}, 
            "search_params": {"metric_type": "L2", "params": {}},
            "pca_dims": pca_dims
        }
    ])
    index_configs.extend([
        {
            "index_name": "IVF_FLAT", 
            "index_build_params": {"nlist":1024}, 
            "search_params": {"metric_type": "L2", "params": {"nprobe":16}}, 
            "pca_dims": pca_dims
        },
         {
            "index_name": "IVF_SQ8", 
            "index_build_params": {"nlist":1024}, 
            "search_params": {"metric_type": "L2", "params": {"nprobe":16}}, 
            "pca_dims": pca_dims
        }
    ])
    index_configs.extend([
         {
            "index_name": "IVF_PQ", 
            "index_build_params": {"nlist":1024, "m": _m, "nbits":nbits}, 
            "search_params": {"metric_type": "L2", "params": {"nprobe":16}}, 
            "pca_dims": pca_dims
        } for _m in m_opts for nbits in [4,8] # ,16 32]
    ])
len(index_configs)

47

In [14]:
#with open("experiments.json", "w") as f: 
#    f.write(json.dumps(index_configs))

In [60]:
with open("experiments.json", "r") as f: 
    index_configs = json.loads(f.read())
    
for index_config in index_configs: 
    print("CONFIG: ", index_config)
    result = index_config.get("result")
    if not result: 

        data = POINT_ONE_PERCENT_SAMPLE_DATA
        search_vectors = SEARCH_VECTORS
            
        pca_train_apply_time = None
        
        if index_config["pca_dims"] > 0: 
            start = time.time()

            embeddings = np.array([d["embedding"] for d in data])
            
            pca_matrix = faiss.PCAMatrix (1280, index_config["pca_dims"])
            pca_matrix.train(embeddings)

            reduced_embeddings = pca_matrix.apply(embeddings)

            data = [
                {**metadata, "embedding": reduced_embedding} 
                for metadata, reduced_embedding in zip(data, reduced_embeddings)
            ]
            
            search_vectors = pca_matrix.apply(np.array(search_vectors))

            pca_train_apply_time = round(time.time() - start, 2)

        _params = {k:v for k,v in index_config.items() if k in ["index_name", "index_build_params", "search_params"]}

        result = evaluate_index(
            **_params,
            data=data, 
            search_vectors=search_vectors
        )
        if pca_train_apply_time: 
            result["pca_train_apply_time"]= pca_train_apply_time

        index_config["result"] = result

        with open("experiments.json", "w") as f: 
            f.write(json.dumps(index_configs))

    print("RESULT: ", result)
    print("\n")


CONFIG:  {'index_name': 'FLAT', 'index_build_params': {}, 'search_params': {'metric_type': 'L2', 'params': {}}, 'pca_dims': 0, 'result': {'index_build_time': 0.68, 'collection_load_time': 29.13, 'search_time': 8.68, 'search_scores': {'recall@1': 1.0, 'recall@10': 1.0, 'recall@100': 1.0}, 'memory_consumption': 475100160}}
RESULT:  {'index_build_time': 0.68, 'collection_load_time': 29.13, 'search_time': 8.68, 'search_scores': {'recall@1': 1.0, 'recall@10': 1.0, 'recall@100': 1.0}, 'memory_consumption': 475100160}


CONFIG:  {'index_name': 'IVF_FLAT', 'index_build_params': {'nlist': 1024}, 'search_params': {'metric_type': 'L2', 'params': {'nprobe': 16}}, 'pca_dims': 0, 'result': {'index_build_time': 155.49, 'collection_load_time': 25.28, 'search_time': 0.99, 'search_scores': {'recall@1': 0.99, 'recall@10': 0.961, 'recall@100': 0.8838}, 'memory_consumption': 480343040}}
RESULT:  {'index_build_time': 155.49, 'collection_load_time': 25.28, 'search_time': 0.99, 'search_scores': {'recall@1': 0

RESULT:  {'index_build_time': 8.78, 'collection_load_time': 18.7, 'search_time': 0.22, 'search_scores': {'recall@1': 0.97, 'recall@10': 0.471, 'recall@100': 0.5504}, 'memory_consumption': 1008584.0, 'pca_train_apply_time': 3.21}


CONFIG:  {'index_name': 'IVF_PQ', 'index_build_params': {'nlist': 1024, 'm': 16, 'nbits': 8}, 'search_params': {'metric_type': 'L2', 'params': {'nprobe': 16}}, 'pca_dims': 64}
RESULT:  {'index_build_time': 40.11, 'collection_load_time': 17.46, 'search_time': 0.11, 'search_scores': {'recall@1': 1.0, 'recall@10': 0.539, 'recall@100': 0.597}, 'memory_consumption': 1812368.0, 'pca_train_apply_time': 3.18}


CONFIG:  {'index_name': 'IVF_PQ', 'index_build_params': {'nlist': 1024, 'm': 32, 'nbits': 4}, 'search_params': {'metric_type': 'L2', 'params': {'nprobe': 16}}, 'pca_dims': 64}
RESULT:  {'index_build_time': 9.14, 'collection_load_time': 18.2, 'search_time': 0.11, 'search_scores': {'recall@1': 1.0, 'recall@10': 0.533, 'recall@100': 0.5872}, 'memory_consumption':

RESULT:  {'index_build_time': 224.87, 'collection_load_time': 21.58, 'search_time': 0.22, 'search_scores': {'recall@1': 1.0, 'recall@10': 0.82, 'recall@100': 0.7891}, 'memory_consumption': 13188224.0, 'pca_train_apply_time': 4.28}


CONFIG:  {'index_name': 'FLAT', 'index_build_params': {}, 'search_params': {'metric_type': 'L2', 'params': {}}, 'pca_dims': 512}
Collection embedding current size: 256, desired size: 512. Dropping and re-creating...
RESULT:  {'index_build_time': 0.71, 'collection_load_time': 19.5, 'search_time': 2.5, 'search_scores': {'recall@1': 1.0, 'recall@10': 0.917, 'recall@100': 0.9274}, 'memory_consumption': 190040064, 'pca_train_apply_time': 4.48}


CONFIG:  {'index_name': 'IVF_FLAT', 'index_build_params': {'nlist': 1024}, 'search_params': {'metric_type': 'L2', 'params': {'nprobe': 16}}, 'pca_dims': 512}
RESULT:  {'index_build_time': 38.63, 'collection_load_time': 20.43, 'search_time': 0.34, 'search_scores': {'recall@1': 1.0, 'recall@10': 0.898, 'recall@100': 0.8329

In [93]:
with open("experiments.json", "r") as f: 
    results = json.loads(f.read())

reference = results[0]
reference_memory = reference["result"]["memory_consumption"]

#print(f"| {' | '.join(results[0].keys())} |")
headers = [
    "Index Type",
    "PCA Dim. Reduction",
    "Build Params",
    "Recall (%)",
    "Memory (Mb)",
    "Memory (% of reference index)",
    "Build Time (s)",
    "Search Time (s)",
    "Load Time (s)"
]
print(f"| {' | '.join(headers)} | ")
print(f"| {' | '.join(' --- ' for _ in headers)} |")
for result in results: 
    build_params = result["index_build_params"]
    build_time = result["result"]["index_build_time"]
    load_time = result["result"]["collection_load_time"] 
    search_time = result["result"]["search_time"]
    recall = result["result"]["search_scores"]["recall@100"]
    memory_consumption = result["result"]["memory_consumption"]
    memory_reduction = memory_consumption/reference_memory
    print(f"|{result['index_name']} | {result['pca_dims'] if result['pca_dims'] else ''} | {', '.join(k+':'+str(v) for k,v in build_params.items())} | {round(recall*100, 2)} | {round(memory_consumption/10**6, 2)} | {round(memory_reduction*100, 2)} | {build_time} | {search_time} | {load_time} |")
    
    

| Index Type | PCA Dim. Reduction | Build Params | Recall (%) | Memory (Mb) | Memory (% of reference index) | Build Time (s) | Search Time (s) | Load Time (s) | 
|  ---  |  ---  |  ---  |  ---  |  ---  |  ---  |  ---  |  ---  |  ---  |
|FLAT |  |  | 100.0 | 475.1 | 100.0 | 0.68 | 8.68 | 29.13 |
|IVF_FLAT |  | nlist:1024 | 88.38 | 480.34 | 101.1 | 155.49 | 0.99 | 25.28 |
|IVF_SQ8 |  | nlist:1024 | 88.01 | 124.02 | 26.1 | 153.71 | 0.88 | 20.22 |
|IVF_PQ |  | nlist:1024, m:32, nbits:4 | 47.87 | 6.81 | 1.43 | 143.62 | 0.57 | 17.32 |
|IVF_PQ |  | nlist:1024, m:32, nbits:8 | 59.35 | 9.52 | 2.0 | 431.66 | 0.85 | 20.16 |
|IVF_PQ |  | nlist:1024, m:64, nbits:4 | 54.95 | 8.29 | 1.75 | 145.09 | 1.87 | 15.77 |
|IVF_PQ |  | nlist:1024, m:64, nbits:8 | 67.45 | 12.49 | 2.63 | 707.45 | 0.78 | 14.59 |
|IVF_PQ |  | nlist:1024, m:128, nbits:4 | 63.2 | 11.26 | 2.37 | 147.51 | 0.72 | 22.36 |
|IVF_PQ |  | nlist:1024, m:128, nbits:8 | 76.45 | 18.43 | 3.88 | 866.46 | 1.42 | 15.32 |
|IVF_PQ |  | nlist:1024, m:

In [99]:
# _data = extract_sample_data()

# # randomly select a set of vectors to use as search
# # vectors to compare the different indexing strategies
# _search_vectors = [
#     _data[i]["embedding"] 
#     for i in np.random.choice(range(len(_data)), size=100)
# ]

result = evaluate_index(
    ** {
        "index_name": "IVF_SQ8", 
        "index_build_params": {"nlist":1024}, 
        "search_params": {"metric_type": "L2", "params": {"nprobe":16}}, 
    },
    data=_data, 
    search_vectors=_search_vectors
)

In [102]:
result

{'index_build_time': 1050.89,
 'collection_load_time': 30.38,
 'search_time': 2.93,
 'search_scores': {'recall@1': 0.0, 'recall@10': 0.0, 'recall@100': 0.0001},
 'memory_consumption': 1122675200}