### Notebook to process benchmar results

Please run this notebook after running all the benchmarks and storing them in the `results` dir. This will export them in the desired format for the single node benchmark plots of [qdrant.tech/benchmarks](https://qdrant.tech/benchmarks)

In [256]:
from pathlib import Path
import re
import json
import pandas as pd
from datetime import datetime, timezone

In [257]:
DATA_DIR = Path().resolve().parent / "results"
DATA_DIR, list(DATA_DIR.glob("*.json"))[0].name

(PosixPath('/home/caiyd/work/vec/vector-db-benchmark/results'),
 'milvus-m-16-ef-128-glove-100-angular-search-9-2024-08-20-18-15-38.json')

In [258]:
PATH_REGEX = re.compile(r"(?P<engine_name>("
                        r"?P<engine>[a-z\-]+)"
                        r"\-m\-(?P<m>[0-9]+)"
                        r"\-ef\-(?P<ef>[0-9]+)"
                        r")"
                        r"\-(?P<dataset>[a-zA-Z0-9\-]+)"
                        r"\-(?P<operation>(search)|(upload))"
                        r"(\-(?P<search_index>[0-9]{1,2})\-)?"
                        r"\-?(?P<date>.*)\.json")

In [259]:
upload_results, search_results = [], []

for path in DATA_DIR.glob("*.json"):
    match = PATH_REGEX.match(path.name)
    if match is None:
        continue

    experiment = match.groupdict()

    with open(path, "r") as fp:
        stats = json.load(fp)

    params = stats["params"]
    dataset = params.pop("dataset")
    engine = params.pop("engine")

    entry = {
        "dataset": dataset,
        "engine": engine,
        "m": match["m"],
        "ef": match["ef"],
        "date": match["date"],
        "params": params,
        "results": stats["results"],
    }

    if experiment["operation"] == "search":
        entry.update({"search_index": match["search_index"]})
        search_results.append(entry)
    elif experiment["operation"] == "upload":
        upload_results.append(entry)
    else:
        raise Exception("Unknown operation")

len(upload_results), len(search_results)

(1, 15)

In [260]:
upload_results, search_results[0]

([{'dataset': 'glove-100-angular',
   'engine': 'milvus',
   'm': '16',
   'ef': '128',
   'date': '2024-08-20-17-49-05',
   'params': {'experiment': 'milvus-m-16-ef-128',
    'parallel': 16,
    'index_params': {'efConstruction': 128, 'M': 16}},
   'results': {'post_upload': {},
    'upload_time': 52.297504355199635,
    'total_time': 136.23506368184462}}],
 {'dataset': 'glove-100-angular',
  'engine': 'milvus',
  'm': '16',
  'ef': '128',
  'date': '2024-08-20-18-15-38',
  'params': {'experiment': 'milvus-m-16-ef-128',
   'parallel': 8,
   'config': {'ef': 128}},
  'results': {'total_time': 3.464979766868055,
   'mean_time': 0.0025059427607338875,
   'mean_precisions': 0.712674,
   'std_time': 0.0007535889975402178,
   'min_time': 0.0014076572842895985,
   'max_time': 0.017521413043141365,
   'rps': 2886.0197383024997,
   'p95_time': 0.003639045869931576,
   'p99_time': 0.005503611019812528},
  'search_index': '9'})

In [261]:
upload_df = pd.DataFrame(upload_results)
upload_df["date"] = pd.to_datetime(upload_df["date"], format="%Y-%m-%d-%H-%M-%S")
upload_df = upload_df.sort_values("date", ascending=False) \
    .groupby(["engine", "m", "ef", "dataset"]) \
    .first()

temp_df = upload_df.copy()
temp_df["total_time"] = temp_df["results"].apply(lambda x: x["total_time"])
temp_df.sort_values("total_time", ascending=True).head(n=5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,date,params,results,total_time
engine,m,ef,dataset,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
milvus,16,128,glove-100-angular,2024-08-20 17:49:05,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'post_upload': {}, 'upload_time': 52.29750435...",136.235064


In [262]:
search_df = pd.DataFrame(search_results)
search_df["date"] = pd.to_datetime(search_df["date"], format="%Y-%m-%d-%H-%M-%S")
search_df = search_df.sort_values("date", ascending=False) \
    .groupby(["engine", "m", "ef", "dataset", "search_index"]) \
    .first()

temp_df = search_df.copy()
temp_df['rps'] = temp_df['results'].apply(lambda x: x["rps"])
temp_df.sort_values("rps", ascending=False).head(n=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,date,params,results,rps
engine,m,ef,dataset,search_index,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
milvus,16,128,glove-100-angular,9,2024-08-20 18:15:38,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 3.464979766868055, 'mean_time':...",2886.019738
milvus,16,128,glove-100-angular,10,2024-08-20 18:15:57,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 4.260843075811863, 'mean_time':...",2346.953366
milvus,16,128,glove-100-angular,12,2024-08-20 18:16:38,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 4.3924609292298555, 'mean_time'...",2276.628105
milvus,16,128,glove-100-angular,14,2024-08-20 18:17:19,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 4.750134743284434, 'mean_time':...",2105.20344
milvus,16,128,glove-100-angular,13,2024-08-20 18:16:58,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 5.018590031657368, 'mean_time':...",1992.591532
milvus,16,128,glove-100-angular,11,2024-08-20 18:16:18,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 5.306716655381024, 'mean_time':...",1884.404359
milvus,16,128,glove-100-angular,6,2024-08-20 18:14:32,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 6.1497083911672235, 'mean_time'...",1626.093363
milvus,16,128,glove-100-angular,7,2024-08-20 18:14:55,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 7.403659658040851, 'mean_time':...",1350.683373
milvus,16,128,glove-100-angular,8,2024-08-20 18:15:19,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 8.594327203929424, 'mean_time':...",1163.558213
milvus,16,128,glove-100-angular,3,2024-08-20 18:13:11,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 13.03046287689358, 'mean_time':...",767.432446


In [263]:
_search = search_df.reset_index()
_upload = upload_df.reset_index()

joined_df = _search.merge(_upload, on=["engine", "m", "ef", "dataset"], how="left", suffixes=("_search", "_upload"))
print(len(joined_df))
joined_df

15


Unnamed: 0,engine,m,ef,dataset,search_index,date_search,params_search,results_search,date_upload,params_upload,results_upload
0,milvus,16,128,glove-100-angular,0,2024-08-20 18:12:00,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 14.422660088166595, 'mean_time'...",2024-08-20 17:49:05,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'post_upload': {}, 'upload_time': 52.29750435..."
1,milvus,16,128,glove-100-angular,1,2024-08-20 18:12:18,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 17.706545684020966, 'mean_time'...",2024-08-20 17:49:05,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'post_upload': {}, 'upload_time': 52.29750435..."
2,milvus,16,128,glove-100-angular,10,2024-08-20 18:15:57,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 4.260843075811863, 'mean_time':...",2024-08-20 17:49:05,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'post_upload': {}, 'upload_time': 52.29750435..."
3,milvus,16,128,glove-100-angular,11,2024-08-20 18:16:18,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 5.306716655381024, 'mean_time':...",2024-08-20 17:49:05,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'post_upload': {}, 'upload_time': 52.29750435..."
4,milvus,16,128,glove-100-angular,12,2024-08-20 18:16:38,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 4.3924609292298555, 'mean_time'...",2024-08-20 17:49:05,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'post_upload': {}, 'upload_time': 52.29750435..."
5,milvus,16,128,glove-100-angular,13,2024-08-20 18:16:58,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 5.018590031657368, 'mean_time':...",2024-08-20 17:49:05,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'post_upload': {}, 'upload_time': 52.29750435..."
6,milvus,16,128,glove-100-angular,14,2024-08-20 18:17:19,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 4.750134743284434, 'mean_time':...",2024-08-20 17:49:05,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'post_upload': {}, 'upload_time': 52.29750435..."
7,milvus,16,128,glove-100-angular,2,2024-08-20 18:12:43,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 24.45869105728343, 'mean_time':...",2024-08-20 17:49:05,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'post_upload': {}, 'upload_time': 52.29750435..."
8,milvus,16,128,glove-100-angular,3,2024-08-20 18:13:11,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 13.03046287689358, 'mean_time':...",2024-08-20 17:49:05,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'post_upload': {}, 'upload_time': 52.29750435..."
9,milvus,16,128,glove-100-angular,4,2024-08-20 18:13:40,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'total_time': 13.220554241910577, 'mean_time'...",2024-08-20 17:49:05,"{'experiment': 'milvus-m-16-ef-128', 'parallel...","{'post_upload': {}, 'upload_time': 52.29750435..."


In [264]:
json_results = []

for index, row in joined_df.reset_index().iterrows():
    engine_params = {}
    
    if isinstance(row['params_upload'], dict):
        engine_params.update(row['params_upload'])
    if isinstance(row['params_search'], dict):
        search_params = row['params_search']
        engine_params.update(search_params.get('config', {}))
        engine_params.update(search_params.get('params', {}))
        engine_params.update(search_params.get('search_params', {}))
        engine_params.update(search_params.get('vectorIndexConfig', {}))

    engine_params.pop('experiment')
    engine_params.pop('parallel')

    engine_name = row['engine']

    if engine_name.startswith("qdrant-"):
        engine_name = "qdrant"
    
    json_object = {
        "engine_name": engine_name,
        "setup_name": f"{row['params_search']['experiment']}",
        "dataset_name": row['dataset'],
        "search_idx": row['search_index'],
        "upload_time": row['results_upload']['upload_time'],
        "total_upload_time": row['results_upload']['total_time'],
        "p95_time": row['results_search']['p95_time'],
        "rps": row['results_search']['rps'],
        "parallel": row['params_search']['parallel'],
        "p99_time": row['results_search']['p99_time'],
        "mean_time": row['results_search']['mean_time'],
        "mean_precisions": row['results_search']['mean_precisions'],
        "engine_params": engine_params,
    }
    json_results.append(json_object)

format = '%Y-%M-%dT%H:%M:%S'
now = datetime.now().replace(tzinfo=timezone.utc).strftime(format)

Path(f"results.json").write_text(json.dumps(json_results, indent=2))
Path(f"results-{now}.json").write_text(json.dumps(json_results, indent=2))

print(json_results[-1], len(json_results))

results_df = pd.DataFrame(json_results).sort_values("p99_time", ascending=True)
# results_df.to_csv('results.csv')
results_df

{'engine_name': 'milvus', 'setup_name': 'milvus-m-16-ef-128', 'dataset_name': 'glove-100-angular', 'search_idx': '9', 'upload_time': 52.297504355199635, 'total_upload_time': 136.23506368184462, 'p95_time': 0.003639045869931576, 'rps': 2886.0197383024997, 'parallel': 8, 'p99_time': 0.005503611019812528, 'mean_time': 0.0025059427607338875, 'mean_precisions': 0.712674, 'engine_params': {'index_params': {'efConstruction': 128, 'M': 16}, 'ef': 128}} 15


Unnamed: 0,engine_name,setup_name,dataset_name,search_idx,upload_time,total_upload_time,p95_time,rps,parallel,p99_time,mean_time,mean_precisions,engine_params
0,milvus,milvus-m-16-ef-128,glove-100-angular,0,52.297504,136.235064,0.00168,693.353372,1,0.001972,0.001415,0.712674,"{'index_params': {'efConstruction': 128, 'M': ..."
8,milvus,milvus-m-16-ef-128,glove-100-angular,3,52.297504,136.235064,0.001739,767.432446,2,0.001974,0.00155,0.712674,"{'index_params': {'efConstruction': 128, 'M': ..."
1,milvus,milvus-m-16-ef-128,glove-100-angular,1,52.297504,136.235064,0.001989,564.762895,1,0.00216,0.001743,0.790302,"{'index_params': {'efConstruction': 128, 'M': ..."
9,milvus,milvus-m-16-ef-128,glove-100-angular,4,52.297504,136.235064,0.002202,756.39794,2,0.002433,0.001934,0.790302,"{'index_params': {'efConstruction': 128, 'M': ..."
11,milvus,milvus-m-16-ef-128,glove-100-angular,6,52.297504,136.235064,0.00205,1626.093363,4,0.002557,0.001822,0.712674,"{'index_params': {'efConstruction': 128, 'M': ..."
12,milvus,milvus-m-16-ef-128,glove-100-angular,7,52.297504,136.235064,0.002574,1350.683373,4,0.003016,0.002261,0.790302,"{'index_params': {'efConstruction': 128, 'M': ..."
7,milvus,milvus-m-16-ef-128,glove-100-angular,2,52.297504,136.235064,0.002822,408.852623,1,0.003079,0.002417,0.854088,"{'index_params': {'efConstruction': 128, 'M': ..."
10,milvus,milvus-m-16-ef-128,glove-100-angular,5,52.297504,136.235064,0.00311,637.924413,2,0.003319,0.002652,0.854088,"{'index_params': {'efConstruction': 128, 'M': ..."
13,milvus,milvus-m-16-ef-128,glove-100-angular,8,52.297504,136.235064,0.00358,1163.558213,4,0.004073,0.003068,0.854088,"{'index_params': {'efConstruction': 128, 'M': ..."
14,milvus,milvus-m-16-ef-128,glove-100-angular,9,52.297504,136.235064,0.003639,2886.019738,8,0.005504,0.002506,0.712674,"{'index_params': {'efConstruction': 128, 'M': ..."
