In [None]:
import gzip
import json
import os
import pprint
import subprocess
import time
from datetime import timedelta
from pathlib import Path

import numpy as np
import pandas as pd
import requests

In [None]:
qdrant_name = "benchmark_qdrant"
qdrant_host = "localhost"
qdrant_port = 6344
qdrant_version = "1.8.4"

In [None]:
def get_dataset_config(target_name):
    setting = {
        "100k-768-m49-ef100-ip": {
            "content_path": "dataset/passages-c400-jawiki-20230403",
            "embedding_path": "dataset/passages-c400-jawiki-20230403/multilingual-e5-base-passage",
            "num_of_docs": 5555583,
            "index_size": 100000,
            "bulk_size": 1000,
            "index_name": "contents",
            "distance": "Dot", # "Cosine"
            "dimension": 768,
            "hnsw_m": 48,
            "hnsw_ef_construction": 200,
            "hnsw_ef": 100,
        },
        "1m-768-m49-ef100-ip": {
            "content_path": "dataset/passages-c400-jawiki-20230403",
            "embedding_path": "dataset/passages-c400-jawiki-20230403/multilingual-e5-base-passage",
            "num_of_docs": 5555583,
            "index_size": 1000000,
            "bulk_size": 1000,
            "index_name": "contents",
            "distance": "Dot", # "Cosine"
            "dimension": 768,
            "hnsw_m": 48,
            "hnsw_ef_construction": 200,
            "hnsw_ef": 100,
        },
        "5m-768-m49-ef100-ip": {
            "content_path": "dataset/passages-c400-jawiki-20230403",
            "embedding_path": "dataset/passages-c400-jawiki-20230403/multilingual-e5-base-passage",
            "num_of_docs": 5555583,
            "index_size": 5000000,
            "bulk_size": 1000,
            "index_name": "contents",
            "distance": "Dot", # "Cosine"
            "dimension": 768,
            "hnsw_m": 48,
            "hnsw_ef_construction": 200,
            "hnsw_ef": 100,
        },
    }
    return setting.get(target_name)

volume_dir = os.getenv("VOLUME_DIR", "./data")

dataset_config = get_dataset_config(os.getenv("TARGET_CONFIG", "100k-768-m49-ef100-ip"))
pprint.pprint(dataset_config)

content_path = Path(dataset_config.get("content_path"))
embedding_path = Path(dataset_config.get("embedding_path"))
num_of_docs = int(dataset_config.get("num_of_docs"))
index_size = int(dataset_config.get("index_size"))
bulk_size = int(dataset_config.get("bulk_size"))

index_name = dataset_config.get("index_name")
distance = dataset_config.get("distance")
dimension = int(dataset_config.get("dimension"))
hnsw_m = int(dataset_config.get("hnsw_m"))
hnsw_ef_construction = int(dataset_config.get("hnsw_ef_construction"))
hnsw_ef = int(dataset_config.get("hnsw_ef"))

results = {}

In [None]:
def run_qdrant():
    print(f"Starting {qdrant_name}... ", end="")
    docker_cmd = [
        # "sudo",
        "docker", "run", "-d",
        "--name", qdrant_name,
        "-p", f"{qdrant_port}:6333",
        # "-v", f"{volume_dir}:/qdrant/storage",
        f"qdrant/qdrant:v{qdrant_version}"
    ]
    result = subprocess.run(docker_cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print("[OK]")
    else:
        print(f"[FAIL]")
        print("STDOUT:")
        print(result.stdout)
        print("STDERR:")
        print(result.stderr)


In [None]:
def stop_qdrant():
    print(f"Stopping {qdrant_name}... ", end="")
    docker_cmd = [
        # "sudo",
        "docker", "stop", qdrant_name
    ]
    result = subprocess.run(docker_cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print("[OK]")
    else:
        print(f"[FAIL]")
        print("STDOUT:")
        print(result.stdout)
        print("STDERR:")
        print(result.stderr)


In [None]:
def prune_docker():
    print(f"Cleaning up... ", end="")
    docker_cmd = [
        # "sudo",
        "docker", "system", "prune", "-f"
    ]
    result = subprocess.run(docker_cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print("[OK]")
    else:
        print(f"[FAIL]")
        print("STDOUT:")
        print(result.stdout)
        print("STDERR:")
        print(result.stderr)


In [None]:
def print_docker_system_df():
    docker_cmd = [
        # "sudo",
        "docker", "system", "df"
    ]
    result = subprocess.run(docker_cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print(result.stdout)
    else:
        print(result.stderr)


In [None]:
def print_docker_container_stats():
    docker_cmd = [
        # "sudo",
        "docker", "container", "stats", "--no-stream"
    ]
    result = subprocess.run(docker_cmd, capture_output=True, text=True)
    containers = {}
    if result.returncode == 0:
        print(result.stdout)
        for line in result.stdout.split("\n"):
            if line.startswith("CONTAINER") or len(line) == 0:
                continue
            values = line.split()
            containers[values[1]] = {
                "container_id": values[0],
                "cpu": values[2],
                "mem": values[6],
                "mem_usage": values[3],
                "mem_limit": values[5],
                "net_in": values[7],
                "net_out": values[9],
                "block_in": values[10],
                "block_out": values[12],
                "pids": values[13],
            }
    else:
        print(result.stderr)
    return containers


In [None]:
def create_index():
    print(F"Creating Collection {index_name}... ", end="")
    response = requests.put(f"http://{qdrant_host}:{qdrant_port}/collections/{index_name}",
                            headers={"Content-Type": "application/json"},
                            json={
                                "vectors": {
                                    "size": dimension,
                                    "distance": distance,
                                    "hnsw_config": {
                                        "m": hnsw_m,
                                        "ef_construction": hnsw_ef_construction,
                                    }
                                },
                                "quantization_config": {
                                    "scalar": {
                                        "type": "int8",
                                        "quantile": 0.99,
                                        "always_ram": True
                                    }
                                }
                            })
    if response.status_code == 200:
        print("[OK]")
    else:
        print(f"[FAIL]\n{response.text}")

    for field_name in ["page_id", "rev_id"]:
        print(F"Creating Payload integer:{index_name}... ", end="")
        response = requests.put(f"http://{qdrant_host}:{qdrant_port}/collections/{index_name}/index",
                                headers={"Content-Type": "application/json"},
                                json={
                                    "field_name": field_name,
                                    "field_schema": "integer"
                                })
        if response.status_code == 200:
            print("[OK]")
        else:
            print(f"[FAIL]\n{response.text}")

    for field_name in ["section"]:
        print(F"Creating Payload keyword:{index_name}... ", end="")
        response = requests.put(f"http://{qdrant_host}:{qdrant_port}/collections/{index_name}/index",
                                headers={"Content-Type": "application/json"},
                                json={
                                    "field_name": field_name,
                                    "field_schema": "keyword"
                                })
        if response.status_code == 200:
            print("[OK]")
        else:
            print(f"[FAIL]\n{response.text}")

    for field_name in ["title", "text"]:
        print(F"Creating Payload text:{index_name}... ", end="")
        response = requests.put(f"http://{qdrant_host}:{qdrant_port}/collections/{index_name}/index",
                                headers={"Content-Type": "application/json"},
                                json={
                                    "field_name": field_name,
                                    "field_schema": {
                                        "type": "text",
                                        "tokenizer": "word",
                                        "min_token_len": 2,
                                        "max_token_len": 2,
                                        "lowercase": True
                                    }
                                })
        if response.status_code == 200:
            print("[OK]")
        else:
            print(f"[FAIL]\n{response.text}")


In [None]:
def delete_index():
    print(F"Deleting Collection {index_name}... ", end="")
    response = requests.delete(f"http://{qdrant_host}:{qdrant_port}/collections/{index_name}")
    if response.status_code == 200:
        print("[OK]")
    else:
        print(f"[FAIL]\n{response.text}")


In [None]:
def print_indices():
    response = requests.get(f"http://{qdrant_host}:{qdrant_port}/collections/{index_name}")
    obj = json.loads(response.text)
    pprint.pprint(obj)
    return {
        "num_of_docs": obj.get("result").get("points_count"),
    }


In [None]:
def wait_for_qdrant(retry_count=60):
    print(f"Waiting for {qdrant_name}", end="")
    for i in range(retry_count):
        try:
            response = requests.get(f"http://{qdrant_host}:{qdrant_port}/cluster")
            if response.status_code == 200:
                print("[OK]")        
                return
        except:
            pass
        print(".", end="")
        time.sleep(1)
    print("[FAIL]")


In [None]:
def get_embedding(embedding_index, embedding_data, id):
    emb_index = int(id / 100000) * 100000
    if embedding_data is None or embedding_index != emb_index:
        with np.load(embedding_path / f"{emb_index}.npz") as data:
            embedding_data = data["embs"]
    embedding = embedding_data[id - emb_index]
    if distance == "Dot":
        embedding = embedding / np.linalg.norm(embedding)
    return emb_index, embedding_data, embedding


def insert_data(bulk_size, max_size):
    start_time = time.time()

    ids = []
    vectors = []
    payloads = []
    def send_data(pos):
        print(F"Sending {int(len(ids))} docs ({pos}/{max_size})... ", end="")
        response = requests.put(f"http://{qdrant_host}:{qdrant_port}/collections/{index_name}/points",
                                 headers={"Content-Type": "application/json"},
                                 params={
                                     "wait": "true",
                                 },
                                 data=json.dumps({
                                     "batch": {
                                         "ids": ids,
                                         "vectors": vectors,
                                         "payloads": payloads,
                                     }
                                 }))
        t = json.loads(response.text).get("time")
        if response.status_code == 200:
            print(f"[OK] {t}")
        else:
            print(f"[FAIL] {t} {response.status_code} {response.text}")
        return t

    total_time = 0
    count = 0
    embedding_index = -1
    embedding_data = None
    for content_file in sorted(content_path.glob("*.parquet")):
        df = pd.read_parquet(content_file)
        for i,row in df.iterrows():
            if count >= max_size:
                break
            embedding_index, embedding_data, embedding = get_embedding(embedding_index, embedding_data, row.id)
            count += 1
            ids.append(count)
            vectors.append(embedding.tolist())
            payloads.append({
                "page_id": row.pageid,
                "rev_id": row.revid,
                # "title": row.title,
                "section": row.section,
                # "text": row.text,
            })
            if len(ids) >= bulk_size:
                total_time += send_data(count)
                ids = []
                vectors = []
                payloads = []

    if len(ids) > 0:
        total_time += send_data(count)

    green_count = 0
    while green_count < 30:
        response = requests.get(f"http://{qdrant_host}:{qdrant_port}/collections/{index_name}")
        obj = json.loads(response.text)
        if obj.get("result").get("status") == "green":
            green_count += 1
        else:
            green_count = 0 # reset
        print(".", end="")
        time.sleep(1)
    print(".")

    execution_time = time.time() - start_time
    hours, remainder = divmod(execution_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f"Execution Time: {int(hours):02d}:{int(minutes):02d}:{seconds:02.2f} ({timedelta(seconds=total_time)})")
    return {
        "execution_time": execution_time,
        "process_time": total_time,
    }


In [None]:
def search(query):
    response = requests.post(f"http://{qdrant_host}:{qdrant_port}/collections/{index_name}/points/search",
                             headers={"Content-Type": "application/json"},
                             json=query)
    # print(response.text)

    if response.status_code == 200:
        obj = json.loads(response.text)
        if obj.get("status") != "ok":
            print(f"[FAIL] {response.text}")
            return -1, -1, [], [], []
        product_ids = [x.get("id") for x in obj.get("result")]
        scores = [x.get("score") for x in obj.get("result")]
        return obj.get("time") * 1000, len(obj.get("result")), product_ids, scores
    print(f"[FAIL][{response.status_code}] {response.text}")
    return -1, -1, [], []


In [None]:
def search_with_knn_queries(output_path, max_size=10000, page_size=100, offset=0):
    print("Sending knn queries...")
    start_time = time.time()
    pos = offset
    count = 0
    running = True
    with gzip.open(output_path, "wt", encoding="utf-8") as f:
        while running:
            with np.load(embedding_path / f"{pos}.npz") as data:
                embedding_data = data["embs"]
            for embedding in embedding_data:
                if count >= max_size:
                    running = False
                    break
                if distance == "Dot":
                    embedding = embedding / np.linalg.norm(embedding)
                query = {
                    "vector": embedding.tolist(),
                    "limit": page_size,
                    # "with_payload": "true",
                    "params": {
                        "hnsw_ef": hnsw_ef,
                    },
                }
                took, hits, ids, scores = search(query=query)
                # print(f"{took}, {total_hits}, {ids}, {scores}")
                if took == -1:
                    continue
                result = {
                    "id": (count + 1),
                    "took": took,
                    "hits": hits,
                    "ids": ids,
                    "scores": scores,
                }
                f.write(json.dumps(result, ensure_ascii=False))
                f.write("\n")
                count += 1
                if count % 10000 == 0:
                    print(f"Sent {count}/{max_size} queries.")

            pos += 100000
            if pos > num_of_docs:
                pos = 0

    execution_time = time.time() - start_time
    hours, remainder = divmod(execution_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f"Execution Time: {int(hours):02d}:{int(minutes):02d}:{seconds:02.2f}")


In [None]:
def get_output_filename(qdrant_version, name):
    filename = f"output/qdrant{qdrant_version.replace('.', '_')}_{name}"
    filename += ".jsonl.gz"
    return filename


In [None]:
def print_took_and_total_hits(filename):
    tooks = []
    hits = []
    with gzip.open(filename, "rt", encoding="utf-8") as f:
        for line in f.readlines():
            obj = json.loads(line)
            tooks.append(obj.get("took"))
            hits.append(obj.get("hits"))
    df = pd.DataFrame({"took": tooks, "hits": hits})
    print(df.describe().to_markdown())
    return {
        "num_of_queries": len(df),
        "took": {
            "mean": df.took.mean(),
            "std": df.took.std(),
            "min": df.took.min(),
            "25%": df.took.quantile(0.25),
            "50%": df.took.quantile(0.5),
            "75%": df.took.quantile(0.75),
            "90%": df.took.quantile(0.9),
            "99%": df.took.quantile(0.99),
            "max": df.took.max(),
        },
        "hits": {
            "mean": df.took.mean(),
            "std": df.took.std(),
            "min": df.took.min(),
            "25%": df.took.quantile(0.25),
            "50%": df.took.quantile(0.5),
            "75%": df.took.quantile(0.75),
            "max": df.took.max(),
        },
    }


In [None]:
def save_results():
    with open("results.json", "wt", encoding="utf-8") as f:
        json.dump({
            "version": qdrant_version,
            "settings": dataset_config,
            "results": results,
        }, f, ensure_ascii=False, default=lambda x: int(x) if isinstance(x, np.int64) else None)


In [None]:
prune_docker()
print(f"<<<Qdrant {qdrant_version}>>>")
run_qdrant()
wait_for_qdrant()

In [None]:
print_docker_container_stats()
print_docker_system_df()

In [None]:
create_index()

In [None]:
print_docker_container_stats()
print_indices()
print_docker_system_df()

In [None]:
results["indexing"] = insert_data(bulk_size=bulk_size, max_size=index_size)

In [None]:
results["indexing"]["container"] = print_docker_container_stats()
print_indices()
print_docker_system_df()

In [None]:
for page_size in [10, 100, 400]:
    print(f"page size: {page_size}")
    filename = get_output_filename(qdrant_version, f"knn_{page_size}")
    search_with_knn_queries(filename, page_size=page_size, max_size=1000) # warmup
    search_with_knn_queries(filename, page_size=page_size, offset=index_size)
    results[f"top_{page_size}"] = print_took_and_total_hits(filename)

In [None]:
results["indexing"]["container"] = print_docker_container_stats()
print_indices()
print_docker_system_df()

In [None]:
save_results()

In [None]:
delete_index()
stop_qdrant()