In [151]:
from qdrant_client import QdrantClient, models
from openai import OpenAI
import os
from dotenv import load_dotenv
from typing import Union, Set, List, Dict
import json
import time

In [56]:
QDRANT_HOST = "localhost"
QDRANT_PORT = 6333
COLLECTION_NAME = 'arxiv_papers'
k = 10
QUERIES_FILE = "queries_embeddings.json"

In [38]:
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)

In [39]:
# Define paths for configuration files
env_path = os.path.expanduser("./.env")

# Load environment variables from .env file
load_dotenv(dotenv_path=env_path, override=True)

# Read environment variables
OPENAI_API_KEY: str | None = os.environ.get("OPENAI_API_KEY")
HF_API_KEX: str | None = os.environ.get("HF_API_KEY")
BASE_URL: str | None = os.environ.get("BASE_URL")

In [40]:
def get_embedding(text: str) -> Union[List[float], None]:
    api_key = os.environ.get("OPENAI_API_KEY")
    client_qa = OpenAI(api_key=api_key)
    text = text.replace("\n", " ")
    try:
        response = client_qa.embeddings.create(input=[text], model="text-embedding-ada-002")
        embedding = response.data[0].embedding
        return embedding
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

In [48]:
def precision_k(ann_results: Set, exact_results: Set):
    return len(ann_results.intersection(exact_results)) / k

In [59]:
def get_ann_points(embedding: List):
    start_time_ann = time.time()
    ann_result = client.query_points(
        collection_name=COLLECTION_NAME,
        query=embedding,
        limit=k
    ).points
    ann_time = time.time() - start_time_ann
    ids = [res.payload['id'] for res in ann_result]
    return ids, ann_time

In [127]:
def get_hnsw_points(embedding: List, hnsw_ef: int, k: int = 10):
    start_time_hnsw = time.time()
    hnsw_result = client.query_points(
        collection_name=COLLECTION_NAME,
        query=embedding,
        limit=k,
        search_params=models.SearchParams(hnsw_ef=hnsw_ef)
    ).points
    hnsw_time = time.time() - start_time_hnsw
    ids = [res.payload['id'] for res in hnsw_result]
    return ids, hnsw_time

In [133]:
def get_knn_points(embedding: List):
    start_time_knn = time.time()
    knn_result = client.query_points(
        collection_name=COLLECTION_NAME,
        query=embedding,
        limit=k,
        search_params=models.SearchParams(exact=True),
    ).points
    knn_time = time.time() - start_time_knn
    ids = [res.payload['id'] for res in knn_result]
    return ids, knn_time

In [50]:
def result_formatting(k, avg_precision, avg_ann_time, avg_knn_time):
    print(f'Average precision@{k}: {avg_precision:.4f}')
    print(f'Average ANN query time: {avg_ann_time * 1000:.2f} ms')
    print(f'Average exact k-NN query time: {avg_knn_time * 1000:.2f} ms')

In [44]:
def get_test_data_set() -> str:
    with open("queries_embeddings.json", 'r', encoding='utf-8') as file:
        test_dataset = json.load(file)
        return test_dataset

In [58]:
embeddings = get_test_data_set()

ann_results = [get_ann_points(vector) for _, vector in embeddings.items()]
knn_results = [get_knn_points(vector) for _, vector in embeddings.items()]

results = [(precision_k(set(ann_id_list), set(knn_id_list)), knn_execution_time / ann_execution_time)
           for (ann_id_list, ann_execution_time), (knn_id_list, knn_execution_time)
           in zip(ann_results, knn_results)]

precisions, times = zip(*results)
avg_precision = sum(precisions) / len(precisions)
avg_ratio = sum(times) / len(times)

print(f"Avg Precision@k: {avg_precision:.4f}")
print(f"Avg execution time ratio (knn vs. ann): {avg_ratio:.4f}")

Avg Precision@k: 0.9980
Avg execution time ratio (knn vs. ann): 6.0790


In [162]:
def evaluate_hnsw_ef():
    knn_results = [get_knn_points(vector) for _, vector in embeddings.items()]

    hnsw_ef_values = [10, 20, 50, 100, 200]
    hnsw_ef_values = [10]

    results_list = []
    for hnsw_ef in hnsw_ef_values:
        hnsw_results = [get_hnsw_points(vector, hnsw_ef) for _, vector in embeddings.items()]

        result_hnsw = [
            (precision_k(set(hnsw_id_list), set(knn_id_list)), hnsw_execution_time)
            for (hnsw_id_list, hnsw_execution_time), (knn_id_list, knn_execution_time)
            in zip(hnsw_results, knn_results)]

        prec, hnsw_exe_time = zip(*result_hnsw)
        avg_precision = sum(prec) / len(prec)
        avg_query_time_ms = sum(hnsw_exe_time) / len(hnsw_exe_time) * 1000

        results_list.append({
            "hnsw_ef": hnsw_ef,
            "avg_precision": avg_precision,
            "avg_query_time_ms": avg_query_time_ms
        })

    return results_list

In [181]:
def evaluate_ann():
    ann_results = [get_ann_points(vector) for _, vector in embeddings.items()]
    knn_results = [get_knn_points(vector) for _, vector in embeddings.items()]

    results = [(precision_k(set(ann_id_list), set(knn_id_list)), ann_execution_time)
               for (ann_id_list, ann_execution_time), (knn_id_list, knn_execution_time)
               in zip(ann_results, knn_results)]

    precisions, exec_time = zip(*results)
    avg_precision = sum(precisions) / len(precisions)
    avg_query_time_ms = sum(exec_time) / len(exec_time) * 1000

    res = {"avg_precision": avg_precision,
           "avg_query_time_ms": avg_query_time_ms
           }

    return res

In [174]:
evaluation = evaluate_hnsw_ef()

In [95]:
client.update_collection(
    collection_name=COLLECTION_NAME,
    hnsw_config=models.HnswConfigDiff(m=16, ef_construct=32)
)

True

In [101]:
client.update_collection(
    collection_name=COLLECTION_NAME,
    hnsw_config=models.HnswConfigDiff(m=160, ef_construct=500)
)

True

In [182]:
COLLECTION_NAME = 'arxiv_papers_8_100'
evaluation_8_100 = evaluate_ann()

In [183]:
evaluation_8_100

{'avg_precision': 0.997, 'avg_query_time_ms': 4.1156005859375}

In [184]:
COLLECTION_NAME = 'arxiv_papers_8_50'
evaluation_8_50 = evaluate_ann()

In [185]:
evaluation_8_50

{'avg_precision': 0.99, 'avg_query_time_ms': 3.6455631256103516}

In [186]:
COLLECTION_NAME = 'arxiv_papers_16_32'
evaluation_16_32 = evaluate_ann()

In [187]:
evaluation_16_32

{'avg_precision': 0.993, 'avg_query_time_ms': 3.522627353668213}

In [188]:
COLLECTION_NAME = 'arxiv_papers_16_50'
evaluation_16_50 = evaluate_ann()

In [189]:
evaluation_16_50

{'avg_precision': 0.996, 'avg_query_time_ms': 3.9110779762268066}

In [152]:
def compute_avg_metrics(data: List[Dict]) -> Dict[str, float]:
    """
    Computes the average of all keys starting with 'avg' across a list of dictionaries.

    :param data: List of dictionaries with metrics.
    :return: Dictionary with average values for 'avg*' keys.
    """
    if not data:
        return {}

    totals = {}
    count = len(data)

    for entry in data:
        for key, value in entry.items():
            if key.startswith('avg') and isinstance(value, (int, float)):
                totals[key] = totals.get(key, 0.0) + value

    return {key: total / count for key, total in totals.items()}


In [190]:
[evaluation_8_100, evaluation_8_50, evaluation_16_32, evaluation_16_50]

[{'avg_precision': 0.997, 'avg_query_time_ms': 4.1156005859375},
 {'avg_precision': 0.99, 'avg_query_time_ms': 3.6455631256103516},
 {'avg_precision': 0.993, 'avg_query_time_ms': 3.522627353668213},
 {'avg_precision': 0.996, 'avg_query_time_ms': 3.9110779762268066}]

In [172]:
[compute_avg_metrics(item) for item in [evaluation_8_100, evaluation_8_50, evaluation_16_32, evaluation_16_50]]

[{'avg_precision': 0.935, 'avg_query_time_ms': 3.2199811935424805},
 {'avg_precision': 0.925, 'avg_query_time_ms': 3.1613779067993164},
 {'avg_precision': 0.948, 'avg_query_time_ms': 3.6258912086486816},
 {'avg_precision': 0.966, 'avg_query_time_ms': 3.345036506652832}]

In [86]:
import pandas as pd

In [87]:
# Convert results to DataFrame
df = pd.DataFrame([{**item, 'm': 8, 'ef_construct': 100} for item in evaluation])
# Round floating point numbers for better readability
df['avg_precision'] = df['avg_precision'].round(6)
df['avg_query_time_ms'] = df['avg_query_time_ms'].round(6)
df

Unnamed: 0,hnsw_ef,avg_precision,avg_query_time_ms,m,ef_construct
0,10,0.933,4.029753,8,100
1,20,0.969,3.52042,8,100
2,50,0.99,4.272552,8,100
3,100,0.996,4.632151,8,100
4,200,0.998,5.858958,8,100


In [191]:
search_params = models.SearchParams(
    quantization=models.QuantizationSearchParams(
        rescore=True,
        oversampling=2.0,
    )
)

In [202]:
COLLECTION_NAME = 'arxiv_papers'

In [201]:
def get_knn_points_ignoring_quantization(embedding: List):
    start_time_knn = time.time()
    knn_result = client.query_points(
        collection_name=COLLECTION_NAME,
        query=embedding,
        limit=k,
        search_params=models.SearchParams(
            quantization=models.QuantizationSearchParams(ignore=True)  # ignore is False by default
        ),
    ).points
    knn_time = time.time() - start_time_knn
    ids = [res.payload['id'] for res in knn_result]
    return ids, knn_time

In [206]:
def get_ann_points_quantized(embedding: List):
    start_time_ann = time.time()
    ann_result = client.query_points(
        collection_name=COLLECTION_NAME,
        query=embedding,
        limit=k,
        search_params = models.SearchParams(
            quantization=models.QuantizationSearchParams(
            rescore=False,
            oversampling=2.0,
            )
        )
    ).points
    ann_time = time.time() - start_time_ann
    ids = [res.payload['id'] for res in ann_result]
    return ids, ann_time

In [203]:
def evaluate_ann_quantized():
    ann_results = [get_ann_points_quantized(vector) for _, vector in embeddings.items()]
    knn_results = [get_knn_points_ignoring_quantization(vector) for _, vector in embeddings.items()]

    results = [(precision_k(set(ann_id_list), set(knn_id_list)), ann_execution_time)
               for (ann_id_list, ann_execution_time), (knn_id_list, knn_execution_time)
               in zip(ann_results, knn_results)]

    precisions, exec_time = zip(*results)
    avg_precision = sum(precisions) / len(precisions)
    avg_query_time_ms = sum(exec_time) / len(exec_time) * 1000

    res = {"avg_precision": avg_precision,
           "avg_query_time_ms": avg_query_time_ms
           }

    return res

In [204]:
evaluate_ann_quantized()

{'avg_precision': 1.0, 'avg_query_time_ms': 13.554525375366211}

In [205]:
evaluate_ann()

{'avg_precision': 1.0, 'avg_query_time_ms': 3.635427951812744}

In [207]:
evaluate_ann_quantized()


{'avg_precision': 0.835, 'avg_query_time_ms': 5.193431377410889}