In [9]:
import sys

sys.path.append("../")

import json

import numpy as np

from qdrant_client import QdrantClient
from qdrant_client.http import models

from config import CohereConfig, QdrantConfig, Paths

In [10]:
data_path = Paths.data / "handbook_merged" / "handbook_with_embeddings.json"

with open(data_path, "r") as f:
    data = json.load(f)

In [11]:
client = QdrantClient(
    host=QdrantConfig.host, api_key=QdrantConfig.api_key, port=QdrantConfig.port
)

In [16]:
client.recreate_collection(
    collection_name=QdrantConfig.collection_name, 
    vectors_config=models.VectorParams(size=CohereConfig.vector_size, distance=models.Distance.COSINE),
)

my_collection_info = client.http.collections_api.get_collection(QdrantConfig.collection_name)
print(my_collection_info.dict())

{'time': 1.9264e-05, 'status': 'ok', 'result': {'status': <CollectionStatus.GREEN: 'green'>, 'optimizer_status': <OptimizersStatusOneOf.OK: 'ok'>, 'vectors_count': 0, 'indexed_vectors_count': 0, 'points_count': 0, 'segments_count': 2, 'config': {'params': {'vectors': {'size': 4096, 'distance': <Distance.COSINE: 'Cosine'>}, 'shard_number': 1, 'replication_factor': 1, 'write_consistency_factor': 1, 'on_disk_payload': True}, 'hnsw_config': {'m': 16, 'ef_construct': 100, 'full_scan_threshold': 10000, 'max_indexing_threads': 0, 'on_disk': False, 'payload_m': None}, 'optimizer_config': {'deleted_threshold': 0.2, 'vacuum_min_vector_number': 1000, 'default_segment_number': 0, 'max_segment_size': None, 'memmap_threshold': None, 'indexing_threshold': 20000, 'flush_interval_sec': 5, 'max_optimization_threads': 1}, 'wal_config': {'wal_capacity_mb': 32, 'wal_segments_ahead': 0}}, 'payload_schema': {}}}


In [17]:
limit = len(data) + 1
ids = [i for i in range(len(data))][:limit]
docs = [{k: v for k, v in row.items() if k not in ("vector", "elements")} for row in data][:limit]
vectors = [np.array(d["vector"]) for d in data][:limit]

In [19]:
# insert data in batches of 500
for i in range(18500, len(ids), 500):
    print(f"Inserting batch {i} to {i + 500} of {len(ids)}")
    client.upsert(
        collection_name=QdrantConfig.collection_name,
        points=models.Batch(
            ids=ids[i : i + 500],
            payloads=docs[i : i + 500],
            vectors=[v.tolist() for v in vectors[i : i + 500]],
        ),
    )

Inserting batch 18500 to 19000 of 26384
Inserting batch 19000 to 19500 of 26384
Inserting batch 19500 to 20000 of 26384
Inserting batch 20000 to 20500 of 26384
Inserting batch 20500 to 21000 of 26384
Inserting batch 21000 to 21500 of 26384
Inserting batch 21500 to 22000 of 26384
Inserting batch 22000 to 22500 of 26384
Inserting batch 22500 to 23000 of 26384
Inserting batch 23000 to 23500 of 26384
Inserting batch 23500 to 24000 of 26384
Inserting batch 24000 to 24500 of 26384
Inserting batch 24500 to 25000 of 26384
Inserting batch 25000 to 25500 of 26384
Inserting batch 25500 to 26000 of 26384
Inserting batch 26000 to 26500 of 26384
