In [1]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, OptimizersConfigDiff
import pandas as pd
import pyarrow.parquet as pq
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import os

load_dotenv()

True

In [20]:
client = QdrantClient(
    url=os.getenv("QDRANT_URL"),
    api_key=os.getenv("QDRANT_API_KEY"),
)
client.get_collections()

CollectionsResponse(collections=[])

In [27]:
collection = "library-v1"
dim = 256

In [28]:
client.create_collection(
    collection_name=collection,
    vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
)

True

In [29]:
# Batch upload configuration
PARQUET_FILE = 'embeddings_results_conclusions_585k_cs1024_ov100_qw3-06B.parquet'
BATCH_SIZE = 5_000  # Number of rows to process at a time

parquet_file = pq.ParquetFile(PARQUET_FILE)
total_rows = parquet_file.metadata.num_rows
n_batches = (total_rows + BATCH_SIZE - 1) // BATCH_SIZE

print(f"Total rows to upload: {total_rows}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Number of batches: {n_batches}")

Total rows to upload: 2703358
Batch size: 5000
Number of batches: 541


In [30]:
# Process and upload in batches
uploaded_count = 0

for batch in tqdm(parquet_file.iter_batches(BATCH_SIZE), desc="Uploading batches", total=n_batches):
   
    # Read only the current batch from disk
    batch_df = batch.to_pandas().reset_index()
    
    # Extract embeddings and prepare payloads
    embeddings = batch_df.pop('embedding')
    vectors = [v[:dim] for v in embeddings.values]
    payloads = [row.to_dict() for _, row in batch_df.iterrows()]
    
    # Upload this batch
    client.upload_collection(
        collection,
        vectors=vectors,
        payload=payloads,
        parallel=4,  # Reduced parallelism per batch
        batch_size=100  # Qdrant internal batch size
    )
    
    uploaded_count += len(vectors)
    
print(f"Successfully uploaded {uploaded_count} vectors")

Uploading batches:   0%|          | 0/541 [00:00<?, ?it/s]

  return self._bootstrap(parent_sentinel)


Successfully uploaded 2703358 vectors


In [26]:
batch_df.reset_index()

Unnamed: 0,openalex_id,chunk_idx,text
0,W4205471817,1,"shrub vegetation\n(i.e., Poaceae, Cyperaceae,..."
1,W4205471817,2,endorf and Freimuth (2017).\nA study by Feakin...
2,W4205471817,3,) and odd n-alkanes (b) in fog-affected Tillan...
3,W4205471817,4,Andrae et al. 2019).\nIt is thought that unde...
4,W4205471817,5,021-​01800-0.\n124:77–87. https://​doi.​org/​1...
...,...,...,...
3353,W4388621958,0,"RESULTS\nTo validate the model prediction, we..."
3354,W4388621958,1,region. In\n130 the second underestimated hot...
3355,W4388621958,2,effects of these features. It is furthermore\...
3356,W4388621958,3,ol pollution will like impact the production o...


## Old : if you can load the whole dataset in memory

In [6]:
# disable indexing before inserting initial data
client.update_collection(
    collection_name=collection,
    optimizers_config=OptimizersConfigDiff(indexing_threshold=0),
)

True

In [None]:
#df = pd.read_parquet('embeddings_results_conclusions_585k_cs1024_ov100_qw3-06B.parquet')
#df = pd.read_parquet('embeddings_abstracts_library_v1_cs1024_ov100_qw3-06B.parquet')

In [31]:
# take N publications with all their chunks
N = 500_000
sample = df.iloc[:N]
#sample = df.loc[df.index[:N]].reset_index()
embeddings = sample.pop('embedding').values
len(sample)

500000

In [32]:
D = 128
vectors = tqdm([v[:D] for v in embeddings])
payloads = [r.to_dict() for i, r in sample.iterrows()]

  0%|          | 0/500000 [00:00<?, ?it/s]

In [33]:
client.upload_collection(collection, vectors=vectors, payload=payloads, parallel=6)

In [None]:
# re-enable indexing
client.update_collection(
    collection_name=collection,
    optimizers_config=OptimizersConfigDiff(indexing_threshold=20_000),
)