In [1]:
!pip install pypdf sentence-transformers qdrant-client tqdm python-dotenv

Collecting pypdf
  Downloading pypdf-6.4.0-py3-none-any.whl.metadata (7.1 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting qdrant-client
  Downloading qdrant_client-1.16.1-py3-none-any.whl.metadata (11 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting python-dotenv
  Using cached python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.9.1-cp311-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.2-cp311-cp311-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.16.3-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting hugging

In [None]:
import os
import hashlib
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models
from tqdm import tqdm
import os 
from dotenv import load_dotenv

load_dotenv()



PDF_DIR = "downloaded"
COLLECTION_NAME = "astro_physics_papers"
QDRANT_URL = "YOUR_QDRANT_URL_HERE"
QDRANT_KEY = "YOUR_QDRANT_API_KEY_HERE"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

# Load model
try:
    model = SentenceTransformer(EMBEDDING_MODEL, device="gpu")
    print("Using GPU")
except:
    model = SentenceTransformer(EMBEDDING_MODEL)
    print("Using CPU")

qdrant = QdrantClient(url=QDRANT_URL
                       , api_key=QDRANT_KEY)



if COLLECTION_NAME not in [d.name for d in qdrant.get_collections().collections]:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(
            size=model.get_sentence_embedding_dimension(),
            distance=models.Distance.COSINE,
        )
    )

def extract_pages(pdf_path):
    reader = PdfReader(pdf_path)
    return [(i+1, page.extract_text() or "") for i, page in enumerate(reader.pages)]

def deterministic_id(filename, page_number):
    key = f"{filename}-{page_number}".encode()
    return int(hashlib.sha256(key).hexdigest(), 16) % (2**63)

# --------------- Batching setup ---------------
BATCH_SIZE = 64
buffer = []

def flush_buffer():
    if not buffer:
        return
    qdrant.upsert(
        collection_name=COLLECTION_NAME,
        points=[
            models.PointStruct(id=pid, vector=vec, payload=pay)
            for pid, vec, pay in buffer
        ]
    )
    buffer.clear()


# ----------------------------------------------
# PROCESS ALL PDFs
# ----------------------------------------------

for filename in tqdm(os.listdir(PDF_DIR)):
    if not filename.lower().endswith(".pdf"):
        continue

    path = os.path.join(PDF_DIR, filename)
    print(f"Processing: {filename}")

    pages = extract_pages(path)

    texts = []
    ids = []
    payloads = []

    for page_number, text in pages:
        if not text.strip():
            continue

        pid = deterministic_id(filename, page_number)

        ids.append(pid)
        texts.append(text)
        payloads.append({
            "document": filename,
            "page": page_number,
            "text": text
        })

        # When batch locally reaches 64 items
        if len(texts) == BATCH_SIZE:
            embeddings = model.encode(texts).tolist()
            for pid, emb, pay in zip(ids, embeddings, payloads):
                buffer.append((pid, emb, pay))
            flush_buffer()

            texts, ids, payloads = [], [], []

    # leftover pages
    if texts:
        embeddings = model.encode(texts).tolist()
        for pid, emb, pay in zip(ids, embeddings, payloads):
            buffer.append((pid, emb, pay))
        flush_buffer()

flush_buffer()
print("DONE — All documents uploaded.")
