## Imports

In [12]:
import os
import gc
from pathlib import Path
from datasets import load_dataset
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

## Loading Dataset

In [5]:
# Configuration
DATASET_NAME = "florin-hf/wiki_dump2018_nq_open"
PERSIST_DIR = "/mnt/d/datasets"
CACHE_DIR = "/mnt/d/datasets/wiki_dump2018_nq_open"
MAX_SAMPLES = None  # Set to None for full dataset
TEXT_COLUMN = "text"
BATCH_SIZE = 100000  # Process documents in batches
CHUNK_BATCH_SIZE = 500  # Add chunks to vector store in smaller batches
COLLECTION_NAME = "wiki_dump2018_nq_open"

Path(PERSIST_DIR).mkdir(parents=True, exist_ok=True)
print(f"Vector store will be saved to: {os.path.abspath(PERSIST_DIR)}")

Vector store will be saved to: /mnt/d/datasets


In [6]:
# Load dataset with streaming for memory efficiency
if MAX_SAMPLES:
    dataset = load_dataset(DATASET_NAME, split=f"train[:{MAX_SAMPLES}]", streaming=False, cache_dir=CACHE_DIR)
    total_samples = MAX_SAMPLES
else:
    # For large datasets, use streaming
    dataset = load_dataset(DATASET_NAME, split="train", streaming=True, cache_dir=CACHE_DIR)
    total_samples = None


## Indexing Dataset

In [13]:
# Initialize components
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cuda"}
)

# Initialize local Qdrant client
qdrant_client = QdrantClient(url="http://localhost:6333")

# Create collection if it doesn't exist
try:
    qdrant_client.get_collection(COLLECTION_NAME)
    print(f"Collection '{COLLECTION_NAME}' already exists")
except Exception:
    # Collection doesn't exist, create it
    qdrant_client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(
            size=384,  # all-MiniLM-L6-v2 embedding dimension
            distance=Distance.COSINE
        )
    )
    print(f"Created collection '{COLLECTION_NAME}'")

# Initialize Qdrant vectorstore
vectorstore = QdrantVectorStore(
    client=qdrant_client,
    collection_name=COLLECTION_NAME,
    embedding=embeddings,
)

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

Created collection 'wiki_dump2018_nq_open'


In [14]:
# Process dataset in memory-efficient batches
def process_batch(batch_items, start_idx):
    """Process a batch of items and return chunks"""
    documents = []
    
    for i, item in enumerate(batch_items):
        metadata = {k: v for k, v in item.items() 
                   if k != TEXT_COLUMN and isinstance(v, (str, int, float, bool))}
        metadata['source'] = f"{DATASET_NAME}_{start_idx + i}"
        
        doc = Document(
            page_content=item[TEXT_COLUMN],
            metadata=metadata
        )
        documents.append(doc)
    
    # Split documents into chunks
    chunks = text_splitter.split_documents(documents)
    
    # Clear documents from memory
    del documents
    gc.collect()
    
    return chunks

In [None]:
# Main processing loop with batch processing
processed_docs = 0
total_chunks = 0
batch_items = []

for i, item in enumerate(dataset):
    batch_items.append(item)
    
    # Process when batch is full or at end of dataset
    if len(batch_items) >= BATCH_SIZE or (MAX_SAMPLES and i == MAX_SAMPLES - 1):
        print(f"Processing documents at iteration: {i+1}")
        # Process current batch
        chunks = process_batch(batch_items, processed_docs)
        
        print(f"Adding documents")
        # Add chunks to vector store in smaller batches
        for j in range(0, len(chunks), CHUNK_BATCH_SIZE):
            chunk_batch = chunks[j:j + CHUNK_BATCH_SIZE]
            vectorstore.add_documents(chunk_batch)
        
        processed_docs += len(batch_items)
        total_chunks += len(chunks)
        
        print(f"Processed {processed_docs} documents, created {total_chunks} chunks")
        
        # Clear memory
        del chunks
        batch_items = []
        gc.collect()
        
        # Break if we've reached max samples
        if MAX_SAMPLES and processed_docs >= MAX_SAMPLES:
            break

print(f"\n✅ Completed processing {processed_docs} documents into {total_chunks} chunks")

Processing documents at iteration: 99999
Adding documents
Processed 100000 documents, created 100002 chunks
Processing documents at iteration: 199999
Adding documents
Processed 200000 documents, created 200002 chunks
Processing documents at iteration: 299999
Adding documents
Processed 300000 documents, created 300003 chunks
Processing documents at iteration: 399999
Adding documents
Processed 400000 documents, created 400009 chunks
Processing documents at iteration: 499999
Adding documents
Processed 500000 documents, created 500010 chunks
Processing documents at iteration: 599999
Adding documents
Processed 600000 documents, created 600010 chunks
Processing documents at iteration: 699999
Adding documents
Processed 700000 documents, created 700012 chunks
Processing documents at iteration: 799999
Adding documents
Processed 800000 documents, created 800017 chunks
Processing documents at iteration: 899999
Adding documents
Processed 900000 documents, created 900022 chunks
Processing documents

In [None]:
print(f"Vector store persisted to: {PERSIST_DIR}")
print(f"Total documents processed: {processed_docs}")
print(f"Total chunks created: {total_chunks}")

## Test Vector Store

In [None]:
test_query = "George Washington"
results = vectorstore.similarity_search(test_query, k=3)

print(f"Found {len(results)} results:")

for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(f"Content: {doc.page_content[:200]}...")
    print(f"Source: {doc.metadata.get('source', 'Unknown')}")