## Imports

In [None]:
import os
from pathlib import Path
from datasets import load_dataset
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

## Loading Dataset

In [None]:
DATASET_NAME = "florin-hf/wiki_dump2018_nq_open"
PERSIST_DIR = "./chroma_db"
MAX_SAMPLES = 1000  # Set to a number like 10000 for testing, None for full dataset
TEXT_COLUMN = "text"

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Create persist directory
Path(PERSIST_DIR).mkdir(parents=True, exist_ok=True)

print(f"📁 Vector store will be saved to: {os.path.abspath(PERSIST_DIR)}")

In [None]:
if MAX_SAMPLES:
    dataset = load_dataset(DATASET_NAME, split=f"train[:{MAX_SAMPLES}]")
    print(f"Loaded {len(dataset)} samples (limited for testing)")
else:
    dataset = load_dataset(DATASET_NAME, split="train")
    print(f"Loaded full dataset: {len(dataset)} samples")

## Indexing Dataset

In [None]:
# Convert dataset into Document objects
documents = []

for i, item in enumerate(dataset):
    # Create metadata from other columns
    metadata = {k: v for k, v in item.items() 
                if k != TEXT_COLUMN and isinstance(v, (str, int, float, bool))}
    metadata['source'] = f"{DATASET_NAME}_{i}"
    
    doc = Document(
        page_content=item[TEXT_COLUMN],
        metadata=metadata
    )
    documents.append(doc)
    
    if (i + 1) % 10000 == 0:
        print(f"Processed {i + 1} documents...")

print(f"Created {len(documents)} documents")

In [None]:
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")

In [None]:
# Initialize Chroma vector store
vectorstore = Chroma(
    persist_directory=PERSIST_DIR,
    embedding_function=embeddings
)

# Add documents to vector store in batches
print(f"Adding chunks to vector store...")
batch_size = 100

for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i + batch_size]
    vectorstore.add_documents(batch)
    
    if (i + batch_size) % 1000 == 0:
        print(f"Added {min(i + batch_size, len(chunks))}/{len(chunks)} chunks...")

# Persist the vector store
vectorstore.persist()
print(f"Vector store persisted to: {PERSIST_DIR}")

## Test Vector Store

In [None]:
test_query = "artificial intelligence"
results = vectorstore.similarity_search(test_query, k=3)

print(f"Found {len(results)} results:")

for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(f"Content: {doc.page_content[:200]}...")
    print(f"Source: {doc.metadata.get('source', 'Unknown')}")