## Imports

In [2]:
import os
from pathlib import Path
from datasets import load_dataset
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

## Loading Dataset

In [22]:
DATASET_NAME = "florin-hf/wiki_dump2018_nq_open"
CACHE_DIR = "/mnt/d/datasets/wiki_dump2018_nq_open"
PERSIST_DIR = "/mnt/d/datasets/wiki_dump2018_nq_open/chroma_db"
MAX_SAMPLES = None  # Set to a number like 10000 for testing, None for full dataset
TEXT_COLUMN = "text"

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}
)

# Create persist directory
Path(PERSIST_DIR).mkdir(parents=True, exist_ok=True)

print(f"Vector store will be saved to: {os.path.abspath(PERSIST_DIR)}")

Vector store will be saved to: /mnt/d/datasets/wiki_dump2018_nq_open/chroma_db


In [None]:
dataset = load_dataset(
    DATASET_NAME, 
    split=f"train",
    cache_dir=CACHE_DIR
)

Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/27 [00:00<?, ?it/s]

In [None]:
if MAX_SAMPLES:
    dataset = dataset.select(range(MAX_SAMPLES))

## Indexing Dataset

In [14]:
# Convert dataset into Document objects
documents = []

for i, item in enumerate(dataset):
    # Create metadata from other columns
    metadata = {k: v for k, v in item.items() 
                if k != TEXT_COLUMN and isinstance(v, (str, int, float, bool))}
    metadata['source'] = f"{DATASET_NAME}_{i}"
    
    doc = Document(
        page_content=item[TEXT_COLUMN],
        metadata=metadata
    )
    documents.append(doc)
    
    if (i + 1) % 10000 == 0:
        print(f"Processed {i + 1} documents...")

print(f"Created {len(documents)} documents")

Created 1000 documents


In [15]:
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")

Created 1000 chunks


In [19]:
# Initialize Chroma vector store
vectorstore = Chroma(
    persist_directory=PERSIST_DIR,
    embedding_function=embeddings
)

# Add documents to vector store in batches
print(f"Adding chunks to vector store...")
batch_size = 100

for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i + batch_size]
    vectorstore.add_documents(batch)
    
    if (i + batch_size) % 1000 == 0:
        print(f"Added {min(i + batch_size, len(chunks))}/{len(chunks)} chunks...")

# Persist the vector store
vectorstore.persist()
print(f"Vector store persisted to: {PERSIST_DIR}")

Adding chunks to vector store...
Added 1000/1000 chunks...
Vector store persisted to: /mnt/d/datasets/wiki_dump2018_nq_open/chroma_db


  vectorstore.persist()


## Test Vector Store

In [1]:
test_query = "Who invented the light bulb?"
results = vectorstore.similarity_search(test_query, k=3)

print(f"Found {len(results)} results:")

for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(f"Content: {doc.page_content[:200]}...")
    print(f"Source: {doc.metadata.get('source', 'Unknown')}")

NameError: name 'vectorstore' is not defined