In [None]:
%pip install llama-index-core llama-index-embeddings-ollama qdrant-client IPython

# Local RAG Playground: chunk → embed → Qdrant → retrieve

We load every dependency we need up front so the chunker, embedding model, and Qdrant client are ready for the rest of the workflow.

In [None]:
import uuid
import textwrap
import atexit
from typing import Any, Dict, List

from IPython.display import Markdown, display
from llama_index.core import Document
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.ollama import OllamaEmbedding
from qdrant_client import QdrantClient
from qdrant_client.http.models import (
    Distance,
    VectorParams,
    PointStruct,
    Filter,
    FieldCondition,
    MatchValue,
)

qdrant_client: QdrantClient | None = None
embedding_model: OllamaEmbedding | None = None
chunk_records: List[Dict[str, Any]] | None = None
embedded_chunks: List[Dict[str, Any]] | None = None
collection_name = "video_chunks"

def cleanup_resources() -> None:
    global qdrant_client, embedding_model, chunk_records, embedded_chunks
    if qdrant_client is not None:
        try:
            qdrant_client.close()
            print("Closed Qdrant client.")
        finally:
            qdrant_client = None
    if embedding_model is not None:
        embedding_model = None
        print("Released embedding model reference.")
    if embedded_chunks is not None:
        print(f"Releasing {len(embedded_chunks)} embedded chunks from memory.")
        embedded_chunks = None
    if chunk_records is not None:
        print(f"Releasing {len(chunk_records)} chunk records from memory.")
        chunk_records = None

atexit.register(cleanup_resources)


We define a small SaaS-style corpus covering pricing, onboarding, and license renewal scenarios so we can simulate multiple documents.

In [None]:
# Read the video transcript from the data directory
with open('data/video.txt', 'r') as f:
    video_transcript = f.read()

documents = [
    {"doc_id": "video-transcript", "text": video_transcript},
]

corpus = f"[video-transcript]\n{video_transcript}"

display(Markdown(f"Loaded corpus with {len(documents)} documents."))

We chunk each document semantically so downstream retrieval works with coherent slices instead of arbitrary fixed windows.

In [None]:
splitter = SemanticSplitterNodeParser(
    buffer_size=1,
    breakpoint_percentile_threshold=70,
    embed_model=OllamaEmbedding(model_name="nomic-embed-text"),
)
llama_documents = [
    Document(text=doc["text"], metadata={"source_doc_id": doc["doc_id"]})
    for doc in documents
]
nodes = splitter.get_nodes_from_documents(llama_documents)

chunk_records = []
for node in nodes:
    chunk_records.append(
        {
            "chunk_id": str(uuid.uuid4()),
            "text": node.get_content(),
            "source_doc_id": node.metadata.get("source_doc_id"),
        }
    )

display(Markdown(f"Generated {len(chunk_records)} semantic chunks."))
for idx, chunk in enumerate(chunk_records[:3], start=1):
    preview = textwrap.shorten(chunk["text"], width=200, placeholder="...")
    display(Markdown(f"**Chunk {idx}** — source `{chunk['source_doc_id']}`\n\n{preview}"))


We embed each chunk with Ollama's local nomic-embed-text model so we can store vectors for similarity search.

In [None]:
embedding_model = OllamaEmbedding(model_name="nomic-embed-text")

embedded_chunks = []
for record in chunk_records:
    vector = embedding_model.get_text_embedding(record["text"])
    embedded_chunks.append(
        {
            "chunk_id": record["chunk_id"],
            "source_doc_id": record["source_doc_id"],
            "text": record["text"],
            "vector": vector,
        }
    )

first_vector = embedded_chunks[0]["vector"]
print(f"Vector length: {len(first_vector)}")
print("First 5 values:", [round(value, 4) for value in first_vector[:5]])


We set up a fresh Qdrant collection using cosine similarity so the entire notebook shares one vector space.

In [None]:
qdrant_client = QdrantClient(url="http://localhost:6333")
vector_size = len(embedded_chunks[0]["vector"])
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
)
print(f"Collection `{collection_name}` ready with vector size {vector_size}.")


We upsert every embedded chunk into Qdrant, attaching the original text and source document metadata.

In [None]:
points = [
    PointStruct(
        id=chunk["chunk_id"],
        vector=chunk["vector"],
        payload={"text": chunk["text"], "source_doc_id": chunk["source_doc_id"]},
    )
    for chunk in embedded_chunks
]
qdrant_client.upsert(collection_name=collection_name, points=points)
print(f"Inserted {len(points)} points into `{collection_name}`.")


We embed a sample question, retrieve similar chunks from Qdrant, and show how to apply an optional metadata filter.

In [None]:
test_query = "What's the recommended RAG application DB to use ?"
query_vector = embedding_model.get_query_embedding(test_query)

results = qdrant_client.search(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=3,
)
for idx, result in enumerate(results, start=1):
    preview = textwrap.shorten(result.payload["text"], width=200, placeholder="...")
    print(f"Result {idx}")
    print(f"  Score: {result.score:.4f}")
    print(f"  Source doc: {result.payload.get('source_doc_id')}")
    print(f"  Text: {preview}")
    print()

filtered_results = qdrant_client.search(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=3,
)
print(
    f"Filtered results (source_doc_id={embedded_chunks[0]['source_doc_id']}): {len(filtered_results)}"
)
for idx, result in enumerate(filtered_results, start=1):
    preview = textwrap.shorten(result.payload["text"], width=200, placeholder="...")
    print(f"  Result {idx} | Score: {result.score:.4f} | Text: {preview}")
