In [1]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
# download the search engine
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

--2025-06-23 12:42:29--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4073 (4.0K) [text/plain]
Saving to: ‘minsearch.py.1’


2025-06-23 12:42:30 (46.2 MB/s) - ‘minsearch.py.1’ saved [4073/4073]





<minsearch.Index at 0x7e6308551130>

In [4]:
from openai import OpenAI

openai_client = OpenAI()

In [5]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [6]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [7]:
def llm(prompt):
    response = openai_client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [8]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [9]:
rag('how do I run kafka?')

'To run Kafka, you can execute the Java producer in the terminal by navigating to your project directory and running the following command:\n\n```\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n``` \n\nFor Python, ensure you create a virtual environment and install the necessary dependencies as follows:\n\n1. Create a virtual environment:\n   ```\n   python -m venv env\n   ```\n\n2. Activate the virtual environment:\n   ```\n   source env/bin/activate\n   ```\n\n3. Install the required packages:\n   ```\n   pip install -r ../requirements.txt\n   ```\n\nRemember to activate the virtual environment each time you need to run your Python script. On Windows, use `env\\Scripts\\activate` instead.'

In [10]:
rag('the course has already started, can I still enroll?')

"Yes, you can still enroll in the course after it has started. Even if you don't register, you are still eligible to submit the homeworks. Just be aware that there will be deadlines for turning in the final projects, so it's important not to leave everything for the last minute."

## RAG with Vector Search

In [11]:
# Import the modules from the qdrant-client package
from qdrant_client import QdrantClient, models

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
qd_client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [15]:
from fastembed import TextEmbedding

In [17]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"


In [18]:
# Define the collection name
collection_name = "zoomcamp-faq"

In [34]:
qd_client.delete_collection(collection_name=collection_name)

True

In [35]:
# Creating a collection

# Create the collection with specified vector parameters
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

True

In [36]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword" # exact matching on string metadata fields
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [37]:
points = []

for i,doc in enumerate(documents):
    text= doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector, #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
        payload=doc
    )
    points.append(point)

In [38]:
points[0]

PointStruct(id=0, vector=Document(text="Course - When will the course start? The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", model='jinaai/jina-embeddings-v2-small-en', options=None), payload={'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with ann

In [39]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [40]:
question = 'I just discovered the course, can i still join?'

In [48]:
course = 'data-engineering-zoomcamp'
query_points = qd_client.query_points(
    collection_name=collection_name,
    query=models.Document(
        text=question,
        model=model_handle 
    ),
    query_filter=models.Filter( # filter by course name
        must=[
            models.FieldCondition(
                key="course",
                match=models.MatchValue(value=course)
            )
        ]
    ),
    limit=5, # top closest matches
    with_payload=True #to get metadata in the results
)
   

In [49]:
results = []

for point in query_points.points:
    results.append(point.payload)


In [55]:
def vector_search(question):
    print('Vector search is used')
    course = 'data-engineering-zoomcamp'
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( # filter by course name
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5, # top closest matches
        with_payload=True #to get metadata in the results
    )

    results = []

    for point in query_points.points:
        results.append(point.payload)

    return results

In [56]:
# vector_search('How do i run kafka?')

In [57]:
def rag(query):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [58]:
rag('How do i run kafka')

Vector search is used


"To run Kafka, you need to execute your producer or consumer Java scripts from the terminal. In the project directory, run the following command:\n\n```\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nMake sure to replace `<jar_name>` with the actual name of your jar file. Additionally, ensure that your Kafka broker is running; you can check this by using `docker ps`, and if it's not running, go to the docker compose yaml file folder and run `docker compose up -d` to start it."