In [1]:
import minsearch
import json

In [4]:
# load documents
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [5]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [17]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [None]:
index.fit(documents)

In [6]:
from openai import OpenAI

In [13]:
client = OpenAI()  # make sure that the OPENAI_API_KEY environment variable is set.def search(query)

In [14]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query = query,
        filter_dict = {'course': 'data-engineering-zoomcamp'},
        boost_dict = boost,
        num_results = 5
    )

    return results

In [8]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: {context}
""".strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question = query, context = context).strip()

    return prompt

In [9]:
def llm(prompt):
    response = client.chat.completions.create(
        model = 'gpt-4o',
        messages = [{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [11]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)

    return answer

In [19]:
rag(query)

'To run Kafka, start by ensuring you have a suitable environment. For Python Kafka, create a virtual environment and install the dependencies as listed in the `requirements.txt`. You can do this by running the following commands:\n\n1. Create and activate a virtual environment:\n   - On MacOS and Linux:\n     ```bash\n     python -m venv env\n     source env/bin/activate\n     ```\n   - On Windows:\n     ```bash\n     python -m venv env\n     env\\Scripts\\activate\n     ```\n\n2. Install the required packages:\n   ```bash\n   pip install -r ../requirements.txt\n   ```\n\n3. Activate the virtual environment whenever you need to work within this environment:\n   - On MacOS and Linux:\n     ```bash\n     source env/bin/activate\n     ```\n   - On Windows:\n     ```bash\n     env\\Scripts\\activate\n     ```\n\n4. Deactivate when done:\n   ```bash\n   deactivate\n   ```\n\nEnsure any Docker images are up and running before proceeding if you are using Docker. For Java Kafka, run the produc

In [20]:
rag('the course has already started, can I still enroll?')

"Yes, you can still enroll in the course after it has started. You are eligible to submit the homework even if you haven't registered yet. However, keep in mind that there will be deadlines for turning in homework and the final projects, so it's advisable not to leave everything for the last minute."