In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [None]:
import json
import os

from dotenv import load_dotenv
import minsearch
import openai

In [None]:
def parse_documents(file_path) -> list:
    with open(file_path, 'rt') as f_in:
        docs_raw = json.load(f_in)
        
    documents = []

    for course_dict in docs_raw:
        for doc in course_dict['documents']:
            doc['course'] = course_dict['course']
            documents.append(doc)

    return documents

In [None]:
def fit_index(documents: list) -> minsearch.Index:
    index = minsearch.Index(
        text_fields = ["question", "text", "section"],
        keyword_fields = ["course"]
    )
    index.fit(documents)
    
    return index

In [None]:
def search(query: str, index: minsearch.Index) -> list:
    boost = {'question': 3.0, 'section': 5.0}

    return index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teach assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the question,
If the CONTEXT doesn't contain the answer output NONE

QUESTION: {question}

CONTEXT: 
{context}
""".strip()
    
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return prompt_template.format(question=query, context=context)

In [None]:
def llm(prompt: str, client: openai.Client) -> str:
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [None]:
def rag(query: str) -> str:
    load_dotenv()

    # Initialize OpenAI client
    client = openai.Client(
        api_key=os.getenv("OPENAI_API_KEY")
    )

    # Load and parse the documents
    documents = parse_documents('faq.json')

    # Fit the search index
    index = fit_index(documents)

    # Perform the search
    search_results = search(query, index)

    # Build the prompt
    prompt = build_prompt(query, search_results)

    # Generate the response
    response = llm(prompt, client)

    return response

In [None]:
query = "How do I enroll in the course?"
rag(query)