In [1]:
import minsearch
import json

from openai import OpenAI

In [2]:
with open('./documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

documents = []
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [3]:
def execute_search(index, query, filter_options, boost_options, num_results):
    """Performs a search on the given index with specified filters and boosting."""
    return index.search(
        query=query,
        filter_dict=filter_options,
        boost_dict=boost_options,
        num_results=num_results
    )

def format_prompt(search_results, question, context_template):
    """Formats a prompt using the search results, question, and a context template."""
    formatted_context = ""

    for result in search_results:
        formatted_context += f"section: {result['section']}\nquestion: {result['question']}\ntext: {result['text']}\n\n"

    return context_template.format(question=question, context=formatted_context)

def ask_question_with_context(model, question, context):
    """Uses a model to generate an answer based on a question and a context."""
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain the answer, output NONE.

    QUESTION: {question}

    CONTEXT:
    {context}
    """.strip()
    client = OpenAI()
    prompt = format_prompt(context, question, prompt_template)
    response = client.chat.completions.create(
        model=model,
        messages=[{'role': 'user', 'content': prompt}]
    )
    return response.choices[0].message.content

# Example of how these functions could be used together:
def rag(model, index, question, filter_dict, boost_dict, num_results):
    search_results = execute_search(index, question, filter_dict, boost_dict, num_results)
    answer = ask_question_with_context(model, question, search_results)
    return answer

In [4]:
index = minsearch.Index(
    text_fields=['question', 'text', 'section'],
    keyword_fields=['course']
)
index.fit(documents)

<minsearch.Index at 0x76f4600d03d0>

In [5]:
model = 'gpt-4o'
filter_dict = {'course': 'data-engineering-zoomcamp'}
boost_dict = {'question': 3.0, 'section': 0.5}
num_results = 5

question = 'The course has already started. Can I still enroll?'

In [6]:
rag(model, index, question, filter_dict, boost_dict, num_results)

"Yes, you can still enroll in the course even if it has already started. Be aware, however, that there will be deadlines for turning in the final projects, so don't leave everything for the last minute."