In [1]:
import minsearch
import json
import openai
from openai import OpenAI
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

In [2]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [None]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

In [3]:
es_client = Elasticsearch('http://localhost:9200')
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
index_name = "course-questions"
es_client.indices.create(index = index_name, body = index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [4]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [5]:
def es_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [6]:
def minsearch(query):
    boost = { "question" : 3, "section" : 0.5 }

    results = index.search(
        query = query,
        filter_dict = {"course": "data-engineering-zoomcamp"},
        boost_dict = boost,
        num_results=10
    )

    return results

In [7]:
def build_prompt(query, search_results):
    prompt_template = """
        You're a course teaching assistant. 
        Answer the QUESTION based on the CONTEXT. 
        Use only the facts from the CONTEXT when answering the QUESTION.
        If the CONTEXT doesn't contain the answer, output IDK
        Answer in at most three sentences.
        
        QUESTION: {question}
        
        CONTEXT: 
        {context}
        
    """.strip()
    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [8]:
def llm(prompt):
    client = OpenAI()
        
    response = client.chat.completions.create(
        model = 'gpt-4o',
        messages = [{"role":"user", "content":prompt}]
    )

    return response.choices[0].message.content

In [9]:
def rag(query):
    #search_results = minsearch(query)
    search_results = es_search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    return llm(prompt)

In [10]:
query = 'if the course already started, may I still join?'

In [11]:
print(rag(query))

Yes, even if the course has already started, you can still join and submit the homework. Be mindful of the deadlines for the final projects.
