# Introduction

In [47]:
from utils.search import minsearch
import json
from openai import OpenAI
from dotenv import load_dotenv
import os

dotenv_path = "./../../.env"
load_dotenv(dotenv_path)
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [2]:
with open('./../data/documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [3]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [25]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [26]:
q = 'the course has already started, can I still enroll?'

In [27]:
index.fit(documents)

<utils.search.minsearch.Index at 0x2d791ef3810>

In [48]:
boost = {"question": 5.0, "section": 0.5}

results = index.search(
    query=q,
    boost_dict=boost,
    num_results=5
)

results

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the cour

# RAG

In [28]:
client = OpenAI(api_key = OPENAI_API_KEY)

In [29]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"It depends on the specific course and institution offering it. Here are some factors to consider:\n\n1. **Course Enrollment Policies:** Check with the institution or course administrator. Some courses have a fixed enrollment period, while others may allow late registration.\n  \n2. **Availability:** Ensure there are still open spots in the course. Popular courses may fill up quickly.\n\n3. **Prerequisites and Preparedness:** Make sure you meet any prerequisites and consider how much of the course material you have missed. Catching up may require extra effort.\n\n4. **Impact on Grades:** Determine if the institution allows late enrollees to make up for missed work and whether this might impact your grade.\n\n5. **Instructor Approval:** Sometimes, enrolling late requires approval from the instructor, who can provide guidance on how to catch up with coursework.\n\n6. **Fees and Deadlines:** Be aware of any additional fees that might apply to late registration and check deadlines for addi

In [30]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [31]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [32]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [33]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [34]:
rag(query)

'To run Kafka, follow these steps based on the provided CONTEXT:\n\nIf you are running a Java producer/consumer/kstreams application in the terminal:\n\nIn the project directory, execute the following command:\n```bash\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nIf you are running a Python producer:\n\n1. Create and activate a virtual environment:\n   ```bash\n   python -m venv env\n   source env/bin/activate\n   ```\n\n2. Install the required packages:\n   ```bash\n   pip install -r ../requirements.txt\n   ```\n\n3. Make sure the docker images for your Kafka setup are up and running.\n\nTo activate the virtual environment later:\n```bash\nsource env/bin/activate\n```\n\nTo deactivate the virtual environment when done:\n```bash\ndeactivate\n```\n\nFor Windows, the activation path differs:\n```bash\nenv/Scripts/activate\n```\n\nMake sure you adjust paths and filenames according to your specific setup and operating system.'

In [35]:
rag('the course has already started, can I still enroll?')

'Yes, you can still enroll in the course even if it has already started. You are eligible to submit the homeworks, but keep in mind that there will be deadlines for the final projects. So, make sure to manage your time effectively.'

In [36]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

# ElasticSearch

In [49]:
from elasticsearch import Elasticsearch

In [50]:
es_client = Elasticsearch('http://localhost:9200') 

In [51]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [52]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [53]:
from tqdm.auto import tqdm

In [54]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [55]:
query = 'I just disovered the course. Can I still join it?'

In [56]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [57]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [58]:
rag(query)

"Yes, you can still join the course. Even if you haven't registered, you're eligible to submit the homework. Just be aware of the deadlines for turning in the final projects, and try not to leave everything for the last minute."