In [1]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [1]:
import minsearch

In [2]:
import json

In [3]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents = []
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
index = minsearch.Index(
    text_fields=['question', 'text', 'section'],
                keyword_fields=['course']
)

In [6]:
index.fit(documents)

<minsearch.Index at 0x7e0584c40730>

In [7]:
from openai import OpenAI

In [10]:
client = OpenAI()

In [11]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}
    results = index.search(
        query=query,
        filter_dict={'course':'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )
    return results

In [12]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEXT when answering the question.

QUESTION : {question}

CONTEXT:
{context}
""".strip()
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [13]:
def llm(prompt):
    response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role":'user', 'content':prompt}]
)
    response.choices[0].message.content
    return response.choices[0].message.content


In [14]:
query = 'how do I run kafka?'
def rag(query):
    results = search(query)
    prompt = build_prompt(query, results)
    answer = llm(prompt)
    return answer

In [15]:
rag('the course has already started, can I still enroll?')

"Yes, you can still enroll in the course even after it has started. You are eligible to submit the homework, but be mindful of the deadlines for turning in the final projects. It's advised not to leave everything to the last minute."

In [17]:
from elasticsearch import Elasticsearch

In [45]:
es_client = Elasticsearch('http://localhost:9200')

In [48]:
es_client.info()

ObjectApiResponse({'name': '9fbe22548c63', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'wU2FPpDNTjmBOs2UEeI3Vw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [49]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course_questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course_questions'})

In [50]:
from tqdm.auto import tqdm

In [51]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:28<00:00, 33.71it/s]


In [52]:
query = 'I just discovered the course. Can I still join?'

In [53]:
def elastic_search(query):
    search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
    }
    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [54]:
elastic_search('How do I run kafka?')

[{'text': "Answer: To run the provided code, ensure that the 'dlt[duckdb]' package is installed. You can do this by executing the provided installation command: !pip install dlt[duckdb]. If you’re doing it locally, be sure to also have duckdb pip installed (even before the duckdb package is loaded).",
  'section': 'Workshop 1 - dlthub',
  'question': 'How do I install the necessary dependencies to run the code?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java',
  'section': 'Module 6: streaming with kafka',
  'question': 'Java Kafka: How to run producer/consumer/kstreams/etc in terminal',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'After you create a GitHub account, you should clone the course repo to your local machine using the process outlined in this video: Git for Everybody: How to Clone a Repository from GitHub\nHaving this local repositor

In [55]:
query = 'how do I run kafka?'
def rag(query):
    results = elastic_search(query)
    prompt = build_prompt(query, results)
    answer = llm(prompt)
    return answer

In [56]:
rag(query)

'To run Kafka, based on the provided context from Module 6: streaming with kafka, you can follow the instructions specific to running Java Kafka components like producer/consumer/kstreams in the terminal. Here’s the relevant information extracted from the context:\n\n1. For running a Java Kafka component like a producer in the terminal, navigate to your project directory.\n2. Execute the following command:\n   ```plaintext\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n\nEnsure you replace `<jar_name>` with the actual name of your jar file.\n\nIf you are dealing with a Python Kafka producer, make sure to:\n1. Create a virtual environment:\n   ```plaintext\n   python -m venv env\n   source env/bin/activate  # On Windows use env\\Scripts\\activate\n   ```\n2. Install the necessary dependencies:\n   ```plaintext\n   pip install -r ../requirements.txt\n   ```\n\n3. Activate the virtual environment whenever you need to run your K