In [10]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [55]:
from dotenv import load_dotenv
import os
from openai import OpenAI

In [56]:
load_dotenv()

True

In [57]:
client = OpenAI()

In [58]:
response = client.responses.create(
    model = "gpt-4.1",
    input = [
        {
          "role": "user",
          "content": "Is it too late to join the course ?"
        }
    ]
)

In [21]:
response.output_text

'Could you clarify which course you are referring to? If you provide the course name or details, I can help check deadlines and enrollment options for you!'

In [14]:
import minsearch

In [15]:
document_url = "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/01-intro/documents.json"

In [16]:
import requests
import json

In [17]:
try:
    response = requests.get(document_url)
    response.raise_for_status()
    print("Status Code: ", response.status_code)
    print("content-Type: ", response.headers.get('Content-Type'))
    data = response.json()
except requests.exceptions.RequestException as e:
    print(f"Error fetching file: {e}")

Status Code:  200
content-Type:  text/plain; charset=utf-8


In [18]:
documents = []

for course in data:
    for item in course['documents']:
        item['course'] =  course['course'] 
        documents.append(item)

In [19]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [45]:
index = minsearch.Index(
    text_fields = ['text','section','question'],
    keyword_fields = ['course']
)

In [46]:
q = "can still enroll in the course ?"

In [47]:
index.fit(documents)

<minsearch.minsearch.Index at 0x790cb53e66c0>

In [48]:
boost = {'question': 3.0,
          'section': 0.5
        }


results = index.search(
    query=q,
    boost_dict=boost,
    num_results=5,
    filter_dict={'course':'data-engineering-zoomcamp'}
)


In [49]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

In [50]:
prompt_template = """
You're course teaching assistant. Answer the question based on the CONTEXT from the faq database.
use only the facts from the CONTEXT when answering the QUESTION
if CONTEXT doesn't contain the answer, output NONE

QUESTION: {question}

CONTEXT: {context}
"""

In [51]:
context = ""

for doc in results: 
    context = context + f"section: {doc['section']}\nquestion:{doc['question']}\nanswer:{doc['text']}\n\n"

In [52]:
prompt = prompt_template.format(question=q,context=context).strip()

In [53]:
prompt

"You're course teaching assistant. Answer the question based on the CONTEXT from the faq database.\nuse only the facts from the CONTEXT when answering the QUESTION\nif CONTEXT doesn't contain the answer, output NONE\n\nQUESTION: can still enroll in the course ?\n\nCONTEXT: section: General course-related questions\nquestion:Course - Can I still join the course after the start date?\nanswer:Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.\n\nsection: General course-related questions\nquestion:Course - Can I follow the course after it finishes?\nanswer:Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone pro

In [59]:
response = client.responses.create(
    model = "gpt-4.1",
    input = [
        {
          "role": "user",
          "content": prompt
        }
    ]
)

In [60]:
response.output_text

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

# 1.5 Clean Code Version below

In [61]:
def search(query):
    boost = {'question': 3.0,
          'section': 0.5
        }

    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=5,
        filter_dict={'course':'data-engineering-zoomcamp'}
    )
    return results

In [62]:
def build_prompt(query, search_results):
    prompt_template = """
    You're course teaching assistant. Answer the question based on the CONTEXT from the faq database.
    use only the facts from the CONTEXT when answering the QUESTION
    if CONTEXT doesn't contain the answer, output NONE

    QUESTION: {question}

    CONTEXT: {context}
    """
    context = ""

    for doc in search_results: 
        context = context + f"section: {doc['section']}\nquestion:{doc['question']}\nanswer:{doc['text']}\n\n"

    prompt = prompt_template.format(question=query,context=context).strip()
    return prompt

In [63]:
def llm(prompt):
    response = client.responses.create(
    model = "gpt-4.1",
    input = [
        {
          "role": "user",
          "content": prompt
        }
    ]
    )
    return response.output_text

In [57]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query,search_results)
    answer = llm(prompt)
    return answer

In [58]:
query = "How do i run kafka ?"

In [59]:
rag(query)

'To run Kafka:\n\n- For Java Kafka, from the project directory, use:\n  ```\n  java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n  ```\n- For Python Kafka, create a virtual environment, activate it, and then run the python files:\n  ```\n  python -m venv env\n  source env/bin/activate\n  pip install -r ../requirements.txt\n  ```\n  (On Windows, activate with: env\\Scripts\\activate)\n\nDocker images should first all be up and running if using the docker setup.\n\n(Note: use only the steps from CONTEXT above; replace <jar_name> with your actual jar name.)'

# 1.6 With Elastic search 

In [8]:
from elasticsearch import Elasticsearch

In [9]:
es_client = Elasticsearch('http://localhost:9200')

In [10]:
es_client.info()

ObjectApiResponse({'name': '625c979509fb', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'fXXSBbItTMmZlV-lojEKdg', 'version': {'number': '9.0.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '73f7594ea00db50aa7e941e151a5b3985f01e364', 'build_date': '2025-04-30T10:07:41.393025990Z', 'build_snapshot': False, 'lucene_version': '10.1.0', 'minimum_wire_compatibility_version': '8.18.0', 'minimum_index_compatibility_version': '8.0.0'}, 'tagline': 'You Know, for Search'})

In [11]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

In [12]:
index_name = "course-questions"

In [13]:
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [22]:
!pip install tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [25]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|█████████████████████████████████████████████████████████████████████████████████| 948/948 [00:04<00:00, 217.87it/s]


In [39]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    # search 
    response = es_client.search(index=index_name, body=search_query)
    # cleaning the response 
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs
    

In [43]:
def elastic_rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query,search_results)
    answer = llm(prompt)
    return answer

In [None]:
query = 'I have just came across this course can i still join ?'

In [64]:
elastic_rag(query)

"Yes, you can still join the course even if you came across it after the start date. You are eligible to submit the homeworks even if you don't register, but be mindful of the deadlines for turning in the final projects. Don't leave everything for the last minute."

In [71]:
# #Q4 Now let's search in our index.

# We will execute a query "How do execute a command on a Kubernetes pod?".

# Use only question and text fields and give question a boost of 4, and use "type": "best_fields".

# What's the score for the top ranking result?

# 84.50
# 64.50
# 44.50
# 24.50
# Look at the _score field.
query = 'How do execute a command on a Kubernetes pod?'

In [72]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
            }
        }
    }
    # search 
    response = es_client.search(index=index_name, body=search_query)
    print(response)
    # cleaning the response 
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [73]:
elastic_search(query)

{'took': 18, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 739, 'relation': 'eq'}, 'max_score': 44.50556, 'hits': [{'_index': 'course-questions', '_id': 'Fw1zYpcB_Qgn4GzDkSoW', '_score': 44.50556, '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)', 'section': '5. Deploying Machine Learning Models', 'question': 'How do I debug a docker container?', 'course': 'machine-learning-zoomcamp'}}, {'_index': 'course-questions', '_id': 'pg1zYpcB_Qgn4GzDkyog', '_score': 35.433445, '_source': {'text': 'Deploy and Access the Kubernetes Dashboard\nLuke', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Kubernetes-dashboard', '

[{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'Deploy and Access the Kubernetes Dashboard\nLuke',
  'section': '10. Kubernetes and TensorFlow Serving',
  'question': 'Kubernetes-dashboard',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'You can copy files from your local machine into a Docker container using the docker cp command. Here\'s how to do it:\nIn the Dockerfile, you can provide the folder containing the files that you want to copy over. The basic syntax is as follows:\nCOPY ["src/predict.py", "models/xgb_model.bin", "./"]\t\t\t\t\t

# Q4 Filtering
Now ask a different question: "How do copy a file to a Docker container?".

This time we are only interested in questions from machine-learning-zoomcamp.

Return 3 results. What's the 3rd question returned by the search engine?

How do I debug a docker container?
How do I copy files from a different folder into docker container’s working directory?
How do Lambda container images work?
How can I annotate a graph?

In [75]:
def elastic_search(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
    # search 
    response = es_client.search(index=index_name, body=search_query)
    # cleaning the response 
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs
    

In [76]:
elastic_search('How do copy a file to a Docker container?')

[{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': "You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:\nTo copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:\ndocker cp /path/to/local/file_or_directory container_id:/path/in/container\nHrithik Kumar Advani",
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I copy files from my local machine to docker container?',
 