In [1]:
import json

with open('./documents.json', 'rt') as f_in:
    documents_file = json.load(f_in)

documents = []

for course in documents_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': 'f8c6e6fa4a94', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'gU-3p0O3TFyXtfctHoPYHA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [6]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
response = es.indices.delete(index=index_name, ignore_unavailable=True)
response = es.indices.create(index=index_name, body=index_settings)

response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [7]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm


100%|██████████| 948/948 [00:02<00:00, 407.72it/s]


In [8]:
user_question = "How do I join the course after it has started?"

search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": user_question,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

response = es.search(index=index_name, body=search_query)

for hit in response['hits']['hits']:
    doc = hit['_source']
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishe...

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependenc...

Section: General course-related questions
Question: How do I use Git / GitHub for this course?
Answer: After you create a GitHub account, you should clone the cour...

Section: Workshop 1 - dlthub
Question: How do I install the necessary dependencies to run the code?
Answer: Answer: To run the provided code, ensure that the 'dlt[duckd...



In [9]:
def retrieve_documents(query, index_name="course-questions", max_results=5):
    es = Elasticsearch("http://localhost:9200")
    
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    # print(response)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents

In [10]:
user_question = "How do I join the course after it has started?"

response = retrieve_documents(user_question)

for doc in response:
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishe...

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependenc...

Section: General course-related questions
Question: How do I use Git / GitHub for this course?
Answer: After you create a GitHub account, you should clone the cour...

Section: Workshop 1 - dlthub
Question: How do I install the necessary dependencies to run the code?
Answer: Answer: To run the provided code, ensure that the 'dlt[duckd...



In [67]:
user_question = "How do I execute a command in a running docker container?"

response = retrieve_documents(user_question)

# print(response)


for doc in response:
    # print(doc)
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")
    # print(f"Score: {doc['_score']}")

{'took': 5, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 407, 'relation': 'eq'}, 'max_score': 56.655964, 'hits': [{'_index': 'course-questions', '_id': 'Pk0nRZABTi0tDO8K3PzR', '_score': 56.655964, '_source': {'text': 'In case running pgcli  locally causes issues or you do not want to install it locally you can use it running in a Docker container instead.\nBelow the usage with values used in the videos of the course for:\nnetwork name (docker network)\npostgres related variables for pgcli\nHostname\nUsername\nPort\nDatabase name\n$ docker run -it --rm --network pg-network ai2ys/dockerized-pgcli:4.0.1\n175dd47cda07:/# pgcli -h pg-database -U root -p 5432 -d ny_taxi\nPassword for root:\nServer: PostgreSQL 16.1 (Debian 16.1-1.pgdg120+1)\nVersion: 4.0.1\nHome: http://pgcli.com\nroot@pg-database:ny_taxi> \\dt\n+--------+------------------+-------+-------+\n| Schema | Name             | Type  | Owner |\n|--------+-------

In [11]:
def retrieve_documents_mlzoomcamp(query, index_name="course-questions", max_results=5):
    es = Elasticsearch("http://localhost:9200")
    
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    # print(response)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents

In [12]:
user_question = "How do I execute a command in a running docker container?"

response = retrieve_documents_mlzoomcamp(user_question, max_results=3)

# print(response)


for doc in response:
    # print(doc)
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")
    # print(f"Score: {doc['_score']}")

Section: 5. Deploying Machine Learning Models
Question: How do I debug a docker container?
Answer: Launch the container image in interactive mode and overridin...

Section: 5. Deploying Machine Learning Models
Question: How do I copy files from my local machine to docker container?
Answer: You can copy files from your local machine into a Docker con...

Section: 5. Deploying Machine Learning Models
Question: How do I copy files from a different folder into docker container’s working directory?
Answer: You can copy files from your local machine into a Docker con...



In [71]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

context_docs = retrieve_documents(user_question)

context_result = ""

for doc in context_docs:
    doc_str = context_template.format(**doc)
    context_result += ("\n\n" + doc_str)

context = context_result.strip()
print(f"{context=}")
print(len(context))

context="Section: Module 1: Docker and Terraform\nQuestion: PGCLI - running in a Docker container\nAnswer: In case running pgcli  locally causes issues or you do not want to install it locally you can use it running in a Docker container instead.\nBelow the usage with values used in the videos of the course for:\nnetwork name (docker network)\npostgres related variables for pgcli\nHostname\nUsername\nPort\nDatabase name\n$ docker run -it --rm --network pg-network ai2ys/dockerized-pgcli:4.0.1\n175dd47cda07:/# pgcli -h pg-database -U root -p 5432 -d ny_taxi\nPassword for root:\nServer: PostgreSQL 16.1 (Debian 16.1-1.pgdg120+1)\nVersion: 4.0.1\nHome: http://pgcli.com\nroot@pg-database:ny_taxi> \\dt\n+--------+------------------+-------+-------+\n| Schema | Name             | Type  | Owner |\n|--------+------------------+-------+-------|\n| public | yellow_taxi_data | table | root  |\n+--------+------------------+-------+-------+\nSELECT 1\nTime: 0.009s\nroot@pg-database:ny_taxi>\n\nSectio

In [13]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

prompt_template = """
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()


def build_context(documents):
    context_result = ""
    
    for doc in documents:
        doc_str = context_template.format(**doc)
        context_result += ("\n\n" + doc_str)
    
    return context_result.strip()


def build_prompt(user_question, documents):
    context = build_context(documents)
    prompt = prompt_template.format(
        user_question=user_question,
        context=context
    )
    return prompt

def ask_ollama(prompt, model="llama3"):
    
    client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama')
    
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}])
    
    answer = response.choices[0].message.content
    return answer


def qa_bot(user_question):
    context_docs = retrieve_documents(user_question)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_ollama(prompt)
    return answer

In [79]:
qa_bot("I'm getting invalid reference format: repository name must be lowercase")

'I see you\'re getting an error message "invalid reference format: repository name must be lowercase". According to our FAQ database, this is likely due to the way you\'re formatting your repository names. Specifically, Docker repository names should be in all lowercase.\n\nTry rewriting your command using all lowercase letters for the repository name. For example:\n```\ndocker run -it \\\n-e POSTGRES_USER="root" \\\n-e POSTGRES_PASSWORD="root" \\\n-e POSTGRES_DB="ny_taxi" \\\n-v "/c:/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data" \\\n-p 5432:5432 \\\npostgres:13\n```\nMake sure to replace `"/c:/some/path/` with the actual path where your data is located. If you\'re still having issues, try one of the alternative formats listed in the original response:\n```\n-v /”$(pwd)”/ny_taxi_postgres_data:/var/lib/postgresql/data\n```\nI hope this helps! Let me know if you have any further questions.'

In [80]:
qa_bot("I can't connect to postgres port 5432, my password doesn't work")

'Based on the provided context, I understand that you\'re experiencing issues connecting to Postgres using port 5432.\n\nYour current issue is:\n\n"I can\'t connect to postgres port 5432, my password doesn\'t work"\n\nFrom the FAQ database, I found a possible solution for you:\n\n"If this error is still persistent , kindly check if you have a service in windows running postgres , Stopping that service will resolve the issue\nOr Changing the port from 5432:5432 to 5431:5432 might help. Try substituting the new port number in your connection string"\n\nIn other words, it\'s possible that another Postgres instance is using port 5432 on your machine, causing the authentication failure. Stopping this service or switching to a different port (like 5431) might resolve the issue.\n\nPlease try these suggestions and let me know if you need further assistance!'

In [81]:
qa_bot("how can I run kafka?")

'Based on the provided context from the FAQ database, I can answer your question about how to run Kafka.\n\nThe relevant section is:\n\nSection: Module 6: streaming with kafka\nQuestion: Java Kafka: How to run producer/consumer/kstreams/etc in terminal\nAnswer: In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n\nThis answer provides instructions on how to run a Java-based Kafka application (producer or consumer) from the terminal.'

In [55]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: How do I execute a command in a running docker container?

CONTEXT:
{context}
""".strip()

In [72]:
len(prompt_template)

253

In [57]:
import os 


Didnt use openai, since free accounts dont have api access. Used ollama with llama3 instead

In [21]:
prompt="The course already started. Can I still join?"

In [82]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

response = client.chat.completions.create(
    model='llama3',
    messages=[{"role": "user", "content": prompt}]
)
    
response.choices[0].message.content

'I\'m just an AI, I don\'t have any information about a specific course that has already started. However, I can provide some general guidance.\n\nIf the course has already started, it\'s generally not possible to officially "join" because you would have missed the initial sessions and introduction. But it\'s always worth reaching out to the course organizer or instructor to inquire about their policy on late enrollment or joining a course that has already begun.\n\nSome possibilities might include:\n\n1. Contacting the instructor or course administrator: Reach out to them directly via email or phone and ask if they can accommodate a late enrollment. They may be willing to make an exception, especially if you\'re willing to catch up on missed material.\n2. Requesting permission to audit the course: If you\'re not able to officially enroll in the course, you might consider requesting permission to audit it instead. This would allow you to attend sessions and participate without earning 