In [2]:
import json

In [4]:
with open("./documents.json", "rt") as f_in:
    documents_file = json.load(f_in)


In [5]:
documents_file[0]["documents"][0]


{'text': "Data Engineering Zoomcamp FAQ\nData Engineering Zoomcamp FAQ\nThe purpose of this document is to capture Frequently asked technical questions\nEditing guidelines:\nWhen adding a new FAQ entry, make sure the question is “Heading 2”\nFeel free to improve if you see something is off\nDon’t change the formatting in the Data document or add any visual “improvements” (make a copy for yourself first if you need to do it for whatever reason)\nDon’t change the pages format (it should be “pageless”)\nAdd name and date for reference, if possible\nThe next cohort starts January 13th 2025. More info at DTC.\nRegister before the course starts using this link.\nJoint the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When does the course start?'}

In [6]:
documents = []

for course in documents_file:
    course_name = course["course"]

    for doc in course["documents"]:
        doc["course"] = course_name
        documents.append(doc)


In [None]:
import time
from elasticsearch import Elasticsearch, ConnectionError

es = Elasticsearch("http://localhost:9200")
index_name = "course-questions"

for _ in range(30):
    try:
        if es.ping():
            print("✔ ES reachable")
            break
    except ConnectionError:
        pass
    print("…waiting for ping()…")
    time.sleep(1)
else:
    raise RuntimeError("ES did not respond to ping in 30s")

✔ ES reachable


In [47]:
if not es.indices.exists(index="course-questions"):
    print("Index 'course-questions' does not exist—creating it now.")
    index_settings = {
        "settings": {"number_of_shards": 1, "number_of_replicas": 0},
        "mappings": {
            "properties": {
                "text": {"type": "text"},
                "section": {"type": "text"},
                "question": {"type": "text"},
                "course": {"type": "keyword"},
            }
        },
    }
    es.indices.create(index="course-questions", body=index_settings)
    print("✔ Created index 'course-questions'")
else:
    print("✔ Index 'course-questions' already exists.")


✔ Index 'course-questions' already exists.


In [48]:
health = es.cluster.health(
    index="course-questions",
    wait_for_status="yellow",
    timeout="30s",
    level="indices",
)
status = health["indices"]["course-questions"]["status"]
print(f"Index 'course-questions' is now {status!r}")

if status not in ("yellow", "green"):
    raise RuntimeError(f"Index stayed {status!r} after waiting 30 seconds.")


Index 'course-questions' is now 'green'


In [49]:
settings = es.indices.get_settings(index="course-questions")
readonly_flag = settings["course-questions"]["settings"]["index"].get(
    "blocks.read_only_allow_delete"
)
print("Current blocks.read_only_allow_delete flag:", readonly_flag)


Current blocks.read_only_allow_delete flag: None


In [50]:
from elasticsearch.helpers import bulk, BulkIndexError

actions = [{"_index": "course-questions", "_source": doc} for doc in documents]
es_with_timeout = es.options(request_timeout=600)

try:
    success_count, errors = bulk(
        client=es_with_timeout,
        actions=actions,
        chunk_size=500,
    )
    print(f"Successfully indexed {success_count} documents into 'course-questions'.")
    if errors:
        print("Bulk returned errors (unexpected):")
        for err in errors[:5]:  # show up to the first 5
            print(err)
except BulkIndexError as bulk_err:
    print(f"Bulk indexing failed: {bulk_err.args[0]}")
    for i, item_err in enumerate(bulk_err.errors[:10], start=1):
        print(f"\nError #{i}:")
        print(item_err)
    raise


Successfully indexed 1217 documents into 'course-questions'.


In [51]:
doc_count_response = es.count(index="course-questions")
count = doc_count_response["count"]
print(f"Total documents in 'course-questions': {count}")


Total documents in 'course-questions': 2434


In [55]:
def retrieve_documents(query, index_name="course-questions", max_results=5):
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields",
                    }
                },
                "filter": {"term": {"course": "data-engineering-zoomcamp"}},
            }
        },
    }

    response = es.search(index=index_name, body=search_query)
    documents = [hit["_source"] for hit in response["hits"]["hits"]]
    return documents


In [56]:
user_question = "How do I join the course after it has started?"

response = retrieve_documents(user_question)

for doc in response:
    print(
        f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n"
    )


Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homework.
Be aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute.


Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homework.
Be aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute.


Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing fo

In [74]:
from llm_rag_workshop.settings import OPENAI_API_KEY
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)


def ask_openai(prompt, model="gpt-4o"):
    print(f"Asking OpenAI with {model} model...")
    completion = client.chat.completions.create(
        model=model, messages=[{"role": "user", "content": prompt}]
    )
    print(completion)
    content = completion.choices[0].message.content.strip()
    return content


In [75]:

def build_prompt(question, context):
    prompt = f"""
    Answer the following question based on the provided context:
    
    Question: {question}
    Context: {context}
    
    Answer:
    """ 

    return ask_openai(prompt)

In [76]:
def qa_bot(user_question):
    context_docs = retrieve_documents(user_question)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_openai(prompt)
    return answer


In [77]:
qa_bot("I can't connect to postgres port 5432, my password doesn't work")

Asking OpenAI with gpt-4o model...
ChatCompletion(id='chatcmpl-BdOlaQFdQKzzwwbrSozCFvRWpTHTt', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Based on the provided context, there are several potential solutions to your problem of not being able to connect to the PostgreSQL database on port 5432 due to password issues:\n\n1. **Port Conflict**: Ensure that the port 5432 is not being used by another instance of PostgreSQL or any other application. If it is, you could stop the existing service that is using this port or change the port number for your current instance.\n\n2. **Service Management on Windows**: If you're on Windows, check if a PostgreSQL service is running that might be conflicting with your current setup:\n   - Press `Win + R`, type `services.msc`, and press Enter.\n   - Look for services like `PostgreSQL` and stop them if necessary.\n\n3. **Change Port**: If the port is still being used by another instance, consider cha

"Given the context you've provided, it seems like you're encountering issues connecting to a PostgreSQL database, potentially due to password or port conflicts. Here’s a breakdown of solutions and checks you can perform:\n\n1. **Check for Port Conflicts**:\n   - Ensure no other applications or instances of PostgreSQL are using port 5432. You can use command-line tools like `netstat` on Linux (`sudo netstat -tuln | grep 5432`) or Windows (`netstat -a -n | findstr 5432`) to see what's using that port.\n\n2. **Manage Services on Windows**:\n   - Use the `Services` console to check for running PostgreSQL services. Stop any that are conflicting with your desired PostgreSQL setup.\n\n3. **Change the Port**:\n   - If another application uses port 5432, consider changing the port in your `postgresql.conf` file:\n     - Find and open the `postgresql.conf` file.\n     - Change the line `#port = 5432` to `port = 5431` or another available port.\n     - Restart the PostgreSQL service for the chang