In [1]:
!pip install minsearch



In [2]:
import minsearch

In [3]:
import json

In [4]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [5]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [6]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [8]:
q = 'the course has already started, can I still enroll?'

In [9]:
index.fit(documents)

<minsearch.Index at 0x7d3c0d4ca6c0>

In [10]:
import openai
from openai import OpenAI

In [11]:
import os

In [13]:
client = OpenAI()

In [14]:
# !pip install google-generativeai
import google.generativeai as genai



  from .autonotebook import tqdm as notebook_tqdm


In [15]:
try:
   genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
except KeyError:
   # Otherwise, prompt the user for the key
   api_key = input("Please enter your Google API Key: ")
   genai.configure(api_key=api_key)

model = genai.GenerativeModel('gemini-1.5-flash-latest')

response = model.generate_content(q)

try:
    result = response.text
    print(result)
except ValueError:
    print("Response was blocked by safety settings.")
    print(response.prompt_feedback)

It depends on the course.  Some courses allow late enrollment, while others do not.  You need to check with the course instructor or the institution offering the course.  Look for information on their website or contact them directly.



In [16]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

'Whether you can enroll in a course that has already started depends on the specific policies of the institution or platform offering the course. Here are some steps you can take:\n\n1. **Check the Official Website:** Look for information regarding late enrollment or the last date to join the course.\n\n2. **Contact the Instructor or Institution:** Reach out directly to the course instructor or the admissions office. They may allow late enrollment on a case-by-case basis.\n\n3. **Review the Course Syllabus:** Check if you’ll be able to catch up on any missed material without too much difficulty.\n\n4. **Consider Online Courses:** Some online platforms offer more flexible enrollment options.\n\n5. **Look for Audit Options:** Some courses allow you to audit, which means you can access course materials without official enrollment or credit.\n\nMake sure to review any specific policies about catch-up work or penalties for starting late.'

In [17]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [18]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [19]:
def llm_1(prompt):
    response_1 = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response_1.choices[0].message.content

In [20]:
def llm_2(prompt):
    try:
       genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
    except KeyError:
       # Otherwise, prompt the user for the key
       api_key = input("Please enter your Google API Key: ")
       genai.configure(api_key=api_key)
    
    model = genai.GenerativeModel('gemini-1.5-flash-latest')
    
    response_2 = model.generate_content(prompt)
    
    try:
        result = response_2.text
        print(result)
    except ValueError:
        print("Response was blocked by safety settings.")
        print(response_2.prompt_feedback)

In [21]:
query = 'how do I run kafka?'

In [22]:
def rag_1(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer_1 = llm_1(prompt)
    return answer_1

In [103]:
rag_1(query)

Total hits: 668
Max score: 32.00351


'The provided context does not contain information on how to execute a command on a Kubernetes pod. Please refer to Kubernetes documentation or other relevant resources for instructions on executing commands on a Kubernetes pod.'

In [24]:
def rag_2(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer_2 = llm_2(prompt)
    return answer_2

In [25]:
rag_2(query)

The provided text gives instructions for running Kafka producers and consumers in Java and Python.  For Java, run `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java` in the project directory. For Python, create a virtual environment using `python -m venv env`, activate it with `source env/bin/activate` (or `env/Scripts/activate` on Windows), install requirements with `pip install -r ../requirements.txt`, and then run your Python files within that environment.  Before running Python files, ensure all Docker images are running.  If you encounter a "ModuleNotFoundError: No module named 'kafka.vendor.six.moves'" error, use `pip install kafka-python-ng` instead.



In [26]:
rag_1('the course has already started, can I still enroll?')

"Yes, you can still enroll in the course after it has started. You are eligible to submit the homeworks. However, be mindful of the deadlines for the final projects, so it's advisable not to leave everything for the last minute."

In [27]:
rag_2('the course has already started, can I still enroll?')

Yes, you can still submit homeworks even if you don't register.  However, be aware of the deadlines for final projects.



In [28]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [32]:
from elasticsearch import Elasticsearch

In [36]:
es_client = Elasticsearch('http://localhost:9200')

In [40]:
es_client.info()

ObjectApiResponse({'name': '3a844f84e77f', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'dbRKHmApQTG0N5lm6zSlDw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [37]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"}
        }
    }
}

index_name = "course-questions"

In [41]:
# Check if index exists, create if it doesn't
if es_client.indices.exists(index=index_name):
    print(f"Index '{index_name}' already exists")
else:
    es_client.indices.create(
        index=index_name,
        settings=index_settings["settings"],
        mappings=index_settings["mappings"]
    )
    print(f"Index '{index_name}' created successfully!")

# Verify it was created
if es_client.indices.exists(index=index_name):
    print(f"✓ Index '{index_name}' is ready to use!")

Index 'course-questions' already exists
✓ Index 'course-questions' is ready to use!


In [42]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [43]:
from tqdm.auto import tqdm

In [44]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|█████████| 948/948 [00:02<00:00, 364.39it/s]


In [106]:
es_query = 'How do execute a command on a Kubernetes pod?'

In [114]:
def elastic_search(query):
    search_query = {
        "size": 10,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [115]:
def rag_1(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer_1 = llm_1(prompt)
    return answer_1

In [116]:
def rag_2(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer_2 = llm_2(prompt)
    return answer_2

In [117]:
rag_1(es_query)

'The CONTEXT does not provide specific information about executing a command on a Kubernetes pod. For executing commands on a Kubernetes pod, generally, you would use the `kubectl exec` command in the following manner:\n\n```bash\nkubectl exec -it <pod_name> -- <command>\n```\n\nReplace `<pod_name>` with the name of the pod and `<command>` with the command you want to execute. For example, to open a shell session, you might use:\n\n```bash\nkubectl exec -it my-pod -- /bin/bash\n```\n\nPlease ensure you have the necessary permissions and configurations set up to use `kubectl` with your Kubernetes cluster.'

In [118]:
rag_2(es_query)

This FAQ database does not contain information on how to execute a command on a Kubernetes pod.



In [107]:
elastic_search(es_query)

[{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'Deploy and Access the K

In [80]:
top_result = elastic_search(query)
top_result[0]
print(f"Top result (score: {top_result['score']}):")
print(top_result['document'])

# Print all results with their scores
print("\nAll results:")
for result in results_with_scores:
    print(f"Rank {result['rank']}: Score {result['score']:.2f}")
    print(f"Question: {result['document']['question']}")
    print(f"Section: {result['document']['section']}")
    print("---")

TypeError: list indices must be integers or slices, not str