In [1]:
# !pip install minsearch

In [2]:
import minsearch

In [3]:
import json

In [4]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [5]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [6]:
# documents[0]

In [7]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [8]:
q = 'the course has already started, can I still enroll?'

In [9]:
index.fit(documents)

<minsearch.Index at 0x7b06c1bf4530>

In [10]:
import openai
from openai import OpenAI

In [11]:
import os

In [13]:
client = OpenAI()

In [14]:
# !pip install google-generativeai
import google.generativeai as genai

In [15]:
try:
   genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
except KeyError:
   # Otherwise, prompt the user for the key
   api_key = input("Please enter your Google API Key: ")
   genai.configure(api_key=api_key)

model = genai.GenerativeModel('gemini-1.5-flash-latest')

response = model.generate_content(q)

try:
    result = response.text
    print(result)
except ValueError:
    print("Response was blocked by safety settings.")
    print(response.prompt_feedback)

It depends on the course.  Some courses allow late enrollment, while others do not.  You should check with the instructor or the institution offering the course to find out their policy on late enrollment.



In [16]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"It depends on the policies of the institution or organization offering the course. Some courses allow late enrollment, often with certain conditions or deadlines for catching up on missed work. It's best to contact the course administrator or visit the course's website for specific information on late enrollment."

In [17]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [18]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [19]:
def llm_1(prompt):
    response_1 = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response_1.choices[0].message.content

In [20]:
def llm_2(prompt):
    try:
       genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
    except KeyError:
       # Otherwise, prompt the user for the key
       api_key = input("Please enter your Google API Key: ")
       genai.configure(api_key=api_key)
    
    model = genai.GenerativeModel('gemini-1.5-flash-latest')
    
    response_2 = model.generate_content(prompt)
    
    try:
        result = response_2.text
        print(result)
    except ValueError:
        print("Response was blocked by safety settings.")
        print(response_2.prompt_feedback)

In [21]:
query = 'how do I run kafka?'

In [22]:
def rag_1(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer_1 = llm_1(prompt)
    return answer_1

In [23]:
rag_1(query)

'To run Kafka using Java in the terminal, navigate to the project directory and execute the following command:\n\n```bash\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nFor running Python Kafka scripts in a virtual environment, create a virtual environment, activate it, and install the necessary packages:\n\n1. Create and activate the virtual environment (run only once):\n   ```bash\n   python -m venv env\n   source env/bin/activate\n   pip install -r ../requirements.txt\n   ```\n\n2. Activate the virtual environment each time you need it:\n   ```bash\n   source env/bin/activate\n   ```\n\n3. Deactivate the virtual environment when done:\n   ```bash\n   deactivate\n   ```\n\nEnsure Docker images are up and running before using the virtual environment for Python scripts.'

In [24]:
def rag_2(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer_2 = llm_2(prompt)
    return answer_2

In [25]:
rag_2(query)

The provided text gives instructions for running Kafka producers and consumers in Java and Python.  For Java, run `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java` from the project directory.  For Python, first create a virtual environment using `python -m venv env`, then activate it using `source env/bin/activate` (or `env/Scripts/activate` on Windows), install dependencies with `pip install -r ../requirements.txt`, and then run the Python files.  If you encounter a "ModuleNotFoundError: No module named 'kafka.vendor.six.moves'" error, use `pip install kafka-python-ng` instead.



In [26]:
rag_1('the course has already started, can I still enroll?')

'Yes, you can still enroll in the course after it has started. You are eligible to submit the homework even without registering. However, please be mindful of the deadlines for turning in the final projects and try not to leave everything until the last minute.'

In [27]:
rag_2('the course has already started, can I still enroll?')

Yes, you can still submit homeworks even if you don't register.  However, be aware of the deadlines for final projects.



In [28]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [29]:
from elasticsearch import Elasticsearch

In [30]:
es_client = Elasticsearch('http://localhost:9200')

In [31]:
# es_client.info()

In [32]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"}
        }
    }
}

index_name = "course-questions"

In [33]:
# Check if index exists, create if it doesn't
if es_client.indices.exists(index=index_name):
    print(f"Index '{index_name}' already exists")
else:
    es_client.indices.create(
        index=index_name,
        settings=index_settings["settings"],
        mappings=index_settings["mappings"]
    )
    print(f"Index '{index_name}' created successfully!")

# Verify it was created
if es_client.indices.exists(index=index_name):
    print(f"✓ Index '{index_name}' is ready to use!")

Index 'course-questions' already exists
✓ Index 'course-questions' is ready to use!


In [34]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [35]:
from tqdm.auto import tqdm

In [36]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [37]:
es_query = 'How do execute a command on a Kubernetes pod?'

In [38]:
def elastic_search(query):
    search_query = {
        "size": 10,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [39]:
def rag_1(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer_1 = llm_1(prompt)
    return answer_1

In [40]:
def rag_2(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer_2 = llm_2(prompt)
    return answer_2

In [41]:
rag_1(es_query)

'The provided CONTEXT does not include information on executing a command on a Kubernetes pod. Please refer to Kubernetes documentation or other reliable sources for guidance on how to perform this action.'

In [42]:
rag_2(es_query)

This question cannot be answered from the given context.  The provided text focuses on using dbt-core projects in Airflow and handling type errors in PySpark, but it does not contain information about executing commands on Kubernetes pods.



In [43]:
elastic_search(es_query)

[{'text': 'Install the astronomer-cosmos package as a dependency. (see Terraform example).\nMake a new folder, dbt/, inside the dags/ folder of your Composer GCP bucket and copy paste your dbt-core project there. (see example)\nEnsure your profiles.yml is configured to authenticate with a service account key. (see BigQuery example)\nCreate a new DAG using the DbtTaskGroup class and a ProfileConfig specifying a profiles_yml_filepath that points to the location of your JSON key file. (see example)\nYour dbt lineage graph should now appear as tasks inside a task group like this:',
  'section': 'Course Management Form for Homeworks',
  'question': 'How to run a dbt-core project as an Airflow Task Group on Google Cloud Composer using a service account JSON key',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Install the astronomer-cosmos package as a dependency. (see Terraform example).\nMake a new folder, dbt/, inside the dags/ folder of your Composer GCP bucket and copy paste your d

In [44]:
top_result = elastic_search(query)
top_result[0]
print(f"Top result (score: {top_result['score']}):")
print(top_result['document'])

# Print all results with their scores
print("\nAll results:")
for result in results_with_scores:
    print(f"Rank {result['rank']}: Score {result['score']:.2f}")
    print(f"Question: {result['document']['question']}")
    print(f"Section: {result['document']['section']}")
    print("---")

TypeError: list indices must be integers or slices, not str