In [1]:
#!pip install OpenAI deepseek
#!pip install minsearch
#!pip install dotenv

In [2]:
import minsearch
import requests 
import json

In [3]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [7]:
q = 'the course has already started, can I still enroll?'

In [8]:
index.fit(documents)

<minsearch.minsearch.Index at 0x75d7ae038c20>

In [9]:
from openai import OpenAI

In [10]:
from dotenv import load_dotenv
import os

load_dotenv()
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")


In [11]:
client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com/v1")

In [12]:
response = client.chat.completions.create(
    model='deepseek-chat',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"Whether you can still enroll in a course after it has started depends on the institution or platform offering it. Here’s what you can do:\n\n1. **Check the Enrollment Policy**:  \n   - Many online platforms (like Coursera, Udemy, or edX) allow late enrollment with full access to previous materials.  \n   - Universities or live courses may have strict deadlines but sometimes permit late joiners with instructor approval.\n\n2. **Contact Support or the Instructor**:  \n   - Email the course provider or instructor to ask if late enrollment is possible. Some may allow it if you can catch up.\n\n3. **Review the Syllabus**:  \n   - If the course is self-paced, you may still be able to join. If it’s cohort-based with deadlines, ask if assignments can be submitted late.\n\n4. **Audit vs. Full Enrollment**:  \n   - Some institutions let you audit the course (access materials without credit) even after it starts.\n\n**Quick Action**: Visit the course website or contact support directly for the f

In [13]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [14]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [15]:
def llm(prompt):
    response = client.chat.completions.create(
        model='deepseek-chat',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [16]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [17]:
rag(query)

'To run Kafka, follow the instructions based on your use case:\n\n1. **For Java Kafka (e.g., running a producer/consumer/KStreams in terminal)**:  \n   Navigate to your project directory and run:  \n   ```bash\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n\n2. **For Python Kafka**:  \n   - If you encounter a "Module \'kafka\' not found" error, create a virtual environment and install dependencies:  \n     ```bash\n     python -m venv env\n     source env/bin/activate  # On Windows: env\\Scripts\\activate\n     pip install -r ../requirements.txt\n     ```  \n   - If you see `./build.sh: Permission denied`, run:  \n     ```bash\n     chmod +x build.sh\n     ```  \n   - For the error `ModuleNotFoundError: No module named \'kafka.vendor.six.moves\'`, use:  \n     ```bash\n     pip install kafka-python-ng\n     ```\n\n3. **Ensure Docker is running** if your setup depends on it.  \n\nFor additional dependencies (e.g., `dlt[duckdb

In [18]:
rag('the course has already started, can I still enroll?')

"Yes, you can still enroll in the course even after it has started. You are eligible to submit homeworks, but be mindful of the deadlines for final projects. \n\nThe course materials will remain available after the course finishes, allowing you to follow at your own pace. Additionally, you can join the Slack channel for support, though it's recommended to search for answers in the FAQ or channel history first. \n\nFor reference, the course started on **15th Jan 2024 at 17:00**. If you haven't already, you can register using the provided link and join the Telegram channel for announcements."

In [19]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [20]:
from elasticsearch import Elasticsearch

#### **run elastic search on docker**
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3

In [21]:
es_client = Elasticsearch('http://localhost:9200') 

In [22]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/qM7Yl9-8RayxMKndjFDmtg] already exists')

In [23]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [24]:
from tqdm.auto import tqdm

In [25]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [26]:
query = 'I just disovered the course. Can I still join it?'

In [29]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [30]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [31]:
rag(query)

'Yes, you can still join the course even after the start date. You are eligible to submit homeworks, but be mindful of the deadlines for final projects. Additionally, all course materials will remain available after the course finishes, allowing you to follow the course at your own pace.  \n\nBefore diving in, you can also prepare by setting up the required tools and reviewing the prerequisites and syllabus.'