# RETRIEVAL & SEARCH

In [1]:
# Download minsearch.py file if not already in repository (toy search to be replaced later by Elasticsearch).
# !wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [2]:
import minsearch

In [3]:
import json

In [4]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [None]:
docs_raw

In [5]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [6]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
index = minsearch.Index(
    text_fields = ["question", "text", "section"],
    keyword_fields = ["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [8]:
q = 'the course has already started, can I still enroll?'

In [9]:
index.fit(documents)

<minsearch.Index at 0x74942c0189b0>

In [10]:
boost = {'question': 3.0, 'section': 0.5}

results = index.search(
    query = q,
    filter_dict = {'course': 'data-engineering-zoomcamp'},
    boost_dict = boost,
    num_results = 5
)

In [11]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

# GENERATING ANSWERS

In [12]:
from openai import OpenAI

In [13]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [14]:
response = client.chat.completions.create(
    model = 'phi3',
    messages = [{"role": "user", "content": q}]
)

In [15]:
response.choices[0].message.content

"Once a college or university course begins for the semester, students generally cannot join if they were not in the admission queue first. However, each institution may have different policies and some might allow late registration under certain conditions after classes begin. Here's what you can do:\n\n1. **Check with Academic Advisors** - They will be able to inform about any possible extension for enrollment and if there is a possibility to join the course or take an alternative available with sufficient credits towards your graduation requirements, subject to institutional policies during ongoing classes. \n\n2. **Reach Out to Departments/Academic Advisors** - Sometimes students at similar stages might have been able to secure earlier enrollment through different channels (eportfolio submissions or research grants etc.). It's worth inquiring these departments as exceptions may be considered on case by case basis. \n\n3. **Contact the Registrar of your college/university** - They h

In [16]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contain the answer, output NONE.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [17]:
context = ""

for doc in results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

In [18]:
print(context)

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start wit

In [19]:
prompt = prompt_template.format(question=q, context=context).strip()

In [20]:
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contain the answer, output NONE.

QUESTION: the course has already started, can I still enroll?

CONTEXT:
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your f

In [21]:
response = client.chat.completions.create(
    model = 'phi3',
    messages = [{"role": "user", "content": prompt}]
)

response.choices[0].message.content

'As a course teaching assistant for a platform that offers live courses with registration options and post-course materials available afterward (as described in QUESTIONs from "When will the course start?" to "What can I do before the course starts?"), here\'s what you need to know: Courses cannot be joined once they have started, according to FAQ entries. However, if interested students learn about this afterwards and wish to participate, a limited number may still register for submission of homeworks which will adhere strictly to strict deadlines as per the "Course - Can I join after?" answer provided in the context documents that also confirm continuous availability of course material post-course. Should you decide not to wait until next time and instead want self-paced learning materials, look for this particular resource: an open slack channel which remains accessible with guidance available via FAQs or by interacting directly using support bots on the platform as detailed in "Cou

# RAG FLOW CLEANING & MODULARIZING

In [22]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [23]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain the answer, output NONE.
    
    QUESTION: {question}
    
    CONTEXT: 
    {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [24]:
def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [25]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [26]:
rag(query)

"To run the Java Kafka producer code mentioned in Module 6's FAQ, execute the following command within your project directory: java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n\nIn situations where you encounter a module not found error for Python Kafka, the solution provided on GitHub is to use kafka-python-ng instead of kafka-python by replacing your current package installation with pip install kafka-python-ng as an interim workaround until stable releases are available again."

In [27]:
rag('the course has already started, can I still enroll?')

"Yes, even if a course starts and someone wants to join later, they are still eligible for homeworks unless registration closes already or an enrollment cap has been set beforehand in the FAQ database details provided (since that information isn't explicitly stated). However, make sure not to wait until last minute because there will be clear deadlines when turning in final projects.\n"

# Search with ElasticSearch

Start by running ElasticSearch in terminal:

```
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

Or if there is a "error pulling image configuration" message try:

```
docker run -it \
    --rm \
    --name elasticsearch \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    elasticsearch:8.4.3
```

Check connection:
```
curl http://localhost:9200
```

In [28]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [29]:
from elasticsearch import Elasticsearch

In [30]:
es_client = Elasticsearch('http://localhost:9200')

In [31]:
es_client.info()

ObjectApiResponse({'name': 'ebc10cfcc132', 'cluster_name': 'docker-cluster', 'cluster_uuid': '9qhH5nUATYK5R5CLTSNAng', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [32]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [33]:
from tqdm.auto import tqdm

In [34]:
for doc in documents:
    es_client.index(index=index_name, document=doc)

In [35]:
query = 'I just disovered the course. Can I still join it?'

In [36]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [37]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [38]:
rag(query)

"You can still join the course even if you discover it later, as long as there are available slots or spots for more students. You will be able to submit homeworks and meet deadlines without needing a confirmation email since registration is not confirmed against any list prior this stage (beyond gauging interest). However, ensure that all course dependencies like Google Cloud account setup and learning prerequisites are completed in advance of joining the class. Remember your submissions will be time-bound for final projects to avoid leaving everything until the last minute. You have access to questions via Slack or using @ZoomcampQABot during self-paced study, although it’s advisable not always rely on bot answers completely as they might contain discrepancies sometimes and may be incorrect at times. The sDocker containers are meant for the course context only but you can still follow along if interested once classes conclude before final projects submission is due by May 21st, which