In [None]:
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3

### Next Step: Confirm with an API Ping
To 100% confirm it's alive, run this in your terminal:

In [None]:
curl -X GET http://localhost:9200


In [1]:
import json
from openai import OpenAI
from elasticsearch import Elasticsearch

In [2]:
with open('document.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [3]:
print(type(docs_raw))

<class 'list'>


In [4]:
print(len(docs_raw))

3


In [5]:
print(type(docs_raw[0]))

<class 'dict'>


In [6]:
print(docs_raw[0].keys())

dict_keys(['course', 'documents'])


In [7]:
print("course key is of type: ", type(docs_raw[0]['course']))
print("documents key is of type: ", type(docs_raw[0]['documents']))

course key is of type:  <class 'str'>
documents key is of type:  <class 'list'>


In [8]:
print("The firts elemenr in the documents lis is of type: ", type(docs_raw[0]['documents'][0]))

The firts elemenr in the documents lis is of type:  <class 'dict'>


In [9]:
print("The keys in the documents dictionary are : ", docs_raw[0]['documents'][0].keys())

The keys in the documents dictionary are :  dict_keys(['text', 'section', 'question'])


## Findings of the data structure
- **docs_raw** is of type list, so enter the list to get individual data<br>
- This individual data is of type dictionary with keys:<br> 
    + **course:** Contains the grouping of individual course sections like **Data Engineering Zoomczmp**<br>
    + **documents:** Contains list of dictionary for all the questions asked and the answers provided for these questions, for that **course**<br>
- Each dictionary in the **documents** list contains keys 'text', 'section', 'question'

### Data formating
We will then proceed to reformat the document to have only a document as a list of all the questions across all course sections as:<br>
document = [{ 'text': "...", 'section': "...", 'question': "...", 'course': "..."}, ...{}]

In [10]:
combined_doc = []

for course_section in docs_raw:
    for doc in course_section['documents']:
        doc['course'] = course_section['course']
        combined_doc.append(doc)
print("combined_doc has a length of: ", len(combined_doc))

combined_doc has a length of:  948


In [11]:
combined_doc[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [12]:
combined_doc[-1]

{'text': 'Problem description\nInfrastructure created in AWS with CD-Deploy Action needs to be destroyed\nSolution description\nFrom local:\nterraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure\nterraform destroy --var-file vars/prod.tfvars\nAdded by Erick Calderin',
 'section': 'Module 6: Best practices',
 'question': 'How to destroy infrastructure created via GitHub Actions',
 'course': 'mlops-zoomcamp'}

In [13]:
#Question
q = 'the course has already started, can I still enroll?'

In [14]:
client = OpenAI()

In [15]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"Whether you can still enroll in a course after it has started depends on several factors, including the policies of the institution offering the course, the specific course structure, and how far along the course is. Here are some general steps you can take:\n\n1. **Check the Institution's Policies**: Look at the institution’s registration and enrollment policies. Some institutions allow late enrollments while others have strict deadlines.\n\n2. **Contact the Instructor**: Reach out to the course instructor to express your interest. They might allow you to join if you've missed only a few sessions and if it’s feasible to catch up.\n\n3. **Consult with Academic Advising**: Speak with an academic advisor or the registrar's office. They can provide guidance on how to proceed with enrollment after the deadline.\n\n4. **Review Course Requirements**: Ensure you can meet all course requirements and deadlines, and that you're able to catch up on any missed material.\n\n5. **Consider Online or

In [16]:
es_client = Elasticsearch('http://localhost:9200') 

In [17]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [18]:
from tqdm.auto import tqdm

In [20]:
for doc in tqdm(combined_doc):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [21]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [22]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [23]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [24]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [25]:
rag(q)

"Yes, even if the course has already started, you can still enroll and submit the homeworks. However, be aware that there will be deadlines for turning in the final projects, so it's important not to leave everything for the last minute."