In [1]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [2]:
!pip install elasticsearch



In [7]:
!pip install elasticsearch==8.18.0

Collecting elasticsearch==8.18.0
  Downloading elasticsearch-8.18.0-py3-none-any.whl.metadata (9.2 kB)
Downloading elasticsearch-8.18.0-py3-none-any.whl (895 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m895.2/895.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: elasticsearch
  Attempting uninstall: elasticsearch
    Found existing installation: elasticsearch 9.0.1
    Uninstalling elasticsearch-9.0.1:
      Successfully uninstalled elasticsearch-9.0.1
Successfully installed elasticsearch-8.18.0


In [2]:
!pip list

Package                      Version
---------------------------- --------------
absl-py                      2.2.2
accelerate                   1.6.0
annotated-types              0.7.0
anyio                        4.9.0
appnope                      0.1.4
argon2-cffi                  23.1.0
argon2-cffi-bindings         21.2.0
arrow                        1.3.0
asttokens                    3.0.0
astunparse                   1.6.3
async-lru                    2.0.5
attrs                        25.3.0
babel                        2.17.0
beautifulsoup4               4.13.4
bitsandbytes                 0.42.0
bleach                       6.2.0
certifi                      2025.4.26
cffi                         1.17.1
charset-normalizer           3.4.2
comm                         0.2.2
debugpy                      1.8.14
decorator                    5.2.1
defusedxml                   0.7.1
distro                       1.9.0
einops                       0.8.1
elastic-transport            8.1

In [3]:
from elasticsearch import Elasticsearch



In [4]:
es_client = Elasticsearch('http://localhost:9200') 

In [5]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [6]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [7]:
from tqdm.auto import tqdm

In [10]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [8]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [11]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [22]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEXT when answering the QUESTION. Make your concise.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = client.chat.completions.create(
        model='deepseek-r1',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [23]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [24]:
query = 'I just disovered the course. Can I still join it?'
rag(query)

'<think>\nOkay, so a user just discovered my course and wants to know if they can still join it. I remember reading through the context provided for this question.\n\nLooking at the FAQ section titled "Can I still join the course after the start date?" The answer clearly states that yes, even without formally registering, you\'re eligible to submit homeworks. But there\'s a note about deadlines for final projects, so they should plan accordingly and not wait until the last minute.\n\nI need to make sure my response is concise and only uses facts from this context. I\'ll confirm their eligibility but remind them about the submission deadlines as that\'s important.\n</think>\n\nYes, you can still join the course after the start date, although homework submissions must be completed by specific deadlines. Keep an eye on the schedule for project submissions to avoid last-minute stress.'

In [25]:
print(_)

<think>
Okay, so a user just discovered my course and wants to know if they can still join it. I remember reading through the context provided for this question.

Looking at the FAQ section titled "Can I still join the course after the start date?" The answer clearly states that yes, even without formally registering, you're eligible to submit homeworks. But there's a note about deadlines for final projects, so they should plan accordingly and not wait until the last minute.

I need to make sure my response is concise and only uses facts from this context. I'll confirm their eligibility but remind them about the submission deadlines as that's important.
</think>

Yes, you can still join the course after the start date, although homework submissions must be completed by specific deadlines. Keep an eye on the schedule for project submissions to avoid last-minute stress.


In [26]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.45.0-py3-none-any.whl.metadata (8.9 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.5.0 (from streamlit)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Downloading cachetools-5.5.2-py3-none-any.whl.metadata (5.4 kB)
Collecting click<9,>=7.0 (from streamlit)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting packaging<25,>=20 (from streamlit)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pyarrow>=7.0 (from streamlit)
  Downloading pyarrow-20.0.0-cp39-cp39-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting gitpython!=3.1.19,<4,>