In [18]:
from openai import OpenAI

In [19]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama'
)

In [21]:
from elasticsearch import Elasticsearch

In [22]:
es_client = Elasticsearch('http://localhost:9200')

In [23]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text"
            },
            "section": {
                "type": "text"
            },
            "question": {
                "type": "text"
            },
            "course": {
                "type": "keyword"
            }
        }
    }
}

index_name = 'course-questions'

if not es_client.indices.exists(index=index_name):
    es_client.indices.create(index=index_name, body=index_settings)

In [24]:
import requests 


docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [25]:
from tqdm.auto import tqdm

In [26]:
for doc in tqdm(documents):
    es_client.index(index=index_name, body=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [27]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [28]:
def build_prompt(query, search_results):
    prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
        Use only the facts from the CONTEXT when answering the QUESTION.

        QUESTION: {question}

        CONTEXT: {context}
    """.strip()

    context_str = ""

    for doc in search_results:
        context_str = context_str + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context_str).strip()
    return prompt 


In [29]:
def llm(prompt):
    response = client.chat.completions.create(
        model = 'phi3',
        messages = [{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [30]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [31]:
query = 'I just discovered the course. Can I still join?'
rag(query)

" Yes, according to our FAQ database, even after the course starts, as a new student or someone who discovered the course later on, I am still eligible to submit homework assignments provided that we don't leave everything until the last minute and meet project deadlines:\n- For projects with immediate due dates. If you find yourself in need of assistance immediately after registration, please do not hesitate to contact me or visit my office hours for additional support during class time."