In [1]:
import elasticsearch

In [2]:
# Delete documents.json file from current directory if it exists
import os
if os.path.exists("documents.json"):
    os.remove("documents.json")

In [3]:
!wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json

--2024-06-27 11:47:33--  https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json [following]
--2024-06-27 11:47:34--  https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 658332 (643K) [text/plain]
Saving to: ‘documents.json’


2024-06-27 11:47:34 (52.3 MB/s) - ‘documents.json’ saved [658332/658332]



In [4]:
import json

In [5]:
with open ('./documents.json', 'rt') as f_in:
    documents_all = json.load(f_in)

In [6]:
documents = []

In [7]:
for course in documents_all:
    course_name = course['course']
    
    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [8]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': '8c069d42c0ef', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'xxmFKwqCShWhem_7BBKwXQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [9]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"


In [10]:
# Query elasticsearch if index exists
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# Recreate the index
response = es.indices.create(index=index_name, body=index_settings)
response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [11]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:32<00:00, 29.05it/s]


In [13]:
def retrieve_documents(query, index_name="course-questions", max_results=5):
    es = Elasticsearch("http://localhost:9200")
    
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents

In [14]:
from openai import AzureOpenAI
user_question = "How do I join the course after it has started?"
client = AzureOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    api_version="2024-02-01",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

deployment_name = "Demo"

In [15]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

In [16]:
prompt_template = """
QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

In [17]:
system_prompt = "You're a course teaching assistant. Answer the user QUESTION from students based on the provided CONTEXT."

In [18]:
def build_context(documents):
    context_result = ""
    
    for doc in documents:
        doc_str = context_template.format(**doc)
        context_result += ("\n\n" + doc_str)
    
    return context_result.strip()

In [19]:
def build_prompt(user_question, documents):
    context = build_context(documents)
    prompt = prompt_template.format(
        user_question=user_question,
        context=context
    )
    return prompt

In [20]:
def ask_openai(system_prompt, prompt):
    response = client.chat.completions.create(
        model=deployment_name,
        messages=[{"role": "system", "content": system_prompt},{"role": "user", "content": prompt}],
        temperature=0.7
    )
    answer = response.choices[0].message.content
    return answer

In [21]:
def qa_bot(user_question):
    context_docs = retrieve_documents(user_question)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_openai(system_prompt, prompt)
    return answer

In [22]:
qa_bot("can I still join the course after it started?")

"Answer: Yes, you can still join the course after it has started. Even if you don't register, you can still submit the homeworks. However, keep in mind that there will be deadlines for turning in the final projects, so don't wait until the last minute."