In [11]:
import minsearch
import json

In [12]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [13]:
docs_raw[1]['course']

'machine-learning-zoomcamp'

In [14]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)


In [15]:
documents[0].keys()

dict_keys(['text', 'section', 'question', 'course'])

In [16]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [34]:
index.fit(documents)

<minsearch.Index at 0x1156347f0>

In [37]:
from openai import OpenAI

client = OpenAI()

In [47]:
def search(query):
    boost = {"question": 3.0, "section": 0.5}

    results = index.search(
        query=query,
        filter_dict={"course": "data-engineering-zoomcamp"},
        boost_dict=boost,
        num_results=5
    )

    return results

In [54]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT. 
Use only the facts from the CONTEXT when answering the QUESTION. 
If the CONTEXT doesn't have answer, output NONE.

QUESTION: {question}

CONTEXT: {context}

    """

    context = "\n"

    for doc in search_results:
        context += f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [57]:
def llm(prompt):
    response = client.chat.completions.create(
                    model= "gpt-3.5-turbo-0125",
                    messages=[{"role": "user",
                            "content": prompt}]
                )
    
    return response.choices[0].message.content

In [60]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)

    return answer

In [61]:
q = "how do i run kafka?"

answer = rag(q)

print(answer)

To run Kafka, you can use the command: java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java as mentioned in Module 6 under streaming with kafka.


In [63]:
from elasticsearch import Elasticsearch



In [64]:
es_client = Elasticsearch("http://localhost:9200")

In [66]:
es_client.info()

ObjectApiResponse({'name': 'e24fa7a0e67d', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'krg7iUBxR3CcjrONMlKAQg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [67]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [69]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [71]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:02<00:00, 380.73it/s]


In [74]:
query = "I just discovered the course. Can I still join it?"

In [86]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [87]:
def rag_elastic(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)

    return answer

In [89]:
rag_elastic(query)

"Yes, you can still join the course even after the start date. You'll be able to submit homeworks and work towards completing the final projects, but make sure to manage your time effectively to meet the deadlines."