In [10]:
import minsearch
import json
import os
from dotenv import load_dotenv
import requests


dotenv_path = '/Users/dandyrahman/Documents/Projects/LLM-ZoomCamp/.env'
load_dotenv(dotenv_path)

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
DATASAUR_API_KEY = os.environ.get("DATASAUR_API_KEY")

In [8]:
API_URL = "https://llm.datasaur.ai/api/llm-applications/2419/1260/playground-1"
headers = {
  'Authorization': f'Bearer {DATASAUR_API_KEY}',
  'Content-Type': 'application/json'
}

In [26]:
def llm_openai_datasaur(prompt):
  response = requests.post(API_URL, headers=headers, json={"prompt":prompt})
  answer = response.json()['message']

  return answer

output = llm_openai_datasaur("Hello!")
print(output['message'])

TypeError: string indices must be integers

In [11]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [12]:
docs_raw[1]['course']

'machine-learning-zoomcamp'

In [13]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)


In [14]:
documents[0].keys()

dict_keys(['text', 'section', 'question', 'course'])

In [15]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [16]:
index.fit(documents)

<minsearch.Index at 0x12aafb7c0>

In [18]:
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)

In [19]:
def search(query):
    boost = {"question": 3.0, "section": 0.5}

    results = index.search(
        query=query,
        filter_dict={"course": "data-engineering-zoomcamp"},
        boost_dict=boost,
        num_results=5
    )

    return results

In [20]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT. 
Use only the facts from the CONTEXT when answering the QUESTION. 
If the CONTEXT doesn't have answer, output NONE.

QUESTION: {question}

CONTEXT: {context}

    """

    context = "\n"

    for doc in search_results:
        context += f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [21]:
def llm(prompt):
    response = client.chat.completions.create(
                    model= "gpt-3.5-turbo-0125",
                    messages=[{"role": "user",
                            "content": prompt}]
                )
    
    return response.choices[0].message.content

In [24]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm_openai_datasaur(prompt)

    return answer

In [27]:
q = "how do i run kafka?"

answer = rag(q)

print(answer)

To run Kafka's producer, consumer, kstreams, etc., in the terminal for your project, navigate to the project directory and use the command: `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`. This will start your Kafka components as needed. If you have any specific requirements or dependencies, make sure they are all properly set up before running the command.


In [28]:
from elasticsearch import Elasticsearch

In [29]:
es_client = Elasticsearch("http://localhost:9200")

In [30]:
es_client.info()

ObjectApiResponse({'name': 'e24fa7a0e67d', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'krg7iUBxR3CcjrONMlKAQg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [31]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/NZpPiXJrQGmhkCrNIx3vVA] already exists')

In [32]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:03<00:00, 286.32it/s]


In [34]:
query = "I just discovered the course. Can I still join it?"

In [35]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [36]:
def rag_elastic(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm_openai_datasaur(prompt)

    return answer

In [89]:
rag_elastic(query)

"Yes, you can still join the course even after the start date. You'll be able to submit homeworks and work towards completing the final projects, but make sure to manage your time effectively to meet the deadlines."