# RAG Intro using minsearch

* [DataTalksClub: LLM RAG Workshop](https://github.com/alexeygrigorev/llm-rag-workshop/tree/main/notebooks) – parse_faq.ipynb and documents.json
* Elasticsearch 8.4.3

To run Elasticsearch locally:

```bash
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

In [6]:
!pip install elasticsearch==8.12.1 load_dotenv openai tqdm ipywidgets

Collecting ipywidgets
  Using cached ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Using cached widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Using cached jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Using cached ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Using cached jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Using cached widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
Installing collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [ipywidgets]3[0m [ipywidgets]
[1A[2KSuccessfully installed ipywidgets-8.1.7 jupyterlab_widgets-3.0.15 widgetsnbextension-4.0.14


In [1]:
from elasticsearch import Elasticsearch

ELASTICSEARCH_URI = 'http://localhost:9200'

def get_elasticsearch_client():
    es_client = Elasticsearch(ELASTICSEARCH_URI, verify_certs=False)
    print("Elasticsearch client info: ", es_client.info())
    return es_client

In [2]:
ELASTICSEARCH_INDEX_NAME = 'course-questions'

def setup_elasticsearch_index(es_client):
    ELASTICSEARCH_INDEX_SETTINGS = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "text": {"type": "text"},
                "section": {"type": "text"},
                "question": {"type": "text"},
                "course": {"type": "keyword"} 
            }
        }
    }

    if not es_client.indices.exists(index=ELASTICSEARCH_INDEX_NAME):
        try:
            es_client.indices.create(index=ELASTICSEARCH_INDEX_NAME, body=ELASTICSEARCH_INDEX_SETTINGS)
            print(f"Index '{ELASTICSEARCH_INDEX_NAME}' created.")
        except RequestError as e:
            print(f"Error creating index: {e.info}")
    else:
        print(f"Index '{ELASTICSEARCH_INDEX_NAME}' already exists.")

In [3]:
import json
from tqdm.auto import tqdm

RAG_DOCUMENT_FILENAME = 'documents.json'

def load_documents_into_elastcsearch(es_client):
    if not es_client.indices.exists(index=ELASTICSEARCH_INDEX_NAME):
        with open(RAG_DOCUMENT_FILENAME, 'rt') as f_in:
            docs_raw = json.load(f_in)

        documents = []
        for course_dict in docs_raw:
            for doc in course_dict['documents']:
                doc['course'] = course_dict['course']
                documents.append(doc)

        print("Document example: ", documents[0])

        for doc in tqdm(documents):
            es_client.index(index=ELASTICSEARCH_INDEX_NAME, body=doc)

In [4]:
RAG_DOCUMENT_COURSE_NAME = 'data-engineering-zoomcamp'

def search(es_client, query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"], # # `question`` field is 3 times more important than `text`` and `section``
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": RAG_DOCUMENT_COURSE_NAME
                    }
                }
            }
        }
    }
    results = es_client.search(index=ELASTICSEARCH_INDEX_NAME, body=search_query)

    result_docs = []
    for hit in results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [5]:
def build_prompt(query, search_results):
    prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from FAQ database.
        Use only the facts from the CONTEXT to answer the QUESTION.
        If the CONTEXT doesn't contain the answer, output NONE

        QUESTION: {question}
        CONTEXT: {context}
    """
    
    context = ""
    for doc in search_results:
        context += f'section: {doc["section"]}\nquestion: {doc["question"]}\nanswer: {doc["text"]}\n'

    prompt = prompt_template.format(question=query, context=context)
    return prompt

In [6]:
import os
from load_dotenv import load_dotenv
load_dotenv()

assert len(os.environ['OPENAI_API_KEY']) > 1

In [7]:
from openai import OpenAI

def llm(prompt, model='gpt-4o-mini'):
    client = OpenAI()
    response = client.chat.completions.create(
        model=model,
        messages=[{'role': 'user', 'content': prompt}]
    )
    return response.choices[0].message.content

In [8]:
es_client = get_elasticsearch_client()
setup_elasticsearch_index(es_client)
load_documents_into_elastcsearch(es_client)

def rag(es_client, query):
    search_results = search(es_client, 'The course already started, can I still enroll?')
    prompt = build_prompt(query, search_results)
    return llm(prompt)

Elasticsearch client info:  {'name': '743a0f47d938', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'VQRfejziSPWcLEWQCxE9MA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}
Index 'course-questions' already exists.


In [10]:
rag(es_client, 'The course already started, can I still enroll?')


"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects."