# RAG Intro using minsearch

* [DataTalksClub: LLM RAG Workshop](https://github.com/alexeygrigorev/llm-rag-workshop/tree/main/notebooks) – parse_faq.ipynb and documents.json
* Elasticsearch 8.4.3

To run Elasticsearch locally:

```bash
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

In [31]:
!pip install elasticsearch==8.12.1 load_dotenv openai tiktoken tqdm ipywidgets requests

Collecting tiktoken
  Using cached tiktoken-0.9.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Using cached regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl.metadata (40 kB)
Using cached tiktoken-0.9.0-cp313-cp313-macosx_11_0_arm64.whl (1.0 MB)
Using cached regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl (284 kB)
Installing collected packages: regex, tiktoken
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [tiktoken]
[1A[2KSuccessfully installed regex-2024.11.6 tiktoken-0.9.0


In [3]:
from elasticsearch import Elasticsearch

ELASTICSEARCH_URI = 'http://localhost:9200'

def get_elasticsearch_client():
    es_client = Elasticsearch(ELASTICSEARCH_URI, verify_certs=False)
    print("Elasticsearch client info: ", es_client.info())
    return es_client

In [16]:
ELASTICSEARCH_INDEX_NAME = 'module-1-homework-course-questions-v2'

def setup_elasticsearch_index(es_client):
    ELASTICSEARCH_INDEX_SETTINGS = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "text": {"type": "text"},
                "section": {"type": "text"},
                "question": {"type": "text"},
                "course": {"type": "keyword"} 
            }
        }
    }

    if not es_client.indices.exists(index=ELASTICSEARCH_INDEX_NAME):
        try:
            es_client.indices.create(index=ELASTICSEARCH_INDEX_NAME, body=ELASTICSEARCH_INDEX_SETTINGS)
            print(f"Index '{ELASTICSEARCH_INDEX_NAME}' created.")
        except RequestError as e:
            print(f"Error creating index: {e.info}")
    else:
        print(f"Index '{ELASTICSEARCH_INDEX_NAME}' already exists.")

In [17]:
import requests

HOMEWORK_DOCS_URL = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'

def load_documents():
    docs_response = requests.get(HOMEWORK_DOCS_URL)
    documents_raw = docs_response.json()

    documents = []

    for course in documents_raw:
        course_name = course['course']

        for doc in course['documents']:
            doc['course'] = course_name
            documents.append(doc)

    return documents

In [22]:
from tqdm.auto import tqdm

def index_documents_into_elastcsearch(es_client, documents):
        if es_client.count(index=ELASTICSEARCH_INDEX_NAME)['count'] == 0:
            print("Indexing documents, document example: ", documents[0])

            for doc in tqdm(documents):
                es_client.index(index=ELASTICSEARCH_INDEX_NAME, body=doc)

In [27]:
RAG_DOCUMENT_COURSE_NAME = 'machine-learning-zoomcamp' #'data-engineering-zoomcamp'

def search(es_client, query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"], # # `question`` field is 3 times more important than `text``
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": RAG_DOCUMENT_COURSE_NAME
                    }
                }
            }
        }
    }
    results = es_client.search(index=ELASTICSEARCH_INDEX_NAME, body=search_query)

    result_docs = []
    for hit in results['hits']['hits']:
        print(hit)
        result_docs.append(hit['_source'])

    return result_docs

Q3. Searching

In [28]:
query = 'How do copy a file to a Docker container?'
es_client = get_elasticsearch_client()
setup_elasticsearch_index(es_client)
documents = load_documents()
index_documents_into_elastcsearch(es_client, documents)
search_results = search(es_client, query)
print(search_results)

Elasticsearch client info:  {'name': '743a0f47d938', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'VQRfejziSPWcLEWQCxE9MA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}
Index 'module-1-homework-course-questions-v2' already exists.
{'_index': 'module-1-homework-course-questions-v2', '_id': 'fLm6iJcBNMnxywQVvsal', '_score': 73.38676, '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Ma

In [29]:
def build_prompt(query, search_results):
    context_template = """
    Q: {question}
    A: {text}
    """.strip()

    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT:
    {context}
    """.strip()
    
    context = ""
    for doc in search_results:
        context += context_template.format(question=doc['question'], text=doc['text']) + '\n\n'

    prompt = prompt_template.format(question=query, context=context)

    print(len(prompt))

    return prompt

Q5. Building a prompt

In [34]:
prompt = build_prompt(query, search_results)

2197


Q6. Tokens

In [37]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o-mini")

encoded = encoding.encode(prompt)
print(len(encoded))
encoded


500


[63842,
 261,
 4165,
 14029,
 29186,
 13,
 30985,
 290,
 150339,
 4122,
 402,
 290,
 31810,
 8099,
 591,
 290,
 40251,
 7862,
 558,
 271,
 7649,
 1606,
 290,
 19719,
 591,
 290,
 31810,
 8099,
 1261,
 55959,
 290,
 150339,
 364,
 271,
 150339,
 25,
 3253,
 621,
 5150,
 261,
 1974,
 316,
 261,
 91238,
 9282,
 1715,
 271,
 31810,
 8099,
 734,
 271,
 1486,
 25,
 3253,
 621,
 357,
 15199,
 261,
 62275,
 9282,
 3901,
 271,
 355,
 25,
 41281,
 290,
 9282,
 3621,
 306,
 25383,
 6766,
 326,
 151187,
 290,
 7251,
 4859,
 11,
 813,
 484,
 480,
 13217,
 261,
 38615,
 6348,
 558,
 68923,
 2461,
 533,
 278,
 2230,
 7962,
 4859,
 38615,
 464,
 3365,
 523,
 3335,
 290,
 9282,
 382,
 4279,
 6788,
 11,
 15792,
 261,
 6348,
 306,
 290,
 4857,
 9282,
 734,
 68923,
 10942,
 350,
 6555,
 290,
 9282,
 26240,
 446,
 68923,
 25398,
 533,
 278,
 464,
 6896,
 26240,
 29,
 38615,
 198,
 6103,
 277,
 10732,
 391,
 79771,
 1029,
 48,
 25,
 3253,
 621,
 357,
 5150,
 6291,
 591,
 922,
 2698,
 7342,
 316,
 62275,
 92

In [6]:
import os
from load_dotenv import load_dotenv
load_dotenv()

assert len(os.environ['OPENAI_API_KEY']) > 1

In [7]:
from openai import OpenAI

def llm(prompt, model='gpt-4o-mini'):
    client = OpenAI()
    response = client.chat.completions.create(
        model=model,
        messages=[{'role': 'user', 'content': prompt}]
    )
    return response.choices[0].message.content

In [8]:
es_client = get_elasticsearch_client()
setup_elasticsearch_index(es_client)
load_documents_into_elastcsearch(es_client)

def rag(es_client, query):
    search_results = search(es_client, 'The course already started, can I still enroll?')
    prompt = build_prompt(query, search_results)
    return llm(prompt)

Elasticsearch client info:  {'name': '743a0f47d938', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'VQRfejziSPWcLEWQCxE9MA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}
Index 'course-questions' already exists.


In [10]:
rag(es_client, 'The course already started, can I still enroll?')


"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects."