## Creating an index and populating it with documents using Elasticsearch

Simple example on how to ingest PDF documents, then web pages content into an Elasticsearch VectorStore.

Requirements:
- A Elasticsearch cluster deployed on OpenShift


### Needed packages

In [None]:
!pip install --upgrade pip
!pip install accelerate
!pip install elasticsearch
!pip install huggingface
!pip install langchain
!pip install langchain_community
!pip install langchain-huggingface
!pip install openai
!pip install pypdf
!pip install sentence-transformers
!pip install transformers
!pip install torch



### Base parameters, the Elasticsearch info

In [2]:
import os as os
product_version = 2.11
CONNECTION_STRING = os.environ.get('CONNECTION_STRING', 'default')
PASSWORD = os.environ.get('PASSWORD', 'default')
COLLECTION_NAME = f"rhoai-doc-{product_version}"

### Verify Elasticsearch connection

In [7]:
from elasticsearch import Elasticsearch
import sys as sys

es = Elasticsearch(
    'https://' + CONNECTION_STRING,
    basic_auth=("elastic", PASSWORD),
    verify_certs=False
)

if es.ping():
    print("Connected to Elasticsearch!")
else:
    print("Could not connect to Elasticsearch. Terminating execution of notebook.")
    #Terminating execution
    sys.exit()

Connected to Elasticsearch!




#### Imports

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

## Initial index creation and document ingestion

#### Download and load pdfs

In [None]:
documents = [
    "release_notes",
    "introduction_to_red_hat_openshift_ai",
    "getting_started_with_red_hat_openshift_ai_self-managed",
    "openshift_ai_tutorial_-_fraud_detection_example",
    "developing_a_model",
    "integrating_data_from_amazon_s3",
    "working_on_data_science_projects",
    "serving_models",
    "monitoring_data_science_models",
    "managing_users",
    "managing_resources",
    "installing_and_uninstalling_openshift_ai_self-managed",
    "installing_and_uninstalling_openshift_ai_self-managed_in_a_disconnected_environment",
    "upgrading_openshift_ai_self-managed",
    "upgrading_openshift_ai_self-managed_in_a_disconnected_environment",   
]

pdfs = [f"https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/{product_version}/pdf/{doc}/red_hat_openshift_ai_self-managed-{product_version}-{doc}-en-us.pdf" for doc in documents]
pdfs_to_urls = {f"red_hat_openshift_ai_self-managed-{product_version}-{doc}-en-us": f"https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/{product_version}/html-single/{doc}/index" for doc in documents}

In [None]:
import requests
import os

docs_dir = f"rhoai-doc-{product_version}"

if not os.path.exists(docs_dir):
    os.mkdir(docs_dir)

for pdf in pdfs:
    try:
        response = requests.get(pdf)
    except:
        print(f"Skipped {pdf}")
        continue
    if response.status_code!=200:
        print(f"Skipped {pdf}")
        continue  
    with open(f"{docs_dir}/{pdf.split('/')[-1]}", 'wb') as f:
        f.write(response.content)

In [None]:
pdf_folder_path = f"./rhoai-doc-{product_version}"

pdf_loader = PyPDFDirectoryLoader(pdf_folder_path)
pdf_docs = pdf_loader.load()

#### Inject metadata

In [None]:
from pathlib import Path

for doc in pdf_docs:
    doc.metadata["source"] = pdfs_to_urls[Path(doc.metadata["source"]).stem]

#### Load websites

In [None]:
websites = [
    "https://ai-on-openshift.io/getting-started/openshift/",
    "https://ai-on-openshift.io/getting-started/opendatahub/",
    "https://ai-on-openshift.io/getting-started/openshift-ai/",
    "https://ai-on-openshift.io/odh-rhoai/configuration/",
    "https://ai-on-openshift.io/odh-rhoai/custom-notebooks/",
    "https://ai-on-openshift.io/odh-rhoai/nvidia-gpus/",
    "https://ai-on-openshift.io/odh-rhoai/custom-runtime-triton/",
    "https://ai-on-openshift.io/odh-rhoai/openshift-group-management/",
    "https://ai-on-openshift.io/tools-and-applications/minio/minio/",
    "https://access.redhat.com/articles/7047935",
    "https://access.redhat.com/articles/rhoai-supported-configs",
]

In [None]:
website_loader = WebBaseLoader(websites)
website_docs = website_loader.load()

#### Merge both types of docs

In [None]:
docs = pdf_docs + website_docs

#### Split documents into chunks with some overlap

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=40)
all_splits = text_splitter.split_documents(docs)
all_splits[0]

#### Cleanup documents. Remove the NUL character, '\x00', in TEXT fields.

In [None]:
for doc in all_splits:
    doc.page_content = doc.page_content.replace('\x00', '')

#### Create the index and ingest the documents

In [None]:
from langchain.vectorstores import ElasticsearchStore
from elasticsearch import Elasticsearch

embeddings = HuggingFaceEmbeddings()

es = Elasticsearch(
    "https://" + CONNECTION_STRING,
    basic_auth=("elastic", PASSWORD),
    verify_certs=False
)

db = ElasticsearchStore(
    embedding=embeddings,
    index_name="rhoai-docs",
    es_connection=es
)

db.add_documents(all_splits)

#### Test query

In [None]:
query = "How can I work with GPU and taints in OpenShift AI?"
docs_with_score = db.similarity_search_with_score(query)

In [None]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)