# Prepare environment

* https://cookbook.openai.com/examples/vector_databases/elasticsearch/elasticsearch-semantic-search

## Init base libs

* Install libs

In [None]:
%%capture
%pip install \
    python-dotenv \
    pyyaml

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
import yaml

def read_yaml_file(file_path):
    """
    Reads YAML file.
    """
    with open(file_path, "r") as file:
        config = yaml.safe_load(file)
    return config
    
# config = read_yaml_file("./config.yml")
# config["cloud_id"], api_key=config["api_key"]

In [None]:
import json

def read_json_file(file_path):
    """
    Reads and loads a JSON file.
    """
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

## Init ElasticSearch

In [None]:
%%capture
%pip install \
    elasticsearch

In [None]:
import elasticsearch
from elasticsearch import Elasticsearch, helpers

def get_client_es(hosts:str="http://elasticsearch:9200", max_retries:int=5, request_timeout:int=600):
    """
    Initializes Elasticsearch client using cloud_id and api_key from config.yml
    """
    es = Elasticsearch(hosts=hosts, request_timeout=request_timeout)
    return es.options(max_retries=max_retries)

es = get_client_es()
es.info() # should return cluster info

In [None]:
# https://www.elastic.co/search-labs/tutorials/search-tutorial/full-text-search/create-index
def create_index(index_name: str, mappings):
    es = get_client_es()
    if not es.indices.exists(index=index_name):
        response = es.indices.create(index=index_name, body=mappings)
        if response.meta.status != 200:
            raise RuntimeError("failed to create index")
        print(f"Index '{index_name}' created successfully.")
    else:
        print(f"Index '{index_name}' already exists.")
        response = es.indices.get(index=index_name)
    return response

In [None]:
def count_index(index_name: str) -> int:
    es = get_client_es()
    count = int(es.cat.count(index=index_name, format="json")[0]["count"])
    return count

In [None]:
def search_index(index_name: str, body):
    es = get_client_es()
    response = es.search(index=index_name, body=body)
    return response

## Init openai

In [None]:
%%capture
# https://platform.openai.com/docs/libraries
%pip install \
    openai

In [None]:
import openai  # for calling the OpenAI API
from getpass import getpass
import os

# Load your API key from an environment variable or secret management service
# openai.api_key = getpass("Enter OpenAI API key") # os.getenv("OPENAI_API_KEY")

from openai import OpenAI

client = OpenAI(
  api_key = os.getenv("OPENAI_API_KEY") # getpass("Enter OpenAI API key")
)

### OpenAI

#### Embedding

In [None]:
def normalize_text(text):
    return " ".join(text.split())

normalize_text("""
Apple is a corporate structure
 that
 
 is famous
""")

In [None]:
# Using openai.Embedding syntax
def get_embedding(text, model="text-embedding-3-small"):
    filtered_text = normalize_text(text)
    return client.embeddings.create(input = [filtered_text], model=model).data[0].embedding

embeddings = get_embedding("Is the Atlantic the biggest ocean in the world?", model='text-embedding-3-small')
len(embeddings)

## Loading dataset from Kaggle

## Loading dataset from OpenAI Cookbook

In [None]:
%%capture
%pip install \
    wget

In [None]:
import wget
import zipfile

folder = "/data/wikipedia/"

embeddings_url = 'https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip'
wget.download(embeddings_url)

with zipfile.ZipFile("vector_database_wikipedia_articles_embedded.zip",
"r") as zip_ref:
    zip_ref.extractall(folder)

In [None]:
import pandas as pd

folder = "/data/wikipedia/"

dataset_file = folder + "vector_database_wikipedia_articles_embedded.csv"
df = pd.read_csv(dataset_file)
df.head(1)

In [None]:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search-api.html

index_name = "wikipedia_vector_index"
mapping = {
  "mappings": {
    "properties": {
      "title_vector": {
          "type": "dense_vector",
          "dims": 1536,
          "index": "true",
          "similarity": "cosine"
      },
      "content_vector": {
          "type": "dense_vector",
          "dims": 1536,
          "index": "true",
          "similarity": "cosine"
      },
      "text": {"type": "text"},
      "title": {"type": "text"},
      "url": { "type": "keyword"},
      "vector_id": {"type": "long"}
    }
  }
}
create_index(index_name, mapping)

In [None]:
import json

def dataframe_to_bulk_actions(df):
    for index, row in df.iterrows():
        yield {
            "_index": index_name,
            "_id": row['id'],
            "_source": {
                'url' : row["url"],
                'title' : row["title"],
                'text' : row["text"],
                'title_vector' : json.loads(row["title_vector"]),
                'content_vector' : json.loads(row["content_vector"]),
                'vector_id' : row["vector_id"]
            }
        }

In [None]:
start = 0
end = len(df)
batch_size = 100

for batch_start in range(start, end, batch_size):
    batch_end = min(batch_start + batch_size, end)
    batch_dataframe = df.iloc[batch_start:batch_end]
    actions = dataframe_to_bulk_actions(batch_dataframe)
    
    helpers.bulk(es, actions)

In [None]:
print(es.search(index=index_name, body={
    "_source": {
        "excludes": ["title_vector", "content_vector"]
    },
    "query": {
        "match": {
            "text": {
                "query": "Hummingbird"
            }
        }
    }
}))

In [None]:
# Function to pretty print Elasticsearch results

def pretty_response(response):
    for hit in response['hits']['hits']:
        id = hit['_id']
        score = hit['_score']
        title = hit['_source']['title']
        text = hit['_source']['text']
        pretty_output = (f"\nID: {id}\nTitle: {title}\nSummary: {text}\nScore: {score}")
        print(pretty_output)

In [None]:
question_embedding = get_embedding('Is the Atlantic the biggest ocean in the world?')

response = es.search(
  index = "wikipedia_vector_index",
  knn={
      "field": "content_vector",
      "query_vector": question_embedding,
      "k": 10,
      "num_candidates": 100
    }
)
pretty_response(response)

### Unstructured IO

https://github.com/Unstructured-IO/unstructured

In [None]:
%%capture
%pip install \
    unstructured-ingest[chroma, confluence, elasticsearch, gcs, github, gitlab, google-drive, jira, kafka, notion, onedrive, openai, postgres, qdrant, reddit, slack, wikipedia] \
    unstructured[pdf,embed-huggingface]<0.16.0

In [None]:
from typing import List
from unstructured.embed.huggingface import (
    HuggingFaceEmbeddingConfig,
    HuggingFaceEmbeddingEncoder,
)

def embeddings_for_text(text: str) -> List[float]:
    embedding_encoder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
    return embedding_encoder.embed_query(text)

def get_embeddings_len(index_name: str):
    es = get_client_es()
    res = es.search(index=index_name, size=1, query={"match_all": {}})
    return len(res["hits"]["hits"][0]["_source"]["embeddings"])

def query(index_name: str, search_text: str):
    # Query the index using the appropriate embedding vector for given query text
    search_vector = embeddings_for_text(search_text)
    # Constructing the search query
    return search_index(index_name=index_name, body={
        "query": {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'embeddings') + 1.0",
                    "params": {"query_vector": search_vector},
                },
            }
        }
    })

embeddings_for_text("")

# Create tables

### Unstructured

Ex : https://github.com/Unstructured-IO/unstructured/blob/main/scripts/elasticsearch-test-helpers/destination_connector/test-ingest-elasticsearch-output.py

In [None]:
index_name = "unstructured-ingest-test"
mappings = read_json_file("/data/search/" + index_name + "/_index_mappings.json")

print("== Connecting to the Elasticsearch cluster ==")
es = get_client_es()
print(f"{es.info()}")

print("== Creating an Elasticsearch index for testing ingest elasticsearch destination connector ==")
response = create_index(index_name=index_name, mappings=mappings)

es.indices.refresh(index=index_name)
response = es.cat.count(index=index_name, format="json")
response

In [None]:
count_index("products-catalog")

### ES Product catalog

 Example : https://github.com/elastic/elasticsearch-labs/tree/main/supporting-blog-content/hybrid-search-for-an-e-commerce-product-catalogue/product-store-search

In [None]:
index_name = "products-catalog"
mapping = {
    "settings": {
        "index": {
            "number_of_replicas": 0,
            "number_of_shards": 1,
        }
    },
    "mappings": {
        "properties": {
            "id": {"type": "keyword"},
            "brand": {
                "type": "text",
                "fields": {"keyword": {"type": "keyword"}},
            },
            "name": {"type": "text"},
            "price": {"type": "float"},
            "price_sign": {"type": "keyword"},
            "currency": {"type": "keyword"},
            "image_link": {"type": "keyword"},
            "description": {"type": "text"},
            "description_embeddings": {"type": "dense_vector", "dims": 384},
            "rating": {"type": "keyword"},
            "category": {"type": "keyword"},
            "product_type": {"type": "keyword"},
            "tag_list": {"type": "keyword"},
        }
    },
}
create_index(index_name, mapping)

In [None]:
def build_query(term=None, categories=None, product_types=None, brands=None):
    must_query = (
        [{"match_all": {}}]
        if not term
        else [
            {
                "multi_match": {
                    "query": term,
                    "fields": ["name", "category", "description"],
                }
            }
        ]
    )
    filters = []
    if categories:
        filters.append({"terms": {"category": categories}})
    if product_types:
        filters.append({"terms": {"product_type": product_types}})
    if brands:
        filters.append({"terms": {"brand.keyword": brands}})
    return {
        "_source": [
            "id",
            "brand",
            "name",
            "price",
            "currency",
            "image_link",
            "category",
            "tag_list",
        ],
        "query": {"bool": {"must": must_query, "filter": filters}},
    }


def build_hybrid_query(term=None, categories=None, product_types=None, brands=None, hybrid=False):
    # Standard query
    organic_query = build_query(term, categories, product_types, brands)

    if hybrid is True and term:
        vector = get_text_vector([term])[0]
        # Hybrid query with RRF (Reciprocal Rank Fusion)
        query = {
            "retriever": {
                "rrf": {
                    "retrievers": [
                        {"standard": {"query": organic_query["query"]}},
                        {
                            "knn": {
                                "field": "description_embeddings",
                                "query_vector": vector,
                                "k": 5,
                                "num_candidates": 20,
                                "filter": {"bool": {"filter": []}},
                            }
                        },
                    ],
                    "rank_window_size": 20,
                    "rank_constant": 5,
                }
            },
            "_source": organic_query["_source"],
        }
        if categories:
            query["retriever"]["rrf"]["retrievers"][1]["knn"]["filter"]["bool"][
                "filter"
            ].append({"terms": {"category": categories}})
        if product_types:
            query["retriever"]["rrf"]["retrievers"][1]["knn"]["filter"]["bool"][
                "filter"
            ].append({"terms": {"product_type": product_types}})
        if brands:
            query["retriever"]["rrf"]["retrievers"][1]["knn"]["filter"]["bool"][
                "filter"
            ].append({"terms": {"brand.keyword": brands}})
    else:
        query = organic_query

    return query

def search_products(
    term,
    categories=None,
    product_types=None,
    brands=None,
    promote_products=[],
    hybrid=False,
):
    query = build_hybrid_query(term, categories, product_types, brands, hybrid)

    if promote_products and not hybrid:
        query = {
            "query": {"pinned": {"ids": promote_products, "organic": query["query"]}},
            "_source": query["_source"],
        }

    print(query)
    response = get_client_es().search(index="products-catalog", body=query, size=20)

    results = []
    for hit in response["hits"]["hits"]:
        print(f"Product Name: {hit['_source']['name']}, Score: {hit['_score']}")

        results.append(
            {
                "id": hit["_source"]["id"],
                "brand": hit["_source"]["brand"],
                "name": hit["_source"]["name"],
                "price": hit["_source"]["price"],
                "currency": (
                    hit["_source"]["currency"] if hit["_source"]["currency"] else "USD"
                ),
                "image_link": hit["_source"]["image_link"],
                "category": hit["_source"]["category"],
                "tags": hit["_source"].get("tag_list", []),
            }
        )

    return results


def get_facets_data(term, categories=None, product_types=None, brands=None):
    query = build_query(term, categories, product_types, brands)
    query["aggs"] = {
        "product_types": {"terms": {"field": "product_type"}},
        "categories": {"terms": {"field": "category"}},
        "brands": {"terms": {"field": "brand.keyword"}},
    }
    response = get_client_es().search(index="products-catalog", body=query, size=0)

    return {
        "product_types": [
            {"product_type": bucket["key"], "count": bucket["doc_count"]}
            for bucket in response["aggregations"]["product_types"]["buckets"]
        ],
        "categories": [
            {"category": bucket["key"], "count": bucket["doc_count"]}
            for bucket in response["aggregations"]["categories"]["buckets"]
        ],
        "brands": [
            {"brand": bucket["key"], "count": bucket["doc_count"]}
            for bucket in response["aggregations"]["brands"]["buckets"]
        ],
    }

## Chunk data for batch processing

In [None]:
def chunk_data(data, batch_size):
    """
    Yields chunks of data in batch sizes for bulk indexing in Elasticsearch.
    """
    for i in range(0, len(data), batch_size):
        yield data[i : i + batch_size]

## Generate bulk actions for Elasticsearch indexing

In [None]:
def generate_bulk_actions(index_name, data_batch):
    """
    Generates bulk actions for Elasticsearch from data batches.
    Adds 'description_embeddings' by encoding the 'description' field.
    """
    for item in data_batch:
        document_id = item["id"]
        item["description_embeddings"] = get_text_vector(item["description"])
        yield {"_index": index_name, "_id": document_id, "_source": item}

## Indexing data in batches to Elasticsearch

In [None]:
def index_data_in_batches(file_path, index_name, batch_size=100):
    """
    Indexes data from the JSON file in batches using Elasticsearch helpers.bulk.
    """
    data = read_json_file(file_path)

    for batch in chunk_data(data, batch_size):
        actions = generate_bulk_actions(index_name, batch)
        success, failed = helpers.bulk(get_client_es(), actions)
        print(f"Batch indexed: {success} successful, {failed} failed")


# main execution block
# if __name__ == '__main__':
#     index_data_in_batches("../files/dataset/products.json", "products-catalog", batch_size=100)

In [None]:
index_data_in_batches(
    "/data/search/product-store-search/products.json", "products-catalog", batch_size=100
)

# Semantic Search

## Sentence transformer

https://sbert.net/docs/quickstart.html

* max_seq_length : max # of tokens encoded into a single vector embedding. Beyond is truncated
* word_embedding_dimension : # of dimensionality of vector
* Normalize : final step is normalization

In [None]:
# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]
embeddings = get_text_vector(sentences)
print(embeddings.shape)
# [3, 384]

# => Calculate the embedding similarities
# similarities = model.similarity(embeddings, embeddings)
# print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])

In [None]:
query = 'which city is the most populated in the world'

embeddings = model.encode(query)
print(embeddings.shape)
# [3, 384]

In [None]:
import torch

_id = '0'
metadata = {'text': query}
vectors = [{_id, embeddings, metadata}]