# Ingestion Data

## You'll need to install the following libraries if they are not already installed:

In [None]:
pip install elasticsearch sentence-transformers pyyaml

In [8]:
import json
import yaml
from elasticsearch import Elasticsearch, helpers
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Step 1: Elasticsearch client setup using cloud configuration

In [11]:
def get_client_es():
    """
    Initializes Elasticsearch client using cloud_id and api_key from config.yml
    """
    with open("../config.yml", "r") as file:
        config = yaml.safe_load(file)
    return Elasticsearch(cloud_id=config["cloud_id"], api_key=config["api_key"])

# Step 2: Text Vectorization using SentenceTransformers


In [2]:
def get_text_vector(sentences):
    """
    Generates sentence embeddings using pre-trained model 'all-MiniLM-L6-v2'.
    """
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    embeddings = model.encode(sentences)
    return embeddings

# Step 3: Read JSON file containing the dataset


In [3]:
def read_json_file(file_path):
    """
    Reads and loads the dataset from a JSON file.
    """
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

# Step 4: Chunk data for batch processing


In [4]:
def chunk_data(data, batch_size):
    """
    Yields chunks of data in batch sizes for bulk indexing in Elasticsearch.
    """
    for i in range(0, len(data), batch_size):
        yield data[i : i + batch_size]

# Step 5: Generate bulk actions for Elasticsearch indexing


In [5]:
def generate_bulk_actions(index_name, data_batch):
    """
    Generates bulk actions for Elasticsearch from data batches.
    Adds 'description_embeddings' by encoding the 'description' field.
    """
    for item in data_batch:
        document_id = item["id"]
        item["description_embeddings"] = get_text_vector(item["description"])
        yield {"_index": index_name, "_id": document_id, "_source": item}

# Step 6: Indexing data in batches to Elasticsearch


In [6]:
def index_data_in_batches(file_path, index_name, batch_size=100):
    """
    Indexes data from the JSON file in batches using Elasticsearch helpers.bulk.
    """
    data = read_json_file(file_path)

    for batch in chunk_data(data, batch_size):
        actions = generate_bulk_actions(index_name, batch)
        success, failed = helpers.bulk(get_client_es(), actions)
        print(f"Batch indexed: {success} successful, {failed} failed")


# main execution block
# if __name__ == '__main__':
#     index_data_in_batches("../files/dataset/products.json", "products-catalog", batch_size=100)

In [10]:
index_data_in_batches(
    "../files/dataset/products.json", "products-catalog-2", batch_size=100
)

Batch indexed: 100 successful, [] failed
Batch indexed: 100 successful, [] failed


KeyboardInterrupt: 