In [None]:
# empty code cell to avoid a known issue with some IDEs: https://github.com/astral-sh/ruff-vscode/issues/593

# Setting up Elasticsearch

The process we'll follow is:

1. Create an Elasticsearch client
2. Check if the required ingest pipeline exists, and create it if not
3. Check if the index with proper mappings exists, and create it if not
4. Load the data into the index

This ensures we don't overwrite existing settings if the index is already properly configured - and, importantly, don't ingest duplicate data!


In [None]:
import glob
import json
import os
import uuid
from collections.abc import Iterator
from datetime import UTC, datetime, timezone

from decouple import config
from tqdm.auto import tqdm  # Use tqdm.auto for notebook-friendly bars

from elasticsearch import Elasticsearch, NotFoundError
from elasticsearch.helpers import bulk

# Load environment variables from .env file
ES_ENDPOINT = config("ES_ENDPOINT", default="")
ES_API_KEY = config("ES_API_KEY", default="")

# define Elasticsearch config files
es_index_name = "data.service-legislation-ukpga"
es_index_settings_file = "../elasticsearch/indices/data.service-legislation-ukpga.json"

es_ingest_pipeline_name = "data.service-legislation-ukpga-pipeline"
es_ingest_pipeline_file = (
    "../elasticsearch/pipelines/data.service-legislation-ukpga-pipeline.json"
)

model_id = "i-dot-ai__all-minilm-l6-v2-ukpga-6k-finetune"
inference_id = "iai-ukpga"

# define data directory
data_dir = "../data/source"

In [None]:
# Initialize Elasticsearch client
if not ES_ENDPOINT or not ES_API_KEY:
    raise ValueError(
        "ES_ENDPOINT and ES_API_KEY must be set in the environment variables."
    )

# Initialize the client
es_client = Elasticsearch(
    hosts=[ES_ENDPOINT],
    api_key=ES_API_KEY,
)

es_client.info()

In [None]:
def create_ingest_pipeline(
    es_client: Elasticsearch, pipeline_file: str, pipeline_name: str
) -> bool:
    """Create the ingest pipeline if it doesn't already exist.

    Args:
        es_client: Elasticsearch client
        pipeline_file: Path to the pipeline definition file
        pipeline_name: Name of the pipeline to create

    Returns:
        bool: True if pipeline was created, False if it already existed
    """
    # Check if pipeline exists
    try:
        es_client.ingest.get_pipeline(id=pipeline_name)
        print(f"Pipeline '{pipeline_name}' already exists")
        return False

    except Exception:
        print(f"Pipeline '{pipeline_name}' not found, creating it...")

        # Load pipeline definition from file
        with open(pipeline_file) as file:
            pipeline_definition = json.load(file)

        # Create the pipeline
        es_client.ingest.put_pipeline(id=pipeline_name, body=pipeline_definition)

        print(f"Pipeline '{pipeline_name}' created successfully")
        return True

In [None]:
def create_index(es_client: Elasticsearch, index_file: str, index_name: str):
    """Create the index if it doesn't already exist.

    Args:
        es_client: Elasticsearch client
        index_file: Path to the index definition file
        index_name: Name of the index to create

    Returns:
        bool: True if index was created, False if it already existed
    """
    # Check if index exists and store the result
    if es_client.indices.exists(index=index_name).body:
        print(f"Index '{index_name}' already exists")
    else:
        # If index does not exist, create it
        print(f"Index '{index_name}' not found, creating it with proper mappings...")

        # Load index definition from file
        with open(index_file) as file:
            index_definition = json.load(file)

        # Create the index with settings and mappings
        if es_client.indices.create(index=index_name, body=index_definition).body:
            print(f"Index '{index_name}' created successfully")
            return True
        else:
            print(f"Failed to create index '{index_name}'")
            return False

In [None]:
def generate_es_actions(filepath: str, index_name: str) -> Iterator[dict]:
    """Reads documents from a file and yields Elasticsearch bulk actions.

    This function efficiently processes both standard JSON files (containing a
    single list of objects) and JSON Lines (.jsonl) files (containing one
    JSON object per line). It reads the file and yields data formatted as
    an action dictionary, ready for use with the `elasticsearch.helpers.bulk` API.

    Args:
        filepath (str): The path to the input file (.json or .jsonl).
        index_name (str): The name of the Elasticsearch index to target.

    Yields:
        dict: An Elasticsearch bulk action dictionary in the format:
              {"_op_type": "index", "_index": index_name, "_source": doc}.
    """
    filename = os.path.basename(filepath)

    def yield_actions_from_docs(docs_iterator):
        """Inner helper to format documents from any iterator."""
        for doc in docs_iterator:
            yield {
                "_op_type": "index",
                "_index": index_name,
                "_source": doc,
            }

    # Handle JSONL files
    if filepath.endswith(".jsonl"):
        try:
            with open(filepath, encoding="utf-8") as f:
                for line in f:
                    if line.strip():
                        doc = json.loads(line)
                        yield {
                            "_op_type": "index",
                            "_index": index_name,
                            "_source": doc,
                        }

        except (OSError, json.JSONDecodeError) as e:
            print(f"Skipping malformed or unreadable line/file {filename}: {e}")

    # Handle standard JSON files
    elif filepath.endswith(".json"):
        try:
            with open(filepath, encoding="utf-8") as f:
                data = json.load(f)
                if isinstance(data, list):
                    # yields actions for each document in the list
                    yield from yield_actions_from_docs(data)
                else:
                    print(f"Skipping {filename} as it does not contain a JSON list.")

        except (OSError, json.JSONDecodeError) as e:
            print(f"Skipping malformed or unreadable file {filename}: {e}")

In [None]:
def load_data_to_elasticsearch(
    es_client: Elasticsearch,
    es_index_name: str,
    data_dir: str,
    chunk_size: int = 500,
    request_timeout: int = 60,
):
    """Finds and loads data from JSON/JSONL files into Elasticsearch using the bulk API.

    This function scans a directory for .json and .jsonl files, reads them
    efficiently using a generator, and ingests them into the specified
    Elasticsearch index in batches for optimal performance.

    During ingestion, any per-document errors reported by the bulk API are
    written to log files under the ``notebooks/logs`` directory so they can
    be inspected and debugged later.

    Args:
        es_client: An initialized Elasticsearch client instance.
        es_index_name: The name of the index where data will be loaded.
        data_dir: The path to the directory containing the data files.
        chunk_size (int): The number of documents to send in each bulk request.
        request_timeout (int): The timeout in seconds for each bulk request.
    """
    all_files = glob.glob(os.path.join(data_dir, "*.json")) + glob.glob(
        os.path.join(data_dir, "*.jsonl")
    )

    if not all_files:
        print(f"No .json or .jsonl files found in '{data_dir}'")
        return

    print(f"Found {len(all_files)} files to process...")
    total_successes = 0

    # Prepare logs directory (relative to the notebook folder).
    logs_dir = os.path.join("logs")
    os.makedirs(logs_dir, exist_ok=True)

    # Create a client with timeout options
    client_with_timeout = es_client.options(request_timeout=request_timeout)

    # show progress for file processing
    for filepath in tqdm(all_files, desc="Processing Files"):
        filename = os.path.basename(filepath)
        error_log_path = os.path.join(
            logs_dir,
            f"{filename}.errors.{datetime.now(UTC).strftime('%Y%m%dT%H%M%SZ')}.log",
        )

        try:
            # Create the generator that yields ES bulk actions
            action_generator = generate_es_actions(filepath, es_index_name)

            # Use the bulk helper for efficient ingestion
            success_count, errors = bulk(
                client=client_with_timeout,
                actions=action_generator,
                chunk_size=chunk_size,
                raise_on_error=False,  # Report errors instead of stopping
            )
            total_successes += success_count

            if errors:
                # Write detailed error information to a per-file log so we can
                # inspect the root causes in notebooks/logs.
                with open(error_log_path, "w", encoding="utf-8") as log_f:
                    log_f.write(
                        f"Ingestion errors for file '{filename}' into index '{es_index_name}'\n"
                    )
                    for err in errors:
                        log_f.write(json.dumps(err, ensure_ascii=False))
                        log_f.write("\n")

                tqdm.write(
                    f"⚠️  Finished {filename}: Loaded {success_count} documents with {len(errors)} errors. "
                    f"Details logged to '{error_log_path}'."
                )
            else:
                tqdm.write(
                    f"✅ Finished {filename}: Loaded {success_count} documents successfully."
                )

        except Exception as e:
            # Log unexpected critical errors for this file as well.
            with open(error_log_path, "a", encoding="utf-8") as log_f:
                log_f.write(
                    f"Critical error while ingesting '{filename}' into '{es_index_name}': {e}\n"
                )
            tqdm.write(
                f"❌ A critical error occurred with {filename}: {e}. "
                f"See '{error_log_path}' for details."
            )

    print("\n--- Ingestion Complete ---")
    print(f"Total documents loaded successfully from all files: {total_successes}")

In [None]:
def deploy_ml_model(
    es_client: Elasticsearch,
    model_id: str = model_id,
    priority: str = "normal",
    hf_model_id: str | None = None,
) -> dict:
    """Ensure a trained model exists (uploading from Hugging Face if needed) and deploy it.

    This function focuses purely on the ML model lifecycle:
    1. Verifies that the trained model is present in the cluster
    2. If it is missing, imports it from Hugging Face using Eland
    3. Starts or updates the model deployment with the specified priority

    Args:
        es_client: Elasticsearch client instance
        model_id: The trained model ID to check and deploy (with underscores, not slashes)
        priority: Deployment priority ('low', 'normal', or 'high')
        hf_model_id: Optional Hugging Face model identifier to import if the model
            is not yet present in the cluster. If omitted, we fall back to a simple
            mapping of ``model_id.replace("__", "/")``.

    Returns:
        dict: A dictionary containing:
            - model_exists (bool): Whether the model is now present in the cluster
            - deployment_status (str): Status of the model deployment

    Raises:
        ValueError: If the model does not exist and cannot be imported
        Exception: For other Elasticsearch API errors
    """
    result = {
        "model_exists": False,
        "deployment_status": "not_checked",
    }

    # Step 1: Check if the trained model exists, importing from Hugging Face if needed.
    try:
        print(f"Checking for trained model: {model_id}")
        models_response = es_client.ml.get_trained_models(model_id=model_id)

        if models_response.body.get("count", 0) > 0:
            result["model_exists"] = True
            print(f"✓ Model '{model_id}' found in cluster")
        else:
            # The API responded but returned no models; treat as not found.
            raise NotFoundError("no_trained_models", "Model not found in cluster")

    except NotFoundError:
        # The client raises NotFoundError when the model id is unknown. In this case
        # we pivot to importing the model from Hugging Face using Eland.
        effective_hf_model_id = hf_model_id or model_id.replace("__", "/")
        print(
            f"Model '{model_id}' not found in cluster. Attempting import from Hugging Face "
            f"as '{effective_hf_model_id}'..."
        )

        try:
            from pathlib import Path

            from eland.ml.pytorch import PyTorchModel
            from eland.ml.pytorch.transformers import TransformerModel
        except ImportError as import_err:  # pragma: no cover - environment-specific
            raise ImportError(
                "The 'eland' package is required to import models from Hugging Face. "
                "Install it in this environment, for example with 'uv add eland' or "
                "'pip install eland', and re-run this cell."
            ) from import_err

        try:
            # Download and export the Hugging Face model via Eland. The
            # TransformerModel API expects keyword-only arguments, so we pass the
            # Hugging Face identifier as `model_id` rather than a positional arg.
            transformer_model = TransformerModel(
                model_id=effective_hf_model_id,
                task_type="text_embedding",
            )

            models_dir = Path("models")
            models_dir.mkdir(parents=True, exist_ok=True)

            model_path, config, vocab_path = transformer_model.save(models_dir)

            # Import the TorchScript model into Elasticsearch as a trained model.
            ptm = PyTorchModel(es_client, model_id)
            ptm.import_model(
                model_path=model_path,
                config_path=None,
                vocab_path=vocab_path,
                config=config,
            )

            result["model_exists"] = True
            print(
                f"✓ Hugging Face model '{effective_hf_model_id}' imported into Elasticsearch "
                f"as trained model '{model_id}'"
            )

        except Exception as import_error:
            raise ValueError(
                f"Failed to import Hugging Face model '{effective_hf_model_id}' into "
                f"Elasticsearch as '{model_id}': {import_error}"
            ) from import_error

    except Exception:
        # Bubble up unexpected errors during model existence/import checks.
        raise

    # Step 2: Start or update model deployment with the requested priority
    try:
        print(
            f"Starting/updating deployment for model: {model_id} with {priority} priority"
        )

        try:
            deployment_stats = es_client.ml.get_trained_models_stats(
                model_id=model_id
            ).body
            current_deployment = None

            if deployment_stats.get("count", 0) > 0:
                trained_models = deployment_stats.get("trained_model_stats", [])
                if trained_models and "deployment_stats" in trained_models[0]:
                    current_deployment = trained_models[0]["deployment_stats"]

            # If already deployed, check if we need to update priority
            if current_deployment:
                current_state = current_deployment.get("state", "")
                print(f"  Current deployment state: {current_state}")

                if current_state == "started":
                    result["deployment_status"] = "already_started"
                    print("  Model deployment already active")
                else:
                    es_client.ml.start_trained_model_deployment(
                        model_id=model_id,
                        priority=priority,
                        wait_for="started",
                    )
                    result["deployment_status"] = "started"
                    print(f"✓ Model deployment started with {priority} priority")
            else:
                es_client.ml.start_trained_model_deployment(
                    model_id=model_id,
                    priority=priority,
                    wait_for="started",
                )
                result["deployment_status"] = "started"
                print(f"✓ Model deployment started with {priority} priority")

        except Exception as e:
            if "resource_already_exists_exception" in str(e):
                result["deployment_status"] = "already_started"
                print("  Model deployment already exists and is active")
            else:
                raise

    except Exception as e:
        print(f"⚠️  Error starting model deployment: {e}")
        result["deployment_status"] = f"error: {e}"

    return result

In [None]:
def create_inference_endpoint(
    es_client: Elasticsearch,
    model_id: str = model_id,
    inference_id: str = inference_id,
    request_timeout: int = 30,
) -> dict:
    """Create a text-embedding inference endpoint for the given model.

    This version reads the current model deployment stats and mirrors the
    number of allocations and threads per allocation so the inference
    endpoint cannot request more resources than the deployment provides.

    Args:
        es_client: Elasticsearch client instance
        model_id: The trained model ID that the endpoint should use
        inference_id: The ID to use for the inference endpoint
        request_timeout: HTTP request timeout in seconds for the client call

    Returns:
        dict: A dictionary containing:
            - inference_created (bool): Whether the create request was sent
            - inference_status (str): Status message for inference endpoint

    Raises:
        Exception: For Elasticsearch API errors
    """
    result = {
        "inference_created": False,
        "inference_status": "not_attempted",
    }

    try:
        print(f"Creating inference endpoint: {inference_id}")

        # Check if inference endpoint already exists
        try:
            existing_inference = es_client.inference.get(inference_id=inference_id)
            if existing_inference:
                result["inference_created"] = False
                result["inference_status"] = "already_exists"
                print(f"  Inference endpoint '{inference_id}' already exists")
                return result
        except NotFoundError:
            # Endpoint does not exist yet, proceed to create it
            pass
        except Exception:
            # Surface unexpected errors
            raise

        # Build service_settings based on the current model deployment so we
        # never exceed the allocations/threads that the deployment has.
        service_settings: dict[str, object] = {"model_id": model_id}

        try:
            stats = es_client.ml.get_trained_models_stats(model_id=model_id).body
            trained_models = stats.get("trained_model_stats", [])
            deployment_stats = None
            if trained_models and "deployment_stats" in trained_models[0]:
                deployment_stats = trained_models[0]["deployment_stats"]

            if deployment_stats:
                num_allocations = deployment_stats.get("number_of_allocations")
                num_threads = deployment_stats.get("threads_per_allocation")

                if isinstance(num_allocations, int) and num_allocations > 0:
                    service_settings["num_allocations"] = num_allocations
                if isinstance(num_threads, int) and num_threads > 0:
                    service_settings["num_threads"] = num_threads
        except Exception as stats_error:
            # If we cannot read deployment stats, fall back to a conservative
            # single-allocation, single-thread configuration.
            print(
                f"⚠️  Could not read deployment stats for '{model_id}', "
                f"using defaults: {stats_error}"
            )
            service_settings.setdefault("num_allocations", 1)
            service_settings.setdefault("num_threads", 1)

        inference_config = {
            "service": "elasticsearch",
            "service_settings": service_settings,
        }

        # Initiate inference endpoint creation asynchronously. By setting the API
        # `timeout` parameter to "0s" we ask Elasticsearch to return immediately
        # after the request is accepted, instead of waiting for the endpoint to be
        # fully created. You can monitor progress separately via the ML stats APIs
        # or the UI.
        print("  Initiating inference endpoint creation (async, timeout=0s)...")
        client_with_timeout = es_client.options(request_timeout=request_timeout)
        client_with_timeout.inference.put(
            inference_id=inference_id,
            task_type="text_embedding",
            inference_config=inference_config,
            timeout=f"{request_timeout}s",
        )

        result["inference_created"] = True
        result["inference_status"] = "creation_requested"
        print(
            f"✓ Inference endpoint creation request for '{inference_id}' sent (not waiting for full creation)"
        )

    except Exception as e:
        if "resource_already_exists_exception" in str(e):
            result["inference_created"] = False
            result["inference_status"] = "already_exists"
            print(f"  Inference endpoint '{inference_id}' already exists")
        else:
            print(f"⚠️  Error creating inference endpoint: {e}")
            result["inference_status"] = f"error: {e}"
            raise

    return result

In [None]:
def setup_inference_endpoint(
    es_client: Elasticsearch,
    model_id: str = model_id,
    inference_id: str = inference_id,
    priority: str = "normal",
    inference_timeout: int = 600,
) -> dict:
    """Convenience wrapper that deploys the model then creates an inference endpoint.

    This function preserves the original notebook behaviour while delegating to
    two smaller helpers:

    1. ``deploy_ml_model`` ensure the trained model exists and is deployed
    2. ``create_inference_endpoint`` request creation of a text-embedding
       inference endpoint for that model (asynchronously).

    Args:
        es_client: Elasticsearch client instance
        model_id: The trained model ID to check and deploy (with underscores, not slashes)
        inference_id: The ID to use for the inference endpoint
        priority: Deployment priority ('low', 'normal', or 'high')
        inference_timeout: Used here as the HTTP request timeout for the endpoint
            creation call. Since we use ``timeout="0s"`` at the API level, this is
            mainly a safety bound for the client request itself.

    Returns:
        dict: A dictionary containing:
            - model_exists (bool): Whether the model was found
            - deployment_status (str): Status of the model deployment
            - inference_created (bool): Whether the endpoint creation was requested
            - inference_status (str): Status message for inference endpoint
    """
    model_result = deploy_ml_model(
        es_client=es_client,
        model_id="i-dot-ai__all-minilm-l6-v2-ukpga-6k-finetune",
        priority="normal",
    )

    endpoint_result = create_inference_endpoint(
        es_client=es_client,
        model_id=model_id,
        inference_id=inference_id,
        request_timeout=inference_timeout,
    )

    return {
        "model_exists": model_result["model_exists"],
        "deployment_status": model_result["deployment_status"],
        "inference_created": endpoint_result["inference_created"],
        "inference_status": endpoint_result["inference_status"],
    }

In [None]:
# Setup the UK legislation fine-tuned model and inference endpoint
result = setup_inference_endpoint(
    es_client=es_client,
    model_id=model_id,
    inference_id=inference_id,
    priority="normal",
    inference_timeout=600,
)

print("\n--- Setup Result ---")
print(f"Model exists: {result['model_exists']}")
print(f"Deployment status: {result['deployment_status']}")
print(f"Inference created: {result['inference_created']}")
print(f"Inference status: {result['inference_status']}")

In [None]:
# Create pipeline and index if they don't exist
create_ingest_pipeline(
    es_client=es_client,
    pipeline_file=es_ingest_pipeline_file,
    pipeline_name=es_ingest_pipeline_name,
)

create_index(
    es_client=es_client, index_file=es_index_settings_file, index_name=es_index_name
)

In [None]:
# load data to Elasticsearch
load_data_to_elasticsearch(
    es_client=es_client,
    es_index_name=es_index_name,
    data_dir=data_dir,
)