# Setting up Elasticsearch

The process we'll follow is:

1. Create an Elasticsearch client
2. Check if the required ingest pipeline exists, and create it if not
3. Check if the index with proper mappings exists, and create it if not
4. Load the intelligence data into the index

This ensures we don't overwrite existing settings if the index is already properly configured - and, importantly, don't ingest duplicate data!

# Notebook stages

1. **Connect to Elasticsearch**: Create a client using cloud credentials.
2. **Prepare ML + ingest**: Import and deploy the NER model `conll03_english_ner_ingest` (Hugging Face: `elastic/distilbert-base-uncased-finetuned-conll03-english`), then ensure the ingest pipeline and index exist.
3. **Ingest intelligence reports**: Load NDJSON files from `../data/intel-reports` into the index using the bulk API with progress reporting and basic error logging.



In [None]:
import glob
import json
import os
import time
import uuid
from collections.abc import Iterator

from decouple import config
from tqdm.auto import tqdm
from datetime import UTC, datetime

from elasticsearch import Elasticsearch, NotFoundError
from elasticsearch.helpers import bulk

# Load environment variables from .env file
ES_CLOUD_ID = config("ES_CLOUD_ID", default="")
ES_API_KEY = config("ES_API_KEY", default="")

# define Elasticsearch config files
es_index_name = "data.service-intelligence.reports-force1"
es_index_settings_file = (
    "../elasticsearch/indices/demo-investigation-intel.reports.json"
)

es_ingest_pipeline_name = "investigation-intel.reports-pipeline"
es_ingest_pipeline_file = (
    "../elasticsearch/pipelines/investigation-intel.reports-pipeline.json"
)

# NER model configuration (Hugging Face: elastic/distilbert-base-uncased-finetuned-conll03-english)
ner_model_id = "conll03_english_ner_ingest"
ner_hf_model_id = "elastic/distilbert-base-uncased-finetuned-conll03-english"

# define data directory
data_dir = "../data/intel-reports"

In [None]:
# Initialize Elasticsearch client
if not ES_CLOUD_ID or not ES_API_KEY:
    raise ValueError(
        "ES_CLOUD_ID and ES_API_KEY must be set in the environment variables."
    )

es_client = Elasticsearch(
    cloud_id=ES_CLOUD_ID,
    api_key=ES_API_KEY,
)

es_client.info()

In [None]:
def create_ingest_pipeline(
    es_client: Elasticsearch, pipeline_file: str, pipeline_name: str
) -> bool:
    """Create the ingest pipeline if it doesn't already exist.

    Args:
        es_client: Elasticsearch client
        pipeline_file: Path to the pipeline definition file
        pipeline_name: Name of the pipeline to create

    Returns:
        bool: True if pipeline was created, False if it already existed
    """
    # Check if pipeline exists
    try:
        es_client.ingest.get_pipeline(id=pipeline_name)
        print(f"Pipeline '{pipeline_name}' already exists")
        return False

    except Exception:
        print(f"Pipeline '{pipeline_name}' not found, creating it...")

        # Load pipeline definition from file
        with open(pipeline_file) as file:
            pipeline_definition = json.load(file)

        # Create the pipeline
        es_client.ingest.put_pipeline(id=pipeline_name, body=pipeline_definition)

        print(f"Pipeline '{pipeline_name}' created successfully")
        return True

In [None]:
def create_index(es_client: Elasticsearch, index_file: str, index_name: str):
    """Create the index if it doesn't already exist.

    Args:
        es_client: Elasticsearch client
        index_file: Path to the index definition file
        index_name: Name of the index to create

    Returns:
        bool: True if index was created, False if it already existed
    """
    # Check if index exists and store the result
    if es_client.indices.exists(index=index_name).body:
        print(f"Index '{index_name}' already exists")
    else:
        # If index does not exist, create it
        print(f"Index '{index_name}' not found, creating it with proper mappings...")

        # Load index definition from file
        with open(index_file) as file:
            index_definition = json.load(file)

        # Create the index with settings and mappings
        if es_client.indices.create(index=index_name, body=index_definition).body:
            print(f"Index '{index_name}' created successfully")
            return True
        else:
            print(f"Failed to create index '{index_name}'")
            return False

In [None]:
def reddit_ndjson_generator(filepath: str) -> Iterator[dict]:
    """Yield one JSON object at a time from an NDJSON file."""
    with open(filepath, encoding="utf-8") as file:
        for line in file:
            try:
                yield json.loads(line)
            except json.JSONDecodeError:
                print(f"Skipping malformed JSON in {filepath}")
                continue

In [None]:
def load_data_to_elasticsearch(
    es_client: Elasticsearch,
    es_index_name: str,
    data_dir: str,
    chunk_size: int = 500,
    request_timeout: int = 120,
):
    """Read NDJSON files and load them into Elasticsearch using the bulk API.

    This mirrors the more robust ingestion pattern from ``ingest_data_elasticsearch.ipynb``
    and avoids per-document timeouts by batching requests and setting an explicit
    HTTP request timeout.

    Args:
        es_client: Elasticsearch client
        es_index_name: The index name to load data into
        data_dir: Directory containing ndjson files
        chunk_size: Number of documents per bulk request
        request_timeout: Timeout in seconds for each bulk request
    """
    all_files = glob.glob(f"{data_dir}/*.ndjson")

    if not all_files:
        print(f"No .ndjson files found in '{data_dir}'")
        return

    print(f"Found {len(all_files)} files to process...")
    total_successes = 0

    # Create a client with timeout options
    client_with_timeout = es_client.options(request_timeout=request_timeout)

    def generate_es_actions(filepath: str, index_name: str, pbar: tqdm):
        """Yield Elasticsearch bulk actions from an NDJSON file and update a progress bar."""
        for doc in reddit_ndjson_generator(filepath):
            pbar.update(1)
            yield {
                "_op_type": "index",
                "_index": index_name,
                "_source": doc,
            }

    # show progress for file processing
    for filepath in tqdm(all_files, desc="Processing files"):
        filename = os.path.basename(filepath)

        # Pre-count lines so we can show doc-level progress per file
        with open(filepath, encoding="utf-8") as f:
            total_lines = sum(1 for _ in f)

        try:
            with tqdm(
                total=total_lines,
                desc=f"Uploading {filename}",
                unit="doc",
            ) as pbar:
                action_generator = generate_es_actions(filepath, es_index_name, pbar)

                success_count, errors = bulk(
                    client=client_with_timeout,
                    actions=action_generator,
                    chunk_size=chunk_size,
                    raise_on_error=False,
                )
            total_successes += success_count

            if errors:
                tqdm.write(
                    f"⚠️  Finished {filename}: Loaded {success_count} documents with {len(errors)} errors."
                )
            else:
                tqdm.write(
                    f"✅ Finished {filename}: Loaded {success_count} documents successfully."
                )

        except Exception as e:
            tqdm.write(f"❌ A critical error occurred with {filename}: {e}")

    print("\n--- Ingestion Complete ---")
    print(f"Total documents loaded successfully from all files: {total_successes}")

In [None]:
def deploy_ner_model(
    es_client: Elasticsearch,
    model_id: str = ner_model_id,
    hf_model_id: str = ner_hf_model_id,
) -> dict:
    """Ensure the CONLL03 NER model exists and is deployed for ingest pipelines.

    The model weights are imported from the Hugging Face model
    ``elastic/distilbert-base-uncased-finetuned-conll03-english``
    (see `https://huggingface.co/elastic/distilbert-base-uncased-finetuned-conll03-english`).
    """
    result = {
        "model_exists": False,
        "deployment_status": "not_checked",
    }

    # Step 1: Check if the trained model exists, importing from Hugging Face if needed.
    try:
        print(f"Checking for trained model: {model_id}")
        models_response = es_client.ml.get_trained_models(model_id=model_id)

        if models_response.body.get("count", 0) > 0:
            result["model_exists"] = True
            print(f"✓ Model '{model_id}' found in cluster")
        else:
            # The API responded but returned no models; treat as not found.
            raise NotFoundError("no_trained_models", "Model not found in cluster")

    except NotFoundError:
        # The client raises NotFoundError when the model id is unknown. In this case
        # we pivot to importing the model from Hugging Face using Eland.
        print(
            f"Model '{model_id}' not found in cluster. Attempting import from Hugging Face "
            f"as '{hf_model_id}'..."
        )

        try:
            from pathlib import Path

            from eland.ml.pytorch import PyTorchModel
            from eland.ml.pytorch.transformers import TransformerModel
        except ImportError as import_err:  # pragma: no cover - environment-specific
            raise ImportError(
                "The 'eland' package is required to import models from Hugging Face. "
                "Install it in this environment, for example with 'uv add eland' or "
                "'pip install eland', and re-run this cell."
            ) from import_err

        try:
            # Download and export the Hugging Face model via Eland.
            transformer_model = TransformerModel(
                model_id=hf_model_id,
                task_type="ner",
            )

            models_dir = Path("models")
            models_dir.mkdir(parents=True, exist_ok=True)

            model_path, config, vocab_path = transformer_model.save(models_dir)

            # Import the TorchScript model into Elasticsearch as a trained model.
            ptm = PyTorchModel(es_client, model_id)
            ptm.import_model(
                model_path=model_path,
                config_path=None,
                vocab_path=vocab_path,
                config=config,
            )

            result["model_exists"] = True
            print(
                f"✓ Hugging Face model '{hf_model_id}' imported into Elasticsearch "
                f"as trained model '{model_id}'"
            )

        except Exception as import_error:
            raise ValueError(
                f"Failed to import Hugging Face model '{hf_model_id}' into "
                f"Elasticsearch as '{model_id}': {import_error}"
            ) from import_error

    except Exception:
        # Bubble up unexpected errors during model existence/import checks.
        raise

    # Step 2: Start or update model deployment
    try:
        print(f"Starting/updating deployment for model: {model_id}")

        try:
            deployment_stats = es_client.ml.get_trained_models_stats(
                model_id=model_id
            ).body
            current_deployment = None

            if deployment_stats.get("count", 0) > 0:
                trained_models = deployment_stats.get("trained_model_stats", [])
                if trained_models and "deployment_stats" in trained_models[0]:
                    current_deployment = trained_models[0]["deployment_stats"]

            # If already deployed, check current state
            if current_deployment:
                current_state = current_deployment.get("state", "")
                print(f"  Current deployment state: {current_state}")

                if current_state == "started":
                    result["deployment_status"] = "already_started"
                    print("  Model deployment already active")
                else:
                    es_client.ml.start_trained_model_deployment(
                        model_id=model_id,
                        wait_for="started",
                    )
                    result["deployment_status"] = "started"
                    print("✓ Model deployment started")
            else:
                es_client.ml.start_trained_model_deployment(
                    model_id=model_id,
                    wait_for="started",
                )
                result["deployment_status"] = "started"
                print("✓ Model deployment started")

        except Exception as e:
            if "resource_already_exists_exception" in str(e):
                result["deployment_status"] = "already_started"
                print("  Model deployment already exists and is active")
            else:
                raise

    except Exception as e:
        print(f"⚠️  Error starting model deployment: {e}")
        result["deployment_status"] = f"error: {e}"

    return result

In [None]:
# Ensure the NER model is available and deployed for ingest
ner_result = deploy_ner_model(es_client=es_client)

print("\n--- NER Model Setup ---")
print(f"Model exists: {ner_result['model_exists']}")
print(f"Deployment status: {ner_result['deployment_status']}")

In [None]:
# Create pipeline and index if they don't exist
create_ingest_pipeline(
    es_client=es_client,
    pipeline_file=es_ingest_pipeline_file,
    pipeline_name=es_ingest_pipeline_name,
)

create_index(
    es_client=es_client, index_file=es_index_settings_file, index_name=es_index_name
)

In [None]:
# Execute the function to load Reddit data to Elasticsearch
load_data_to_elasticsearch(
    es_client=es_client,
    es_index_name=es_index_name,
    data_dir=data_dir,
)