# HIERARCHY ENGINE.ipynb 
# CORE BACKEND LOGIC

---
## Overview
- Docstring and Imports
- Class Initialization
- Build Semantic Layers
- Extract Semantic Layer
- Get Semantic Layer
- Rename and Reassignment
- Apply Semantic Layer Updates
- Cleanup/Normalization
- Recluster
- Merge
- Split
- Build Attribute Layer
- Attribute Layer Helpers
- Final Summarization

## 1 - Docstring and Imports
The Hierarchy Engine is the heart of the application. This script is responsible for building the Category Layer, Semantic Layer, and Attribute Layer. Additionally, the script implements a human-supervision element, allowing users to edit semantic layers by moving items between clusters OR merging/splitting/renaming clusters. The Hierarchy Engine acts as the middleman between the UI and the layer-specific logic. 

The Hierarchy Engine is constructed using internal functions imported from the Semantic Layer, Attribute Layer, and Category Layer. Pandas is used for data management, and annotations/type hints are incorporated for clarity. 

In [None]:
# core/hierarchy_engine.py

"""
This file implements the Hierarchy Engine, the engine responsible for
implementing layer-specific logic within the application. 

Currently, the engine builds a hierarchy of the form:

    Root (Semantic Layer)
        → Level 1 (Semantic Layer)
            → Level 2 (Category Layer)
                → Level 3 (Attribute Layer, cached)

Future versions of this project will allow greater flexibility,
such as specifying desired number of layers and the type of layer
for each level.

It also supports interactive operations such as renaming clusters,
reassigning labels, merging and splitting semantic clusters, and
re-running clustering logic from scratch. These capabilities are designed
to support both backend pipelines and UI-driven editing workflows.

Public API used by the hierarchy engine consumer (e.g., UI or pipeline):

    HierarchyEngine(
        df,
        category_col,
        attribute_method="sparsity",
        attribute_excluded_cols=None
    )
        Initializes the hierarchy engine and builds semantic layers.

    get_semantic_layer(level) -> pd.DataFrame
        Returns a dataframe describing the requested semantic layer
        (level 0 or level 1) with columns: label, id, name.

    apply_semantic_changes(level, rename_map, reassignment_map) -> None
        Applies user-driven renames and label reassignments to a semantic
        layer and propagates changes through dependent layers.

    semantic_recluster(level) -> None
        Rebuilds semantic clustering from scratch for the specified level.

    merge_semantic_clusters(level, from_cluster, to_cluster) -> None
        Merges one semantic cluster into another.

    split_semantic_cluster(level, from_cluster, labels_to_move) -> None
        Manually splits a semantic cluster by moving selected labels into
        a new cluster.

    get_attribute_layer() -> pd.DataFrame
        Returns the attribute-layer view with cluster IDs and names,
        computing it lazily if necessary.

    attribute_recluster(method=None) -> None
        Re-runs attribute-based clustering using the specified method.

    apply_attribute_changes(rename_map, reassignment_map) -> None
        Applies user edits to attribute cluster names and assignments.

    get_hierarchy_df() -> pd.DataFrame
        Returns the full hierarchy dataframe with all active layers.

Internal helpers:

    _build_semantic_layers() -> None
        Constructs semantic layers in sequence.

    _extract_semantic_layer(level) -> pd.DataFrame
        Extracts a compact label/id/name table for a semantic layer.

    _apply_layer_back_to_hierarchy(level, df_layer) -> None
        Writes updated semantic layer assignments back into the hierarchy.

    _cleanup_semantic_layer_ids(df_layer) -> pd.DataFrame
        Normalizes cluster IDs to a contiguous 1..K range.

    _build_attribute_layer() -> None
        Computes and caches attribute-based clustering and naming.
"""

# Type hints
from __future__ import annotations
from typing import Dict, Optional, List

# External dependencies
import pandas as pd

# Internal dependencies
from .semantic_layer import build_semantic_layer
from .attribute_layer import assign_all_clusters, make_cluster_names
from .category_layer import ensure_or_generate_category_name

# 2 - Class Docstring and Initialization
This section outlines the class HierarchyEngine's responsibilities and initialization.

Initialization line-by-line:
- Class definition. HierarchyEngine contains `df` (pd.DataFrame), `category_col` (string), `attribute_method` (string defaulting to "sparsity), and `attribute_excluded_cols` (list of strings). In practice, `df` is the base dataset, `category_col` is the column containing category assignments in the base dataset, `attribute_method` is the desired Attribute Layer clustering method to use (sparsity-based vs. value-based), and `attribute_excluded_cols` is the list of columns to exclude from consideration when building the Attribute Layer.
- A defensive copy of `df` is created.
- Run `ensure_or_generate_category_name` to identify the category column OR create one if needed. Store the resulting dataframe as `df`.
- Store `df` as the full hierarchy `self._heir_df`.
- Generate cached Semantic layers and Attribute layer.
- Store Attribute Layer configurations.
- Build semantic layers.

In [None]:
class HierarchyEngine:
    """
    Unified backend engine.

    Responsibilities:
        - Build current hierarchy
        - Support rename / reassign / merge / split operations
        - Maintain a consistent hierarchy dataframe
        - Provide semantic layers to the UI
        - Produce attribute-layer clustering and expose it to the UI
    """
    def __init__(
        self,
        df: pd.DataFrame,
        category_col: Optional[str],
        attribute_method: str = "sparsity",
        attribute_excluded_cols: Optional[List[str]] = None,
    ):
        df = df.copy()

        # Ensure or generate `category_name`
        df = ensure_or_generate_category_name(
            df,
            category_col,
            extra_excluded_cols=attribute_excluded_cols,
        )

        # Base dataframe (full hierarchy, including attribute layer later)
        self._hier_df: pd.DataFrame = df

        # Cached semantic and attribute-layer views
        self._semantic_layer_0: pd.DataFrame | None = None
        self._semantic_layer_1: pd.DataFrame | None = None
        self._attribute_layer_df: pd.DataFrame | None = None

        # Attribute layer configuration
        self._attribute_excluded_cols: list[str] | None = (
            list(attribute_excluded_cols) if attribute_excluded_cols else None
        )
        self._attribute_method: str = attribute_method  # "sparsity" or "value"

        # Build semantic layers
        self._build_semantic_layers()

# 3 - Build Semantic Layers

This function is responsible for building semantic layers. Currently, the function builds the top layer (level 0) and the next layer (level 1). Future versions of this project will allow for users to specify how many layers to build and the order to place them in.

Line-by-line breakdown:
- Private function definition with no return object.
- Build `df1` using `build_semantic_layer`, passing in `self._hier_df`. 
- Build `df0` using `build_semantic_layer`, passing `df1`.
- Store `df0` as `self._hier_df`.
- Extract unique label tables for both levels.

NOTE: The layer immediately above the Category Layer is built first, since it is based on the names of categories. Then, the highest layer (level 0) is built second.

In [None]:
    # ============================================================
    # Build semantic layers
    # ============================================================

    def _build_semantic_layers(self) -> None:
        """
        category_name → level_1 (Semantic Layer 1) → level_0 (Semantic Layer 0)
        """
        # -----------------------------
        # Semantic Layer 1 (level 1)
        # -----------------------------
        df1 = build_semantic_layer(
            self._hier_df,
            input_label_col="category_name",
            n_clusters=None,
            output_prefix="level_1",
        )

        # -----------------------------
        # Semantic Layer 0 (level 0)
        # -----------------------------
        df0 = build_semantic_layer(
            df1,
            input_label_col="level_1_name",
            n_clusters=None,
            output_prefix="level_0",
        )

        self._hier_df = df0

        self._semantic_layer_1 = self._extract_semantic_layer(1)
        self._semantic_layer_0 = self._extract_semantic_layer(0)

# 4 - Extract Semantic Layer

This function is responsible for extracting a specified layer as a dataframe. The function takes an integer `level` and returns a dataframe. In practice, `level` is the desired Semantic layer to extract, and the return dataframe is the layer itself containing the item label, cluster ID, and cluster name. 

Line-by-line breakdown:
- Declaration and docstring.
- Check if `level` is 1. If so, then store self._hier_df (containing the category column, level 1 cluster IDs and level 1 cluster names) as `df`. Drop any duplicates and sort on ID number, and rename columns appropriately. Then, return the dataframe.
- Otherwise, if `level` is 0, then store self._hier_df (containing the level 1 cluster names, level 0 cluster IDs and level 0 cluster names) as `df`. Drop any duplicates and sort on ID number, and rename columns appropriately. Then, return the dataframe.
- Otherwise, raise a descriptive error statement.

In [None]:
    def _extract_semantic_layer(self, level: int) -> pd.DataFrame:
        """
        level=1 → [category_name, level_1_id, level_1_name]
        level=0 → [level_1_name, level_0_id, level_0_name]
        """
        if level == 1:
            df = self._hier_df[["category_name", "level_1_id", "level_1_name"]]
            df = df.drop_duplicates().sort_values("level_1_id")
            df.columns = ["label", "id", "name"]
            return df.reset_index(drop=True)

        if level == 0:
            df = self._hier_df[["level_1_name", "level_0_id", "level_0_name"]]
            df = df.drop_duplicates().sort_values("level_0_id")
            df.columns = ["label", "id", "name"]
            return df.reset_index(drop=True)

        raise ValueError("Semantic level must be 0 or 1")

# 5 - Get Semantic Layers

This simple public function returns an extracted semantic layer as a dataframe, based on the integer argument `level`. If an invalid number is passed in, then a descriptive error statement is raised.

In [None]:
    def get_semantic_layer(self, level: int) -> pd.DataFrame:
        if level == 1:
            return self._semantic_layer_1.copy()
        if level == 0:
            return self._semantic_layer_0.copy()
        raise ValueError("Semantic level must be 0 or 1")

# 6 - Rename and Reassignment

This public function modifies a Semantic layer, normalizes it, and pushes it back into `_hier.df`. This function takes an integer `level` (which layer to modify), a dictionary of integer and string pairs `rename_map`, and a dictionary of string and integer pairs `reassignment_map`. In practice, `rename_map` is a dictionary of cluster IDs and desired new names, and `reassignment_map` is a dictionary of item labels and desired new cluster assignments. The function does not have a return object since it is just modifying an object attribute in place.

Line-by-line breakdown:
- Retrieve a level-specific layer dataframe.
- Iterate through all requested cluster renames, and set the `name` column of each cluster to the desired new name.
- Iterate through all requested item reassignments, and set the `id` column of each item to the desired new cluster assignment.
- Cleanup cluster IDs to ensure it is ascending from 1 to the total number of clusters.
- Apply the changes to the actual hierarchy.
- Re-store the layer that was modified.

In [None]:
    def apply_semantic_changes(
        self,
        level: int,
        rename_map: dict[int, str],
        reassignment_map: dict[str, int],
    ) -> None:
        df_layer = self.get_semantic_layer(level)

        # -------------------------------------
        # Rename clusters
        # -------------------------------------
        for cid, new_name in rename_map.items():
            df_layer.loc[df_layer["id"] == cid, "name"] = new_name

        # -------------------------------------
        # Move items
        # -------------------------------------
        for label, new_cid in reassignment_map.items():
            df_layer.loc[df_layer["label"] == label, "id"] = new_cid

        # -------------------------------------
        # Cleanup ids (ensure ascending 1..K)
        # -------------------------------------
        df_layer = self._cleanup_semantic_layer_ids(df_layer)

        # -------------------------------------
        # Push changes into hierarchy_df
        # -------------------------------------
        self._apply_layer_back_to_hierarchy(level, df_layer)

        # -------------------------------------
        # Re-store layer
        # -------------------------------------
        if level == 1:
            self._semantic_layer_1 = df_layer
        else:
            self._semantic_layer_0 = df_layer

# 7 - Apply Semantic Layer to Hierarchy DF

This private function handles updating the hierarchy directly after updating a Semantic layer. The function accepts an integer `level` (specified modified layer) and a dataframe `df_layer` (the modified layer itself), and it does not return an object.

Line-by-line breakdown:
- Check if the desired level is 1. If so, build a mapping from the label (category name) to the new cluster IDs/names, depending on the edit.
- Write updated level 1 ID into the main hierarchy dataframe.
- Write updated level 1 name into the main hierarchy dataframe.
- Rebuild level 0 based on the updated level 1.
- Extract the updated layer.
- If the desired level is not 1, then perform the same sequence of events on level 0, using level 1 names instead of category names. In this case, level 1 does not need to be rebuilt since it is not reliant on level 0 labels.
- Erase Attribute Layer, if it exists.

In [None]:
    def _apply_layer_back_to_hierarchy(self, level: int, df_layer: pd.DataFrame) -> None:
        if level == 1:
            # Map category_name → new ids/names
            m = df_layer.set_index("label")[["id", "name"]].to_dict(orient="index")
            self._hier_df["level_1_id"] = self._hier_df["category_name"].map(
                lambda x: m[x]["id"]
            )
            self._hier_df["level_1_name"] = self._hier_df["category_name"].map(
                lambda x: m[x]["name"]
            )

            # Level 0 depends on level 1 labels → must rebuild
            self._hier_df = build_semantic_layer(
                self._hier_df,
                input_label_col="level_1_name",
                n_clusters=None,
                output_prefix="level_0",
            )
            self._semantic_layer_0 = self._extract_semantic_layer(0)

        else:
            # Level 0 maps from level_1_name
            m = df_layer.set_index("label")[["id", "name"]].to_dict(orient="index")
            self._hier_df["level_0_id"] = self._hier_df["level_1_name"].map(
                lambda x: m[x]["id"]
            )
            self._hier_df["level_0_name"] = self._hier_df["level_1_name"].map(
                lambda x: m[x]["name"]
            )

        # Attribute layer cache is now stale (if it existed)
        self._attribute_layer_df = None

# 8 - Cleanup: Ensure Contiguous IDs

This private function normalizes cluster IDs to ensure that there are no gaps, zeros, or negative numbers. It accepts a dataframe `df_layer` (the level to normalize) and returns it upon updating.

Line-by-line breakdown:
- Get unique cluster IDs.
- Remap cluster IDs into a new sequential order from 1 to K.
- Update ID columns in the current layer's dataframe and return the updated dataframe.

In [None]:
    def _cleanup_semantic_layer_ids(self, df_layer: pd.DataFrame) -> pd.DataFrame:
 
        unique_ids = sorted(df_layer["id"].unique())
        remap = {old: i + 1 for i, old in enumerate(unique_ids)}
        df_layer["id"] = df_layer["id"].map(remap)
        return df_layer

# 9 - Recluster Semantic Layers

This public function rebuilds a specified semantic layer. It accepts an integer `level` (layer to rebuild), and it does not return an object since it is modifying self objects.

Line-by-line breakdown:
- Check if desired level is level 1. If so, run `build_semantic_layer` amd store as `df1`, then assign to self._hier_df. Extract the layer to store it as self._semantic_layer_1. Do the same for level 0, since it is dependent on level 1.
- Otherwise, only perform the level 0 applications.
- Erase Attribute layer, if it exists.

NOTE: This function is similar to `_build_semantic_layers` and `_apply_layer_back_to_hierarchy`, but it is public instead of private.

In [None]:
    def semantic_recluster(self, level: int) -> None:

        if level == 1:
            df1 = build_semantic_layer(
                self._hier_df,
                input_label_col="category_name",
                n_clusters=None,
                output_prefix="level_1",
            )
            self._hier_df = df1
            self._semantic_layer_1 = self._extract_semantic_layer(1)

            df0 = build_semantic_layer(
                self._hier_df,
                input_label_col="level_1_name",
                n_clusters=None,
                output_prefix="level_0",
            )
            self._hier_df = df0
            self._semantic_layer_0 = self._extract_semantic_layer(0)
        else:
            df0 = build_semantic_layer(
                self._hier_df,
                input_label_col="level_1_name",
                n_clusters=None,
                output_prefix="level_0",
            )
            self._hier_df = df0
            self._semantic_layer_0 = self._extract_semantic_layer(0)

        # Semantic changes invalidate attribute layer cache
        self._attribute_layer_df = None

# 10 - Merge Semantic Clusters

This public function handles semantic cluster merging logic. The function accepts an integer `level` (layer), an integer `from_cluster` (the cluster to be absorbed), and an integer `to_cluster` (the cluster absorbing). The function does not return anything.

Line-by-line breakdown:
- Retrieve the Semantic layer for the specified level.
- For every row in the layer table whose cluster ID equals `from_cluster`, set it to `to_cluster`.
- Clean up and normalize cluster IDs.
- Push new structure into the main hierarchy.
- Store the updated layer directly.

In [None]:
    def merge_semantic_clusters(self, level: int, from_cluster: int, to_cluster: int) -> None:
        df_layer = self.get_semantic_layer(level)

        # Update IDs
        df_layer.loc[df_layer["id"] == from_cluster, "id"] = to_cluster

        # Clean IDs
        df_layer = self._cleanup_semantic_layer_ids(df_layer)

        # Write back to hierarchy
        self._apply_layer_back_to_hierarchy(level, df_layer)

        # Store updated layer
        if level == 1:
            self._semantic_layer_1 = df_layer
        else:
            self._semantic_layer_0 = df_layer

# 11 - Split Semantic Clusters

This public function splits a semantic cluster into 2 separate clusters by moving selected items/labels/categories into a brand new cluster. The function accepts an integer `level` (layer), an integer `from_cluster` (cluster ID to split), and list of strings`labels_to_move` (labels in `from_cluster` to be peeled off into a new cluster). The function does not return anything.

Line-by-line breakdown:
- Store the desired layer in `df_layer`.
- Raise a clear error statement if `labels_to_move` did not receive a parameter.
- Store convenience ailias for the strings "label" and "id" (good hygiene practice haha).
- Create boolean mask selecting rows whose ID equals `from_cluster`.
- Pull the "label" values from the selected rows and convert to a set.
- Iterate through each label in `labels_to_move`, and include it in a list `invalid` if the label is not in the set `labels_in_cluster`.
- If anything is added to `invalid`, then raise a clear error statement.
- If all labels are selected to be split off, then raise a clear error statement (no logical reason for a user to do this).
- Create a new cluster ID (the value is 1 more than the current highest cluster ID).
- Move the selected labels into the new cluster.
- Normalize IDs and push back into main hierarchy.
- Store modified layer as self object.

In [None]:
    def split_semantic_cluster(
        self,
        level: int,
        from_cluster: int,
        labels_to_move: list[str],
    ) -> None:
    
        df_layer = self.get_semantic_layer(level)

        if not labels_to_move:
            raise ValueError("No labels provided to move.")

        # Short aliases
        label_col = "label"
        id_col = "id"

        # Validate labels belong to from_cluster
        in_cluster_mask = df_layer[id_col] == from_cluster
        labels_in_cluster = set(df_layer.loc[in_cluster_mask, label_col].tolist())

        invalid = [lbl for lbl in labels_to_move if lbl not in labels_in_cluster]
        if invalid:
            raise ValueError(
                f"Cannot split: these labels are not in cluster {from_cluster}: {invalid}"
            )

        if len(labels_in_cluster) == len(labels_to_move):
            raise ValueError(
                "Cannot move all items out of the cluster. At least one item must remain."
            )

        # Assign a fresh new cluster ID
        max_id = int(df_layer[id_col].max())
        new_id = max_id + 1

        df_layer.loc[df_layer[label_col].isin(labels_to_move), id_col] = new_id

        # Cleanup & write back
        df_layer = self._cleanup_semantic_layer_ids(df_layer)
        self._apply_layer_back_to_hierarchy(level, df_layer)

        if level == 1:
            self._semantic_layer_1 = df_layer
        else:
            self._semantic_layer_0 = df_layer

# 12 - Build Attribute Layer

This private function utilizes the Attribute Layer functions to build the Attribute Layer. 

Line-by-line breakdown:
- Create a safe copy of the main hierarchy.
- Retrieve columns to be excluded from clustering.
- Retrieve method to be used for clustering.
- Run `assign_all_clusters` and store the resulting dataframe as `df`.
- Run `make_cluster_names` to create cluster names.
- Create a stable column name for the engine level.
- Update self._hier_df and store the Attribute layer view.

In [None]:
    def _build_attribute_layer(self) -> None:

        df = self._hier_df.copy()

        excluded = getattr(self, "_attribute_excluded_cols", None)
        method = getattr(self, "_attribute_method", "sparsity")

        # 1) Assign clusters within each category (attribute-based)
        df = assign_all_clusters(
            df,
            random_state=42,
            extra_excluded_cols=excluded,
            method=method,
        )

        # 2) Name clusters
        _, df_named = make_cluster_names(
            df,
            extra_excluded_cols=excluded,
        )

        # 3) Expose cluster info as attribute-layer columns
        df_named["attribute_cluster_id"] = df_named["category_cluster"]
        df_named["attribute_cluster_name"] = df_named["category_cluster_name"]

        # Cache full hierarchy + attribute-layer view
        self._hier_df = df_named
        self._attribute_layer_df = (
            df_named[
                [
                    "category_name",
                    "attribute_cluster_id",
                    "attribute_cluster_name",
                ]
            ]
            .drop_duplicates()
            .reset_index(drop=True)
        )

# 13 - Attribute Layer Getters/Mutators

These functions are useful helper functions for managing the Attribute Layer. See the beginning docstring for information on each.

In [None]:
    def get_attribute_layer(self) -> pd.DataFrame:
        """
        Returns a dataframe for the attribute layer:
            category_name, attribute_cluster_id, attribute_cluster_name
        """
        if self._attribute_layer_df is None:
            self._build_attribute_layer()
        return self._attribute_layer_df.copy()
    
    def get_attribute_method(self) -> str:
        """
        Returns the currently configured attribute clustering method:
        'sparsity' or 'value'.
        """
        return getattr(self, "_attribute_method", "sparsity")
    

    def attribute_recluster(self, method: str | None = None) -> None:
        """
        Re-run attribute-based clustering from scratch.

        Parameters
        ----------
        method :
            'sparsity' or 'value'. If None, reuse the current method.
        """
        if method is not None:
            self._attribute_method = method

        self._attribute_layer_df = None
        self._build_attribute_layer()


    def apply_attribute_changes(
        self,
        rename_map: Dict[int, str],
        reassignment_map: Dict[str, int],
    ) -> None:
        """
        Apply user edits to the attribute layer:

        - rename_map: attribute_cluster_id → new name
        - reassignment_map: category_name → new attribute_cluster_id

        Note: reassignment by category_name updates all rows belonging
        to that category to the new attribute cluster.
        """
        if self._attribute_layer_df is None:
            self._build_attribute_layer()

        df = self._hier_df.copy()
        layer_df = self._attribute_layer_df.copy()

        # 1) Reassign categories → new attribute_cluster_id
        for cat_name, new_cid in reassignment_map.items():
            new_cid = int(new_cid)

            mask_h = df["category_name"] == cat_name
            df.loc[mask_h, "attribute_cluster_id"] = new_cid

            mask_l = layer_df["category_name"] == cat_name
            layer_df.loc[mask_l, "attribute_cluster_id"] = new_cid

        # 2) Apply renames: attribute_cluster_id → new name
        for cid, new_name in rename_map.items():
            cid = int(cid)

            mask_l = layer_df["attribute_cluster_id"] == cid
            layer_df.loc[mask_l, "attribute_cluster_name"] = new_name

            mask_h = df["attribute_cluster_id"] == cid
            df.loc[mask_h, "attribute_cluster_name"] = new_name

        # 3) Save back
        self._hier_df = df
        self._attribute_layer_df = (
            layer_df[
                [
                    "category_name",
                    "attribute_cluster_id",
                    "attribute_cluster_name",
                ]
            ]
            .drop_duplicates()
            .reset_index(drop=True)
        )

    def set_attribute_excluded_columns(self, cols: list[str] | None) -> None:
        """
        Set additional columns to exclude from attribute clustering and naming.
        Passing None or [] resets to default behavior.
        """
        if cols is None:
            self._attribute_excluded_cols = None
        else:
            # Store as simple list of strings
            self._attribute_excluded_cols = [str(c) for c in cols]

        # Changing exclusions invalidates any cached attribute layer
        self._attribute_layer_df = None



# 14 - Final Summary

This public function retrieves the current main hierarchy to be used for displaying and exporting results in the application.

In [None]:
    def get_hierarchy_df(self) -> pd.DataFrame:
        return self._hier_df.copy()