# 01_tag_id_features preview

Quick inspection of the tag lookup table and the train/test wide matrices generated in `data/interim/01_tag_id_features/`.



In [None]:
from collections import OrderedDict
from pathlib import Path

import pandas as pd
from IPython.display import display

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 200)

NOTEBOOK_RELATIVE_PATH = Path("notebooks/data/interim/01_tag_id_features/preview.ipynb")
try:
    NOTEBOOK_PATH = Path(__file__).resolve()
except NameError:  # running interactively
    NOTEBOOK_PATH = (Path.cwd() / NOTEBOOK_RELATIVE_PATH).resolve()


def resolve_project_root(notebook_path: Path) -> Path:
    for candidate in notebook_path.parents:
        if (candidate / "data").exists() and (candidate / "src").exists():
            return candidate
    return notebook_path.parents[-1]


PROJECT_ROOT = resolve_project_root(NOTEBOOK_PATH)
OUTPUT_DIR = PROJECT_ROOT / "data" / "interim" / "01_tag_id_features"

TAG_FEATURES = OrderedDict(
    [
        ("unit_tag_id", {"column_prefix": "unit_tag"}),
        ("building_tag_id", {"column_prefix": "building_tag"}),
        ("statuses", {"column_prefix": "status_tag"}),
    ]
)

TAG_IDS_PATH = OUTPUT_DIR / "tag_ids.parquet"
TRAIN_MATRIX_PATH = OUTPUT_DIR / "train_tag_ids.parquet"
TEST_MATRIX_PATH = OUTPUT_DIR / "test_tag_ids.parquet"
HEAD_ROWS = 10


def read_parquet_or_warn(path: Path) -> pd.DataFrame | None:
    if not path.exists():
        print(f"⚠️ Missing file: {path.relative_to(PROJECT_ROOT)}")
        return None
    return pd.read_parquet(path)



In [None]:
def summarize_tag_lookup(df: pd.DataFrame) -> None:
    print("=== tag_ids lookup ===")
    print(f"path: {TAG_IDS_PATH.relative_to(PROJECT_ROOT)}")
    print(f"shape: {df.shape[0]} rows x {df.shape[1]} cols")
    summary = (
        df.groupby("feature_name")
        .size()
        .rename("unique_tag_ids")
        .reset_index()
    )
    display(summary)
    display(df.head(20))


def summarize_matrix(label: str, path: Path, df: pd.DataFrame, tag_lookup: pd.DataFrame) -> None:
    print(f"\n=== {label} matrix ===")
    print(f"path: {path.relative_to(PROJECT_ROOT)}")
    print(f"shape: {df.shape[0]} rows x {df.shape[1]} cols")

    block_summaries = []
    for feature_name, meta in TAG_FEATURES.items():
        tag_ids = (
            tag_lookup.loc[tag_lookup["feature_name"] == feature_name, "tag_id"].tolist()
        )
        columns = [f"{meta['column_prefix']}_{tag}" for tag in tag_ids]
        missing = [col for col in columns if col not in df.columns]
        if missing:
            print(f"⚠️ {feature_name}: {len(missing)} columns missing from matrix (unexpected)")
        present_cols = [col for col in columns if col in df.columns]
        if not present_cols:
            continue
        block = df[present_cols]
        per_row = block.sum(axis=1)
        block_summaries.append(
            {
                "feature_name": feature_name,
                "n_columns": len(present_cols),
                "mean_tags_per_row": per_row.mean(),
                "rows_with_any_tag_pct": (per_row > 0).mean() * 100,
                "total_tag_hits": int(block.to_numpy().sum()),
            }
        )
    if block_summaries:
        summary_df = pd.DataFrame(block_summaries)
        display(summary_df)

    preview_cols = [col for col in df.columns if col != "data_id"]
    preview_cols = ["data_id", *preview_cols[:15]]
    display(df[preview_cols].head(HEAD_ROWS))



In [None]:
tag_lookup = read_parquet_or_warn(TAG_IDS_PATH)
if tag_lookup is not None:
    summarize_tag_lookup(tag_lookup)
else:
    print("⚠️ Skipping matrix inspection because tag lookup is missing.")



In [None]:
if tag_lookup is not None:
    matrices = [
        ("train", TRAIN_MATRIX_PATH),
        ("test", TEST_MATRIX_PATH),
    ]
    for label, path in matrices:
        matrix_df = read_parquet_or_warn(path)
        if matrix_df is None:
            continue
        summarize_matrix(label, path, matrix_df, tag_lookup)

