# Devices Report (CSV Exports)

This notebook rebuilds the DisMAL `devices` report using raw Discovery CSV exports.
It reads appliance details from `config.yaml`, loops over each instance, and
writes the merged dataset to the standard output folders.


## Requirements

We rely on `pandas` and `PyYAML` for data wrangling. Uncomment the cell below to install them if needed.


In [None]:
# %pip install -q pandas pyyaml

import math
from ast import literal_eval
from pathlib import Path
from typing import Any, Dict, List

import pandas as pd
import yaml


## Configuration

Adjust these values to control where CSV exports are loaded from and where results are written.


In [None]:
# Root folder that contains sub-folders per appliance (e.g., raw_exports/dev)
RAW_EXPORT_ROOT = Path("../raw_exports")

# Optional filters (set to a list like ["prod"] to limit processing)
INCLUDE_INSTANCES = None  # or list of appliance names
EXCLUDE_INSTANCES = []    # names to skip even if present in config

# Optional credential UUID filter (last segment, with or without prefix)
DEVICES_WITH_CRED_UUID = None  # e.g., "7636fe3b4bd69466ab487f0000010700"

# Optional base directory override for outputs (per appliance sub-folder is created automatically)
OUTPUT_BASE_DIR = None  # e.g., Path("../../csv_outputs")
OUTPUT_FILENAME = "devices.csv"


In [None]:
def find_repo_root(start: Path) -> Path:
    for candidate in [start] + list(start.parents):
        if (candidate / "config.yaml").exists() or (candidate / ".git").is_dir():
            return candidate
    return start

NOTEBOOK_DIR = Path.cwd()
REPO_ROOT = find_repo_root(NOTEBOOK_DIR)
CONFIG_PATH = REPO_ROOT / "config.yaml"

if not CONFIG_PATH.exists():
    raise FileNotFoundError(f"config.yaml not found at {CONFIG_PATH}")

with CONFIG_PATH.open("r", encoding="utf-8") as fh:
    cfg = yaml.safe_load(fh) or {}

appliance_entries = cfg.get("appliances") or []
if isinstance(appliance_entries, dict):
    appliance_entries = [appliance_entries]

if not appliance_entries:
    default_target = cfg.get("target")
    name = cfg.get("name") or (default_target or "default")
    appliance_entries = [{"name": name, "target": default_target}]

available_appliances: List[Dict[str, Any]] = []
for entry in appliance_entries:
    name = str(entry.get("name") or "").strip()
    target = str(entry.get("target") or "").strip()
    if not name:
        continue
    available_appliances.append({"name": name, "target": target})

if not available_appliances:
    raise ValueError("No appliances with a name found in config.yaml")

exports_root = RAW_EXPORT_ROOT if RAW_EXPORT_ROOT.is_absolute() else (NOTEBOOK_DIR / RAW_EXPORT_ROOT).resolve()
if not exports_root.exists():
    raise FileNotFoundError(f"Raw export root not found: {exports_root}")

available_dirs = {path.name: path for path in exports_root.iterdir() if path.is_dir()}

include_set = set(INCLUDE_INSTANCES or [])
exclude_set = set(EXCLUDE_INSTANCES or [])

selected_appliances: List[Dict[str, Any]] = []
skipped_missing: List[str] = []
skipped_filtered: List[str] = []

for appliance in available_appliances:
    name = appliance["name"]
    if include_set and name not in include_set:
        skipped_filtered.append(name)
        continue
    if name in exclude_set:
        skipped_filtered.append(name)
        continue
    export_dir = available_dirs.get(name)
    if not export_dir:
        skipped_missing.append(name)
        continue
    selected_appliances.append({
        "name": name,
        "target": appliance.get("target", ""),
        "export_dir": export_dir,
    })

print(f"Repo root     : {REPO_ROOT}")
print(f"Config path   : {CONFIG_PATH}")
print(f"Exports root  : {exports_root}")
print(f"Appliances in config : {[a['name'] for a in available_appliances]}")
print(f"Raw export dirs      : {sorted(available_dirs)}")
print(f"Selected appliances  : {[a['name'] for a in selected_appliances]}")
if skipped_missing:
    print(f"Missing export folders: {skipped_missing}")
if skipped_filtered:
    print(f"Skipped by filter     : {skipped_filtered}")

if not selected_appliances:
    raise RuntimeError("No appliances selected for processing â€“ check raw exports and filters.")


In [None]:
INVALID_STRINGS = {"", "none", "nan", "null"}

def is_missing(value) -> bool:
    if value is None:
        return True
    if isinstance(value, float):
        return math.isnan(value)
    if isinstance(value, str):
        return value.strip().lower() in INVALID_STRINGS
    return False

def to_clean_str(value):
    if is_missing(value):
        return None
    return str(value).strip()

def parse_listish(value):
    if isinstance(value, list):
        return [str(v).strip() for v in value if not is_missing(v)]
    if is_missing(value):
        return []
    text = str(value).strip()
    if text.startswith("[") and text.endswith("]"):
        try:
            parsed = literal_eval(text)
        except (ValueError, SyntaxError):
            return [text]
        if isinstance(parsed, list):
            return [str(v).strip() for v in parsed if not is_missing(v)]
    return [text]

def merge_lists(series) -> List[str]:
    merged: List[str] = []
    for values in series:
        if not values:
            continue
        if isinstance(values, list):
            merged.extend(values)
    return sorted({item for item in merged if not is_missing(item)})

def gather_values(row, columns) -> List[str]:
    collected: List[str] = []
    for col in columns:
        if col not in row:
            continue
        cell = row[col]
        if isinstance(cell, list):
            collected.extend(cell)
        elif not is_missing(cell):
            collected.append(str(cell).strip())
    return sorted({item for item in collected if not is_missing(item)})

def clean_uuid(value):
    text = to_clean_str(value)
    if text is None:
        return None
    return text.split("/")[-1].lower()

def to_bool(value) -> bool:
    if isinstance(value, bool):
        return value
    text = to_clean_str(value)
    if text is None:
        return False
    lowered = text.lower()
    if lowered in {"true", "1", "yes"}:
        return True
    if lowered in {"false", "0", "no"}:
        return False
    return False

def collect_unique(series) -> List[str]:
    return sorted({str(v).strip() for v in series if not is_missing(v)})


In [None]:
IDENTITY_IP_COLUMNS = [
    "DiscoveryAccess.endpoint",
    "Endpoint.endpoint",
    "DiscoveredIPAddress.ip_addr",
    "InferredElement.__all_ip_addrs",
    "NetworkInterface.ip_addr",
]
IDENTITY_NAME_COLUMNS = [
    "InferredElement.name",
    "InferredElement.hostname",
    "InferredElement.local_fqdn",
    "InferredElement.sysname",
    "NetworkInterface.fqdns",
]


def build_output_dir(target: str) -> Path:
    sanitized = (target or "unknown").replace(".", "_").replace(":", "_").replace("/", "_")
    if OUTPUT_BASE_DIR is None:
        return REPO_ROOT / f"output_{sanitized}"
    base_root = OUTPUT_BASE_DIR if isinstance(OUTPUT_BASE_DIR, Path) else Path(OUTPUT_BASE_DIR)
    base_root = base_root.expanduser().resolve()
    return base_root / f"output_{sanitized}"


def process_instance(instance: Dict[str, Any]) -> Dict[str, Any]:
    name = instance["name"]
    target = instance.get("target") or name
    export_dir: Path = instance["export_dir"]
    print(f"=== Processing {name} ({target}) ===")
    identities_csv = export_dir / "devices_report_identities.csv"
    last_discovery_csv = export_dir / "devices_report_last_discovery.csv"

    missing_files = [str(p) for p in [identities_csv, last_discovery_csv] if not p.exists()]
    if missing_files:
        print(f"Missing required CSV files for {name}: {missing_files}")
        return {"instance": name, "target": target, "output_path": None, "rows": 0, "status": "missing_csv"}

    id_df = pd.read_csv(identities_csv)

    if "DiscoveryAccess.endpoint" not in id_df.columns:
        raise ValueError("Identities export missing 'DiscoveryAccess.endpoint' column")

    identity_lists = id_df["DiscoveryAccess.endpoint"].apply(parse_listish)
    id_df["identity_endpoint"] = identity_lists.apply(lambda values: values[0] if values else None)
    id_df["identity_endpoint"] = id_df["identity_endpoint"].apply(to_clean_str)
    id_df["DiscoveryAccess.endpoint"] = identity_lists

    for col in IDENTITY_IP_COLUMNS + IDENTITY_NAME_COLUMNS:
        if col == "DiscoveryAccess.endpoint":
            continue
        if col in id_df.columns:
            id_df[col] = id_df[col].apply(parse_listish)

    id_df["ips"] = id_df.apply(lambda row: gather_values(row, IDENTITY_IP_COLUMNS), axis=1)
    id_df["names"] = id_df.apply(lambda row: gather_values(row, IDENTITY_NAME_COLUMNS), axis=1)

    identities_df = (
        id_df[["identity_endpoint", "ips", "names"]]
        .rename(columns={"identity_endpoint": "Identities.endpoint"})
        .groupby("Identities.endpoint", dropna=False)
        .agg({
            "ips": merge_lists,
            "names": merge_lists,
        })
        .reset_index()
        .rename(columns={"ips": "list_of_ips", "names": "list_of_names"})
    )

    print(f"Identity rows: {len(id_df)} | Unique endpoints: {len(identities_df)}")

    ld_df = pd.read_csv(last_discovery_csv)

    required_ld_columns = [
        "DiscoveryAccess.endpoint",
        "DeviceInfo.hostname",
        "DiscoveryAccess.node_kind",
        "DiscoveryAccess.scan_starttime",
        "DiscoveryRun.label",
        "DiscoveryAccess.end_state",
        "DiscoveryAccess.result",
        "DiscoveryAccess.current_access",
        "DiscoveryAccess.host_node_updated",
        "DiscoveryAccess.scan_endtime_raw",
        "label",
        "username",
    ]
    for col in required_ld_columns:
        if col not in ld_df.columns:
            ld_df[col] = None

    if "DeviceInfo.last_credential" in ld_df.columns:
        ld_df["last_cred_short"] = ld_df["DeviceInfo.last_credential"].apply(clean_uuid)
    else:
        ld_df["last_cred_short"] = None

    if "DiscoveryAccess.host_node_updated" in ld_df.columns:
        ld_df["DiscoveryAccess.host_node_updated"] = ld_df["DiscoveryAccess.host_node_updated"].apply(to_bool)

    ld_df["scan_end_ts"] = pd.to_datetime(ld_df.get("DiscoveryAccess.scan_endtime_raw"), utc=True, errors="coerce")
    ld_df["scan_end_rank"] = ld_df["scan_end_ts"].apply(lambda ts: ts.value if pd.notna(ts) else -1)

    if DEVICES_WITH_CRED_UUID:
        want_uuid = clean_uuid(DEVICES_WITH_CRED_UUID)
        ld_df = ld_df[ld_df["last_cred_short"] == want_uuid].copy()
        print(f"Filtered last discovery rows to credential {want_uuid} -> {len(ld_df)} rows")
    else:
        print(f"Last discovery rows: {len(ld_df)}")

    output_dir = build_output_dir(target)
    output_dir.mkdir(parents=True, exist_ok=True)

    ip_map = identities_df[["Identities.endpoint", "list_of_ips"]].explode("list_of_ips", ignore_index=True)
    ip_map["list_of_ips"] = ip_map["list_of_ips"].apply(to_clean_str)
    ip_map = ip_map.dropna(subset=["list_of_ips"])

    merged = ip_map.merge(
        ld_df,
        left_on="list_of_ips",
        right_on="DiscoveryAccess.endpoint",
        how="left",
    )

    if "last_cred_short" not in merged.columns:
        merged["last_cred_short"] = None
    merged["last_cred_short"] = merged["last_cred_short"].apply(clean_uuid)

    if "label" not in merged.columns:
        merged["label"] = None
    if "username" not in merged.columns:
        merged["username"] = None

    merged["cred_display"] = merged["last_cred_short"].apply(to_clean_str)

    if "scan_end_rank" not in merged.columns:
        merged["scan_end_rank"] = -1
    else:
        merged["scan_end_rank"] = merged["scan_end_rank"].fillna(-1)

    print(f"Merged rows: {len(merged)}")

    grp = merged.groupby("Identities.endpoint", dropna=False)

    agg_device_names = grp["DeviceInfo.hostname"].apply(collect_unique).reset_index(name="all_device_names")
    agg_endpoints = grp["DiscoveryAccess.endpoint"].apply(collect_unique).reset_index(name="all_endpoints")
    agg_runs = grp["DiscoveryRun.label"].apply(collect_unique).reset_index(name="all_discovery_runs")
    agg_creds = grp["cred_display"].apply(collect_unique).reset_index(name="all_credentials_used")

    latest_idx = grp["scan_end_rank"].idxmax()
    latest = merged.loc[latest_idx.tolist(), [
        "Identities.endpoint",
        "DiscoveryAccess.endpoint",
        "DeviceInfo.hostname",
        "DiscoveryAccess.node_kind",
        "last_cred_short",
        "label",
        "username",
        "DiscoveryAccess.scan_starttime",
        "DiscoveryRun.label",
        "DiscoveryAccess.end_state",
        "DiscoveryAccess.result",
        "DiscoveryAccess.current_access",
    ]].copy()

    latest = latest.rename(columns={
        "DiscoveryAccess.endpoint": "last_scanned_ip",
        "DeviceInfo.hostname": "last_identity",
        "DiscoveryAccess.node_kind": "last_kind",
        "last_cred_short": "last_credential",
        "label": "last_credential_label",
        "username": "last_credential_username",
        "DiscoveryAccess.scan_starttime": "last_start_time",
        "DiscoveryRun.label": "last_run",
        "DiscoveryAccess.end_state": "last_endstate",
        "DiscoveryAccess.result": "last_result",
        "DiscoveryAccess.current_access": "last_access_method",
    })

    succ = merged[merged.get("DiscoveryAccess.host_node_updated") == True].copy()
    if succ.empty:
        last_succ = pd.DataFrame(columns=[
            "Identities.endpoint",
            "last_successful_identity",
            "last_successful_ip",
            "last_successful_credential",
            "last_successful_credential_label",
            "last_successful_credential_username",
            "last_successful_start_time",
            "last_successful_run",
            "last_successful_endstate",
        ])
    else:
        succ_idx = succ.groupby("Identities.endpoint", dropna=False)["scan_end_rank"].idxmax()
        last_succ = succ.loc[succ_idx.tolist(), [
            "Identities.endpoint",
            "DeviceInfo.hostname",
            "DiscoveryAccess.endpoint",
            "last_cred_short",
            "label",
            "username",
            "DiscoveryAccess.scan_starttime",
            "DiscoveryRun.label",
            "DiscoveryAccess.end_state",
        ]].copy()
        last_succ = last_succ.rename(columns={
            "DeviceInfo.hostname": "last_successful_identity",
            "DiscoveryAccess.endpoint": "last_successful_ip",
            "last_cred_short": "last_successful_credential",
            "label": "last_successful_credential_label",
            "username": "last_successful_credential_username",
            "DiscoveryAccess.scan_starttime": "last_successful_start_time",
            "DiscoveryRun.label": "last_successful_run",
            "DiscoveryAccess.end_state": "last_successful_endstate",
        })

    df_out = (
        latest
        .merge(agg_device_names, on="Identities.endpoint", how="left")
        .merge(agg_endpoints, on="Identities.endpoint", how="left")
        .merge(agg_runs, on="Identities.endpoint", how="left")
        .merge(agg_creds, on="Identities.endpoint", how="left")
        .merge(last_succ, on="Identities.endpoint", how="left")
        .merge(identities_df[["Identities.endpoint", "list_of_names"]], on="Identities.endpoint", how="left")
    )

    def fallback_identity(row):
        primary = to_clean_str(row.get("last_identity"))
        if primary:
            return primary
        names = row.get("list_of_names")
        if isinstance(names, list) and names:
            return names[0]
        return None

    df_out["last_identity"] = df_out.apply(fallback_identity, axis=1)

    desired_columns = [
        "last_scanned_ip",
        "last_identity",
        "last_kind",
        "all_device_names",
        "all_endpoints",
        "all_credentials_used",
        "all_discovery_runs",
        "last_credential",
        "last_credential_label",
        "last_credential_username",
        "last_start_time",
        "last_run",
        "last_endstate",
        "last_result",
        "last_access_method",
        "last_successful_identity",
        "last_successful_ip",
        "last_successful_credential",
        "last_successful_credential_label",
        "last_successful_credential_username",
        "last_successful_start_time",
        "last_successful_run",
        "last_successful_endstate",
    ]

    for col in desired_columns:
        if col not in df_out.columns:
            df_out[col] = None

    df_out = df_out[desired_columns]
    df_out.insert(0, "Discovery Instance", target)

    output_csv = output_dir / OUTPUT_FILENAME
    df_out.to_csv(output_csv, index=False)

    print(f"Output rows: {len(df_out)} | Saved to {output_csv}")
    display(df_out.head(5))

    return {
        "instance": name,
        "target": target,
        "output_path": output_csv,
        "rows": int(len(df_out)),
        "status": "ok",
    }


In [None]:
results = []
for appliance in selected_appliances:
    outcome = process_instance(appliance)
    results.append(outcome)

summary_df = pd.DataFrame(results)
display(summary_df)
