# Devices Report (BMC Discovery)

This notebook reproduces the DisMAL `devices` report using the Tideway Python library to run Discovery Data API searches.
It reads connection details from `config.yaml`, supports an optional `devices_with_cred` filter,
and writes a CSV under the standard `output_<target>` folder.

> **NOTE:** Due to API limits, this may take a while to run.

## Requirements

We use `tideway` (local package in this repo or PyPI), `pandas`, and `PyYAML`.
Uncomment the following to install in your environment if needed.

In [None]:
# %pip install -q tideway pandas pyyaml

import os, sys, json
from pathlib import Path
import pandas as pd
import yaml
from typing import Any, Dict, List
import os
from datetime import datetime, UTC
import json
# Optional: for safely parsing list-like columns later
from ast import literal_eval


## Select Appliance (optional)

If your `config.yaml` defines multiple appliances under the `appliances:` list,
set `APPLIANCE_NAME` to one of their names (e.g., 'prod' or 'dev') or use the index.
Defaults to the first appliance if neither is set.

In [None]:
APPLIANCE_NAME = None   # e.g., 'prod' or 'dev'
APPLIANCE_INDEX = 0     # integer index if not using name selection

# Optional filter: if set to a credential UUID, runs devices_with_cred flow
DEVICES_WITH_CRED_UUID = None  # e.g., '7636fe3b4bd69466ab487f0000010700'


## Configuration (from config.yaml)

Reads settings from `../config.yaml` including target, token/token_file,
API version, and SSL verification preference.
Saves the CSV to `../output_<target>/devices.csv` (or `devices_with_cred.csv`).

In [None]:
def _find_repo_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / 'config.yaml').exists():
            return p
    return start.parent

repo_root = _find_repo_root(Path.cwd())
config_path = repo_root / 'config.yaml'
with open(config_path, 'r') as fh:
    cfg = yaml.safe_load(fh) or {}

# Appliance selection
apps = cfg.get('appliances') or []
selected = None
if isinstance(apps, list) and apps:
    if APPLIANCE_NAME:
        selected = next((a for a in apps if a.get('name') == APPLIANCE_NAME), None)
        if selected is None:
            raise ValueError(f"No appliance named '{APPLIANCE_NAME}' in config.yaml")
    else:
        try:
            selected = apps[int(APPLIANCE_INDEX)]
        except Exception:
            selected = apps[0]

target = ((selected or {}).get('target') or cfg.get('target') or '').strip()
if not target:
    raise ValueError('config.yaml missing "target"')

# Token handling: inline token or token file
token = (((selected or {}).get('token') or cfg.get('token') or '').strip())
token_file = (selected or {}).get('token_file') or cfg.get('token_file') or cfg.get('f_token')
if not token and token_file:
    tf_path = Path(token_file)
    if not tf_path.is_absolute():
        tf_path = repo_root / tf_path
    with open(tf_path, 'r') as tf:
        token = tf.read().strip()
if not token:
    raise ValueError('API token not found in config.yaml (token or token_file)')

# Version and SSL
API_VERSION = str((selected or {}).get('api_version') or cfg.get('api_version') or 'v1.14')
VERIFY_SSL = bool((selected or {}).get('verify_ssl', cfg.get('verify_ssl', True)))

# Output path
sanitized = target.replace('.', '_').replace(':', '_').replace('/', '_')
output_dir = repo_root / f'output_{sanitized}'
output_dir.mkdir(parents=True, exist_ok=True)

print('Base Host      :', target)
print('API Version    :', API_VERSION)
print('Verify SSL     :', VERIFY_SSL)
print('Output folder  :', output_dir)
print('Token set      :', bool(token))

# Prefer local Tideway package in this repo if available
local_tideway = repo_root / 'Tideway'
if local_tideway.exists():
    sys.path.insert(0, str(local_tideway))

import importlib
tideway = importlib.import_module('tideway')

API_VERSION_NUM = API_VERSION.lstrip('v')
app = tideway.appliance(target, token, api_version=API_VERSION_NUM, ssl_verify=VERIFY_SSL)
twsearch = app.data()
twcreds = app.credentials()

# Quick probe (optional)
try:
    about = app.api_about
    print('Appliance reachable:', about.status_code)
except Exception as e:
    print('Warning: failed to contact appliance /api/about:', e)


## Normalization

In [None]:
# Minimal pandas-based normalizer for Tideway responses
def df_from_tw(resp):
    """Return a pandas DataFrame from Tideway search/endpoint responses.

    Supports:
    - dict with 'headings' + 'results'
    - list: [headings, row1, row2, ...]
    - list of dicts
    - Response-like object with .json()
    """
    obj = resp
    if hasattr(resp, 'json'):
        try:
            obj = resp.json()
        except Exception:
            obj = None
    if isinstance(obj, dict) and 'headings' in obj and 'results' in obj:
        cols = obj.get('headings') or []
        rows = obj.get('results') or []
        return pd.DataFrame(rows, columns=cols)
    if isinstance(obj, list):
        if obj and isinstance(obj[0], list):
            return pd.DataFrame(obj[1:], columns=obj[0])
        if obj and isinstance(obj[0], dict):
            return pd.DataFrame(obj)
        return pd.DataFrame()
    if isinstance(obj, dict):
        return pd.json_normalize(obj)
    return pd.DataFrame()


# Cache Builder

In [None]:
from pathlib import Path
from datetime import datetime, UTC
import pandas as pd
import json, hashlib
from ast import literal_eval

CACHE_DIR = Path(".cache")

def _paths(name: str):
    CACHE_DIR.mkdir(exist_ok=True)
    return CACHE_DIR / f"{name}.csv", CACHE_DIR / f"{name}.meta.json"

def _now_utc_iso():
    return datetime.now(UTC).isoformat(timespec="seconds")

def _age_hours(iso_ts: str) -> float:
    try:
        dt = datetime.fromisoformat(iso_ts)
        return (datetime.now(UTC) - dt).total_seconds() / 3600.0
    except Exception:
        return 1e9  # force stale

def _hash(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()[:12]

def get_or_build_df(
    name: str,
    build_fn,                     # () -> pd.DataFrame  (calls API + normalises)
    parse_list_cols=None,         # ['ips_all', ...] if you stored lists as strings
    transform_fn=None,            # (pd.DataFrame) -> pd.DataFrame (post-load tweaks)
    use_cache=True,
    force_refresh=False,
    max_age_hours=None,           # e.g. 12 (if None: never expires)
    query_text: str | None = None, # if provided, ties cache to query hash
    prefer_csv_without_meta=True,
    debug=True):
    csv_path, meta_path = _paths(name)

    def _debug(msg):
        if debug: print(f"[cache:{name}] {msg}")

    # CSV exists but meta missing
    if use_cache and not force_refresh and csv_path.exists() and (prefer_csv_without_meta and not meta_path.exists()):
        _debug(f"meta missing; loading CSV-only at {csv_path}")
        df = pd.read_csv(csv_path)
        if transform_fn:
            df = transform_fn(df)
        # Write a minimal meta so future loads are clean
        meta = {
            "saved_at_utc": _now_utc_iso(),
            "row_count": int(len(df)),
            "columns": list(df.columns),
            "note": "meta reconstructed from CSV-only cache"
        }
        with open(meta_path, "w") as f: json.dump(meta, f, indent=2)
        return df

    # Existing logic when both CSV and meta are present
    if use_cache and not force_refresh and csv_path.exists() and meta_path.exists():
        with open(meta_path) as f: meta = json.load(f)
        if query_text and meta.get("query_hash") != _hash(query_text.strip()):
            _debug("query hash mismatch -> rebuild")
        elif max_age_hours is not None and _age_hours(meta.get("saved_at_utc","1970-01-01T00:00:00+00:00")) > max_age_hours:
            _debug("cache expired -> rebuild")
        else:
            _debug(f"loading from cache {csv_path}")
            df = pd.read_csv(csv_path)
            if transform_fn: df = transform_fn(df)
            return df

    # Rebuild
    _debug("building via API")
    df = build_fn()
    df.to_csv(csv_path, index=False)
    meta = {
        "saved_at_utc": _now_utc_iso(),
        "row_count": int(len(df)),
        "columns": list(df.columns),
    }
    if query_text: meta["query_hash"] = _hash(query_text.strip())
    with open(meta_path, "w") as f: json.dump(meta, f, indent=2)
    if transform_fn: df = transform_fn(df)
    _debug(f"cached -> {csv_path.name}")
    return df

# --- Usage ---
# 1) Use cache if present, else call API:
# id_df = get_identities_df(twsearch, df_from_tw, limit=500, use_cache=True, force_refresh=False)

# 2) Force refresh (ignore cache and re-save):
# id_df = get_identities_df(twsearch, df_from_tw, limit=500, use_cache=True, force_refresh=True)

# 3) Load only from cache without any API call:
# id_df = load_identities_from_cache()
# if id_df is None:
#     print("No cache found. Run get_identities_df(..., force_refresh=True) once to create it.")

## Queries

These TWQL queries mirror the DisMAL devices flow and the optional devices_with_cred lookup.

In [None]:
# devices_with_cred flow
qry_sessions_for_cred = lambda uuid: f"""
search SessionResult where credential = '{uuid}'
show
(#Detail:Metadata:DiscoveryAccess:DiscoveryAccess.#Associate:Inference:InferredElement:.name
  or #Detail:Metadata:DiscoveryAccess:DiscoveryAccess.#DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo.hostname) as 'device_name',
(kind(#Detail:Metadata:DiscoveryAccess:DiscoveryAccess.#Associate:Inference:InferredElement:)
  or #Detail:Metadata:DiscoveryAccess:DiscoveryAccess.inferred_kind
  or #Detail:Metadata:DiscoveryAccess:DiscoveryAccess.#DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo.kind) as 'inferred_node',
#Detail:Metadata:DiscoveryAccess:DiscoveryAccess.endpoint as 'scanned_endpoint',
credential as 'credential',
success as 'success',
message as 'message',
friendlyTime(time_index) as 'date_time',
#Detail:Metadata:DiscoveryAccess:DiscoveryAccess.#id as 'node_id'
"""

qry_di_for_cred = lambda uuid: f"""
search DeviceInfo where last_credential = '{uuid}' or last_slave = '{uuid}' or __preserved_last_credential = '{uuid}'
ORDER BY hostname
show
(hostname or sysname) as 'device_name',
kind as 'inferred_node',
#DiscoveryResult:DiscoveryAccessResult:DiscoveryResult:DiscoveryAccess.endpoint as 'scanned_endpoint',
#DiscoveryResult:DiscoveryAccessResult:DiscoveryResult:DiscoveryAccess.#id as 'da_node_id',
#DiscoveryResult:DiscoveryAccessResult:DiscoveryResult:DiscoveryAccess.reason as 'message',
method_success as 'success',
method_failure as 'failure',
friendlyTime(request_time) as 'date_time'
"""


## Run Report

When `DEVICES_WITH_CRED_UUID` is set, runs the devices_with_cred flow; otherwise generates the consolidated devices report.

In [None]:
# --- Setup ---
print(f"- Target: {target}")

# Optional credential filter (UUID or None)
cred_uuid = DEVICES_WITH_CRED_UUID if DEVICES_WITH_CRED_UUID else None

# Load credentials as DataFrame (replace custom mapping with pandas)
try:
    creds_payload = twcreds.get_vault_credentials  # Response-like
    creds_list = creds_payload.json() if hasattr(creds_payload, "json") else (creds_payload or [])
except Exception:
    creds_list = []

creds_df = pd.DataFrame(creds_list)
display(creds_df.sample(5))

if creds_df.empty:
    print("Issue retrieving credentials!")

In [None]:
# Short UUID and coalesced username
creds_df['short_uuid'] = creds_df.get('uuid', '').astype(str).str.split('/').str[-1].str.lower()
user_cols = ['username', 'snmp.v3.securityname', 'aws.access_key_id', 'azure.application_id']

print(f"- Credentials loaded: {len(creds_df)}")
display(creds_df[['short_uuid','label','username']].sample(5))

In [None]:
# Credential filter
if cred_uuid:
    want = str(cred_uuid).split('/')[-1].lower()
    detail = creds_df.loc[creds_df['short_uuid'] == want, ['short_uuid','label','username']].head(1)
    if not detail.empty:
        row = detail.iloc[0]
        print(f"- Credential UUID: {row['short_uuid']}")
        print(f"- Label: {row.get('label')}")
    else:
        print(f"- Credential UUID: {want} (not found in creds_df)")
    # keep normalized short UUID for later filtering
    cred_uuid = want


In [None]:
# Build Identities

qry_device_ids = '''
                    search DiscoveryAccess
                    show
                    #::InferredElement:.name as 'InferredElement.name',
                    #::InferredElement:.hostname as 'InferredElement.hostname',
                    #::InferredElement:.local_fqdn as 'InferredElement.local_fqdn',
                    #::InferredElement:.sysname as 'InferredElement.sysname',
                    endpoint as 'DiscoveryAccess.endpoint',
                    #DiscoveryAccess:Endpoint:Endpoint:Endpoint.endpoint as 'Endpoint.endpoint',
                    #DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DiscoveredIPAddressList.#List:List:Member:DiscoveredIPAddress.ip_addr as 'DiscoveredIPAddress.ip_addr',
                    #::InferredElement:.__all_ip_addrs as 'InferredElement.__all_ip_addrs',
                    #::InferredElement:.#DeviceWithInterface:DeviceInterface:InterfaceOfDevice:NetworkInterface.ip_addr as 'NetworkInterface.ip_addr',
                    #::InferredElement:.#DeviceWithInterface:DeviceInterface:InterfaceOfDevice:NetworkInterface.fqdns as 'NetworkInterface.fqdns'
                    '''

# Run query and normalize to DataFrame
#resp_ids = twsearch.search({'query': qry_device_ids}, format='object', limit=500)
#id_df = df_from_tw(resp_ids)

def build_identities_df():
    resp = twsearch.search({'query': qry_device_ids}, format='object', limit=500)
    return df_from_tw(resp)

id_df = get_or_build_df(
    name="identities",
    build_fn=build_identities_df,
    use_cache=True,
    force_refresh=False,
    max_age_hours=24,        # refresh daily
    query_text=qry_device_ids
)
print(f"- Identity rows: {len(id_df)}")

In [None]:
# Check random rows
display(id_df.sample(5))

In [None]:
# Aggregate names and IPs per originating DiscoveryAccess.endpoint using pandas
ip_fields = [
    'DiscoveryAccess.endpoint', 'Endpoint.endpoint',
    'DiscoveredIPAddress.ip_addr', 'InferredElement.__all_ip_addrs',
    'NetworkInterface.ip_addr'
]
name_fields = [
    'InferredElement.name', 'InferredElement.hostname',
    'InferredElement.local_fqdn', 'InferredElement.sysname',
    'NetworkInterface.fqdns'
]
for col in ip_fields + name_fields + ['DiscoveryAccess.endpoint']:
    if col not in id_df.columns:
        id_df[col] = None
to_list = lambda x: x if isinstance(x, list) else ([] if pd.isna(x) or x == '' else [x])
ips_lists = id_df[ip_fields].map(to_list)
names_lists = id_df[name_fields].map(to_list)
id_df['ips'] = ips_lists.apply(lambda row: sorted({str(v) for lst in row for v in lst if v is not None}), axis=1)
id_df['names'] = names_lists.apply(lambda row: sorted({str(v) for lst in row for v in lst if v is not None}), axis=1)

identities_df = (
    id_df[['DiscoveryAccess.endpoint','ips','names']]
    .groupby('DiscoveryAccess.endpoint', as_index=False)
    .agg({
        'ips': lambda s: sorted({v for lst in s for v in lst}),
        'names': lambda s: sorted({v for lst in s for v in lst}),
    })
    .rename(columns={
        'DiscoveryAccess.endpoint': 'Identities.endpoint',
        'ips': 'list_of_ips',
        'names': 'list_of_names',
    })
)

print(f"- Unique endpoints: {len(identities_df)}")
display(identities_df.head(5))

In [None]:
# Trimmed last discovery access view with key fields needed for devices summary
qry_last_disco = '''
search DiscoveryAccess where endtime
ORDER BY discovery_endtime DESC
show
endpoint as 'DiscoveryAccess.endpoint',
#DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo.hostname as 'DeviceInfo.hostname',
#Member:List:List:DiscoveryRun.label as 'DiscoveryRun.label',
friendlyTime(discovery_starttime) as 'DiscoveryAccess.scan_starttime',
friendlyTime(discovery_endtime) as 'DiscoveryAccess.scan_endtime',
discovery_endtime as 'DiscoveryAccess.scan_endtime_raw',
whenWasThat(discovery_endtime) as 'DiscoveryAccess.when_last_scan',
(#DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo.last_access_method in ['windows', 'rcmd']
    and #DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo.last_slave
        or #DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo.probed_os and 'Probe'
            or #DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo.last_access_method) as 'DiscoveryAccess.current_access',
(kind(#Associate:Inference:InferredElement:)
    or inferred_kind
        or #DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo.kind) as 'DiscoveryAccess.node_kind',
(#DiscoveryAccess:Metadata:Detail:SessionResult.credential and success
    or #DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo.last_credential
    or #DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo.last_slave
    or #DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo.__preserved_last_credential) as 'DeviceInfo.last_credential',
#DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo.os_version as 'DeviceInfo.os_version',
(nodecount(traverse DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo
  traverse flags(include_destroyed) Primary:Inference:InferredElement: where not destroyed(#)) > 0) as 'DiscoveryAccess.host_node_updated',
end_state as 'DiscoveryAccess.end_state',
result as 'DiscoveryAccess.result'
'''

#resp_ld = twsearch.search({'query': qry_last_disco}, format='object', limit=500)
#ld_df = df_from_tw(resp_ld)

def build_last_discovery_df():
    resp = twsearch.search({'query': qry_last_disco}, format='object', limit=500)
    return df_from_tw(resp)

def normalise_last_discovery(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if 'DeviceInfo.last_credential' not in df.columns:
        df['DeviceInfo.last_credential'] = None
    df['last_cred_short'] = (
        df['DeviceInfo.last_credential']
        .astype(str).str.split('/').str[-1].str.lower()
    )
    return df

ld_df = get_or_build_df(
    name="last_discovery",
    build_fn=build_last_discovery_df,
    transform_fn=normalise_last_discovery,
    use_cache=True,
    force_refresh=False,
    max_age_hours=12,        # refresh twice a day
    query_text=qry_last_disco
)

# Optional filter by credential
if cred_uuid:
    ld_df = ld_df[ld_df['last_cred_short'] == cred_uuid].copy()

print(f"- Last Discovery rows: {len(ld_df)}")

In [None]:
# View results

display(ld_df.sample(5))

In [None]:
# Sanity Check for dupes

dup_counts = ld_df['DiscoveryAccess.endpoint'].value_counts()
dup_counts = dup_counts[dup_counts > 1]
print(f"Endpoints with duplicates: {len(dup_counts)}")

# Inspect a few duplicated endpoints
sample_dups = dup_counts.index[:5]
display(ld_df[ld_df['DiscoveryAccess.endpoint'].isin(sample_dups)]
        .sort_values(['DiscoveryAccess.endpoint','DiscoveryAccess.scan_endtime_raw'], ascending=[True, False]))

In [None]:
# Merge identities with last discovery by endpoint

# Explode identity IPs and join to discovery rows
ip_map = identities_df[['Identities.endpoint','list_of_ips']].explode('list_of_ips')
merged = ip_map.merge(ld_df, left_on='list_of_ips', right_on='DiscoveryAccess.endpoint', how='left')

# Enrich with credential label/username for convenience
merged = merged.merge(
    creds_df[['short_uuid','label','username']],
    left_on='last_cred_short',
    right_on='short_uuid',
    how='left'
)

# Numeric scan end time for ranking
merged['scan_end_raw'] = pd.to_numeric(merged.get('DiscoveryAccess.scan_endtime_raw'), errors='coerce').fillna(-1)

print(f"- Merged rows: {len(merged)}")

In [None]:
# Review some samples

display(merged.sample(5))

# Keep only rows where list_of_ips has >1 values
multi_ip_rows = ip_map[ip_map['list_of_ips'].apply(lambda x: isinstance(x, list) and len(x) > 1)]

print(f"Rows with multiple IPs: {len(multi_ip_rows)}")
display(multi_ip_rows.head())

ip_map['ip_count'] = ip_map['list_of_ips'].apply(
    lambda x: len(x) if isinstance(x, list) else (0 if pd.isna(x) else 1)
)

print(ip_map['ip_count'].value_counts())

In [None]:
print("\n=== Aggregate per originating endpoint and select latest/last successful ===")

grp = merged.groupby('Identities.endpoint', dropna=False)

# Aggregated sets
agg_device_names = grp['DeviceInfo.hostname'].apply(lambda s: sorted(set(x for x in s.dropna().astype(str)))).rename('all_device_names')
agg_endpoints = grp['DiscoveryAccess.endpoint'].apply(lambda s: sorted(set(x for x in s.dropna().astype(str)))).rename('all_endpoints')
agg_runs = grp['DiscoveryRun.label'].apply(lambda s: sorted(set(x for x in s.dropna().astype(str)))).rename('all_discovery_runs')
merged['cred_display'] = merged.apply(
    lambda r: f"{r['label']} ({r['last_cred_short']})" if pd.notna(r.get('label')) and pd.notna(r.get('last_cred_short')) else (r.get('last_cred_short')),
    axis=1
)
agg_creds = grp['cred_display'].apply(lambda s: sorted(set(x for x in s.dropna().astype(str)))).rename('all_credentials_used')

# Latest rows
idx_latest = grp['scan_end_raw'].idxmax()
latest = merged.loc[idx_latest, [
    'Identities.endpoint',
    'DiscoveryAccess.endpoint',
    'DeviceInfo.hostname',
    'DiscoveryAccess.node_kind',
    'last_cred_short',
    'label',
    'username',
    'DiscoveryAccess.scan_starttime',
    'DiscoveryRun.label',
    'DiscoveryAccess.end_state',
    'DiscoveryAccess.result',
    'DiscoveryAccess.current_access'
]].rename(columns={
    'DiscoveryAccess.endpoint': 'last_scanned_ip',
    'DeviceInfo.hostname': 'last_identity',
    'DiscoveryAccess.node_kind': 'last_kind',
    'last_cred_short': 'last_credential',
    'label': 'last_credential_label',
    'username': 'last_credential_username',
    'DiscoveryAccess.scan_starttime': 'last_start_time',
    'DiscoveryRun.label': 'last_run',
    'DiscoveryAccess.end_state': 'last_endstate',
    'DiscoveryAccess.result': 'last_result',
    'DiscoveryAccess.current_access': 'last_access_method',
})

# Latest successful rows
succ = merged[merged.get('DiscoveryAccess.host_node_updated').astype(bool)]
idx_succ = succ.groupby('Identities.endpoint')['scan_end_raw'].idxmax()
last_succ = succ.loc[idx_succ, [
    'Identities.endpoint',
    'DeviceInfo.hostname',
    'DiscoveryAccess.endpoint',
    'last_cred_short',
    'label',
    'username',
    'DiscoveryAccess.scan_starttime',
    'DiscoveryRun.label',
    'DiscoveryAccess.end_state'
]].rename(columns={
    'DeviceInfo.hostname': 'last_successful_identity',
    'DiscoveryAccess.endpoint': 'last_successful_ip',
    'last_cred_short': 'last_successful_credential',
    'label': 'last_successful_credential_label',
    'username': 'last_successful_credential_username',
    'DiscoveryAccess.scan_starttime': 'last_successful_start_time',
    'DiscoveryRun.label': 'last_successful_run',
    'DiscoveryAccess.end_state': 'last_successful_endstate',
})

# Assemble final DataFrame
df_out = (
    latest
    .merge(agg_device_names, on='Identities.endpoint', how='left')
    .merge(agg_endpoints, on='Identities.endpoint', how='left')
    .merge(agg_runs, on='Identities.endpoint', how='left')
    .merge(agg_creds, on='Identities.endpoint', how='left')
    .merge(last_succ, on='Identities.endpoint', how='left')
    .merge(identities_df[['Identities.endpoint', 'list_of_names']], on='Identities.endpoint', how='left')
)

# Fallback last_identity to a name seen in identities if missing
df_out['last_identity'] = df_out.apply(
    lambda r: r['last_identity'] if pd.notna(r['last_identity']) and str(r['last_identity']).strip() else (r['list_of_names'][0] if isinstance(r['list_of_names'], list) and r['list_of_names'] else None),
    axis=1
)

# Reorder columns to original schema
df_out = df_out[[
    'last_scanned_ip',
    'last_identity',
    'last_kind',
    'all_device_names',
    'all_endpoints',
    'all_credentials_used',
    'all_discovery_runs',
    'last_credential',
    'last_credential_label',
    'last_credential_username',
    'last_start_time',
    'last_run',
    'last_endstate',
    'last_result',
    'last_access_method',
    'last_successful_identity',
    'last_successful_ip',
    'last_successful_credential',
    'last_successful_credential_label',
    'last_successful_credential_username',
    'last_successful_start_time',
    'last_successful_run',
    'last_successful_endstate',
]]

# Add Discovery Instance col and sample
df_out.insert(0, 'Discovery Instance', target)
REPORT_NAME = 'devices'
print(f"- Output shape: {df_out.shape}")
display(df_out.head(5))


## Save to CSV

Writes the report to the standard output folder in the project root.

In [None]:
OUTPUT_CSV = str(output_dir / f'{REPORT_NAME}.csv')
df_out.to_csv(OUTPUT_CSV, index=False)
print(f'Saved to {OUTPUT_CSV}')
