# Discovery Analysis Report (BMC Discovery)

This notebook reproduces the DisMAL `discovery_analysis` report but runs independently of DisMAL modules.
It uses the Tideway Python library for queries, is broken into small sections,
and saves the final CSV to the standard `output_<target>` folder.

## Requirements

Uncomment the next cell to install dependencies in your environment if needed.

> **NOTE:** Right now this takes a long time to run over API if there are a lot of DiscoveryAccesses. This is the way the API an search is designed, not much of a way around it at current!

In [None]:
# %pip install -q tideway pandas pyyaml

import os, sys, json, datetime, importlib
from pathlib import Path
from typing import Any, Dict, List
import pandas as pd
import numpy as np
import yaml
from collections.abc import Mapping, Sequence
from pandas.errors import EmptyDataError
import hashlib


## Select Appliance (optional)

If `config.yaml` has multiple appliances, set by name or index. Defaults to the first.

In [None]:
APPLIANCE_NAME = None   # e.g., 'prod' or 'dev'
APPLIANCE_INDEX = 0     # integer index if not using name selection


## Configuration (from config.yaml)

Finds `config.yaml` in the repo root (or its parents), reads `target`, token/token_file, API version, and SSL flag.
Saves the CSV to `output_<target>/discovery_analysis.csv`.

In [None]:
def _find_repo_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / 'config.yaml').exists():
            return p
    return start.parent

repo_root = _find_repo_root(Path.cwd())
config_path = repo_root / 'config.yaml'
with open(config_path, 'r') as fh:
    cfg = yaml.safe_load(fh) or {}

apps = cfg.get('appliances') or []
selected = None
if isinstance(apps, list) and apps:
    if APPLIANCE_NAME:
        selected = next((a for a in apps if a.get('name') == APPLIANCE_NAME), None)
        if selected is None:
            raise ValueError(f"No appliance named '{APPLIANCE_NAME}' in config.yaml")
    else:
        try:
            selected = apps[int(APPLIANCE_INDEX)]
        except Exception:
            selected = apps[0]

target = ((selected or {}).get('target') or cfg.get('target') or '').strip()
if not target:
    raise ValueError('config.yaml missing "target"')

token = (((selected or {}).get('token') or cfg.get('token') or '').strip())
token_file = (selected or {}).get('token_file') or cfg.get('token_file') or cfg.get('f_token')
if not token and token_file:
    tf_path = Path(token_file)
    if not tf_path.is_absolute():
        tf_path = repo_root / tf_path
    with open(tf_path, 'r') as tf:
        token = tf.read().strip()
if not token:
    raise ValueError('API token not found in config.yaml (token or token_file)')

API_VERSION = str((selected or {}).get('api_version') or cfg.get('api_version') or 'v1.14')
VERIFY_SSL = bool((selected or {}).get('verify_ssl', cfg.get('verify_ssl', True)))

sanitized = target.replace('.', '_').replace(':', '_').replace('/', '_')
output_dir = repo_root / f'output_{sanitized}'
output_dir.mkdir(parents=True, exist_ok=True)

print('Base Host     :', target)
print('API Version   :', API_VERSION)
print('Verify SSL    :', VERIFY_SSL)
print('Output folder :', output_dir)

# Prefer local Tideway package if present
local_tideway = repo_root / 'Tideway'
if local_tideway.exists():
    sys.path.insert(0, str(local_tideway))
tideway = importlib.import_module('tideway')
API_VERSION_NUM = API_VERSION.lstrip('v')
app = tideway.appliance(target, token, api_version=API_VERSION_NUM, ssl_verify=VERIFY_SSL)
twsearch = app.data()
twcreds = app.credentials()
try:
    about = app.api_about
    print('Appliance reachable:', about.status_code)
except Exception as e:
    print('Warning: failed to contact appliance /api/about:', e)


## Helpers

Utility helpers for Tideway result normalization, identity building, and credential mapping.

In [None]:
def list_table_to_json(table_like):
    if not table_like or not isinstance(table_like, list):
        return []
    if not table_like or not isinstance(table_like[0], list):
        return []
    headers = table_like[0]
    rows = table_like[1:]
    out = []
    for r in rows:
        try:
            out.append(dict(zip(headers, r)))
        except Exception:
            continue
    return out

def to_rows(payload):
    if isinstance(payload, list):
        if payload and isinstance(payload[0], list):
            return list_table_to_json(payload)
        if payload and isinstance(payload[0], dict):
            return payload
        return []
    if hasattr(payload, 'json'):
        try:
            js = payload.json()
        except Exception:
            return []
        if isinstance(js, list):
            if js and isinstance(js[0], list):
                return list_table_to_json(js)
            if js and isinstance(js[0], dict):
                return js
        if isinstance(js, dict) and 'results' in js and 'headings' in js:
            table_like = [js['headings']] + list(js.get('results') or [])
            return list_table_to_json(table_like)
        return []
    if isinstance(payload, dict) and 'results' in payload and 'headings' in payload:
        table_like = [payload['headings']] + list(payload.get('results') or [])
        return list_table_to_json(table_like)
    return []

def tw_search_all(search, query: str, limit: int = 500):
    resp = search.search({'query': query}, format='object', limit=limit)
    return to_rows(resp)

def flatten_list(value):
    if value is None:
        return []
    if isinstance(value, list):
        out = []
        for v in value:
            if isinstance(v, list):
                out.extend(v)
            else:
                out.append(v)
        return out
    return [value]

def sort_unique(items):
    return sorted(set([x for x in items if x is not None]))

def get_credential_map(twcreds_handle):
    resp = twcreds_handle.get_vault_credentials
    try:
        items = resp.json()
    except Exception:
        items = []
    mapping = {}
    for c in items or []:
        if not isinstance(c, dict):
            continue
        uuid = str(c.get('uuid') or '').split('/')[-1]
        mapping[uuid] = {
            'label': c.get('label'),
            'username': c.get('username')
                        or c.get('snmp.v3.securityname')
                        or c.get('aws.access_key_id')
                        or c.get('azure.application_id'),
        }
    return mapping

def build_identities(id_rows: pd.DataFrame):
    if id_rows is None or id_rows.empty:
        return []

    # Restrict to relevant columns if present
    cols = [
        'DiscoveryAccess.endpoint','Endpoint.endpoint',
        'DiscoveredIPAddress.ip_addr','InferredElement.__all_ip_addrs',
        'NetworkInterface.ip_addr',
        'InferredElement.name','InferredElement.hostname',
        'InferredElement.local_fqdn','InferredElement.sysname',
        'NetworkInterface.fqdns'
    ]
    present = [c for c in cols if c in id_rows.columns]
    df = id_rows[present].copy()

    endpoint_map: Dict[str, Dict[str, set]] = {}
    ip_fields = [f for f in [
        'DiscoveredIPAddress.ip_addr','InferredElement.__all_ip_addrs','NetworkInterface.ip_addr'
    ] if f in df.columns]
    name_fields = [f for f in [
        'InferredElement.name','InferredElement.hostname','InferredElement.local_fqdn','InferredElement.sysname','NetworkInterface.fqdns'
    ] if f in df.columns]

    for rec in df.to_dict('records'):
        ep = rec.get('DiscoveryAccess.endpoint') or rec.get('Endpoint.endpoint')
        if not ep:
            continue
        data = endpoint_map.setdefault(ep, {'ips': set(), 'names': set()})
        for f in ip_fields:
            vals = flatten_list(rec.get(f))
            data['ips'].update([v for v in vals if v])
        for f in name_fields:
            vals = flatten_list(rec.get(f))
            data['names'].update([v for v in vals if v])

    identities = []
    for ep, sets in endpoint_map.items():
        ips = sort_unique(list(sets['ips']))
        names = sort_unique(list(sets['names']))
        identities.append({
            'originating_endpoint': ep,
            'list_of_ips': ips,
            'list_of_names': names,
        })

    return identities

def calc_when(ts: datetime.datetime):
    bins = [0, 59, 1440, 10080, 43830, 131487, 262974, 525949, 525950]
    labels = [
        'Less than 60 minutes ago',
        'Less than 24 hours ago',
        'Less than 7 days ago',
        'Less than 1 month ago',
        'Less than 3 months ago',
        'Less than 6 months ago',
        'Less than 12 months ago',
        'Over a year ago',
    ]
    if ts is None:
        return None
    delta = datetime.datetime.now(ts.tzinfo) - ts
    mins = delta.days * 24 * 60 + delta.seconds / 60
    df = pd.DataFrame({'in_minutes': [mins]})
    df['when'] = pd.cut(df['in_minutes'], bins=bins, labels=labels, right=False)
    return df.to_dict().get('when').get(0)


# Cache Builder

In [None]:
CACHE_DIR = Path(".cache")

def _paths(name: str):
    CACHE_DIR.mkdir(exist_ok=True)
    return CACHE_DIR / f"{name}.csv", CACHE_DIR / f"{name}.meta.json"

def _now_utc_iso():
    return datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds")

def _age_hours(iso_ts: str) -> float:
    try:
        dt = datetime.datetime.fromisoformat(iso_ts)
        return (datetime.datetime.now(datetime.timezone.utc) - dt).total_seconds() / 3600.0
    except Exception:
        return 1e9  # force stale

def _hash(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()[:12]

def get_or_build_df(
    name: str,
    build_fn,                      # () -> pd.DataFrame | None
    parse_list_cols=None,          # kept for compatibility (unused here)
    transform_fn=None,             # (pd.DataFrame) -> pd.DataFrame
    use_cache=True,
    force_refresh=False,
    max_age_hours=None,            # None = never expires
    query_text: str | None = None, # ties cache to query hash
    prefer_csv_without_meta=True,
    expected_columns: list[str] | None = None,
    trust_existing_csv=True,      # if CSV exists and not forcing, load & update meta
    debug=True):
    csv_path, meta_path = _paths(name)

    def _debug(msg):
        if debug: print(f"[cache:{name}] {msg}")

    def _read_csv_with_schema(columns_from_meta=None):
        try:
            df = pd.read_csv(csv_path)
        except EmptyDataError:
            df = pd.DataFrame(columns=columns_from_meta or [])
        return df

    # CSV exists but meta missing
    if use_cache and not force_refresh and csv_path.exists() and (prefer_csv_without_meta and not meta_path.exists()):
        _debug(f"meta missing; loading CSV-only at {csv_path}")
        df = _read_csv_with_schema()
        if transform_fn: df = transform_fn(df)
        meta = {
            "saved_at_utc": _now_utc_iso(),
            "row_count": int(len(df)),
            "columns": list(df.columns),
            "no_results": len(df) == 0,
            "note": "meta reconstructed from CSV-only cache"
        }
        if query_text: meta["query_hash"] = _hash(query_text.strip())
        with open(meta_path, "w") as f: json.dump(meta, f, indent=2)
        return df

    # Normal cached path
    if use_cache and not force_refresh and csv_path.exists() and meta_path.exists():
        with open(meta_path) as f: meta = json.load(f)
        qh_ok = (query_text is None) or (meta.get("query_hash") == _hash(query_text.strip()))
        fresh = (max_age_hours is None) or (_age_hours(meta.get("saved_at_utc","1970-01-01T00:00:00+00:00")) <= max_age_hours)

        if qh_ok and fresh:
            _debug(f"loading from cache {csv_path}")
            df = _read_csv_with_schema(columns_from_meta=meta.get("columns", []))
            if transform_fn: df = transform_fn(df)
            return df
        elif trust_existing_csv:
            _debug("stale/mismatch meta; trusting existing CSV and updating meta")
            df = _read_csv_with_schema(columns_from_meta=meta.get("columns", []))
            new_meta = {
                "saved_at_utc": _now_utc_iso(),
                "row_count": int(len(df)),
                "columns": list(df.columns),
                "no_results": len(df) == 0,
            }
            if query_text: new_meta["query_hash"] = _hash(query_text.strip())
            with open(meta_path, "w") as f: json.dump(new_meta, f, indent=2)
            if transform_fn: df = transform_fn(df)
            return df
        else:
            _debug("query hash mismatch or cache expired -> rebuild")

    # Rebuild
    _debug("building via API")
    df = build_fn()
    if df is None:
        df = pd.DataFrame(columns=expected_columns or [])
    elif not isinstance(df, pd.DataFrame):
        df = pd.DataFrame(df)

    # Ensure we always write a CSV with headers, even for 0 rows
    if df.empty and len(df.columns) == 0 and expected_columns:
        df = pd.DataFrame(columns=expected_columns)

    df.to_csv(csv_path, index=False)

    meta = {
        "saved_at_utc": _now_utc_iso(),
        "row_count": int(len(df)),
        "columns": list(df.columns),
        "no_results": len(df) == 0,
    }
    if query_text: meta["query_hash"] = _hash(query_text.strip())
    with open(meta_path, "w") as f: json.dump(meta, f, indent=2)

    if transform_fn: df = transform_fn(df)
    _debug(f"cached -> {csv_path.name}")
    return df


## Fetch Discovery Extracts

Retrieves the minimal key mapping and the fact tables used for the analysis.

In [None]:
def df_of(q, label, n=5):
    rows = tw_search_all(twsearch, q) or []
    df = pd.DataFrame(rows)

    if df.empty:
        print(f'- {label}: 0 rows')
        return df

    # make a hashable view for dedupe
    def _hashable(x):
        if isinstance(x, (list, tuple)):
            return tuple(_hashable(v) for v in x)
        if isinstance(x, dict):
            return tuple(sorted((k, _hashable(v)) for k, v in x.items()))
        return x

    df_h = df.map(_hashable)
    idx = df_h.drop_duplicates().index
    dfu = df.loc[idx].reset_index(drop=True)

    print(f'- {label}: {len(rows)} rows (raw), {len(dfu)} rows (unique)')
    display(dfu.head(n))
    return dfu

In [None]:
#search DiscoveryAccess where endtime and #id = "0684b5680100264824e0a6256e446973636f76657279416363657373"
qry_key = '''
search DiscoveryAccess where endtime
show
#id as "DiscoveryAccess.id",
#Next:Sequential:Previous:DiscoveryAccess.#id as "DiscoveryAccess.previous_id",
#Previous:Sequential:Next:DiscoveryAccess.#id as "DiscoveryAccess.next_id",
#DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo.#id as "DeviceInfo.id",
#Member:List:List:DiscoveryRun.#id as "DiscoveryRun.id",
#::InferredElement:.#id as "InferredElement.id",
#DiscoveryAccess:Metadata:Detail:SessionResult.#id as "SessionResult.id",
explode #::InferredElement:.#DeviceWithInterface:DeviceInterface:InterfaceOfDevice:NetworkInterface.#id as "NetworkInterface.id"
'''

print('Running extracts…')

def build_key_map_df():
    rows = tw_search_all(twsearch, qry_key) or []
    return pd.DataFrame(rows)

key_df = get_or_build_df(
    name="key_map",
    build_fn=build_key_map_df,
    use_cache=True,
    force_refresh=False,
    max_age_hours=24,        # refresh daily
    query_text=qry_key
)

In [None]:
# Check query resuts

display(key_df.head(10))
display(key_df.shape)

In [None]:
# Make columns hashable

def _hashable(x):
    if isinstance(x, Mapping):
        return tuple(sorted((k, _hashable(v)) for k, v in x.items()))
    if isinstance(x, (list, tuple, set)):
        return tuple(_hashable(v) for v in x)
    return x

obj_cols = key_df.select_dtypes(include="object").columns
key_df[obj_cols] = key_df[obj_cols].map(_hashable)

In [None]:
# Dedupe and preview (mirror df_of)
key_df = key_df.drop_duplicates().reset_index(drop=True)
print(f'- Key Map: {len(key_df)} rows (unique)')
display(key_df.head(10))
display(key_df.shape)

In [None]:
qry_access = '''
search DiscoveryAccess where endtime
ORDER BY discovery_endtime DESC
show
#id as "DiscoveryAccess.id",
#Next:Sequential:Previous:DiscoveryAccess.#id as "DiscoveryAccess.previous_id",
#Previous:Sequential:Next:DiscoveryAccess.#id as "DiscoveryAccess.next_id",
endpoint as 'DiscoveryAccess.endpoint',
friendlyTime(discovery_starttime) as 'DiscoveryAccess.scan_starttime',
friendlyTime(discovery_endtime) as 'DiscoveryAccess.scan_endtime',
discovery_endtime as 'DiscoveryAccess.scan_endtime_raw',
discovery_endtime as 'DiscoveryAccess.discovery_endtime',
whenWasThat(discovery_endtime) as 'DiscoveryAccess.when_last_scan',
end_state as 'DiscoveryAccess.end_state',
reason as 'DiscoveryAccess.reason_not_updated',
result as 'DiscoveryAccess.result',
_last_marker as 'DiscoveryAccess._last_marker',
_first_marker as 'DiscoveryAccess._first_marker',
_last_interesting as 'DiscoveryAccess._last_interesting',
__had_inference as 'DiscoveryAccess.__had_inference',
best_ip_score as 'DiscoveryAccess.best_ip_score',
(#DiscoveryAccess:Metadata:Detail:SessionResult.success or access_success) as 'DiscoveryAccess.access_success',
access_failure as 'DiscoveryAccess.access_failure',
message as 'DiscoveryAccess.message'
'''

def build_access_df():
    rows = tw_search_all(twsearch, qry_access) or []
    return pd.DataFrame(rows)

access_df = get_or_build_df(
    name="access",
    build_fn=build_access_df,
    use_cache=True,
    force_refresh=False,
    max_age_hours=24,        # refresh daily
    query_text=qry_access
)

# Check query resuts
display(access_df.head(5))
display(access_df.shape)

In [None]:
qry_deviceinfo = '''
search DeviceInfo
show
#id as "DeviceInfo.id",
hostname as 'DeviceInfo.hostname',
os_type as 'DeviceInfo.os_type',
os_class as 'DeviceInfo.os_class',
os_version as 'DeviceInfo.os_version',
kind as 'DeviceInfo.kind',
inferred_kind as 'DeviceInfo.inferred_kind',
last_access_method as 'DeviceInfo.last_access_method',
probed_os as 'DeviceInfo.probed_os',
last_credential as 'DeviceInfo.last_credential',
last_slave as 'DeviceInfo.last_slave',
__preserved_last_credential as 'DeviceInfo.__preserved_last_credential'
'''

def build_device_df():
    rows = tw_search_all(twsearch, qry_deviceinfo) or []
    return pd.DataFrame(rows)

device_df = get_or_build_df(
    name="device_info",
    build_fn=build_device_df,
    use_cache=True,
    force_refresh=False,
    max_age_hours=24,        # refresh daily
    query_text=qry_deviceinfo
)

# Check query resuts
display(device_df.head(5))
display(device_df.shape)

In [None]:
qry_run = '''
search DiscoveryRun
show
#id as "DiscoveryRun.id",
label as 'DiscoveryRun.label',
friendlyTime(starttime) as 'DiscoveryRun.starttime',
friendlyTime(endtime) as 'DiscoveryRun.endtime'
'''

def build_run_df():
    rows = tw_search_all(twsearch, qry_run) or []
    return pd.DataFrame(rows)

run_df = get_or_build_df(
    name="runs",
    build_fn=build_run_df,
    use_cache=True,
    force_refresh=False,
    max_age_hours=24,        # refresh daily
    query_text=qry_run
)

# Check query resuts
display(run_df.head(5))
display(run_df.shape)

In [None]:
qry_session = '''
search SessionResult
show
#id as "SessionResult.id",
success as "SessionResult.success",
session_type as "SessionResult.session_type",
provider as "SessionResult.provider"
'''

def build_session_df():
    rows = tw_search_all(twsearch, qry_session) or []
    return pd.DataFrame(rows)

session_df = get_or_build_df(
    name="sessions",
    build_fn=build_session_df,
    use_cache=True,
    force_refresh=False,
    max_age_hours=24,        # refresh daily
    query_text=qry_session
)

# Check query resuts
display(session_df.head(5))
display(session_df.shape)

In [None]:
qry_inferred = '''
search InferredElement
show
#id as "InferredElement.id",
__all_ip_addrs as 'InferredElement.__all_ip_addrs'
'''

def build_inferred_df():
    rows = tw_search_all(twsearch, qry_inferred) or []
    return pd.DataFrame(rows)

inferred_df = get_or_build_df(
    name="inferred",
    build_fn=build_inferred_df,
    use_cache=True,
    force_refresh=False,
    max_age_hours=24,        # refresh daily
    query_text=qry_inferred
)

# Check query resuts
display(inferred_df.head(5))
display(inferred_df.shape)

In [None]:
qry_interface = '''
search NetworkInterface
show
#id as "NetworkInterface.id",
ip_addr as 'NetworkInterface.ip_addr'
'''

def build_interface_df():
    rows = tw_search_all(twsearch, qry_interface) or []
    return pd.DataFrame(rows)

interface_df = get_or_build_df(
    name="interfaces",
    build_fn=build_interface_df,
    use_cache=True,
    force_refresh=False,
    max_age_hours=24,        # refresh daily
    query_text=qry_interface
)

# Check query resuts
display(interface_df.head(5))
display(interface_df.shape)

## Merge Extracts

Merges the key map with fact tables to reassemble the wide view, adds helper fields
(session_results_logged, previous_end_state, access_method/current_access).

In [None]:
# --- Prep: index and trim columns you need ---
acc = access_df.set_index('DiscoveryAccess.id')
dev = device_df.set_index('DeviceInfo.id')
run = run_df.set_index('DiscoveryRun.id')

# Optional tables (only if present & non-empty)
ses = session_df.set_index('SessionResult.id') if 'SessionResult.id' in key_df and not session_df.empty else None
inf = inferred_df.set_index('InferredElement.id') if 'InferredElement.id' in key_df and not inferred_df.empty else None
nic = interface_df.set_index('NetworkInterface.id') if 'NetworkInterface.id' in key_df and not interface_df.empty else None

# --- Join chain (all left joins on their IDs) ---
merged = (
    key_df
      .join(acc, on='DiscoveryAccess.id', how='left', rsuffix='_acc')
      .join(dev, on='DeviceInfo.id', how='left', rsuffix='_dev')
      .join(run, on='DiscoveryRun.id', how='left', rsuffix='_run')
)

if ses is not None:
    merged = merged.join(ses, on='SessionResult.id', how='left', rsuffix='_ses')
if inf is not None:
    merged = merged.join(inf, on='InferredElement.id', how='left', rsuffix='_inf')
if nic is not None:
    merged = merged.join(nic, on='NetworkInterface.id', how='left', rsuffix='_nic')

# --- Session results flag (fully vectorised) ---
if 'SessionResult.provider' in merged.columns:
    merged['DiscoveryAccess.session_results_logged'] = (
        merged['SessionResult.provider'].isna()
        .groupby(merged['DiscoveryAccess.id'])
        .transform('any')
    )
else:
    merged['DiscoveryAccess.session_results_logged'] = False

# --- Previous end state via map (unchanged logic) ---
if 'DiscoveryAccess.previous_id' in merged.columns and 'DiscoveryAccess.end_state' in access_df.columns:
    prev_map = access_df.set_index('DiscoveryAccess.id')['DiscoveryAccess.end_state']
    merged['DiscoveryAccess.previous_end_state'] = merged['DiscoveryAccess.previous_id'].map(prev_map)
else:
    merged['DiscoveryAccess.previous_end_state'] = None

# --- Access method fields ---
merged['DiscoveryAccess.access_method'] = merged.get('DeviceInfo.last_access_method')

lam = merged['DeviceInfo.last_access_method']
slave = merged['DeviceInfo.last_slave'].astype(bool, errors='ignore')
probed = merged['DeviceInfo.probed_os'].astype(bool, errors='ignore')

cond1 = lam.isin(['windows', 'rcmd']) & slave
merged['DiscoveryAccess.current_access'] = np.where(
    cond1, lam, np.where(probed, 'Probe', lam)
)

print('Merged rows:', len(merged))
merged.head(3)


## Build Identities

Retrieves identity hints per endpoint (IPs and names) and builds a simple identity list.

In [None]:
qry_device_ids = '''
search DiscoveryAccess
show
#::InferredElement:.name as 'InferredElement.name',
#::InferredElement:.hostname as 'InferredElement.hostname',
#::InferredElement:.local_fqdn as 'InferredElement.local_fqdn',
#::InferredElement:.sysname as 'InferredElement.sysname',
endpoint as 'DiscoveryAccess.endpoint',
#DiscoveryAccess:Endpoint:Endpoint:Endpoint.endpoint as 'Endpoint.endpoint',
#DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DiscoveredIPAddressList.#List:List:Member:DiscoveredIPAddress.ip_addr as 'DiscoveredIPAddress.ip_addr',
#::InferredElement:.__all_ip_addrs as 'InferredElement.__all_ip_addrs',
#::InferredElement:.#DeviceWithInterface:DeviceInterface:InterfaceOfDevice:NetworkInterface.ip_addr as 'NetworkInterface.ip_addr',
#::InferredElement:.#DeviceWithInterface:DeviceInterface:InterfaceOfDevice:NetworkInterface.fqdns as 'NetworkInterface.fqdns'
'''

def build_id_rows():
    rows = tw_search_all(twsearch, qry_device_ids) or []
    return pd.DataFrame(rows)

id_rows = get_or_build_df(
    name="identity_rows",
    build_fn=build_id_rows,
    use_cache=True,
    force_refresh=False,
    max_age_hours=24,        # refresh daily
    query_text=qry_device_ids
)

print('Identity rows:', len(id_rows))
identities = build_identities(id_rows)
print('Unique identities:', len(identities))
identities[:3]


## Dropped Endpoints

Fetches dropped endpoints to include in the endpoint-level view.

In [None]:
qry_dropped = '''
search DroppedEndpoints
show explode endpoints as 'Endpoint',
reason as 'Reason_Not_Updated',
__reason as 'End_State',
friendlyTime(starttime) as 'Start',
friendlyTime(endtime) as 'End',
endtime as 'End_Raw',
whenWasThat(endtime) as 'When_Last_Scan',
#EndpointRange:EndpointRange:DiscoveryRun:DiscoveryRun.label as "Run"
'''

def build_dropped():
    rows = tw_search_all(twsearch, qry_dropped) or []
    return pd.DataFrame(rows)

dropped = get_or_build_df(
    name="dropped",
    build_fn=build_dropped,
    use_cache=True,
    force_refresh=False,
    max_age_hours=24,        # refresh daily
    query_text=qry_dropped
)

print('Dropped rows:', len(dropped))
dropped[:3]

## Assemble Endpoint Records

Groups merged facts and dropped entries by endpoint, enriches with credential labels,
computes consistency and "when" buckets, then selects the best/latest record per endpoint.

In [None]:
cred_map = get_credential_map(twcreds)
id_map = {ip: ident for ident in identities for ip in (ident.get('list_of_ips') or []) if ip}

by_endpoint = {}
for _, r in merged.iterrows():
    ep = r.get('DiscoveryAccess.endpoint')
    if not ep:
        continue
    by_endpoint.setdefault(ep, {'discos': [], 'dropped': []})['discos'].append(r.to_dict())
for r in dropped or []:
    ep = r.get('Endpoint')
    if not ep:
        continue
    by_endpoint.setdefault(ep, {'discos': [], 'dropped': []})['dropped'].append(r)

def _parse_ts(scan_end_raw, friendly_end):
    ts = None
    if scan_end_raw:
        try:
            ts = datetime.datetime.fromisoformat(str(scan_end_raw).replace('Z','+00:00'))
        except Exception:
            ts = None
    if ts is None and friendly_end:
        try:
            ts = datetime.datetime.strptime(' '.join(str(friendly_end).split(' ')[:2]), '%Y-%m-%d %H:%M:%S')
        except Exception:
            ts = None
    return ts

def _consistency(states: List[str]):
    states = [s for s in states if s]
    if not states:
        return None
    total = len(states)
    counts = {}
    for s in states:
        counts[s] = counts.get(s, 0) + 1
    top = max(counts, key=counts.get)
    if counts[top] == total:
        return f'Always {top}'
    if counts[top] >= total - 2:
        return f'Usually {top}'
    return f'Most Often {top}'

endpoint_rows = []
for ep, recs in by_endpoint.items():
    ident = id_map.get(ep, {})
    names = ident.get('list_of_names') or []
    eps = ident.get('list_of_ips') or []

    states = [d.get('DiscoveryAccess.end_state') for d in recs['discos']] + [d.get('End_State') for d in recs['dropped']]
    consistency = _consistency(states)

    endpoint_records = []

    for d in recs['discos']:
        scan_end_raw = d.get('DiscoveryAccess.scan_endtime_raw')
        friendly_end = d.get('DiscoveryAccess.scan_endtime')
        ts = _parse_ts(scan_end_raw, friendly_end)
        when = calc_when(ts) if ts else None
        hostname = d.get('DeviceInfo.hostname') or (names[0] if names else None)
        node_kind = d.get('DeviceInfo.kind') or d.get('DeviceInfo.inferred_kind')
        last_cred = d.get('DeviceInfo.last_credential')
        last_cred_uuid = str(last_cred).split('/')[-1] if last_cred else None
        cred_info = cred_map.get(last_cred_uuid or '', {})
        endpoint_records.append({
            'endpoint': ep,
            'hostname': hostname,
            'list_of_names': names,
            'list_of_endpoints': eps,
            'node_kind': node_kind,
            'os_type': d.get('DeviceInfo.os_type'),
            'os_version': d.get('DeviceInfo.os_version'),
            'os_class': d.get('DeviceInfo.os_class'),
            'disco_run': d.get('DiscoveryRun.label'),
            'run_start': d.get('DiscoveryRun.starttime'),
            'run_end': d.get('DiscoveryRun.endtime'),
            'scan_start': d.get('DiscoveryAccess.scan_starttime'),
            'scan_end': d.get('DiscoveryAccess.scan_endtime'),
            'scan_end_raw': scan_end_raw,
            'when_was_that': when,
            'consistency': consistency,
            'current_access': d.get('DiscoveryAccess.current_access'),
            'access_method': d.get('DiscoveryAccess.access_method') or d.get('DeviceInfo.last_access_method'),
            'inferred_node_updated': d.get('DiscoveryAccess.host_node_updated'),
            'reason_not_updated': d.get('DiscoveryAccess.reason_not_updated'),
            'end_state': d.get('DiscoveryAccess.end_state'),
            'previous_end_state': d.get('DiscoveryAccess.previous_end_state'),
            'session_results_logged': d.get('DiscoveryAccess.session_results_logged'),
            'last_credential': last_cred_uuid,
            'credential_name': cred_info.get('label'),
            'credential_login': cred_info.get('username'),
            'timestamp': ts,
            'da_id': d.get('DiscoveryAccess.id'),
            'prev_da_id': d.get('DiscoveryAccess.previous_id'),
            'next_node_id': d.get('DiscoveryAccess.next_id'),
        })

    for d in recs['dropped']:
        ts = _parse_ts(d.get('End_Raw'), d.get('End'))
        when = calc_when(ts) if ts else None
        endpoint_records.append({
            'endpoint': ep,
            'hostname': names[0] if names else None,
            'list_of_names': names,
            'list_of_endpoints': eps,
            'disco_run': d.get('Run'),
            'run_start': d.get('Start'),
            'run_end': d.get('End'),
            'when_was_that': when,
            'consistency': consistency,
            'reason_not_updated': d.get('Reason_Not_Updated'),
            'end_state': d.get('End_State'),
            'timestamp': ts,
            'scan_end_raw': d.get('End_Raw'),
        })

    if not endpoint_records:
        continue
    # choose latest, but keep identifying fields when available
    latest = max(endpoint_records, key=lambda r: r.get('timestamp') or datetime.datetime.min)
    named = [r for r in endpoint_records if r.get('hostname') or r.get('credential_name')]
    chosen = max(named, key=lambda r:   r.get('timestamp') or datetime.datetime.min) if named else latest
    # backfill from latest
    for k, v in latest.items():
        if chosen.get(k) in (None, '') and v not in (None, ''):
            chosen[k] = v
    endpoint_rows.append(chosen)

len(endpoint_rows)


## Format Output

Builds the final DataFrame with the same columns as the DisMAL `discovery_analysis` CSV.

In [None]:
rows = []
for r in endpoint_rows:
    rows.append([
        r.get('endpoint'),
        r.get('hostname'),
        r.get('list_of_names'),
        r.get('list_of_endpoints'),
        r.get('node_kind'),
        r.get('os_type'),
        r.get('os_version'),
        r.get('os_class'),
        r.get('disco_run'),
        r.get('run_start'),
        r.get('run_end'),
        r.get('scan_start'),
        r.get('scan_end'),
        r.get('scan_end_raw'),
        r.get('when_was_that'),
        r.get('consistency'),
        r.get('current_access'),
        r.get('access_method'),
        r.get('inferred_node_updated'),
        r.get('reason_not_updated'),
        r.get('end_state'),
        r.get('previous_end_state'),
        (f"{r.get('previous_end_state')} -> {r.get('end_state')}" if r.get('end_state') is not None else None),
        r.get('session_results_logged'),
        r.get('last_credential'),
        r.get('credential_name'),
        r.get('credential_login'),
        r.get('timestamp'),
        r.get('da_id'),
        r.get('prev_da_id'),
        r.get('next_node_id'),
    ])
headers = [
    'endpoint',
    'device_name',
    'list_of_device_names',
    'list_of_endpoints',
    'node_kind',
    'os_type',
    'os_version',
    'os_class',
    'discovery_run',
    'discovery_run_start',
    'discovery_run_end',
    'scan_start',
    'scan_end',
    'scan_end_raw',
    'when_was_that',
    'consistency',
    'current_access',
    'access_method',
    'inferred_node_updated',
    'reason_not_updated',
    'end_state',
    'previous_end_state',
    'end_state_change',
    'session_results_logged',
    'last_credential',
    'credential_name',
    'credential_login',
    'timestamp',
    'da_id',
    'prev_da_id',
    'next_node_id',
]
df_out = pd.DataFrame(rows, columns=headers)
df_out.insert(0, 'Discovery Instance', target)
df_out.head(10)


## Save to CSV

Writes to the standard output folder.

In [None]:
OUTPUT_CSV = str(output_dir / 'discovery_analysis.csv')
df_out.to_csv(OUTPUT_CSV, index=False)
print(f'Saved to {OUTPUT_CSV}')
