# Device Identities (device_ids)

Fetch device identity rows from every configured BMC Discovery appliance using the Tideway SDK (bulk search), build unique identities per originating endpoint, and save CSVs to the standard DisMAL output folders.

> **NOTE:** Due to limitations of the API, this may take a while to run.


In [None]:
# %pip install -q pandas pyyaml
import pandas as pd
import yaml
from pathlib import Path
from typing import Any, Dict, List, Optional
import xml.etree.ElementTree as ET
import tideway
import re
import ast


## Query configuration

Set the query title and optionally restrict the appliances to run against by name. Leave `APPLIANCE_NAMES` empty to run against every appliance defined in `config.yaml`.


In [None]:
QUERY_TITLE = 'Device IDs'
APPLIANCE_NAMES: List[str] = []  # e.g., ['prod', 'dev']; empty -> all appliances


## Optional filters

Apply filters after fetching the results from each appliance to narrow the output.


In [None]:
DEVICE_NAME_FILTER = None  # e.g., 'host-name'
INCLUDE_ENDPOINTS: List[str] = []     # e.g., ['10.1.2.3']
ENDPOINT_PREFIX = None     # e.g., '10.1.'


## Load configuration and prepare helpers

Locate the repository root, resolve the shared XML query definitions and raw export directories, and build helper utilities for connecting to each appliance when necessary.


In [None]:
def _find_repo_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / 'config.yaml').exists():
            return p
    return start.parent

repo_root = _find_repo_root(Path.cwd())

queries_xml_candidates = [
    repo_root / 'queries' / 'dismal_queries.xml',
    repo_root / 'DisMAL' / 'queries' / 'dismal_queries.xml',
]
for candidate in queries_xml_candidates:
    if candidate.exists():
        queries_xml = candidate
        break
else:
    raise FileNotFoundError('Unable to locate dismal_queries.xml')

def load_query_from_xml(title: str, xml_path: Optional[Path] = None) -> str:
    xml_path = xml_path or queries_xml
    tree = ET.parse(xml_path)
    root = tree.getroot()
    for elem in root.findall('query'):
        if (elem.get('title') or '').strip() == title:
            search_text = (elem.findtext('search') or '').strip()
            if not search_text:
                raise ValueError(f"Query '{title}' missing search text")
            return search_text
    raise ValueError(f"Query '{title}' not found in {xml_path}")

QUERY_TEXT = load_query_from_xml(QUERY_TITLE)

cfg_path = repo_root / 'config.yaml'
cfg: Dict[str, Any] = yaml.safe_load(cfg_path.read_text()) or {}
RAW_EXPORT_ROOT = repo_root / 'raw_exports'

def _resolve_token(appliance_cfg: Dict[str, Any], global_cfg: Dict[str, Any]) -> str:
    token = str(appliance_cfg.get('token') or global_cfg.get('token') or '').strip()
    token_file = appliance_cfg.get('token_file') or global_cfg.get('token_file') or global_cfg.get('f_token')
    if not token and token_file:
        tf_path = Path(token_file)
        if not tf_path.is_absolute():
            tf_path = repo_root / tf_path
        token = tf_path.read_text().strip()
    if not token:
        raise ValueError('API token not found for appliance (token or token_file)')
    return token

def _sanitize_target(target: str) -> str:
    return target.replace('.', '_').replace(':', '_').replace('/', '_')

def _slugify(value: str) -> str:
    slug = re.sub(r'[^A-Za-z0-9]+', '_', value).strip('_').lower()
    return slug or 'unnamed'

def build_appliance_contexts(include_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
    include = [name.strip() for name in (include_names or []) if str(name).strip()]
    appliances = cfg.get('appliances') or []
    contexts: List[Dict[str, Any]] = []

    if appliances:
        iterable = list(enumerate(appliances))
    else:
        iterable = [(0, cfg)]

    for index, raw in iterable:
        name = str(raw.get('name') or raw.get('target') or f'appliance_{index + 1}').strip()
        if include and name not in include:
            continue

        target = str(raw.get('target') or cfg.get('target') or '').strip()
        if not target:
            print(f"Skipping {name or f'appliance_{index + 1}'}: missing target")
            continue

        token = None
        token_error: Optional[Exception] = None
        try:
            token = _resolve_token(raw, cfg)
        except Exception as exc:
            token_error = exc

        api_version = str(raw.get('api_version') or cfg.get('api_version') or 'v1.14')
        verify_ssl = bool(raw.get('verify_ssl', cfg.get('verify_ssl', True)))
        output_dir = repo_root / ('output_' + _sanitize_target(target))
        output_dir.mkdir(parents=True, exist_ok=True)

        slug = _slugify(name or target)
        raw_export_dir = RAW_EXPORT_ROOT / slug
        raw_csv_path = raw_export_dir / 'device_ids.csv'

        contexts.append({
            'name': name or target,
            'target': target,
            'token': token,
            'token_error': token_error,
            'api_version': api_version,
            'verify_ssl': verify_ssl,
            'output_dir': output_dir,
            'raw_export_dir': raw_export_dir,
            'raw_csv_path': raw_csv_path,
            'app': None,
            'data': None,
        })

    if include and not contexts:
        raise RuntimeError(f"No appliances matched the provided names: {include}")
    if not contexts:
        raise RuntimeError('No appliances could be initialised from config.yaml')

    return contexts

def initialise_instances(include_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
    contexts = build_appliance_contexts(include_names=include_names)
    for ctx in contexts:
        print(f"Configured {ctx['name']} ({ctx['target']}):")
        raw_csv = ctx['raw_csv_path']
        if raw_csv.exists():
            print('  Raw export    :', raw_csv)
        else:
            print('  Raw export    : missing -> will attempt API fallback')
            if ctx['token'] is None:
                warn = ctx.get('token_error') or 'API token not available'
                print('  Warning       :', warn)
            else:
                print('  API fallback  : available')
        print('  Output dir    :', ctx['output_dir'])
    return contexts

NAME_COLUMNS = [
    'InferredElement.name',
    'InferredElement.hostname',
    'InferredElement.local_fqdn',
    'InferredElement.sysname',
    'NetworkInterface.fqdns',
]
IP_COLUMNS = [
    'DiscoveryAccess.endpoint',
    'Endpoint.endpoint',
    'DiscoveredIPAddress.ip_addr',
    'InferredElement.__all_ip_addrs',
    'NetworkInterface.ip_addr',
]
ENDPOINT_COLUMN = 'DiscoveryAccess.endpoint'

def _coerce_iterable(value):
    if isinstance(value, list):
        return value
    if value is None or pd.isna(value):
        return []
    if isinstance(value, str):
        stripped = value.strip()
        if not stripped or stripped.lower() in {'none', 'nan'}:
            return []
        if stripped.startswith('[') and stripped.endswith(']'):
            try:
                parsed = ast.literal_eval(stripped)
            except Exception:
                return [stripped]
            if isinstance(parsed, list):
                return [item for item in parsed if item not in (None, 'None')]
            if parsed in (None,):
                return []
            return [parsed]
        return [value]
    return [value]

def _ensure_api_data(instance: Dict[str, Any]):
    if instance.get('data') is not None:
        return instance['data']
    if instance.get('token') is None:
        token_error = instance.get('token_error') or 'API token unavailable'
        raise RuntimeError(f"API fallback unavailable: {token_error}")
    api_number = (instance.get('api_version') or '').lstrip('vV')
    app = tideway.appliance(instance['target'], instance['token'], api_version=api_number or None, ssl_verify=instance['verify_ssl'])
    data_ep = app.data()
    try:
        about = app.api_about
        status = getattr(about, 'status_code', 'ok')
        print('  About status   :', status)
    except Exception as exc:
        print('  Warning        : failed to reach /api/about ->', exc)
    instance['app'] = app
    instance['data'] = data_ep
    return data_ep

def fetch_device_rows(instance: Dict[str, Any]) -> pd.DataFrame:
    csv_path = instance.get('raw_csv_path')
    if csv_path and csv_path.exists():
        print('  Source         : raw CSV export')
        return pd.read_csv(csv_path, na_values=['None'])

    print('  Source         : API query (CSV missing)')
    try:
        data_ep = _ensure_api_data(instance)
    except Exception as exc:
        print('  Error          :', exc)
        return pd.DataFrame()

    payload = data_ep.search({'query': QUERY_TEXT}, format='object', limit=0)

    if hasattr(payload, 'json'):
        payload = payload.json()

    if isinstance(payload, dict):
        payload = payload.get('results') or payload.get('result') or []

    if not payload:
        return pd.DataFrame()

    if isinstance(payload, list):
        first = payload[0] if payload else None
        if isinstance(first, list):
            headers, *rows = payload
            payload = [dict(zip(headers, row)) for row in rows]
        elif not isinstance(first, dict):
            return pd.DataFrame()
    else:
        return pd.DataFrame()

    return pd.json_normalize(payload)

def prepare_device_identities(rows: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    if rows.empty:
        empty_cols = [ENDPOINT_COLUMN, 'List of IPs', 'List of Names']
        return rows.copy(), pd.DataFrame(columns=empty_cols)

    df = rows.copy()

    required_cols = set(NAME_COLUMNS + IP_COLUMNS + [ENDPOINT_COLUMN])
    for col in required_cols:
        if col not in df.columns:
            df[col] = pd.NA

    if DEVICE_NAME_FILTER:
        needle = str(DEVICE_NAME_FILTER).lower()
        name_mask = pd.Series(False, index=df.index)
        for col in NAME_COLUMNS:
            col_str = df[col].astype(str).str.lower()
            name_mask = name_mask | col_str.str.contains(needle, na=False)
        df = df[name_mask]

    if INCLUDE_ENDPOINTS:
        df = df[df[ENDPOINT_COLUMN].isin(INCLUDE_ENDPOINTS)]
    elif ENDPOINT_PREFIX:
        df = df[df[ENDPOINT_COLUMN].astype(str).str.startswith(str(ENDPOINT_PREFIX))]

    if df.empty:
        empty_cols = [ENDPOINT_COLUMN, 'List of IPs', 'List of Names']
        return df, pd.DataFrame(columns=empty_cols)

    df['ips_all'] = df.apply(lambda r: [item for col in IP_COLUMNS for item in _coerce_iterable(r[col])], axis=1)
    df['names_all'] = df.apply(lambda r: [item for col in NAME_COLUMNS for item in _coerce_iterable(r[col])], axis=1)

    df_ips = df[[ENDPOINT_COLUMN, 'ips_all']].explode('ips_all').explode('ips_all')
    df_ips = df_ips[df_ips['ips_all'].notna()]
    if not df_ips.empty:
        ips_agg = df_ips.groupby(ENDPOINT_COLUMN, dropna=True)['ips_all'].agg(lambda s: sorted(pd.unique(s.astype(str))))
    else:
        ips_agg = pd.Series(dtype=object, name='ips_all')

    df_names = df[[ENDPOINT_COLUMN, 'names_all']].explode('names_all').explode('names_all')
    df_names = df_names[df_names['names_all'].notna()]
    if not df_names.empty:
        names_agg = df_names.groupby(ENDPOINT_COLUMN, dropna=True)['names_all'].agg(lambda s: sorted(pd.unique(s.astype(str))))
    else:
        names_agg = pd.Series(dtype=object, name='names_all')

    agg_df = pd.concat([ips_agg, names_agg], axis=1).reset_index()
    agg_df = agg_df.rename(columns={'ips_all': 'List of IPs', 'names_all': 'List of Names'})

    return df, agg_df


## Initialise appliances and run query

Load each appliance's device ID data from raw CSV exports when present, falling back to live API queries only when necessary.


In [None]:
instances = initialise_instances(include_names=APPLIANCE_NAMES)

results: List[Dict[str, Any]] = []
for inst in instances:
    print()
    print(f"Running query for {inst['name']} ({inst['target']}):")
    rows = fetch_device_rows(inst)
    print('  Raw rows         :', len(rows))
    if not rows.empty:
        display(rows.head(5))

    filtered_rows, agg_df = prepare_device_identities(rows)
    print('  Rows after filter:', len(filtered_rows))

    if not filtered_rows.empty:
        display(filtered_rows.head(5))

    agg_df = agg_df.copy()
    agg_df.insert(0, 'Discovery Instance', inst['target'])

    if not agg_df.empty:
        display(agg_df.head(5))
    else:
        print('  Aggregated results are empty.')

    results.append({'instance': inst, 'rows': rows, 'filtered': filtered_rows, 'aggregated': agg_df})


## Combined results preview

Concatenate the aggregated outputs for a quick multi-appliance view.


In [None]:
empty_cols = ['Discovery Instance', ENDPOINT_COLUMN, 'List of IPs', 'List of Names']
if results:
    combined_frames = [item['aggregated'] for item in results if not item['aggregated'].empty]
    combined_results = pd.concat(combined_frames, ignore_index=True) if combined_frames else pd.DataFrame(columns=empty_cols)
else:
    combined_results = pd.DataFrame(columns=empty_cols)

display(combined_results.head(10))


## Save outputs

Persist each appliance's aggregated results to its configured output directory.


In [None]:
for item in results:
    inst = item['instance']
    out_df = item['aggregated']
    output_csv = inst['output_dir'] / 'device_ids.csv'
    out_df.to_csv(output_csv, index=False)
    print(f"Saved to {output_csv} ({len(out_df)} rows)")
