# Expected Agents (BMC Discovery)

This notebook reproduces the DisMAL `expected_agents` report by:
- Gathering installed software per host via Tideway Data API.
- Determining which software appears on a significant proportion of hosts ("expected").
- Listing hosts missing any of those expected agents.

It reads connection details from `config.yaml` and writes a CSV under `output_<target>`.

## Requirements

We use `tideway` from pip (remote), plus `pandas` and `PyYAML`.
Uncomment the following to install in your environment if needed.

In [None]:
# %pip install -q tideway pandas pyyaml

import os, sys
from pathlib import Path
from typing import Any, Dict, List, Iterable, Set
import pandas as pd
import yaml


## Parameters

- `THRESHOLD`: A software title is "expected" if present on at least this fraction of hosts (default 0.5).
- `WINDOWS_ONLY`: Restrict analysis to Windows hosts (default True).

In [None]:
THRESHOLD = 0.5
WINDOWS_ONLY = True


## Select Appliance (optional)

If your `config.yaml` defines multiple appliances under the `appliances:` list,
set `APPLIANCE_NAME` to one of their names (e.g., 'prod' or 'dev') or use the index.
Defaults to the first appliance if neither is set.

In [None]:
APPLIANCE_NAME = None   # e.g., 'prod' or 'dev'
APPLIANCE_INDEX = 1     # integer index if not using name selection


## Configuration (from config.yaml)

Reads settings from `../config.yaml` including target, token/token_file,
API version, and SSL verification preference.
Saves the CSV to `../output_<target>/expected_agents.csv`.

In [None]:
def _find_repo_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / 'config.yaml').exists():
            return p
    return start.parent

repo_root = _find_repo_root(Path.cwd())
config_path = repo_root / 'config.yaml'
with open(config_path, 'r') as fh:
    cfg = yaml.safe_load(fh) or {}

# Appliance selection
apps = cfg.get('appliances') or []
selected = None
if isinstance(apps, list) and apps:
    if APPLIANCE_NAME:
        selected = next((a for a in apps if a.get('name') == APPLIANCE_NAME), None)
        if selected is None:
            raise ValueError(f"No appliance named '{APPLIANCE_NAME}' in config.yaml")
    else:
        try:
            selected = apps[int(APPLIANCE_INDEX)]
        except Exception:
            selected = apps[0]

target = ((selected or {}).get('target') or cfg.get('target') or '').strip()
if not target:
    raise ValueError('config.yaml missing "target"')

# Token handling: inline token or token file
token = (((selected or {}).get('token') or cfg.get('token') or '').strip())
token_file = (selected or {}).get('token_file') or cfg.get('token_file') or cfg.get('f_token')
if not token and token_file:
    tf_path = Path(token_file)
    if not tf_path.is_absolute():
        tf_path = repo_root / tf_path
    with open(tf_path, 'r') as tf:
        token = tf.read().strip()
if not token:
    raise ValueError('API token not found in config.yaml (token or token_file)')

# Version and SSL
API_VERSION = str((selected or {}).get('api_version') or cfg.get('api_version') or 'v1.14')
VERIFY_SSL = bool((selected or {}).get('verify_ssl', cfg.get('verify_ssl', True)))

# Output path
sanitized = target.replace('.', '_').replace(':', '_').replace('/', '_')
output_dir = repo_root / f'output_{sanitized}'
output_dir.mkdir(parents=True, exist_ok=True)

print('Base Host      :', target)
print('API Version    :', API_VERSION)
print('Verify SSL     :', VERIFY_SSL)
print('Output folder  :', output_dir)
print('Token set      :', bool(token))

# Import tideway from pip; install if needed (respects PIP_INDEX_URL/PIP_EXTRA_INDEX_URL)
try:
    import tideway  # type: ignore
except Exception:
    import subprocess
    print('Installing tideway via pip...')
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tideway'])
    import tideway  # retry

API_VERSION_NUM = API_VERSION.lstrip('v')
app = tideway.appliance(target, token, api_version=API_VERSION_NUM, ssl_verify=VERIFY_SSL)
twsearch = app.data()

# Quick probe (optional)
try:
    about = app.api_about
    print('Appliance reachable:', about.status_code)
except Exception as e:
    print('Warning: failed to contact appliance /api/about:', e)


## Query

Fetch host/software pairs then assemble per-host lists.

In [None]:
qry_sw_host = '''
                    search SoftwareInstance
                    show
                    #RunningSoftware:HostedSoftware:Host:.name as 'Host_Name',
                    name as 'Running_Software'
                '''
if WINDOWS_ONLY:
    qry_sw_host = qry_sw_host.replace("show\n", "show\n#RunningSoftware:HostedSoftware:Host:.os_type as 'Host.os_type',\n")

def list_table_to_json(table_like: List[List[Any]]) -> List[Dict[str, Any]]:
    if not table_like or not isinstance(table_like, list):
        return []
    if not table_like or not isinstance(table_like[0], list):
        return []
    headers = table_like[0]
    rows = table_like[1:]
    out: List[Dict[str, Any]] = []
    for r in rows:
        try:
            out.append(dict(zip(headers, r)))
        except Exception:
            continue
    return out

def to_rows(payload: Any) -> List[Dict[str, Any]]:
    if isinstance(payload, list):
        if payload and isinstance(payload[0], list):
            return list_table_to_json(payload)
        if payload and isinstance(payload[0], dict):
            return payload
        return []
    if hasattr(payload, 'json'):
        try:
            js = payload.json()
        except Exception:
            return []
        if isinstance(js, list):
            if js and isinstance(js[0], list):
                return list_table_to_json(js)
            if js and isinstance(js[0], dict):
                return js
        if isinstance(js, dict) and 'results' in js and 'headings' in js:
            table_like = [js['headings']] + list(js.get('results') or [])
            return list_table_to_json(table_like)
        return []
    if isinstance(payload, dict) and 'results' in payload and 'headings' in payload:
        table_like = [payload['headings']] + list(payload.get('results') or [])
        return list_table_to_json(table_like)
    return []

rows = to_rows(twsearch.search({'query': qry_sw_host}, format='object', limit=0))
df = pd.DataFrame(rows)
if WINDOWS_ONLY and 'Host.os_type' in df.columns:
    df = df[df['Host.os_type'].astype(str).str.contains('Windows', na=False)]
df = df[['Host_Name', 'Running_Software']].dropna() if not df.empty else df
print(f'Raw software rows: {len(df)}')
display(df.head(20)) if not df.empty else print('No records returned.')


## Compute expected and missing agents

In [None]:
# Build per-host software lists
host_sw = (
    df.groupby('Host_Name')['Running_Software']
      .apply(lambda s: sorted(set([str(x) for x in s if str(x).strip()])))
) if not df.empty else pd.Series(dtype=object)

# Determine expected set
total_hosts = len(host_sw)
freq = {}
for softwares in host_sw.tolist():
    for name in set(softwares):
        freq[name] = freq.get(name, 0) + 1
expected = {n for n, c in freq.items() if total_hosts and (c / total_hosts) >= THRESHOLD}
print('Total hosts:', total_hosts)
print('Expected agents (threshold', THRESHOLD, '):', len(expected))
print(sorted(list(expected))[:25])

# Find missing per host
rows_out = []
for host, softwares in host_sw.items():
    missing = sorted(list(expected - set(softwares)))
    if missing:
        rows_out.append({'Host Name': host, 'Missing Agents': ';'.join(missing)})
missing_df = pd.DataFrame(rows_out)
print('Hosts missing agents:', len(missing_df))
display(missing_df.head(20)) if not missing_df.empty else print('No hosts are missing expected agents.')


## Save CSV

In [None]:
df_out = missing_df.copy() if 'missing_df' in locals() else pd.DataFrame(columns=['Host Name','Missing Agents'])
df_out.insert(0, 'Discovery Instance', target)
OUTPUT_CSV = str(output_dir / 'expected_agents.csv')
df_out.to_csv(OUTPUT_CSV, index=False)
print(f'Saved to {OUTPUT_CSV}')


---
### Notes
- Increase `THRESHOLD` to focus on agents nearly ubiquitous in your environment.
- Set `WINDOWS_ONLY = False` to include non-Windows hosts.
- The approach mirrors DisMAL's logic in `core/common_agents.py`.