# Sensitive Data (BMC Discovery)

This notebook reproduces the DisMAL `sensitive_data` report using the Discovery Data API.
It runs the same TWQL query, normalizes the results, and writes `sensitive_data.csv` to `output_<target>`.

> **NOTE:** May timeout due to limitations of the API if there are a large set of records.

## Requirements
We use `requests` for HTTP, `pandas` for tabular handling, and `PyYAML` for configuration.
Uncomment below to install if needed.

In [None]:
# %pip install -q requests pandas pyyaml

import pandas as pd
import requests
import yaml
from pathlib import Path
from urllib.parse import urljoin
import os


## Select Appliance (optional)
If your `config.yaml` defines multiple appliances, set `APPLIANCE_NAME` or `APPLIANCE_INDEX`.

In [None]:
APPLIANCE_NAME = None   # e.g., 'prod' or 'dev'
APPLIANCE_INDEX = 0     # integer index if not using name selection


## Configuration (from config.yaml)
Locates `../config.yaml`, reads target and token, and prepares the output directory.

In [None]:
def _find_repo_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / 'config.yaml').exists():
            return p
    return start.parent

repo_root = _find_repo_root(Path.cwd())
config_path = repo_root / 'config.yaml'
with open(config_path, 'r') as fh:
    cfg = yaml.safe_load(fh) or {}

apps = cfg.get('appliances') or []
selected = None
if isinstance(apps, list) and apps:
    if APPLIANCE_NAME:
        selected = next((a for a in apps if a.get('name') == APPLIANCE_NAME), None)
        if selected is None:
            raise ValueError(f"No appliance named '{APPLIANCE_NAME}' in config.yaml")
    else:
        try:
            selected = apps[int(APPLIANCE_INDEX)]
        except Exception:
            selected = apps[0]

target = ((selected or {}).get('target') or cfg.get('target') or '').strip()
if not target:
    raise ValueError('config.yaml missing "target"')
BASE_URL = target if ('://' in target) else f'https://{target}'

token = (((selected or {}).get('token') or cfg.get('token') or '').strip())
token_file = (selected or {}).get('token_file') or cfg.get('token_file') or cfg.get('f_token')
if not token and token_file:
    tf_path = Path(token_file)
    if not tf_path.is_absolute():
        tf_path = repo_root / tf_path
    with open(tf_path, 'r') as tf:
        token = tf.read().strip()
if not token:
    raise ValueError('API token not found in config.yaml (token or token_file)')

API_VERSION = str((selected or {}).get('api_version') or cfg.get('api_version') or 'v1.14')
VERIFY_SSL = bool((selected or {}).get('verify_ssl', cfg.get('verify_ssl', True)))

sanitized = target.replace('.', '_').replace(':', '_').replace('/', '_')
output_dir = repo_root / f'output_{sanitized}'
output_dir.mkdir(parents=True, exist_ok=True)

print('Base URL      :', BASE_URL)
print('API Version   :', API_VERSION)
print('Verify SSL    :', VERIFY_SSL)
print('Output folder :', output_dir)


## Session and helpers
Create a session and a helper to return a unified object table {'headings': [...], 'results': [...]}.

In [None]:
session = requests.Session()
auth_value = token if token.lower().startswith('bearer ') else f'Bearer {token}'
session.headers.update({'Authorization': auth_value, 'Accept': 'application/json'})
session.verify = VERIFY_SSL

def api_url(path: str) -> str:
    base = BASE_URL.rstrip('/') + f'/api/{API_VERSION}/'
    return urljoin(base, path.lstrip('/'))

def post_search(query: str, *, limit: int | None = None, page_size: int = 500):
    url = api_url('data/search')
    headings = None
    results = []
    offset = 0
    fetch_all = (limit == 0)
    while True:
        payload = {'query': query, 'format': 'object'}
        if fetch_all:
            payload['limit'] = page_size
            if offset:
                payload['offset'] = offset
        elif limit is not None:
            payload['limit'] = limit
        r = session.post(url, json=payload)
        if r.status_code >= 400:
            print(f'Error {r.status_code} POST {url}: {r.text[:200]}')
            return {'headings': [], 'results': []}
        try:
            data = r.json()
        except Exception:
            data = []
        table = None
        if isinstance(data, list):
            for x in data:
                if isinstance(x, dict) and 'headings' in x and 'results' in x:
                    table = x
                    break
        elif isinstance(data, dict) and 'headings' in data and 'results' in data:
            table = data
        if not table:
            return {'headings': [], 'results': []}
        if headings is None:
            headings = table.get('headings', [])
        page_rows = table.get('results') or []
        results.extend(page_rows)
        if not fetch_all or len(page_rows) < page_size:
            break
        offset += page_size
    return {'headings': (headings or []), 'results': results}


## TWQL for Sensitive Data
Mirrors `core/queries.py:sensitive_data`.

In [None]:
qry_sensitive = r'''
search DiscoveredProcess
where ((args has subword 'user' or args has substring 'username')
    and (args has subword 'pass' or args has substring 'password'))
  or (args matches regex '(?i)\s-u(\s+|=)\S+'
    and args matches regex '(?i)\s-p(\s+|=)\S+')
show
  #Member:List:List:ProcessList.#DiscoveryResult:DiscoveryAccessResult:DiscoveryAccess:DiscoveryAccess.#DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:DeviceInfo.hostname as 'Host',
  #Member:List:List:ProcessList.#DiscoveryResult:DiscoveryAccessResult:DiscoveryAccess:DiscoveryAccess.endpoint as 'Endpoint',
  username,
  cmd,
  args,
  (extract(args, regex '(?i)(user(name)?.*?\S+)', raw '\1')
      or extract(args, regex '(?i)(-u.*?\S+)', raw '\1')) as 'Matched Username String',
  extract(args, regex '(?i)(password.*?\S+|\s-p.*?\S+)', raw '\1') as 'Matched Password String'
'''


## Execute and save
Execute the query, build a DataFrame, insert 'Discovery Instance' as the first column, and write CSV.

In [None]:
tbl = post_search(qry_sensitive, limit=0)
heads = tbl.get('headings', []) if isinstance(tbl, dict) else []
rows = tbl.get('results', []) if isinstance(tbl, dict) else []
df = pd.DataFrame(rows, columns=heads) if rows else pd.DataFrame()

if not df.empty:
    df.insert(0, 'Discovery Instance', target)
    display(df.head(20))
else:
    print('No rows returned')

OUTPUT_CSV = str(output_dir / 'sensitive_data.csv')
if not df.empty:
    df.to_csv(OUTPUT_CSV, index=False)
    print(f'Saved to {OUTPUT_CSV} (rows: {len(df)})')
else:
    # Create empty file with headers for consistency
    pd.DataFrame(columns=['Discovery Instance'] + heads).to_csv(OUTPUT_CSV, index=False)
    print(f'No data; created empty file at {OUTPUT_CSV}')
