# IP Analysis (BMC Discovery)

This notebook reproduces the DisMAL `ip_analysis` report logic using the Discovery Data API.
It reads configuration from `../config.yaml`, evaluates the relevant TWQL queries,
derives overlapping schedules and gaps, then writes `ip_analysis.csv` to the standard output directory.

## Requirements

We use `requests`, `pandas`, and `PyYAML`. Uncomment below to install if needed.

In [None]:
# %pip install -q requests pandas pyyaml

import pandas as pd
import requests
import yaml
from pathlib import Path
from urllib.parse import urljoin
import ipaddress
from typing import List, Dict, Any, Tuple


## Select Appliance (optional)
If `config.yaml` defines multiple `appliances:`, set `APPLIANCE_NAME` or `APPLIANCE_INDEX`.

In [None]:
APPLIANCE_NAME = None   # e.g., 'prod' or 'dev'
APPLIANCE_INDEX = 0     # integer index if not using name selection


## Configuration (from config.yaml)
Locates `../config.yaml`, reads connection details, and prepares the output folder.

In [None]:
def _find_repo_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / 'config.yaml').exists():
            return p
    return start.parent

repo_root = _find_repo_root(Path.cwd())
config_path = repo_root / 'config.yaml'
with open(config_path, 'r') as fh:
    cfg = yaml.safe_load(fh) or {}

apps = cfg.get('appliances') or []
selected = None
if isinstance(apps, list) and apps:
    if APPLIANCE_NAME:
        selected = next((a for a in apps if a.get('name') == APPLIANCE_NAME), None)
        if selected is None:
            raise ValueError(f"No appliance named '{APPLIANCE_NAME}' in config.yaml")
    else:
        try:
            selected = apps[int(APPLIANCE_INDEX)]
        except Exception:
            selected = apps[0]

target = ((selected or {}).get('target') or cfg.get('target') or '').strip()
if not target:
    raise ValueError('config.yaml missing "target"')
BASE_URL = target if ('://' in target) else f'https://{target}'

token = (((selected or {}).get('token') or cfg.get('token') or '').strip())
token_file = (selected or {}).get('token_file') or cfg.get('token_file') or cfg.get('f_token')
if not token and token_file:
    tf_path = Path(token_file)
    if not tf_path.is_absolute():
        tf_path = repo_root / tf_path
    with open(tf_path, 'r') as tf:
        token = tf.read().strip()
if not token:
    raise ValueError('API token not found in config.yaml (token or token_file)')

API_VERSION = str((selected or {}).get('api_version') or cfg.get('api_version') or 'v1.14')
VERIFY_SSL = bool((selected or {}).get('verify_ssl', cfg.get('verify_ssl', True)))

sanitized = target.replace('.', '_').replace(':', '_').replace('/', '_')
output_dir = repo_root / f'output_{sanitized}'
output_dir.mkdir(parents=True, exist_ok=True)

print('Base URL      :', BASE_URL)
print('API Version   :', API_VERSION)
print('Verify SSL    :', VERIFY_SSL)
print('Output folder :', output_dir)


## Session and helpers
`post_search` aggregates the 'object' table format across pages and returns {'headings': [...], 'results': [...]}.
`table_to_dicts` zips headings into dict rows.

In [None]:
session = requests.Session()
auth_value = token if token.lower().startswith('bearer ') else f'Bearer {token}'
session.headers.update({'Authorization': auth_value, 'Accept': 'application/json'})
session.verify = VERIFY_SSL

def api_url(path: str) -> str:
    base = BASE_URL.rstrip('/') + f'/api/{API_VERSION}/'
    return urljoin(base, path.lstrip('/'))

def post_search(query: str, *, limit: int | None = None, page_size: int = 500) -> Dict[str, Any]:
    url = api_url('data/search')
    headings = None
    results: List[List[Any]] = []
    offset = 0
    fetch_all = (limit == 0)
    while True:
        payload = {'query': query, 'format': 'object'}
        if fetch_all:
            payload['limit'] = page_size
            if offset:
                payload['offset'] = offset
        elif limit is not None:
            payload['limit'] = limit
        r = session.post(url, json=payload)
        if r.status_code >= 400:
            print(f'Error {r.status_code} POST {url}: {r.text[:200]}')
            return {'headings': [], 'results': []}
        try:
            data = r.json()
        except Exception:
            data = []
        table = None
        if isinstance(data, list):
            for x in data:
                if isinstance(x, dict) and 'headings' in x and 'results' in x:
                    table = x
                    break
        elif isinstance(data, dict) and 'headings' in data and 'results' in data:
            table = data
        if not table:
            return {'headings': [], 'results': []}
        if headings is None:
            headings = table.get('headings', [])
        page_rows = table.get('results') or []
        results.extend(page_rows)
        if not fetch_all or len(page_rows) < page_size:
            break
        offset += page_size
    return {'headings': (headings or []), 'results': results}

def table_to_dicts(table: Dict[str, Any]) -> List[Dict[str, Any]]:
    heads = table.get('headings') or []
    rows = table.get('results') or []
    return [dict(zip(heads, r)) for r in rows]


## TWQL Queries
These mirror `core/queries.py` entries used by the CLI implementation.

In [None]:
qry_scanrange = '''
search ScanRange where scan_type = 'Scheduled'
show
    range_id as 'ID',
    label as 'Label',
    (range_strings or provider) as 'Scan_Range',
    scan_level as 'Level',
    recurrenceDescription(schedule) as 'Date_Rules'
'''

qry_excludes = '''
search in '_System' ExcludeRange
show
    exrange_id as 'ID',
    name as 'Label',
    range_strings as 'Scan_Range',
    recurrenceDescription(schedule) as 'Date_Rules'
'''

qry_ip_schedules = '''
search DiscoveryAccess
show endpoint,
     nodecount(traverse Member:List:List:DiscoveryRun where scan_type = 'Scheduled') as 'schedules'
process with unique()
'''

qry_connections_unscanned = '''
search Host
traverse InferredElement:Inference:Associate:DiscoveryAccess
traverse DiscoveryAccess:DiscoveryAccessResult:DiscoveryResult:NetworkConnectionList
traverse List:List:Member:DiscoveredNetworkConnection
order by remote_ip_addr
show remote_ip_addr as 'Unscanned Host IP Address'
processwith connectionsToUnseen
'''


## Build report data
- Collect scan ranges and excludes; prepare matchers.
- For each endpoint, identify schedule labels; record overlaps and missing schedules.
- Add 'seen but unscanned' IPs.

In [None]:
def _parse_scan_tokens(tokens: List[str]):
    items = []
    wildcard = False
    for t in tokens:
        t = (t or '').strip()
        if not t:
            continue
        if t in ('0.0.0.0/0', '::/0'):
            wildcard = True
            continue
        if '-' in t:
            try:
                start, end = [ipaddress.ip_address(x.strip()) for x in t.split('-', 1)]
                items.append(('range', (int(start), int(end), start.version)))
                continue
            except Exception:
                pass
        if '/' in t:
            try:
                net = ipaddress.ip_network(t, strict=False)
                items.append(('network', (net, net.version)))
                continue
            except Exception:
                pass
        # Single IP
        try:
            ip = ipaddress.ip_address(t)
            items.append(('single', (int(ip), ip.version)))
        except Exception:
            # Unknown token; skip
            pass
    return wildcard, items

def _endpoint_in_items(ep: str, wildcard: bool, items) -> bool:
    if wildcard:
        return True
    try:
        ip = ipaddress.ip_address(ep)
        ival = int(ip)
        ver = ip.version
    except Exception:
        return False
    for kind, data in items:
        if kind == 'network':
            net, nver = data
            if ver == nver and ip in net:
                return True
        elif kind == 'range':
            start, end, rver = data
            if ver == rver and start <= ival <= end:
                return True
        elif kind == 'single':
            sval, sver = data
            if ver == sver and ival == sval:
                return True
    return False

# Fetch tables
t_scan = post_search(qry_scanrange, limit=0)
t_exc = post_search(qry_excludes, limit=0)
rows_scan = table_to_dicts(t_scan)
rows_exc = table_to_dicts(t_exc)

# Build label -> matchers
label_matchers: List[Tuple[str, bool, list]] = []
for row in rows_scan + rows_exc:
    label = row.get('Label') or ''
    ranges = row.get('Scan_Range')
    if isinstance(ranges, str):
        tokens = [x.strip() for x in ranges.split(',') if x.strip()]
    elif isinstance(ranges, list):
        # Flatten embedded comma lists
        tokens = []
        for r in ranges:
            if isinstance(r, str):
                tokens.extend([x.strip() for x in r.split(',') if x.strip()])
    else:
        tokens = []
    wildcard, items = _parse_scan_tokens(tokens)
    if items or wildcard:
        label_matchers.append((label, wildcard, items))

# Endpoints and their schedule count
t_sched = post_search(qry_ip_schedules, limit=0)
rows_sched = table_to_dicts(t_sched)

data_rows = []  # [ip, schedules-or-message]
# Overlaps and missing
for r in rows_sched:
    ep = r.get('endpoint')
    sched = r.get('schedules')
    try:
        sched_n = int(sched) if sched is not None else 0
    except Exception:
        sched_n = 0
    if not ep:
        continue
    if sched_n == 0:
        data_rows.append([ep, 'Endpoint has previous DiscoveryAccess, but not currently scheduled.'])
        continue
    # Determine which labels include this endpoint
    labels = []
    for label, wildcard, items in label_matchers:
        if _endpoint_in_items(ep, wildcard, items):
            labels.append(label)
    if len(labels) > 1:
        data_rows.append([ep, sorted(labels)])

# Seen but unscanned
t_unseen = post_search(qry_connections_unscanned, limit=0)
rows_unseen = table_to_dicts(t_unseen)
existing_ips = {row[0] for row in data_rows}
for r in rows_unseen:
    ip = r.get('Unscanned Host IP Address')
    if ip and ip not in existing_ips:
        data_rows.append([ip, 'Seen but unscanned.'])
        existing_ips.add(ip)

# Build DataFrame and save
df = pd.DataFrame(data_rows, columns=['IP Address', 'Scan Schedules'])
df.insert(0, 'Discovery Instance', target)
OUTPUT_CSV = str(output_dir / 'ip_analysis.csv')
df.to_csv(OUTPUT_CSV, index=False)
print(f'Saved to {OUTPUT_CSV} (rows: {len(df)})')
