# Discovery Schedules (BMC Discovery)

This notebook reproduces the DisMAL `schedules` report using the raw CSV exports generated by the CLI.
It loads `raw_exports/<appliance>/scheduled_scan_ranges.csv`, `exclude_ranges.csv`, and `discoveryaccess_schedule_counts.csv`
to summarise scheduled and excluded ranges along with the number of discovery endpoints that fall within each range.


## Requirements

We rely on `pandas`, `PyYAML`, and the standard library. Uncomment the next cell to install requirements if needed.


In [None]:
# %pip install -q pandas pyyaml

import ast
import ipaddress
from pathlib import Path
from typing import Iterable, List, Tuple

import pandas as pd
import yaml


## Configuration (from config.yaml)

Locates the repository root, reads configuration, and determines the raw export and output directories for each appliance.


In [None]:
def load_config_params(
    start: Path,
    appliance_name: str = None,
    appliance_index: int = 0,
) -> dict:
    def _find_repo_root(path: Path) -> Path:
        for candidate in [path] + list(path.parents):
            if (candidate / 'config.yaml').exists():
                return candidate
        return path.parent

    def _slugify(value: str) -> str:
        return ''.join(ch if ch.isalnum() else '_' for ch in value).strip('_').lower() or 'default'

    repo_root = _find_repo_root(start)
    config_path = repo_root / 'config.yaml'

    with open(config_path, 'r') as fh:
        cfg = yaml.safe_load(fh) or {}

    appliances = cfg.get('appliances') or []
    selected = None
    if isinstance(appliances, list) and appliances:
        if appliance_name:
            selected = next((a for a in appliances if a.get('name') == appliance_name), None)
            if selected is None:
                raise ValueError(f"No appliance named '{appliance_name}' in config.yaml")
        else:
            try:
                selected = appliances[int(appliance_index)]
            except Exception:
                selected = appliances[0]

    target = ((selected or {}).get('target') or cfg.get('target') or '').strip()
    if not target:
        raise ValueError('config.yaml missing "target"')

    sanitized = target.replace('.', '_').replace(':', '_').replace('/', '_')
    output_dir = repo_root / f'output_{sanitized}'
    output_dir.mkdir(parents=True, exist_ok=True)

    export_name = ((selected or {}).get('name') or appliance_name or sanitized)
    raw_export_dir = repo_root / 'raw_exports' / _slugify(export_name)

    return {
        'repo_root': repo_root,
        'config_path': config_path,
        'cfg': cfg,
        'selected': selected,
        'target': target,
        'output_dir': output_dir,
        'raw_export_dir': raw_export_dir,
    }


## Initialise Instances


In [None]:
try:
    twprod = load_config_params(Path.cwd(), appliance_name='prod')
except ValueError:
    twprod = load_config_params(Path.cwd(), appliance_index=0)
print('Prod Target  :', twprod['target'])
print('Prod Exports :', twprod['raw_export_dir'])
print('Prod Output  :', twprod['output_dir'])

try:
    twdev = load_config_params(Path.cwd(), appliance_name='dev')
except ValueError:
    twdev = load_config_params(Path.cwd(), appliance_index=1)
print('Dev Target   :', twdev['target'])
print('Dev Exports  :', twdev['raw_export_dir'])
print('Dev Output   :', twdev['output_dir'])


## Helper Functions


In [None]:
BASE_EXPORT_COLUMNS = ['Appliance Target', 'Appliance Name', 'Query Title']
SCHEDULE_FILE = 'scheduled_scan_ranges.csv'
EXCLUDE_FILE = 'exclude_ranges.csv'
SCHEDULE_COUNTS_FILE = 'discoveryaccess_schedule_counts.csv'
EXPECTED_SCHEDULE_COLUMNS = ['ID', 'Label', 'Scan_Range', 'Level', 'Date_Rules']
EXPECTED_EXCLUDE_COLUMNS = ['ID', 'Label', 'Scan_Range', 'Date_Rules']


def _parse_range_list(value) -> List[str]:
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return []
    if isinstance(value, list):
        return [str(v).strip() for v in value if str(v).strip()]
    text = str(value).strip()
    if not text:
        return []
    if text.startswith('[') and text.endswith(']'):
        try:
            evaluated = ast.literal_eval(text)
        except Exception:
            evaluated = [token.strip() for token in text.strip('[]').split(',')]
        if isinstance(evaluated, (list, tuple)):
            return [str(v).strip() for v in evaluated if str(v).strip()]
    return [token.strip() for token in text.split(',') if token.strip()]


def _parse_range_token(token: str):
    token = token.strip()
    if not token:
        return None
    try:
        if '/' in token:
            net = ipaddress.ip_network(token, strict=False)
            return ('network', net)
        if '-' in token:
            start_txt, end_txt = token.split('-', 1)
            start_ip = ipaddress.ip_address(start_txt.strip())
            end_ip = ipaddress.ip_address(end_txt.strip())
            if start_ip.version != end_ip.version:
                return None
            start_val = int(start_ip)
            end_val = int(end_ip)
            if end_val < start_val:
                start_val, end_val = end_val, start_val
            return ('range', (start_val, end_val, start_ip.version))
        single_ip = ipaddress.ip_address(token)
        return ('single', (int(single_ip), single_ip.version))
    except Exception:
        return None


def _build_range_specs(tokens: Iterable[str]):
    specs = []
    for token in tokens:
        parsed = _parse_range_token(token)
        if parsed:
            specs.append(parsed)
    return specs


def load_ranges(instance: dict, filename: str, expected: List[str], range_type: str) -> pd.DataFrame:
    csv_path = instance['raw_export_dir'] / filename
    if csv_path.exists():
        df = pd.read_csv(csv_path)
    else:
        df = pd.DataFrame(columns=expected)

    if df.empty and not set(expected).intersection(df.columns):
        for col in expected:
            df[col] = pd.NA

    df = df.drop(columns=[c for c in BASE_EXPORT_COLUMNS if c in df.columns], errors='ignore')
    for col in expected:
        if col not in df.columns:
            df[col] = pd.NA
    df['Type'] = range_type
    return df[expected + ['Type']]


def load_schedule_counts(instance: dict) -> List[Tuple[ipaddress._BaseAddress, int]]:
    csv_path = instance['raw_export_dir'] / SCHEDULE_COUNTS_FILE
    if not csv_path.exists():
        return []
    df = pd.read_csv(csv_path)
    df = df.drop(columns=[c for c in BASE_EXPORT_COLUMNS if c in df.columns], errors='ignore')
    if 'endpoint' not in df.columns:
        return []
    df['schedules'] = pd.to_numeric(df.get('schedules', 1), errors='coerce').fillna(0).astype(int)

    parsed: List[Tuple[ipaddress._BaseAddress, int]] = []
    for endpoint, count in zip(df['endpoint'], df['schedules']):
        try:
            ip_obj = ipaddress.ip_address(str(endpoint))
        except Exception:
            continue
        parsed.append((ip_obj, count if count > 0 else 1))
    return parsed


def endpoint_matches(ip_obj: ipaddress._BaseAddress, specs: List[Tuple[str, object]]) -> bool:
    ip_val = int(ip_obj)
    version = ip_obj.version
    for kind, data in specs:
        if kind == 'network':
            network: ipaddress._BaseNetwork = data
            if network.version == version and ip_obj in network:
                return True
        elif kind == 'range':
            start, end, ver = data
            if ver == version and start <= ip_val <= end:
                return True
        elif kind == 'single':
            value, ver = data
            if ver == version and value == ip_val:
                return True
    return False


def count_matched_endpoints(range_tokens: List[str], endpoints: List[Tuple[ipaddress._BaseAddress, int]]) -> int:
    specs = _build_range_specs(range_tokens)
    if not specs:
        return 0
    total = 0
    for ip_obj, sched in endpoints:
        if endpoint_matches(ip_obj, specs):
            total += sched
    return total


## Load and Preview


In [None]:
def build_schedule_dataframe(instance: dict) -> pd.DataFrame:
    sched_df = load_ranges(instance, SCHEDULE_FILE, EXPECTED_SCHEDULE_COLUMNS, 'Scan Range')
    excl_df = load_ranges(instance, EXCLUDE_FILE, EXPECTED_EXCLUDE_COLUMNS, 'Exclude Range')

    if 'Level' not in excl_df.columns:
        excl_df['Level'] = pd.NA

    combined = pd.concat([sched_df, excl_df], ignore_index=True, sort=False)
    if combined.empty:
        return pd.DataFrame(columns=['Discovery Instance', 'Name', 'Type', 'Range ID', 'Ranges', 'Scan Level', 'When', 'Scheduled Endpoints'])

    endpoints = load_schedule_counts(instance)

    records = []
    for _, row in combined.iterrows():
        tokens = _parse_range_list(row.get('Scan_Range'))
        range_count = len(tokens)
        matched = count_matched_endpoints(tokens, endpoints)
        records.append({
            'Discovery Instance': instance['target'],
            'Name': row.get('Label'),
            'Type': row.get('Type'),
            'Range ID': row.get('ID'),
            'Ranges': range_count,
            'Scan Level': row.get('Level'),
            'When': row.get('Date_Rules'),
            'Scheduled Endpoints': matched,
        })

    result = pd.DataFrame(records)
    result = result.sort_values(['Type', 'Range ID'], na_position='last').reset_index(drop=True)
    return result

prod_df = build_schedule_dataframe(twprod)
print(twprod['target'])
display(prod_df.head(10))

dev_df = build_schedule_dataframe(twdev)
print(twdev['target'])
display(dev_df.head(10))


## Save CSV


In [None]:
OUTPUT_FILE = 'schedules.csv'

prod_path = twprod['output_dir'] / OUTPUT_FILE
prod_df.to_csv(prod_path, index=False)
print(f'Saved prod schedules to {prod_path} (rows: {len(prod_df)})')

dev_path = twdev['output_dir'] / OUTPUT_FILE
dev_df.to_csv(dev_path, index=False)
print(f'Saved dev schedules to {dev_path} (rows: {len(dev_df)})')


---
### Notes
- Scheduled endpoint counts are derived from `discoveryaccess_schedule_counts.csv` by matching endpoints to each range definition.
- Update the export mappings above if additional schedule-related queries are added to the CLI workflow.
