# DisMAL Bulk Query Export


This notebook loads every query defined in `queries/dismal_queries.xml`, connects to each appliance configured in `config.yaml`, and writes the results as CSVs under `raw_exports/<appliance>/<query>.csv`. The Tideway Python SDK handles authentication and API calls; adjust paths or limits as needed before running all cells.


In [None]:
# %pip install -q tideway pandas pyyaml


In [None]:
import re
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Any, Dict, List, Optional

import pandas as pd
import tideway
import yaml
from IPython.display import display


In [None]:
RAW_EXPORT_DIR_NAME = 'raw_exports'
DEFAULT_QUERY_LIMIT: Optional[int] = None  # Set to an integer to restrict rows per query

def find_repo_root(start: Path) -> Path:
    for candidate in [start] + list(start.parents):
        if (candidate / 'config.yaml').exists():
            return candidate
    return start

def slugify(value: str) -> str:
    slug = re.sub(r'[^A-Za-z0-9]+', '_', value).strip('_').lower()
    return slug or 'unnamed'

def load_query_definitions(xml_path: Path) -> List[Dict[str, str]]:
    if not xml_path.exists():
        raise FileNotFoundError(f'Missing query file: {xml_path}')
    tree = ET.parse(xml_path)
    root = tree.getroot()
    queries: List[Dict[str, str]] = []
    for elem in root.findall('query'):
        title = (elem.get('title') or 'Untitled Query').strip()
        description = (elem.findtext('description') or '').strip()
        search = (elem.findtext('search') or '').strip()
        if not search:
            continue
        queries.append({'title': title, 'description': description, 'search': search})
    return queries

def resolve_token(appliance_cfg: Dict[str, Any], global_cfg: Dict[str, Any], repo_root: Path) -> str:
    token = str(appliance_cfg.get('token') or global_cfg.get('token') or '').strip()
    token_file = appliance_cfg.get('token_file') or global_cfg.get('token_file') or global_cfg.get('f_token')
    if not token and token_file:
        tf_path = Path(token_file)
        if not tf_path.is_absolute():
            tf_path = repo_root / tf_path
        with open(tf_path, 'r') as fh:
            token = fh.read().strip()
    if not token:
        raise ValueError('API token not found (token or token_file)')
    return token

def build_appliance_contexts(repo_root: Path) -> List[Dict[str, Any]]:
    config_path = repo_root / 'config.yaml'
    if not config_path.exists():
        raise FileNotFoundError(f'config.yaml not found at {config_path}')
    with open(config_path, 'r') as fh:
        cfg = yaml.safe_load(fh) or {}
    appliances = cfg.get('appliances') or []
    if not appliances:
        raise ValueError('config.yaml does not define any appliances')
    contexts: List[Dict[str, Any]] = []
    for index, raw in enumerate(appliances):
        raw_name = raw.get('name') or f'appliance_{index + 1}'
        name = str(raw_name).strip() or f'appliance_{index + 1}'
        raw_target = raw.get('target') or cfg.get('target') or ''
        target = str(raw_target).strip()
        if not target:
            print(f'Skipping {name}: missing target')
            continue
        try:
            token = resolve_token(raw, cfg, repo_root)
        except Exception as exc:
            print(f'Skipping {name}: {exc}')
            continue
        api_version = str(raw.get('api_version') or cfg.get('api_version') or 'v1.14')
        api_number = api_version.lstrip('vV')
        verify_ssl = bool(raw.get('verify_ssl', cfg.get('verify_ssl', True)))
        export_dir = repo_root / RAW_EXPORT_DIR_NAME / slugify(name or target)
        export_dir.mkdir(parents=True, exist_ok=True)
        contexts.append({
            'name': name,
            'target': target,
            'token': token,
            'api_version': api_version,
            'api_number': api_number,
            'verify_ssl': verify_ssl,
            'export_dir': export_dir,
            'raw_config': raw,
        })
    return contexts

def run_tideway_query(search_api: Any, query_text: str, limit: Optional[int] = DEFAULT_QUERY_LIMIT) -> List[Dict[str, Any]]:
    params = {'query': query_text}
    call_kwargs = {'format': 'object'}
    if limit is not None:
        call_kwargs['limit'] = limit
    return search_api.search(params, **call_kwargs)

def prepare_dataframe(rows: List[Dict[str, Any]], context: Dict[str, Any], query_title: str) -> pd.DataFrame:
    frame = pd.DataFrame(rows or [])
    frame.insert(0, 'Appliance Target', context['target'])
    frame.insert(1, 'Appliance Name', context['name'])
    frame.insert(2, 'Query Title', query_title)
    return frame


In [None]:
repo_root = find_repo_root(Path.cwd())
print(f'Repo root: {repo_root}')

queries_path = repo_root / 'queries/dismal_queries.xml'
queries = load_query_definitions(queries_path)
print(f'Loaded {len(queries)} queries.')

appliance_contexts = build_appliance_contexts(repo_root)
print('Configured appliances:')
for ctx in appliance_contexts:
    try:
        rel_dir = ctx['export_dir'].relative_to(repo_root)
    except ValueError:
        rel_dir = ctx['export_dir']
    print(f"  - {ctx['name']} ({ctx['target']}) -> {rel_dir}")
if not appliance_contexts:
    raise RuntimeError('No appliances available from config.yaml')

clients: List[Dict[str, Any]] = []
for ctx in appliance_contexts:
    print(f"Connecting to {ctx['name']} ({ctx['target']})")
    try:
        app = tideway.appliance(ctx['target'], ctx['token'], api_version=ctx['api_number'], ssl_verify=ctx['verify_ssl'])
        search = app.data()
        try:
            status = getattr(app.api_about, 'status_code', 'ok')
            print(f'  API reachable (status: {status})')
        except Exception as exc:
            print(f'  Warning: unable to read /api/about ({exc})')
        clients.append({'app': app, 'search': search, 'context': ctx})
    except Exception as exc:
        print(f'  Failed to initialise: {exc}')

if not clients:
    raise RuntimeError('Unable to connect to any appliances')


In [None]:
execution_summary: List[Dict[str, Any]] = []
for query in queries:
    title = query['title']
    print(f'=== {title} ===')
    for client in clients:
        ctx = client['context']
        outfile = ctx['export_dir'] / f"{slugify(title)}.csv"
        if outfile.exists():
            print(f"Skipping {ctx['name']} ({ctx['target']}): {outfile.name} already exists")
            execution_summary.append({'query': title, 'appliance': ctx['name'], 'rows': None, 'status': 'skipped_existing', 'path': str(outfile)})
            continue
        print(f"Running on {ctx['name']} ({ctx['target']})")
        try:
            rows = run_tideway_query(client['search'], query['search'], limit=DEFAULT_QUERY_LIMIT)
        except Exception as exc:
            print(f'  ! Query failed: {exc}')
            execution_summary.append({'query': title, 'appliance': ctx['name'], 'rows': 0, 'status': f'error: {exc}'})
            continue
        df = prepare_dataframe(rows, ctx, title)
        if df.empty:
            print('  No rows returned.')
        else:
            display(df.head(10))
        df.to_csv(outfile, index=False)
        print(f'  Saved {len(df)} rows to {outfile}')
        execution_summary.append({'query': title, 'appliance': ctx['name'], 'rows': len(df), 'status': 'ok', 'path': str(outfile)})

print('Exports complete.')


In [None]:
summary_df = pd.DataFrame(execution_summary)
display(summary_df)
