# Open Ports (CSV Exports)

This notebook rebuilds the DisMAL `open_ports` report from raw Discovery CSV exports.
It reads appliance entries from `config.yaml`, loops over matching export folders,
and writes the per-instance CSVs without calling the Discovery API.


## Requirements

We rely on `pandas` and `PyYAML` for data wrangling. Uncomment below to install them if needed.


In [None]:
# %pip install -q pandas pyyaml

from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional

import pandas as pd
import yaml


## Configuration

Adjust these values to control which instances are processed and where outputs are written.


In [None]:
# Root folder containing raw_exports/<instance> subdirectories
RAW_EXPORT_ROOT = Path("../../raw_exports")

# Optional filters (set INCLUDE_INSTANCES to something like ["prod"] to limit processing)
INCLUDE_INSTANCES: Optional[Iterable[str]] = None
EXCLUDE_INSTANCES: Iterable[str] = ()

# Optional override for outputs (per appliance sub-folder is created automatically)
OUTPUT_BASE_DIR = None  # e.g., Path("../../csv_outputs")
OUTPUT_FILENAME = "open_ports.csv"


In [None]:
def find_repo_root(start: Path) -> Path:
    for candidate in [start] + list(start.parents):
        if (candidate / "config.yaml").exists() or (candidate / ".git").is_dir():
            return candidate
    return start

NOTEBOOK_DIR = Path.cwd()
REPO_ROOT = find_repo_root(NOTEBOOK_DIR)
CONFIG_PATH = REPO_ROOT / "config.yaml"

if not CONFIG_PATH.exists():
    raise FileNotFoundError(f"config.yaml not found at {CONFIG_PATH}")

with CONFIG_PATH.open("r", encoding="utf-8") as fh:
    cfg = yaml.safe_load(fh) or {}

appliance_entries = cfg.get("appliances") or []
if isinstance(appliance_entries, dict):
    appliance_entries = [appliance_entries]

if not appliance_entries:
    fallback_target = cfg.get("target")
    fallback_name = cfg.get("name") or (fallback_target or "default")
    appliance_entries = [{"name": fallback_name, "target": fallback_target}]

available_appliances: List[Dict[str, Any]] = []
for entry in appliance_entries:
    name = str(entry.get("name") or "").strip()
    target = str(entry.get("target") or "").strip()
    if not name:
        continue
    available_appliances.append({"name": name, "target": target or name})

if not available_appliances:
    raise ValueError("No appliances with a name found in config.yaml")

exports_root = RAW_EXPORT_ROOT if RAW_EXPORT_ROOT.is_absolute() else (NOTEBOOK_DIR / RAW_EXPORT_ROOT).resolve()
if not exports_root.exists():
    raise FileNotFoundError(f"Raw export root not found: {exports_root}")

include_set = {str(v).strip() for v in (INCLUDE_INSTANCES or []) if str(v).strip()}
exclude_set = {str(v).strip() for v in (EXCLUDE_INSTANCES or []) if str(v).strip()}

available_dirs = {path.name: path for path in exports_root.iterdir() if path.is_dir()}

selected_appliances: List[Dict[str, Any]] = []
skipped_missing: List[str] = []
skipped_filtered: List[str] = []

for appliance in available_appliances:
    name = appliance["name"]
    if include_set and name not in include_set:
        skipped_filtered.append(name)
        continue
    if name in exclude_set:
        skipped_filtered.append(name)
        continue
    export_dir = available_dirs.get(name)
    if not export_dir:
        skipped_missing.append(name)
        continue
    selected_appliances.append({
        "name": name,
        "target": appliance.get("target") or name,
        "export_dir": export_dir,
    })

print(f"Repo root         : {REPO_ROOT}")
print(f"Config path       : {CONFIG_PATH}")
print(f"Exports root      : {exports_root}")
print(f"Config appliances : {[a['name'] for a in available_appliances]}")
print(f"Export directories: {sorted(available_dirs)}")
print(f"Selected          : {[a['name'] for a in selected_appliances]}")
if skipped_missing:
    print(f"Missing export dirs: {skipped_missing}")
if skipped_filtered:
    print(f"Skipped by filter  : {skipped_filtered}")

if not selected_appliances:
    raise RuntimeError("No appliances selected for processing â€“ check raw exports and filters.")


In [None]:
METADATA_COLUMNS = ["Appliance Target", "Appliance Name", "Query Title"]

CSV_FILENAME = "open_ports_default_services.csv"

REQUIRED_COLUMNS = [
    "DiscoveryAccess.endpoint",
    "OpenPort.port",
    "OpenPort.protocol",
    "OpenPort.service",
    "DeviceInfo.hostname",
    "DiscoveryAccess.node_kind",
    "DiscoveryAccess.scan_starttime",
    "DiscoveryAccess.scan_endtime",
    "DiscoveryRun.label",
]

from pandas.errors import EmptyDataError


def load_csv(path: Path) -> pd.DataFrame:
    if not path.exists():
        print(f"Missing CSV: {path}")
        return pd.DataFrame()
    try:
        return pd.read_csv(path, low_memory=False)
    except EmptyDataError:
        print(f"Empty CSV: {path}")
        return pd.DataFrame()


def drop_metadata(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop(columns=[c for c in METADATA_COLUMNS if c in df.columns], errors="ignore")


def ensure_columns(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    for col in columns:
        if col not in df.columns:
            df[col] = None
    return df


In [None]:
def build_output_dir(target: str) -> Path:
    sanitized = (target or "unknown").replace(".", "_").replace(":", "_").replace("/", "_")
    if OUTPUT_BASE_DIR is None:
        return REPO_ROOT / f"output_{sanitized}"
    base_root = OUTPUT_BASE_DIR if isinstance(OUTPUT_BASE_DIR, Path) else Path(OUTPUT_BASE_DIR)
    return base_root.expanduser().resolve() / f"output_{sanitized}"


def process_instance(instance: Dict[str, Any]) -> Dict[str, Any]:
    name = instance["name"]
    target = instance.get("target") or name
    export_dir: Path = instance["export_dir"]
    print(f"=== Processing {name} ({target}) ===")

    output_dir = build_output_dir(target)
    output_dir.mkdir(parents=True, exist_ok=True)

    csv_path = export_dir / CSV_FILENAME
    df = drop_metadata(load_csv(csv_path))
    df = ensure_columns(df, REQUIRED_COLUMNS)

    if df.empty:
        print("No open port rows found.")
    else:
        print(f"Rows loaded: {len(df)}")

    other_cols = sorted([c for c in df.columns if c not in {"Discovery Instance"}])

    df_out = df.copy()
    df_out.insert(0, "Discovery Instance", target)
    df_out = df_out[["Discovery Instance"] + other_cols]

    output_csv = output_dir / OUTPUT_FILENAME
    df_out.to_csv(output_csv, index=False)

    print(f"Saved to {output_csv}")
    display(df_out.head(20))

    return {
        "instance": name,
        "target": target,
        "output_path": output_csv,
        "rows": int(len(df_out)),
        "status": "ok",
    }


In [None]:
results: List[Dict[str, Any]] = []
for appliance in selected_appliances:
    outcome = process_instance(appliance)
    results.append(outcome)

summary_df = pd.DataFrame(results)
if "output_path" in summary_df.columns:
    summary_df["output_path"] = summary_df["output_path"].map(lambda p: str(p) if p is not None else None)
display(summary_df)
