# Data Validation (LEMMA + Nezha)

Checks all 98 converted scenarios for structural correctness, temporal alignment, and signal quality.

Validation tiers:
1. Structure — manifest/ground_truth JSON coherence
2. Temporal — monotonic indices, no gaps, fault_bin in range
3. Metrics — mean shift (shift_sigma) around fault
4. Logs — textual pattern/volume changes

Each scenario gets an A/B/C/D grade based on signal strength (A = strong metrics + logs, D = both weak).

LEMMA scenarios match `YYYYMMDD`, Nezha match `YYYYMMDD_nezha_N`.


In [1]:
import json
import re
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

try:
    from sentence_transformers import SentenceTransformer
    HAS_SBERT = True
    print("[INFO] sentence_transformers available - using MiniLM embeddings")
except ImportError:
    HAS_SBERT = False
    print("[INFO] sentence_transformers NOT available - falling back to TF-IDF")


  from .autonotebook import tqdm as notebook_tqdm


[INFO] sentence_transformers available - using MiniLM embeddings


In [2]:
# === CONFIGURATION ===
ROOT = Path("/root/lemm/core_multimodal_tmp")
METRICS_BASE = Path("/root/lemm/core_metrics_tmp")
LOGS_BASE = Path("/root/lemm/core_logs_tmp")

WINDOW_BEFORE = pd.Timedelta("10min")
WINDOW_AFTER = pd.Timedelta("10min")

OUTDIR = Path("./unified_rca_reports")
OUTDIR.mkdir(parents=True, exist_ok=True)
(OUTDIR / "figs").mkdir(exist_ok=True)

# Patterns to identify datasets
NEZHA_PATTERN = re.compile(r"^\d{8}_nezha_\d+$")
LEMMA_PATTERN = re.compile(r"^\d{8}$")

# Excluded scenarios (known bad data)
EXCLUDED_SCENARIOS = {
    # Corrupted data
    "20220822_nezha_22",  # adservice timestamps corrupted (year 2028)
    "20220822_nezha_23",  # adservice timestamps corrupted (year 2028)
    "20230130_nezha_15",  # ts-security-service CPU=0 in original dataset
    "20230130_nezha_16",  # ts-security-service CPU=0 in original dataset
    # No signal
    "20220822_nezha_14",  # network_delay no signal
    "20220823_nezha_21",  # network_delay no signal
    "20220823_nezha_24",  # network_delay no signal
}

print(f"ROOT: {ROOT}")
print(f"OUTDIR: {OUTDIR.absolute()}")
print(f"Excluded scenarios: {len(EXCLUDED_SCENARIOS)}")


ROOT: /root/lemm/core_multimodal_tmp
OUTDIR: /root/lemm/notebooks/03-Data-Validation/unified_rca_reports
Excluded scenarios: 7


## Dataset Detection & Scenario Discovery


In [3]:
def detect_dataset(scenario_id: str) -> str:
    """Detect if scenario is from LEMMA or Nezha."""
    if NEZHA_PATTERN.match(scenario_id):
        return "nezha"
    elif LEMMA_PATTERN.match(scenario_id):
        return "lemma"
    else:
        return "unknown"

def get_metric_files_map(dataset: str) -> dict:
    """Get the correct metric file names based on dataset."""
    if dataset == "lemma":
        return {
            "cpu": "pod_level_data_pod_cpu_usage_total.parquet",
            "memory": "pod_level_data_pod_memory_working_set.parquet",
            "rx_bytes": "pod_level_data_pod_network_rx_bytes.parquet",
            "tx_bytes": "pod_level_data_pod_network_tx_bytes.parquet",
        }
    else:  # nezha
        return {
            "cpu": "pod_cpu_usage_total.parquet",
            "memory": "pod_memory_working_set.parquet",
            "rx_bytes": "pod_network_rx_bytes.parquet",
            "tx_bytes": "pod_network_tx_bytes.parquet",
            # Extra metrics only in Nezha
            "latency_server": "pod_latency_server_p95.parquet",
            "latency_client": "pod_latency_client_p95.parquet",
            "workload": "pod_workload_ops.parquet",
        }

def get_metrics_dir(scenario_id: str, dataset: str) -> Path:
    """Get the correct metrics directory based on dataset."""
    if dataset == "lemma":
        return ROOT / scenario_id  # LEMMA has metrics in multimodal dir
    else:
        return METRICS_BASE / scenario_id  # Nezha has separate metrics dir

def get_logs_path(scenario_id: str, dataset: str) -> Path:
    """Get the correct logs file path based on dataset."""
    if dataset == "lemma":
        return None  # Will be read from manifest
    else:
        return LOGS_BASE / scenario_id / "logs_service_texts.parquet"

# Discover all scenarios
all_scenarios = []
lemma_count = 0
nezha_count = 0
excluded_count = 0

for d in sorted(ROOT.iterdir()):
    if d.is_dir():
        has_manifest = (d / "manifest.json").exists()
        has_gt = (d / "ground_truth.json").exists()
        
        if has_manifest and has_gt:
            dataset = detect_dataset(d.name)
            
            if d.name in EXCLUDED_SCENARIOS:
                excluded_count += 1
                print(f"[EXCLUDED] {d.name}")
                continue
            
            if dataset == "lemma":
                lemma_count += 1
            elif dataset == "nezha":
                nezha_count += 1
            
            all_scenarios.append({"dir": d, "id": d.name, "dataset": dataset})

print(f"\n{'='*60}")
print(f"SCENARIO DISCOVERY SUMMARY")
print(f"{'='*60}")
print(f"LEMMA scenarios:    {lemma_count}")
print(f"Nezha scenarios:    {nezha_count}")
print(f"Excluded:           {excluded_count}")
print(f"TOTAL valid:        {len(all_scenarios)}")



SCENARIO DISCOVERY SUMMARY
LEMMA scenarios:    4
Nezha scenarios:    94
Excluded:           0
TOTAL valid:        98


In [4]:
def load_json(path):
    with open(path) as f:
        return json.load(f)

def load_manifest_and_gt(scen_dir):
    manifest = load_json(scen_dir / "manifest.json")
    gt = load_json(scen_dir / "ground_truth.json")
    return manifest, gt

def reconstruct_ts_index(time_start, time_end, n_timesteps):
    start = pd.Timestamp(time_start)
    end = pd.Timestamp(time_end)
    return pd.date_range(start=start, end=end, periods=n_timesteps)

def safe_std(arr, min_samples=3):
    """Calculate std safely. Returns None if not enough data.
    Note: min_samples=3 for Nezha which has short PRE windows (~3 min = 6 points at 30s).
    """
    arr = np.asarray(arr)
    valid = arr[~np.isnan(arr)]
    if len(valid) < min_samples:
        return None
    s = np.std(valid)
    if s == 0 or s < 1e-10:
        return None
    return s

def safe_mad(arr, min_samples=3):
    """Calculate MAD (Median Absolute Deviation) safely.
    Note: min_samples=3 for Nezha which has short PRE windows.
    """
    arr = np.asarray(arr)
    valid = arr[~np.isnan(arr)]
    if len(valid) < min_samples:
        return None
    median = np.median(valid)
    mad = np.median(np.abs(valid - median))
    if mad == 0 or mad < 1e-10:
        return None
    return mad

def robust_zscore(value, median, mad):
    """Calculate robust z-score using MAD. Scale factor = 1.4826 for normal dist."""
    if mad is None or mad == 0:
        return np.nan
    return (value - median) / (1.4826 * mad)

def cap_zscore(z, cap=25.0):
    """Cap z-score to reasonable range for display."""
    if np.isnan(z):
        return np.nan
    return np.clip(z, -cap, cap)

def is_valid_metric(value, max_reasonable=1000):
    """Verify that a sigma/z-score value is reasonable."""
    if value is None or np.isnan(value):
        return False
    return abs(value) < max_reasonable

def detect_root_service_log_col(logs_df, service_name):
    """Find the log column for a service."""
    if service_name in logs_df.columns:
        return service_name
    for col in logs_df.columns:
        if service_name.lower() in col.lower():
            return col
    return None

print("✓ Utilities loaded")


✓ Utilities loaded


## TIER 1: Structural Validation


In [5]:
# Keys required in both datasets
COMMON_MANIFEST_KEYS = ["scenario_id", "time_start", "time_end", "n_timesteps", "n_pods", "n_services", "pods", "services"]
COMMON_GT_KEYS = ["fault_timestamp_raw", "fault_time_idx", "root_cause_service", "root_cause_pods"]

# Additional keys by dataset
LEMMA_MANIFEST_KEYS = ["metrics_files", "logs_texts_file"]
NEZHA_MANIFEST_KEYS = ["metrics_files", "logs_texts_file"]
NEZHA_GT_KEYS = ["fault_type"]

struct_results = []

for scen in all_scenarios:
    scen_dir = scen["dir"]
    sid = scen["id"]
    dataset = scen["dataset"]
    
    manifest, gt = load_manifest_and_gt(scen_dir)
    errors = []
    
    # Check common keys
    for k in COMMON_MANIFEST_KEYS:
        if k not in manifest:
            errors.append(f"manifest missing {k}")
    
    for k in COMMON_GT_KEYS:
        if k not in gt:
            errors.append(f"gt missing {k}")
    
    # Check dataset-specific keys
    if dataset == "nezha":
        for k in NEZHA_GT_KEYS:
            if k not in gt:
                errors.append(f"gt missing {k}")
    
    # Coherence checks
    if not errors:
        if len(manifest["pods"]) != manifest["n_pods"]:
            errors.append(f"len(pods)={len(manifest['pods'])} != n_pods={manifest['n_pods']}")
        
        if len(manifest["services"]) != manifest["n_services"]:
            errors.append(f"len(services)={len(manifest['services'])} != n_services={manifest['n_services']}")
        
        # Check fault timestamp in range
        fault_ts = pd.Timestamp(gt["fault_timestamp_raw"])
        ts_start = pd.Timestamp(manifest["time_start"])
        ts_end = pd.Timestamp(manifest["time_end"])
        if not (ts_start <= fault_ts <= ts_end):
            errors.append(f"fault_ts outside window")
        
        # Check root cause pod exists
        for rc_pod in gt["root_cause_pods"]:
            if rc_pod not in manifest["pods"]:
                errors.append(f"rc_pod {rc_pod[:20]}... not in pods")
    
    status = "OK" if not errors else "; ".join(errors[:3])
    struct_results.append({
        "scenario": sid,
        "dataset": dataset,
        "fault_type": gt.get("fault_type", "N/A"),
        "n_pods": manifest.get("n_pods", 0),
        "n_services": manifest.get("n_services", 0),
        "status": status,
        "n_errors": len(errors)
    })

struct_df = pd.DataFrame(struct_results)

# Summary
print(f"{'='*70}")
print("STRUCTURAL VALIDATION SUMMARY")
print(f"{'='*70}")

for ds in ["lemma", "nezha"]:
    ds_df = struct_df[struct_df["dataset"] == ds]
    if len(ds_df) == 0:
        continue
    n_ok = (ds_df["status"] == "OK").sum()
    print(f"\n{ds.upper()}: {n_ok}/{len(ds_df)} OK")
    if n_ok < len(ds_df):
        print("  Errors:")
        for _, row in ds_df[ds_df["status"] != "OK"].iterrows():
            print(f"    {row['scenario']}: {row['status'][:50]}")

struct_df.to_csv(OUTDIR / "struct_validation.csv", index=False)
print(f"\nSaved: {OUTDIR / 'struct_validation.csv'}")


STRUCTURAL VALIDATION SUMMARY

LEMMA: 4/4 OK

NEZHA: 94/94 OK

Saved: unified_rca_reports/struct_validation.csv


## TIER 1: Temporal Alignment


In [6]:
temporal_results = []

for scen in all_scenarios:
    scen_dir = scen["dir"]
    sid = scen["id"]
    dataset = scen["dataset"]
    
    manifest, gt = load_manifest_and_gt(scen_dir)
    errors = []
    
    try:
        ts_start = pd.Timestamp(manifest["time_start"])
        ts_end = pd.Timestamp(manifest["time_end"])
        n_ts = manifest["n_timesteps"]
        
        ts_index = reconstruct_ts_index(manifest["time_start"], manifest["time_end"], n_ts)
        
        # Check monotonicity
        if not ts_index.is_monotonic_increasing:
            errors.append("index not monotonic")
        
        # Check step consistency (should be 30s)
        diffs = ts_index.to_series().diff().dropna()
        expected_step = pd.Timedelta("30s")
        max_deviation = (diffs - expected_step).abs().max()
        if max_deviation > pd.Timedelta("1s"):
            errors.append(f"step deviation: {max_deviation}")
        
        # Check fault timestamp in window
        fault_ts = pd.Timestamp(gt["fault_timestamp_raw"])
        if not (ts_start <= fault_ts <= ts_end):
            errors.append("fault_ts outside window")
        
        # Check fault_time_idx validity
        fault_idx = gt["fault_time_idx"]
        if not (0 <= fault_idx < n_ts):
            errors.append(f"fault_idx={fault_idx} out of range [0,{n_ts})")
        
    except Exception as e:
        errors.append(f"exception: {str(e)[:30]}")
    
    status = "OK" if not errors else "; ".join(errors)
    temporal_results.append({
        "scenario": sid,
        "dataset": dataset,
        "n_timesteps": manifest.get("n_timesteps", 0),
        "duration_min": (ts_end - ts_start).total_seconds() / 60 if 'ts_end' in dir() else 0,
        "fault_idx": gt.get("fault_time_idx", -1),
        "status": status
    })

temporal_df = pd.DataFrame(temporal_results)

print(f"{'='*70}")
print("TEMPORAL ALIGNMENT SUMMARY")
print(f"{'='*70}")

for ds in ["lemma", "nezha"]:
    ds_df = temporal_df[temporal_df["dataset"] == ds]
    if len(ds_df) == 0:
        continue
    n_ok = (ds_df["status"] == "OK").sum()
    print(f"\n{ds.upper()}: {n_ok}/{len(ds_df)} OK")
    if n_ok < len(ds_df):
        print("  Errors:")
        for _, row in ds_df[ds_df["status"] != "OK"].iterrows():
            print(f"    {row['scenario']}: {row['status'][:50]}")

temporal_df.to_csv(OUTDIR / "temporal_validation.csv", index=False)
print(f"\nSaved: {OUTDIR / 'temporal_validation.csv'}")


TEMPORAL ALIGNMENT SUMMARY

LEMMA: 4/4 OK

NEZHA: 94/94 OK

Saved: unified_rca_reports/temporal_validation.csv


## TIER 2: Metric Evaluation


In [7]:
metrics_eval_results = []

for scen in all_scenarios:
    scen_dir = scen["dir"]
    sid = scen["id"]
    dataset = scen["dataset"]
    
    manifest, gt = load_manifest_and_gt(scen_dir)
    fault_type = gt.get("fault_type", "N/A")
    
    fault_ts = pd.Timestamp(gt["fault_timestamp_raw"])
    fault_bin = fault_ts.floor("30s")
    
    # Get actual scenario time bounds
    ts_start = pd.Timestamp(manifest["time_start"])
    ts_end = pd.Timestamp(manifest["time_end"])
    
    rc_pods = gt["root_cause_pods"]
    metrics_dir = get_metrics_dir(sid, dataset)
    metric_files = get_metric_files_map(dataset)
    
    # Windows based on dataset - USE AVAILABLE DATA
    if dataset == "lemma":
        # LEMMA has long windows, use fixed offsets
        pre_start = fault_bin - 2 * WINDOW_BEFORE
        pre_end = fault_bin - WINDOW_BEFORE
        near_start = fault_bin - WINDOW_BEFORE
        near_end = fault_bin + WINDOW_AFTER
    else:
        # Nezha has short windows (PRE~3min, POST~7min)
        # Use all available PRE data as baseline, excluding 30s before fault
        pre_start = ts_start
        pre_end = fault_bin - pd.Timedelta("30s")
        # NEAR = from fault_bin to end of scenario
        near_start = fault_bin
        near_end = ts_end
    
    for metric_key in ["cpu", "memory", "rx_bytes", "tx_bytes"]:
        metric_file = metric_files.get(metric_key)
        if not metric_file:
            continue
        
        metric_path = metrics_dir / metric_file
        if not metric_path.exists():
            continue
        
        try:
            df = pd.read_parquet(metric_path)
            
            for rc_pod in rc_pods:
                if rc_pod not in df.columns:
                    continue
                
                series = df[rc_pod]
                
                pre_vals = series.loc[pre_start:pre_end].values
                near_vals = series.loc[near_start:near_end].values
                
                pre_mean = np.nanmean(pre_vals) if len(pre_vals) > 0 else np.nan
                near_mean = np.nanmean(near_vals) if len(near_vals) > 0 else np.nan
                
                std = safe_std(pre_vals)
                shift_sigma = (near_mean - pre_mean) / std if std else np.nan
                
                # z-score in window
                mad = safe_mad(pre_vals)
                median = np.nanmedian(pre_vals) if len(pre_vals) > 0 else np.nan
                
                z_scores = []
                for v in near_vals:
                    if not np.isnan(v):
                        z = robust_zscore(v, median, mad)
                        if not np.isnan(z):
                            z_scores.append(abs(z))
                
                max_z = max(z_scores) if z_scores else np.nan
                
                metrics_eval_results.append({
                    "scenario": sid,
                    "dataset": dataset,
                    "fault_type": fault_type,
                    "metric": metric_key,
                    "pod": rc_pod[:40],
                    "pre_mean": pre_mean,
                    "near_mean": near_mean,
                    "shift_sigma": shift_sigma if is_valid_metric(shift_sigma) else np.nan,
                    "max_z_window": cap_zscore(max_z) if is_valid_metric(max_z) else np.nan,
                })
        
        except Exception as e:
            pass  # Skip metrics with errors

metrics_eval_df = pd.DataFrame(metrics_eval_results)
metrics_eval_df.to_csv(OUTDIR / "metrics_window_eval.csv", index=False)

print(f"{'='*70}")
print("METRIC EVALUATION SUMMARY")
print(f"{'='*70}")

for ds in ["lemma", "nezha"]:
    ds_df = metrics_eval_df[metrics_eval_df["dataset"] == ds]
    if len(ds_df) == 0:
        continue
    print(f"\n{ds.upper()}:")
    print(f"  Metrics evaluated: {len(ds_df)}")
    
    valid_shifts = ds_df["shift_sigma"].dropna()
    if len(valid_shifts) > 0:
        strong = (valid_shifts.abs() >= 3.0).sum()
        print(f"  Strong shift (|σ| ≥ 3): {strong} ({100*strong/len(valid_shifts):.0f}%)")
    
    valid_z = ds_df["max_z_window"].dropna()
    if len(valid_z) > 0:
        high_z = (valid_z >= 5.0).sum()
        print(f"  High z-score (z ≥ 5): {high_z} ({100*high_z/len(valid_z):.0f}%)")

print(f"\nSaved: {OUTDIR / 'metrics_window_eval.csv'}")


METRIC EVALUATION SUMMARY

NEZHA:
  Metrics evaluated: 376
  Strong shift (|σ| ≥ 3): 101 (30%)
  High z-score (z ≥ 5): 181 (59%)

Saved: unified_rca_reports/metrics_window_eval.csv


## TIER 2: Log Evaluation


In [8]:
STRICT_ERROR_KEYWORDS = [
    'error', 'exception', 'failed', 'failure', 'timeout', 'refused',
    'unavailable', 'unreachable', 'denied', 'rejected', 'crash',
    '500', '502', '503', '504', 'critical', 'fatal', 'panic', 'traceback'
]
EXCLUDE_PATTERNS = ['deprecated', 'health', 'kube-probe', 'liveness', 'readiness']

def count_strict_errors(texts):
    count = 0
    for t in texts:
        t_lower = str(t).lower()
        if any(ex in t_lower for ex in EXCLUDE_PATTERNS):
            continue
        if any(kw in t_lower for kw in STRICT_ERROR_KEYWORDS):
            count += 1
    return count

_LOG_NORM_PATTERNS = [
    (re.compile(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'), '<IP>'),
    (re.compile(r'\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b', re.I), '<UUID>'),
    (re.compile(r'\b\d+\b'), '<NUM>'),
    (re.compile(r'-[a-z0-9]{5,10}-[a-z0-9]{5}'), '<POD>'),
]

def normalize_log_template(text):
    t = str(text)
    for pattern, replacement in _LOG_NORM_PATTERNS:
        t = pattern.sub(replacement, t)
    return t[:100].strip()

def count_unique_templates(texts):
    templates = set()
    for t in texts:
        template = normalize_log_template(t)
        if template:
            templates.add(template)
    return len(templates)

logs_eval_results = []

for scen in all_scenarios:
    scen_dir = scen["dir"]
    sid = scen["id"]
    dataset = scen["dataset"]
    
    manifest, gt = load_manifest_and_gt(scen_dir)
    fault_type = gt.get("fault_type", "N/A")
    
    fault_ts = pd.Timestamp(gt["fault_timestamp_raw"])
    fault_bin = fault_ts.floor("30s")
    
    # Get actual scenario time bounds
    ts_start = pd.Timestamp(manifest["time_start"])
    ts_end = pd.Timestamp(manifest["time_end"])
    
    # Windows - USE AVAILABLE DATA
    if dataset == "lemma":
        pre_start = fault_bin - 2 * WINDOW_BEFORE
        pre_end = fault_bin - WINDOW_BEFORE
        near_start = fault_bin - WINDOW_BEFORE
        near_end = fault_bin + WINDOW_AFTER
    else:
        # Nezha: use all available PRE data, excluding 30s before fault
        pre_start = ts_start
        pre_end = fault_bin - pd.Timedelta("30s")
        near_start = fault_bin
        near_end = ts_end
    
    rc_service = gt["root_cause_service"]
    
    # Get logs path
    if dataset == "lemma":
        logs_path = Path(manifest.get("logs_texts_file", ""))
    else:
        logs_path = LOGS_BASE / sid / "logs_service_texts.parquet"
    
    if not logs_path or not logs_path.exists():
        logs_eval_results.append({
            "scenario": sid, "dataset": dataset, "fault_type": fault_type,
            "status": "logs file not found"
        })
        continue
    
    logs_df = pd.read_parquet(logs_path)
    log_col = detect_root_service_log_col(logs_df, rc_service)
    
    if not log_col:
        logs_eval_results.append({
            "scenario": sid, "dataset": dataset, "fault_type": fault_type,
            "status": f"service {rc_service} not in logs"
        })
        continue
    
    pre_logs = logs_df.loc[pre_start:pre_end, log_col].dropna()
    near_logs = logs_df.loc[near_start:near_end, log_col].dropna()
    
    pre_texts = [str(t) for t in pre_logs.values if str(t).strip()]
    near_texts = [str(t) for t in near_logs.values if str(t).strip()]
    
    vol_pre = sum(len(t) for t in pre_texts)
    vol_near = sum(len(t) for t in near_texts)
    delta_vol = (vol_near - vol_pre) / max(vol_pre, 1) * 100
    
    errors_pre = count_strict_errors(pre_texts)
    errors_near = count_strict_errors(near_texts)
    
    templates_pre = count_unique_templates(pre_texts)
    templates_near = count_unique_templates(near_texts)
    new_templates = max(0, templates_near - templates_pre)
    
    has_log_change = (errors_near > errors_pre or new_templates > 2 or abs(delta_vol) > 100)
    
    logs_eval_results.append({
        "scenario": sid,
        "dataset": dataset,
        "fault_type": fault_type,
        "log_col": log_col,
        "vol_pre": vol_pre,
        "vol_near": vol_near,
        "delta_vol_pct": delta_vol,
        "errors_pre": errors_pre,
        "errors_near": errors_near,
        "templates_pre": templates_pre,
        "templates_near": templates_near,
        "new_templates": new_templates,
        "has_log_change": has_log_change,
        "status": "OK"
    })

logs_eval_df = pd.DataFrame(logs_eval_results)
logs_eval_df.to_csv(OUTDIR / "logs_window_eval.csv", index=False)

print(f"{'='*70}")
print("LOG EVALUATION SUMMARY")
print(f"{'='*70}")

for ds in ["lemma", "nezha"]:
    ds_df = logs_eval_df[logs_eval_df["dataset"] == ds]
    if len(ds_df) == 0:
        continue
    ok_df = ds_df[ds_df["status"] == "OK"]
    print(f"\n{ds.upper()}:")
    print(f"  Processed: {len(ok_df)}/{len(ds_df)}")
    if len(ok_df) > 0:
        print(f"  With log change: {ok_df['has_log_change'].sum()} ({100*ok_df['has_log_change'].mean():.0f}%)")

print(f"\nSaved: {OUTDIR / 'logs_window_eval.csv'}")


LOG EVALUATION SUMMARY

LEMMA:
  Processed: 4/4
  With log change: 4 (100%)

NEZHA:
  Processed: 94/94
  With log change: 73 (78%)

Saved: unified_rca_reports/logs_window_eval.csv


## A/B/C/D Classification


In [9]:
METRIC_THRESHOLD_SHIFT = 3.0
METRIC_THRESHOLD_Z = 5.0

quality_results = []

for scen in all_scenarios:
    scen_dir = scen["dir"]
    sid = scen["id"]
    dataset = scen["dataset"]
    
    manifest, gt = load_manifest_and_gt(scen_dir)
    fault_type = gt.get("fault_type", "N/A")
    
    # Metrics strength
    scen_metrics = metrics_eval_df[metrics_eval_df["scenario"] == sid]
    metrics_strong = False
    metrics_reasons = []
    
    if len(scen_metrics) > 0:
        valid_metrics = scen_metrics[scen_metrics["shift_sigma"].notna()]
        if len(valid_metrics) > 0:
            shifts = valid_metrics["shift_sigma"].abs()
            has_shift = (shifts >= METRIC_THRESHOLD_SHIFT).any()
            if has_shift:
                max_shift = shifts.max()
                metrics_reasons.append(f"shift={max_shift:.1f}σ")
            
            z_vals = valid_metrics["max_z_window"].dropna()
            has_z = (z_vals >= METRIC_THRESHOLD_Z).any() if len(z_vals) > 0 else False
            if has_z:
                max_z = z_vals.max()
                metrics_reasons.append(f"z={max_z:.1f}")
            
            metrics_strong = has_shift or has_z
    
    # Logs strength
    scen_logs = logs_eval_df[logs_eval_df["scenario"] == sid]
    logs_strong = False
    logs_reasons = []
    
    if len(scen_logs) > 0 and scen_logs.iloc[0].get("status") == "OK":
        row = scen_logs.iloc[0]
        logs_strong = row.get("has_log_change", False)
        
        if row.get("errors_near", 0) > row.get("errors_pre", 0):
            logs_reasons.append(f"errors:{row['errors_pre']}->{row['errors_near']}")
        if row.get("new_templates", 0) > 2:
            logs_reasons.append(f"new_tmpl={row['new_templates']}")
    
    # Classification
    if metrics_strong and logs_strong:
        classification = "A"
    elif metrics_strong and not logs_strong:
        classification = "B"
    elif not metrics_strong and logs_strong:
        classification = "C"
    else:
        classification = "D"
    
    quality_results.append({
        "scenario": sid,
        "dataset": dataset,
        "fault_type": fault_type,
        "metrics_strong": bool(metrics_strong),
        "metrics_reasons": ", ".join(metrics_reasons) if metrics_reasons else "none",
        "logs_strong": bool(logs_strong),
        "logs_reasons": ", ".join(logs_reasons) if logs_reasons else "none",
        "classification": classification
    })

quality_df = pd.DataFrame(quality_results)
quality_df.to_csv(OUTDIR / "scenario_quality.csv", index=False)

# Save as JSON
quality_dict = {r["scenario"]: r for r in quality_results}
with open(OUTDIR / "scenario_quality.json", "w") as f:
    json.dump(quality_dict, f, indent=2)

print(f"{'='*70}")
print("A/B/C/D CLASSIFICATION SUMMARY")
print(f"{'='*70}")

for ds in ["lemma", "nezha"]:
    ds_df = quality_df[quality_df["dataset"] == ds]
    if len(ds_df) == 0:
        continue
    print(f"\n{ds.upper()} ({len(ds_df)} scenarios):")
    for cls in ['A', 'B', 'C', 'D']:
        n = (ds_df['classification'] == cls).sum()
        pct = 100 * n / len(ds_df)
        print(f"  {cls}: {n:3d} ({pct:5.1f}%)")

print(f"\n{'='*70}")
print("COMBINED SUMMARY:")
print(f"{'='*70}")
print(quality_df["classification"].value_counts().sort_index())

print(f"\nSaved: {OUTDIR / 'scenario_quality.csv'} and {OUTDIR / 'scenario_quality.json'}")


A/B/C/D CLASSIFICATION SUMMARY

LEMMA (4 scenarios):
  A:   0 (  0.0%)
  B:   0 (  0.0%)
  C:   4 (100.0%)
  D:   0 (  0.0%)

NEZHA (94 scenarios):
  A:  69 ( 73.4%)
  B:  20 ( 21.3%)
  C:   4 (  4.3%)
  D:   1 (  1.1%)

COMBINED SUMMARY:
classification
A    69
B    20
C     8
D     1
Name: count, dtype: int64

Saved: unified_rca_reports/scenario_quality.csv and unified_rca_reports/scenario_quality.json


In [10]:
# Combine all results
summary_df = quality_df[["scenario", "dataset", "fault_type", "classification", "metrics_strong", "logs_strong"]].copy()

# Add validation status
summary_df = summary_df.merge(
    struct_df[["scenario", "status"]].rename(columns={"status": "struct_status"}),
    on="scenario", how="left"
)
summary_df = summary_df.merge(
    temporal_df[["scenario", "n_timesteps", "status"]].rename(columns={"status": "temporal_status"}),
    on="scenario", how="left"
)

# Add max shift
max_shifts = metrics_eval_df.groupby("scenario")["shift_sigma"].apply(
    lambda x: x.abs().max()
).reset_index()
max_shifts.columns = ["scenario", "max_shift_sigma"]
summary_df = summary_df.merge(max_shifts, on="scenario", how="left")

summary_df.to_csv(OUTDIR / "report_summary.csv", index=False)

print(f"{'='*70}")
print("FINAL UNIFIED VALIDATION SUMMARY")
print(f"{'='*70}")

print(f"\nTotal scenarios validated: {len(summary_df)}")
print(f"  LEMMA:  {len(summary_df[summary_df['dataset'] == 'lemma'])}")
print(f"  Nezha:  {len(summary_df[summary_df['dataset'] == 'nezha'])}")

print(f"\nStructural validation:")
print(f"  OK: {(summary_df['struct_status'] == 'OK').sum()}")

print(f"\nTemporal validation:")
print(f"  OK: {(summary_df['temporal_status'] == 'OK').sum()}")

print(f"\nClassification (all datasets):")
for cls in ['A', 'B', 'C', 'D']:
    n = (summary_df['classification'] == cls).sum()
    pct = 100 * n / len(summary_df)
    print(f"  {cls}: {n:3d} ({pct:5.1f}%)")

print(f"\n{'='*70}")
print("TRAINING RECOMMENDATIONS")
print(f"{'='*70}")
n_a = (summary_df['classification'] == 'A').sum()
n_b = (summary_df['classification'] == 'B').sum()
n_c = (summary_df['classification'] == 'C').sum()
n_d = (summary_df['classification'] == 'D').sum()

print(f"\n✅ Class A (ideal multimodal): {n_a} scenarios")
print(f"✅ Class B (metrics focus):    {n_b} scenarios")
print(f"✅ Class C (logs focus):       {n_c} scenarios")
print(f"⚠️  Class D (consider exclude): {n_d} scenarios")
print(f"\nTotal trainable (A+B+C): {n_a + n_b + n_c} scenarios")

print(f"\nSaved: {OUTDIR / 'report_summary.csv'}")


FINAL UNIFIED VALIDATION SUMMARY

Total scenarios validated: 98
  LEMMA:  4
  Nezha:  94

Structural validation:
  OK: 98

Temporal validation:
  OK: 98

Classification (all datasets):
  A:  69 ( 70.4%)
  B:  20 ( 20.4%)
  C:   8 (  8.2%)
  D:   1 (  1.0%)

TRAINING RECOMMENDATIONS

✅ Class A (ideal multimodal): 69 scenarios
✅ Class B (metrics focus):    20 scenarios
✅ Class C (logs focus):       8 scenarios
⚠️  Class D (consider exclude): 1 scenarios

Total trainable (A+B+C): 97 scenarios

Saved: unified_rca_reports/report_summary.csv


## Sanity Checks


In [11]:
all_ok = True
errors = []

for scen in all_scenarios:
    scen_dir = scen["dir"]
    sid = scen["id"]
    dataset = scen["dataset"]
    
    try:
        manifest, gt = load_manifest_and_gt(scen_dir)
        
        ts_index = reconstruct_ts_index(manifest["time_start"], manifest["time_end"], manifest["n_timesteps"])
        assert ts_index.is_monotonic_increasing, f"{sid}: ts_index not monotonic"
        
        fault_ts = pd.Timestamp(gt["fault_timestamp_raw"])
        fault_bin = fault_ts.floor("30s")
        window_start = fault_bin - WINDOW_BEFORE
        window_end = fault_bin + WINDOW_AFTER
        window_mask = (ts_index >= window_start) & (ts_index <= window_end)
        assert window_mask.sum() > 0, f"{sid}: window empty"
        
        rc_pods = gt["root_cause_pods"]
        metrics_dir = get_metrics_dir(sid, dataset)
        metric_files = get_metric_files_map(dataset)
        
        cpu_path = metrics_dir / metric_files["cpu"]
        if cpu_path.exists():
            cpu_df = pd.read_parquet(cpu_path)
            for pod in rc_pods:
                assert pod in cpu_df.columns, f"{sid}: rc_pod {pod[:20]}... not in metrics"
    
    except AssertionError as e:
        all_ok = False
        errors.append(str(e))

if all_ok:
    print("✅ All sanity checks passed successfully")
else:
    print(f"❌ {len(errors)} checks failed:")
    for e in errors[:10]:
        print(f"  - {e}")


✅ All sanity checks passed successfully
