# Nezha – Data Transformation

Converts raw Nezha (OnlineBoutique + TrainTicket) into the unified multimodal format:
metrics aligned to 30s bins, logs aggregated at service level, manifest/ground_truth JSONs.

Produces 7 metric parquets (4 core + 3 Nezha-only extras), service-level logs, and per-scenario JSONs.

| Date | App | Scenarios | Fault types |
|------|-----|-----------|-------------|
| 2022-08-22 | OnlineBoutique | 24 | cpu_contention, network_delay, exception, return |
| 2022-08-23 | OnlineBoutique | 35 | cpu_contention, network_delay, exception, return |
| 2023-01-29 | TrainTicket | 28 | cpu_contention, exception, return |
| 2023-01-30 | TrainTicket | 17 | cpu_contention, network_delay |

In [10]:
# === CONFIGURATION ===
from pathlib import Path
import json
import pandas as pd
import numpy as np

# Paths
NEZHA_DIR = Path("/root/lemm/Nezha")
OUTPUT_BASE = Path("/root/lemm")

# ============================================================================
# ALL NEZHA DATES
# ============================================================================
# OnlineBoutique: 2022-08-22, 2022-08-23
# Train-Ticket:   2023-01-29, 2023-01-30
ALL_DATES = ["2022-08-22", "2022-08-23", "2023-01-29", "2023-01-30"]

# Known problematic scenarios (exclude)
EXCLUDED_SCENARIOS = {
    # Bad data
    "20220822_nezha_22",  # adservice timestamps corrupted (year 2028)
    "20220822_nezha_23",  # adservice timestamps corrupted (year 2028)
    "20230130_nezha_15",  # ts-security-service CPU=0 in original dataset
    "20230130_nezha_16",  # ts-security-service CPU=0 in original dataset
    # Poor signal (Class D)
    "20220822_nezha_14",  # network_delay no signal
    "20220823_nezha_21",  # network_delay no signal
    "20220823_nezha_24",  # network_delay no signal
}

# Time window
PRE_FAULT_MINUTES = 3
POST_FAULT_MINUTES = 7
BIN_SECONDS = 30

# ============================================================================
# COMPLETE METRIC MAPPING: ALL 19 METRICS AVAILABLE IN NEZHA
# ============================================================================
# We now use ALL metrics (not just CORE + EXTRA) to detect anomalies
# that might be visible in metrics we were previously ignoring

# Complete mapping: Nezha column -> internal name -> parquet file
COMPLETE_COLUMN_MAP = {
    # CORE (4 metrics - for LEMMA compatibility)
    "CpuUsageRate(%)": "cpu_usage_rate",
    "MemoryUsage(Mi)": "memory_usage",
    "NetworkReceiveBytes": "network_rx_bytes",
    "NetworkTransmitBytes": "network_tx_bytes",
    
    # EXTRA (3 metrics - Nezha-specific)
    "PodServerLatencyP95(s)": "latency_server_p95",
    "PodClientLatencyP95(s)": "latency_client_p95",
    "PodWorkload(Ops)": "workload_ops",
    
    # ADDITIONAL METRICS (12 metrics we were NOT using - may contain anomalies!)
    "CpuUsage(m)": "cpu_usage_m",
    "MemoryUsageRate(%)": "memory_usage_rate",
    "PodSuccessRate(%)": "pod_success_rate",  # IMPORTANT: success rate for fault detection
    "SyscallRead": "syscall_read",
    "SyscallWrite": "syscall_write",
    "NodeCpuUsageRate(%)": "node_cpu_usage_rate",  # Node-level metrics
    "NodeMemoryUsageRate(%)": "node_memory_usage_rate",
    "NodeNetworkReceiveBytes": "node_network_rx_bytes",
    "PodClientLatencyP90(s)": "latency_client_p90",  # Other latency percentiles
    "PodClientLatencyP99(s)": "latency_client_p99",
    "PodServerLatencyP90(s)": "latency_server_p90",
    "PodServerLatencyP99(s)": "latency_server_p99",
}

# Generate file map for all metrics
ALL_FILE_MAP = {}
for nezha_col, internal_name in COMPLETE_COLUMN_MAP.items():
    ALL_FILE_MAP[internal_name] = f"pod_{internal_name}.parquet"

# Backward compatibility: CORE mapping (for LEMMA compatibility)
CORE_COLUMN_MAP = {
    "CpuUsageRate(%)": "cpu_usage_rate",
    "MemoryUsage(Mi)": "memory_usage",
    "NetworkReceiveBytes": "network_rx_bytes",
    "NetworkTransmitBytes": "network_tx_bytes",
}

# Backward compatibility: EXTRA mapping
EXTRA_COLUMN_MAP = {
    "PodServerLatencyP95(s)": "latency_server_p95",
    "PodClientLatencyP95(s)": "latency_client_p95",
    "PodWorkload(Ops)": "workload_ops",
}

# Metric order: CORE first (for LEMMA compatibility), then EXTRA, then ADDITIONAL
METRIC_ORDER = [
    # CORE (4)
    "cpu_usage_rate", "memory_usage", "network_rx_bytes", "network_tx_bytes",
    # EXTRA (3)
    "latency_server_p95", "latency_client_p95", "workload_ops",
    # ADDITIONAL (12)
    "cpu_usage_m", "memory_usage_rate", "pod_success_rate",
    "syscall_read", "syscall_write",
    "node_cpu_usage_rate", "node_memory_usage_rate", "node_network_rx_bytes",
    "latency_client_p90", "latency_client_p99", "latency_server_p90", "latency_server_p99",
]

# Backward compatibility
CORE_FILE_MAP = {k: v for k, v in ALL_FILE_MAP.items() if k in CORE_COLUMN_MAP.values()}

# Verify paths
fault_list_path = NEZHA_DIR / "rca_data" / "2022-08-22"  / f"2022-08-22-fault_list.json"
print(f"[OK] Nezha dir exists: {NEZHA_DIR.exists()}")
print(f"[OK] Fault list exists: {fault_list_path.exists()}")
print(f"[OK] Output base exists: {OUTPUT_BASE.exists()}")


[OK] Nezha dir exists: True
[OK] Fault list exists: True
[OK] Output base exists: True


## 1. Load Functions

Uses `inject_time` (string, parsed as UTC) instead of `inject_timestamp` (epoch) because of timezone inconsistencies in some Nezha dates.

In [11]:
# === LOAD FUNCTIONS ===
def load_and_flatten_faults(date: str) -> list[dict]:
    """Load fault_list.json and flatten to simple list.
    
    NOTE: We use inject_time parsed as UTC instead of epoch (inject_timestamp)
    because some Nezha datasets have incorrect timezone in epoch.
    """
    path = NEZHA_DIR / "rca_data" / date / f"{date}-fault_list.json"
    by_hour = json.loads(path.read_text())
    
    faults = []
    for hour, hour_faults in by_hour.items():
        for fault in hour_faults:
            # Usar inject_time parseado como UTC (el epoch tiene bugs en Train-Ticket)
            inject_ts_corrected = int(pd.Timestamp(fault["inject_time"], tz="UTC").timestamp())
            faults.append({
                "date": date,
                "inject_time": fault["inject_time"],
                "inject_timestamp": inject_ts_corrected,
                "inject_pod": fault["inject_pod"],
                "inject_type": fault["inject_type"],
            })
    
    return sorted(faults, key=lambda x: x["inject_timestamp"])

# Show summary of all dates
print("="*70)
print("NEZHA DATASET SUMMARY")
print("="*70)
for date in ALL_DATES:
    path = NEZHA_DIR / "rca_data" / date / f"{date}-fault_list.json"
    if path.exists():
        faults = load_and_flatten_faults(date)
        fault_types = set(f["inject_type"] for f in faults)
        print(f"   {date}: {len(faults):2} faults | Types: {', '.join(sorted(fault_types))}")
    else:
        print(f"   {date}: [WARN] Not found")


NEZHA DATASET SUMMARY
   2022-08-22: 24 faults | Types: cpu_consumed, cpu_contention, exception, network_delay, return
   2022-08-23: 32 faults | Types: cpu_consumed, cpu_contention, exception, network_delay, return
   2023-01-29: 28 faults | Types: cpu_contention, exception, return
   2023-01-30: 17 faults | Types: cpu_contention, network_delay


In [12]:
# === VALIDATION AND BINNING FUNCTIONS ===

def get_available_pods(date: str) -> set[str]:
    """Get pods that have metrics for this day."""
    metric_dir = NEZHA_DIR / "rca_data" / date / "metric"
    return {f.stem.replace("_metric", "") for f in metric_dir.glob("*_metric.csv")}

def get_available_log_minutes(date: str) -> set[str]:
    """Get HH_MM minutes that have log files."""
    log_dir = NEZHA_DIR / "rca_data" / date / "log"
    return {f.stem.replace("_log", "") for f in log_dir.glob("*_log.csv")}

def calculate_bins(fault: dict) -> tuple:
    """Calculate window and verify fault_idx is valid."""
    inject_ts = pd.Timestamp(fault["inject_timestamp"], unit="s", tz="UTC")
    
    t_start = (inject_ts - pd.Timedelta(minutes=PRE_FAULT_MINUTES)).floor(f"{BIN_SECONDS}s")
    t_end = (inject_ts + pd.Timedelta(minutes=POST_FAULT_MINUTES)).ceil(f"{BIN_SECONDS}s")
    bins = pd.date_range(start=t_start, end=t_end, freq=f"{BIN_SECONDS}s", tz="UTC")
    
    fault_bin_ts = inject_ts.floor(f"{BIN_SECONDS}s")
    fault_idx = int((bins == fault_bin_ts).argmax())
    
    # Verify fault_bin is within window
    in_range = 0 <= fault_idx < len(bins)
    matches = bins[fault_idx] == fault_bin_ts if in_range else False
    
    return bins, fault_bin_ts, fault_idx, in_range and matches

def check_log_coverage(bins: pd.DatetimeIndex, available_minutes: set[str]) -> tuple[int, int]:
    """Cuenta cuántos minutos de la ventana tienen logs disponibles."""
    needed_minutes = set()
    for b in bins:
        needed_minutes.add(f"{b.hour:02d}_{b.minute:02d}")
    
    found = len(needed_minutes & available_minutes)
    total = len(needed_minutes)
    return found, total

# Show statistics per date
print("="*70)
print("STATISTICS PER DATE")
print("="*70)
for date in ALL_DATES:
    pods = get_available_pods(date)
    log_mins = get_available_log_minutes(date)
    print(f"   {date}: {len(pods):2} pods with metrics | {len(log_mins):3} log minutes")


STATISTICS PER DATE
   2022-08-22: 10 pods with metrics |  72 log minutes
   2022-08-23: 10 pods with metrics |  96 log minutes
   2023-01-29: 46 pods with metrics |  84 log minutes
   2023-01-30: 46 pods with metrics |  51 log minutes


## PROCESSING FUNCTIONS ===
#### Includes log cleaning identical to LEMMA (clean_log_content_vectorized)
#### and template counting for full consistency

In [13]:


import json as json_lib
import re
from collections import Counter

# ============================================================================
# LOG CLEANING (IDENTICAL TO LEMMA)
# ============================================================================

# Whitelist of common ports to preserve (not replaced by <NUM>)
_WHITELIST_PORTS = r'80|443|8080|8443|3000|3306|5432|5631|5672|5900|6379|7001|7070|8081|9000|9001|9090|9200|9300|9411|9999'


def clean_log_content_vectorized(series: pd.Series) -> pd.Series:
    """
    Vectorized version of clean_log_content using pandas str methods.
    IDENTICAL TO LEMMA - normalizes logs for better model generalization.
    
    Replaces:
    - Timestamps → <TS>
    - UUIDs → <UUID>
    - Hex strings → <HEX>
    - IPs → <HOST>
    - Pod suffixes → -<RAND>
    - Large numbers → <NUM>
    - Preserves known ports and HTTP status codes
    """
    s = series.fillna('')
    # Remove ANSI escape codes
    s = s.str.replace(r'\x1b\[[0-9;]*m', '', regex=True)
    # Timestamps
    s = s.str.replace(r'\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?Z?', '<TS>', regex=True)
    # UUIDs
    s = s.str.replace(r'[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}', '<UUID>', regex=True)
    # Long hex strings
    s = s.str.replace(r'\b[a-fA-F0-9]{24,}\b', '<HEX>', regex=True)
    # IP addresses (with optional port)
    s = s.str.replace(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(:\d{2,5})?', '<HOST>', regex=True)
    # Pod/deployment suffixes (random hashes)
    s = s.str.replace(r'(?<=[a-zA-Z0-9])-[a-z0-9]{6,12}-[a-z0-9]{4,6}\b', '-<RAND>', regex=True)
    # IDs in URL paths
    s = s.str.replace(r'(?<=/)\d{4,}(?=/|$|\s|")', ':id', regex=True)
    # Preserve ports (mark temporarily)
    s = s.str.replace(rf'(?i)(\bport\b|listen(?:ing)?(?:\s+on)?|bind(?:ing)?(?:\s+to)?|socket)[:\s]+(\d{{2,5}})', r'\1 __PRT_\2__', regex=True)
    s = s.str.replace(rf':({_WHITELIST_PORTS})\b', r':__PRT_\1__', regex=True)
    # Preserve HTTP status codes
    s = s.str.replace(r'(?i)(https?|status|code)[:\s]*([1-5]\d\d)\b', r'\1 __HTTP_\2__', regex=True)
    # Preserve version numbers
    s = s.str.replace(r'\bv?(\d+(?:\.\d+){1,3})\b', r'__VER_\1__', regex=True)
    # Replace large numbers
    s = s.str.replace(r'\b\d{4,}\b', '<NUM>', regex=True)
    # Restore preserved tokens
    s = s.str.replace(r'__PRT_(\d+)__', r'\1', regex=True)
    s = s.str.replace(r'__HTTP_(\d+)__', r'\1', regex=True)
    s = s.str.replace(r'__VER_([^_]+)__', r'\1', regex=True)
    # Lowercase and normalize whitespace
    s = s.str.lower()
    s = s.str.replace(r'\s+', ' ', regex=True).str.strip()
    return s


# ============================================================================
# METRIC LOADING
# ============================================================================

def load_metrics_for_pod(date: str, pod: str, bins: pd.DatetimeIndex) -> dict[str, pd.Series]:
    """Carga y alinea métricas CORE + EXTRA de un pod a los bins de 30s."""
    path = NEZHA_DIR / "rca_data" / date / "metric" / f"{pod}_metric.csv"
    if not path.exists():
        return {}
    
    df = pd.read_csv(path)
    if "TimeStamp" not in df.columns:
        return {}
    
    df["ts"] = pd.to_datetime(df["TimeStamp"], unit="s", utc=True)
    df = df.set_index("ts").sort_index()
    
    # Filter to window with margin
    margin = pd.Timedelta(minutes=2)
    df = df[(df.index >= bins[0] - margin) & (df.index <= bins[-1] + margin)]
    
    result = {}
    
    # UNIFICADO CON LEMMA: ffill(limit=1) sin fillna(0)
    # limit=1 because Nezha has data every 60s -> resampling to 30s only 1 bin vacío
    # Mantener NaN donde no hay datos (el dataloader hará mask + nan_to_num)
    
    # Load ALL available metrics (not just CORE + EXTRA)
    for nezha_col, internal_name in COMPLETE_COLUMN_MAP.items():
        if nezha_col in df.columns:
            series = df[nezha_col].resample(f"{BIN_SECONDS}s").mean()
            series = series.reindex(bins).ffill(limit=1)  # NO fillna(0)
            result[internal_name] = series
    
    return result


# ============================================================================
# LOG LOADING & EXTRACTION
# ============================================================================

def load_logs_for_window(date: str, bins: pd.DatetimeIndex) -> pd.DataFrame:
    """Carga logs de los archivos HH_MM_log.csv necesarios."""
    log_dir = NEZHA_DIR / "rca_data" / date / "log"
    
    # Identify needed minutes
    needed = {f"{b.hour:02d}_{b.minute:02d}" for b in bins}
    needed |= {f"{(b - pd.Timedelta(minutes=1)).hour:02d}_{(b - pd.Timedelta(minutes=1)).minute:02d}" for b in bins}
    
    dfs = []
    for hh_mm in sorted(needed):
        path = log_dir / f"{hh_mm}_log.csv"
        if path.exists():
            try:
                df = pd.read_csv(path, usecols=["Timestamp", "PodName", "Log"], 
                                encoding='utf-8', encoding_errors='ignore')
                dfs.append(df)
            except:
                pass
    
    if not dfs:
        return pd.DataFrame(columns=["Timestamp", "PodName", "Log"])
    
    logs = pd.concat(dfs, ignore_index=True)
    logs["ts"] = pd.to_datetime(logs["Timestamp"], utc=True, errors='coerce')
    logs = logs.dropna(subset=["ts"])
    logs["bin_start"] = logs["ts"].dt.floor(f"{BIN_SECONDS}s")
    
    return logs[(logs["bin_start"] >= bins[0]) & (logs["bin_start"] <= bins[-1])]


def extract_log_message(log_json: str) -> str:
    """Extrae mensaje del JSON anidado de Nezha."""
    if not isinstance(log_json, str):
        return ""
    try:
        outer = json_lib.loads(log_json)
        inner_raw = outer.get("log", "")
        if isinstance(inner_raw, str):
            try:
                inner = json_lib.loads(inner_raw)
                return inner.get("message", inner.get("msg", ""))
            except:
                return inner_raw.strip()
        return ""
    except:
        return ""


# ============================================================================
# LOG AGGREGATION (IDÉNTICO A LEMMA - con limpieza + template counting)
# ============================================================================

def aggregate_logs_to_services(logs_df: pd.DataFrame, bins: pd.DatetimeIndex, services: list[str]) -> pd.DataFrame:
    """
    Agrega logs por SERVICIO y bin (IDÉNTICO A LEMMA).
    
    Pipeline:
    1. Extrae mensaje del JSON anidado de Nezha
    2. Limpia con clean_log_content_vectorized (normaliza timestamps, IPs, etc.)
    3. Agrupa por (service, bin_start, template_limpio)
    4. Cuenta ocurrencias de cada template
    5. Formatea como "template:x{count}" con most_common(10)
    
    Resultado: mismo formato que LEMMA logs
    """
    # Estructura: service_bins[service][bin] = Counter({template: count})
    service_bins = {svc: {b: Counter() for b in bins} for svc in services}
    
    if not logs_df.empty:
        # 1. Extraer mensaje del JSON
        logs_df = logs_df.copy()
        logs_df["text"] = logs_df["Log"].apply(extract_log_message)
        
        # 2. Limpiar con clean_log_content_vectorized (IGUAL QUE LEMMA)
        logs_df["clean_text"] = clean_log_content_vectorized(logs_df["text"])
        
        # 3. Extract service from pod name
        logs_df["service"] = logs_df["PodName"].apply(extract_service_from_pod)
        
        # 4. Group by (service, bin_start, clean_text) and count
        for _, row in logs_df.iterrows():
            svc = row["service"]
            bin_start = row["bin_start"]
            template = row["clean_text"]
            
            if svc in service_bins and bin_start in service_bins[svc] and template:
                service_bins[svc][bin_start][template] += 1
    
    # 5. Construir DataFrame con formato "template:x{count}" (IGUAL QUE LEMMA)
    columns_data = {}
    for svc in services:
        col = []
        for b in bins:
            templates = service_bins[svc][b]
            if templates:
                # most_common(10) igual que LEMMA
                text = " | ".join(f"{t}:x{c}" for t, c in templates.most_common(10))
            else:
                text = ""
            col.append(text)
        columns_data[svc] = col
    
    texts_df = pd.DataFrame(columns_data, index=bins)
    return texts_df


# ============================================================================
# UTILITIES
# ============================================================================

def generate_scenario_id(date: str, idx: int) -> str:
    """Genera ID simple: YYYYMMDD_nezha_{idx}"""
    date_compact = date.replace("-", "")
    return f"{date_compact}_nezha_{idx}"


def extract_service_from_pod(pod_name: str) -> str:
    """Extrae nombre de servicio del pod (ej: frontend-579b9bff58-t2dbm → frontend)"""
    parts = pod_name.split("-")
    if len(parts) >= 3:
        # El deployment es todo menos las últimas 2 partes (replicaset hash + pod hash)
        return "-".join(parts[:-2])
    return pod_name


def build_service_mappings(pods: list[str]) -> tuple[list[str], dict, list[int]]:
    """Construye services, service_to_idx, pod_to_service_idx."""
    pod_services = [extract_service_from_pod(p) for p in pods]
    services = sorted(set(pod_services))
    service_to_idx = {s: i for i, s in enumerate(services)}
    pod_to_service_idx = [service_to_idx[s] for s in pod_services]
    return services, service_to_idx, pod_to_service_idx


print("[OK] Processing functions defined (log cleaning identical to LEMMA)")


[OK] Processing functions defined (log cleaning identical to LEMMA)


## 4. Execute Pipeline

Loops over all dates, loads fault lists, and for each fault: calculates 30s bins, aligns metrics, aggregates logs by service, and saves everything (7 parquets + logs + JSONs).### Excluded Scenarios

Known problematic scenarios are automatically skipped:
- Corrupted timestamps (adservice 2028)
- Missing metrics (CPU=0)
- No detectable signal (Class D)

In [14]:
# === PROCESS ALL SCENARIOS FROM ALL DATES ===

def process_single_scenario(fault: dict, idx: int, pods: list[str]) -> dict:
    """Procesa un fallo y guarda todos los archivos. Retorna info del resultado."""
    scenario_id = generate_scenario_id(fault["date"], idx)
    
    # Check if in excluded list
    if scenario_id in EXCLUDED_SCENARIOS:
        return {"scenario_id": scenario_id, "status": "EXCLUDED", "reason": "Known bad data"}
    
    # 1. Calcular bins
    bins, fault_bin_ts, fault_idx, _ = calculate_bins(fault)
    
    # 2. Load metrics (ALL available metrics - up to 19 total)
    all_metric_names = list(COMPLETE_COLUMN_MAP.values())
    metrics = {name: {} for name in all_metric_names}
    for pod in pods:
        pod_metrics = load_metrics_for_pod(fault["date"], pod, bins)
        for metric_name, series in pod_metrics.items():
            metrics[metric_name][pod] = series
    
    # UNIFICADO CON LEMMA: mantener NaN para pods sin datos
    # El dataloader usará mask = ~isnan() y luego nan_to_num()
    metrics_dfs = {}
    for metric_name, pod_dict in metrics.items():
        if pod_dict:
            df = pd.DataFrame(pod_dict)
            df = df.reindex(columns=pods)  # NO fillna(0) - mantener NaN
            metrics_dfs[metric_name] = df
    
    # 3. Build mappings (needed for service-level logs)
    services, service_to_idx, pod_to_service_idx = build_service_mappings(pods)
    
    # 4. Load and aggregate logs by SERVICE (consistent with LEMMA)
    logs_raw = load_logs_for_window(fault["date"], bins)
    logs_texts = aggregate_logs_to_services(logs_raw, bins, services)
    
    # 5. Crear directorios
    metrics_dir = OUTPUT_BASE / "core_metrics_tmp" / scenario_id
    logs_dir = OUTPUT_BASE / "core_logs_tmp" / scenario_id
    multimodal_dir = OUTPUT_BASE / "core_multimodal_tmp" / scenario_id
    metrics_dir.mkdir(parents=True, exist_ok=True)
    logs_dir.mkdir(parents=True, exist_ok=True)
    multimodal_dir.mkdir(parents=True, exist_ok=True)
    
    # 6. Save metrics (ALL available metrics - up to 19 files)
    for metric_name, df in metrics_dfs.items():
        if metric_name in ALL_FILE_MAP:
            df.to_parquet(metrics_dir / ALL_FILE_MAP[metric_name])
    
    # 7. Save logs by SERVICE (consistent with LEMMA)
    logs_texts.to_parquet(logs_dir / "logs_service_texts.parquet")
    
    # 8. Construir pod mappings
    pod_to_idx = {p: i for i, p in enumerate(pods)}
    
    root_cause_pod = fault["inject_pod"]
    root_cause_pod_idx = pods.index(root_cause_pod)
    root_cause_service = extract_service_from_pod(root_cause_pod)
    root_cause_service_idx = service_to_idx[root_cause_service]
    
    # 9. Save ground_truth.json
    ground_truth = {
        "scenario_id": scenario_id,
        "fault_timestamp_raw": str(pd.Timestamp(fault["inject_timestamp"], unit="s", tz="UTC")),
        "fault_bin": str(fault_bin_ts),
        "fault_time_idx": fault_idx,
        "root_cause_service": root_cause_service,
        "root_cause_service_idx": root_cause_service_idx,
        "root_cause_deployment": root_cause_pod.rsplit("-", 1)[0],
        "root_cause_pods": [root_cause_pod],
        "root_cause_pod_indices": [root_cause_pod_idx],
        "pod_to_idx": pod_to_idx,
        "fault_type": fault["inject_type"],
    }
    with open(multimodal_dir / "ground_truth.json", "w") as f:
        json.dump(ground_truth, f, indent=2)
    
    # 10. Save manifest.json (unified schema - ALL metrics)
    # Only include metrics that were actually loaded
    available_metrics = [k for k in METRIC_ORDER if k in metrics_dfs]
    manifest = {
        "scenario_id": scenario_id,
        "dataset": "nezha",
        "time_start": str(bins[0]),
        "time_end": str(bins[-1]),
        "n_timesteps": len(bins),
        "n_pods": len(pods),
        "n_services": len(services),
        "n_metrics": len(available_metrics),
        "window_T": f"{BIN_SECONDS}s",
        "metrics_files": [str(metrics_dir / ALL_FILE_MAP[k]) for k in available_metrics if k in ALL_FILE_MAP],
        "logs_texts_file": str(logs_dir / "logs_service_texts.parquet"),
        "pods": pods,
        "services": services,
        "service_to_idx": service_to_idx,
        "pod_to_service_idx": pod_to_service_idx,
    }
    with open(multimodal_dir / "manifest.json", "w") as f:
        json.dump(manifest, f, indent=2)
    
    # Calcular cobertura de logs
    log_coverage = (logs_texts.astype(str).apply(lambda x: x.str.len()) > 0).values.mean()
    
    return {
        "scenario_id": scenario_id,
        "status": "OK",
        "fault_type": fault["inject_type"],
        "inject_pod": fault["inject_pod"],
        "n_bins": len(bins),
        "log_coverage": log_coverage,
    }


def process_all_dates():
    """Process ALL Nezha dates at once."""
    all_results = []
    
    print("="*80)
    print("PROCESSING ALL NEZHA DATASETS")
    print("="*80)
    print(f"   Dates: {ALL_DATES}")
    print(f"   Excluded: {len(EXCLUDED_SCENARIOS)} known problematic scenarios")
    print()
    
    for date in ALL_DATES:
        print(f"\n{'='*60}")
        print(f"Processing {date}...")
        print(f"{'='*60}")
        
        # Check exists
        fault_list_path = NEZHA_DIR / "rca_data" / date / f"{date}-fault_list.json"
        if not fault_list_path.exists():
            print(f"   [WARN] Not found: {fault_list_path}")
            continue
        
        # Load faults
        faults = load_and_flatten_faults(date)
        pods = sorted(get_available_pods(date))
        
        print(f"   Faults: {len(faults)} | Pods: {len(pods)}")
        
        # Process each fault
        date_results = []
        for idx, fault in enumerate(faults):
            result = process_single_scenario(fault, idx, pods)
            date_results.append(result)
            
            if result.get("status") == "EXCLUDED":
                print(f"   [SKIP] {result['scenario_id']} | EXCLUDED")
            else:
                print(f"   [OK] {result['scenario_id']} | {result['fault_type']:15} | logs: {result['log_coverage']:.1%}")
        
        all_results.extend(date_results)
        
        # Day summary
        n_ok = sum(1 for r in date_results if r.get("status") == "OK")
        n_excl = sum(1 for r in date_results if r.get("status") == "EXCLUDED")
        print(f"\n   {date}: {n_ok} processed, {n_excl} excluded")
    
    # Final summary
    print("\n" + "="*80)
    print("FINAL SUMMARY")
    print("="*80)
    
    total_ok = sum(1 for r in all_results if r.get("status") == "OK")
    total_excl = sum(1 for r in all_results if r.get("status") == "EXCLUDED")
    
    print(f"   Total scenarios: {len(all_results)}")
    print(f"   Processed:       {total_ok}")
    print(f"   Excluded:        {total_excl}")
    
    # Per date
    for date in ALL_DATES:
        date_compact = date.replace("-", "")
        date_results = [r for r in all_results if r["scenario_id"].startswith(date_compact)]
        n_ok = sum(1 for r in date_results if r.get("status") == "OK")
        print(f"   {date}: {n_ok} valid scenarios")
    
    return all_results


# ============================================================================
# EJECUTAR PROCESAMIENTO COMPLETO
# ============================================================================
results = process_all_dates()


PROCESSING ALL NEZHA DATASETS
   Dates: ['2022-08-22', '2022-08-23', '2023-01-29', '2023-01-30']
   Excluded: 7 known problematic scenarios


Processing 2022-08-22...
   Faults: 24 | Pods: 10


   [OK] 20220822_nezha_0 | cpu_contention  | logs: 31.8%
   [OK] 20220822_nezha_1 | return          | logs: 31.8%
   [OK] 20220822_nezha_2 | cpu_consumed    | logs: 31.8%
   [OK] 20220822_nezha_3 | exception       | logs: 31.8%
   [OK] 20220822_nezha_4 | network_delay   | logs: 31.8%
   [OK] 20220822_nezha_5 | cpu_contention  | logs: 31.8%
   [OK] 20220822_nezha_6 | cpu_contention  | logs: 31.8%
   [OK] 20220822_nezha_7 | network_delay   | logs: 31.8%
   [OK] 20220822_nezha_8 | exception       | logs: 28.2%
   [OK] 20220822_nezha_9 | return          | logs: 30.5%
   [OK] 20220822_nezha_10 | cpu_contention  | logs: 32.7%
   [OK] 20220822_nezha_11 | network_delay   | logs: 31.8%
   [OK] 20220822_nezha_12 | cpu_contention  | logs: 45.5%
   [OK] 20220822_nezha_13 | network_delay   | logs: 31.8%
   [SKIP] 20220822_nezha_14 | EXCLUDED
   [OK] 20220822_nezha_15 | cpu_contention  | logs: 31.8%
   [OK] 20220822_nezha_16 | cpu_consumed    | logs: 31.8%
   [OK] 20220822_nezha_17 | network_delay  

## 3. Processing Functions

Log cleaning is identical to LEMMA (`clean_log_content_vectorized`). Metrics are loaded from CSV, resampled to 30s, and aligned with `ffill(limit=1)`.