# LEMMA-RCA Format Converter

LEMMA scenarios have inconsistent directory structures across dates. Normalizes everything to the `20240215` layout (renames files, reorganizes folders). Data content is not modified.

| Aspect | Source | Target |
|--------|--------|--------|
| Metrics | `cpu_usage.npy`, various subfolders | `pod_cpu_usage_total.npy` in `metrics_data/{scenario}/` |
| Logs | `pod_message/`, `pod_removed/` | `log_data/{scenario}/log_data/pod/` |
| Log frequency | various paths | `{MMDD}_log_frequency_pod_level_removed.npy` |


## 1. Configuration and Directory Setup


In [1]:
from pathlib import Path
import shutil
import zipfile
import numpy as np
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import subprocess
import gc

# Number of parallel workers for file extraction 
NUM_WORKERS = 20

# Prefer native unzip command (faster)
USE_NATIVE_UNZIP = True

# Base directory containing the LEMMA-RCA dataset
BASE_DIR = Path("/root/lemm")

# Source directories (original lemma-RCA preprocessed data)
ZIPS_METRICS = BASE_DIR / "Cloud_Computing_Preprocessed" / "Metrics Data"
ZIPS_LOGS = BASE_DIR / "Cloud_Computing_Preprocessed" / "Log Data"

# Target directories (unified format)
OUT_METRICS = BASE_DIR / "metrics_data"
OUT_LOGS = BASE_DIR / "log_data"

print(f"Source metrics: {ZIPS_METRICS}")
print(f"Source logs:    {ZIPS_LOGS}")
print(f"Output metrics: {OUT_METRICS}")
print(f"Output logs:    {OUT_LOGS}")


Source metrics: /root/lemm/Cloud_Computing_Preprocessed/Metrics Data
Source logs:    /root/lemm/Cloud_Computing_Preprocessed/Log Data
Output metrics: /root/lemm/metrics_data
Output logs:    /root/lemm/log_data


## 2. Scenario Configuration

ZIP structures vary per scenario â€” paths below found by manual inspection.


In [2]:
# Metric filename mapping: source name -> target name (20240215 format)
METRIC_MAPPING = {
    "pod_level_data_cpu_usage.npy": "pod_level_data_pod_cpu_usage_total.npy",
    "pod_level_data_memory_usage.npy": "pod_level_data_pod_memory_working_set.npy",
    "pod_level_data_received_bandwidth.npy": "pod_level_data_pod_network_rx_bytes.npy",
    "pod_level_data_transmit_bandwidth.npy": "pod_level_data_pod_network_tx_bytes.npy",
}

# Scenario-specific configuration with EXACT paths (verified via unzip -l)
SCENARIOS = {
    "20231207": {
        # Metrics: 20231207.zip -> 20231207/pod_level_data_*.npy
        "zip_metrics": "20231207.zip",
        "metrics_prefix": "20231207/",
        # Logs: 20231207.zip -> log_data/pod_removed/*_structured.csv
        "zip_logs": "20231207.zip",
        "logs_prefix": "log_data/pod_removed/",
        # log_frequency in Logs ZIP
        "log_frequency_zip": "logs",
        "log_frequency_path": "log_data/pod_level_log_frequency.npy",
        # Metadata
    },
    "20231221": {
        # Metrics: 20231221.zip -> 20231221/metric_error/pod_level_data_*.npy
        "zip_metrics": "20231221.zip",
        "metrics_prefix": "20231221/metric_error/",
        # Logs: 20231221.zip -> 20231221/20231221/log_data/pod_message/*_structured.csv
        "zip_logs": "20231221.zip",
        "logs_prefix": "20231221/20231221/log_data/pod_message/",
        # log_frequency in Logs ZIP
        "log_frequency_zip": "logs",
        "log_frequency_path": "20231221/20231221/log_data/pod_level_log_frequency.npy",
        # Metadata
    },
    "20240115": {
        # Metrics: 20240115.zip -> 20240115/latency/pod_level_data_*.npy
        "zip_metrics": "20240115.zip",
        "metrics_prefix": "20240115/latency/",
        # Logs: 20240115.zip -> 20240115/log_data/pod_message/*_structured.csv
        "zip_logs": "20240115.zip",
        "logs_prefix": "20240115/log_data/pod_message/",
        # log_frequency in Logs ZIP
        "log_frequency_zip": "logs",
        "log_frequency_path": "20240115/pod_level_log_frequency.npy",
        # Metadata
    },

    #THIS SCENARIO IS THE TARGET FORMAT, NO NEED TO CONVERT METRICS BUT WE NEED TO CONVERT LOG DATA
    "20240215": {
        # Metrics: 20240215.zip -> 20240215/pod_level_data_pod_*.npy (already correct format)
        "zip_metrics": "20240215.zip",
        "metrics_prefix": "20240215/",
        # Logs: 20240215.zip -> log_data/pod/*_structured.csv
        "zip_logs": "20240215.zip",
        "logs_prefix": "log_data/pod/",
        # log_frequency in Logs ZIP (not Metrics ZIP!)
        "log_frequency_zip": "logs",
        "log_frequency_path": "log_data/pod_level_log_frequency.npy",
        # Metadata
        # Flag: metrics filenames already in target format
        "skip_metric_mapping": True,
    },
}

print(f"Scenarios configured: {list(SCENARIOS.keys())}")


Scenarios configured: ['20231207', '20231221', '20240115', '20240215']


## 3. Metric Extraction

In [3]:
def extract_metrics(scenario_id: str, config: dict) -> Path:
    """
    Extract and rename metric files from source ZIP to target format.
    """
    zip_path = ZIPS_METRICS / config["zip_metrics"]
    out_dir = OUT_METRICS / scenario_id
    out_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"\n{'='*60}")
    print(f"Extracting metrics: {scenario_id}")
    print(f"{'='*60}")
    print(f"Source ZIP: {zip_path.name}")
    print(f"Prefix: {config['metrics_prefix']}")
    
    with zipfile.ZipFile(zip_path, 'r') as zf:
        all_files = zf.namelist()
        prefix = config['metrics_prefix']
        
        skip_mapping = config.get('skip_metric_mapping', False)
        
        if skip_mapping:
            # Extract files with original names (already in target format)
            target_files = [
                "pod_level_data_pod_cpu_usage_total.npy",
                "pod_level_data_pod_memory_working_set.npy",
                "pod_level_data_pod_network_rx_bytes.npy",
                "pod_level_data_pod_network_tx_bytes.npy",
            ]
            for target_name in target_files:
                src_path = prefix + target_name
                if src_path in all_files:
                    print(f"  [OK] {target_name} (no rename needed)")
                    with zf.open(src_path) as f:
                        data = f.read()
                    with open(out_dir / target_name, 'wb') as f:
                        f.write(data)
                else:
                    print(f"  [ERROR] {target_name} not found at {src_path}")
        else:
            # Extract and rename core metrics using METRIC_MAPPING
            for src_name, dst_name in METRIC_MAPPING.items():
                src_path = prefix + src_name
                if src_path not in all_files:
                    print(f"  [ERROR] {src_name} not found at {src_path}")
                    continue
                print(f"  [OK] {src_name} -> {dst_name}")
                with zf.open(src_path) as f:
                    data = f.read()
                with open(out_dir / dst_name, 'wb') as f:
                    f.write(data)
        
        # Extract log_frequency if it's in the metrics ZIP
        log_freq_zip = config.get('log_frequency_zip', 'metrics')
        if log_freq_zip == 'metrics':
            log_freq_path = config.get('log_frequency_path')
            if log_freq_path and log_freq_path in all_files:
                log_freq_dst = f"{scenario_id[-4:]}_log_frequency_pod_level_removed.npy"
                print(f"  [OK] log_frequency -> {log_freq_dst}")
                with zf.open(log_freq_path) as f:
                    data = f.read()
                with open(out_dir / log_freq_dst, 'wb') as f:
                    f.write(data)
    
    return out_dir



## 4. Log Extraction

Streams CSVs in 8MB chunks to keep memory low. Also extracts log_frequency NPY.


In [4]:
def extract_logs_native(scenario_id: str, config: dict, zip_path: Path, out_dir: Path, metrics_out_dir: Path) -> int:
    """
    Extract logs using native unzip command (fastest method).
    Returns number of files extracted.
    """
    prefix = config['logs_prefix']
    
    # Count files before extraction
    files_before = set(out_dir.glob("*_structured.csv"))
    
    # Extract directly to output directory using unzip
    # -j: junk paths (extract flat, ignore directory structure in ZIP)
    # -q: quiet
    # -o: overwrite existing files
    # -d: destination directory
    pattern = f"{prefix}*_structured.csv"
    cmd = ["unzip", "-j", "-q", "-o", str(zip_path), pattern, "-d", str(out_dir)]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode not in (0, 11):  # 11 = no files matched (not an error for us)
        print(f"  [WARNING] unzip returned {result.returncode}: {result.stderr}")
    
    # Count files after extraction
    files_after = set(out_dir.glob("*_structured.csv"))
    count = len(files_after - files_before) if files_before else len(files_after)
    
    # Extract log_frequency separately (single file)
    log_freq_zip = config.get('log_frequency_zip', 'metrics')
    if log_freq_zip == 'logs':
        log_freq_path = config.get('log_frequency_path')
        if log_freq_path:
            with zipfile.ZipFile(zip_path, 'r') as zf:
                if log_freq_path in zf.namelist():
                    log_freq_dst = f"{scenario_id[-4:]}_log_frequency_pod_level_removed.npy"
                    print(f"  [OK] log_frequency -> {log_freq_dst}")
                    with zf.open(log_freq_path) as f:
                        data = f.read()
                    with open(metrics_out_dir / log_freq_dst, 'wb') as f:
                        f.write(data)
                else:
                    print(f"  [ERROR] log_frequency not found at {log_freq_path}")
    
    return count


def extract_logs_python(scenario_id: str, config: dict, zip_path: Path, out_dir: Path, metrics_out_dir: Path) -> int:
    """
    Extract logs using Python zipfile with ThreadPoolExecutor (fallback method).
    Returns number of files extracted.
    """
    prefix = config['logs_prefix']
    
    # First pass: collect list of files to extract
    with zipfile.ZipFile(zip_path, 'r') as zf:
        all_files = zf.namelist()
        csv_files = [f for f in all_files if f.startswith(prefix) and f.endswith('_structured.csv')]
    
    total_files = len(csv_files)
    print(f"  Found {total_files} CSV files (using {NUM_WORKERS} Python workers)")
    
    # Thread-safe counter for progress reporting
    extracted_count = [0]
    counter_lock = threading.Lock()
    
    def extract_batch(batch: list) -> int:
        """Extract a batch of CSV files. Each thread opens its own ZIP handle."""
        count = 0
        with zipfile.ZipFile(zip_path, 'r') as zf:
            for src_path in batch:
                try:
                    filename = Path(src_path).name
                    with zf.open(src_path) as f:
                        data = f.read()
                    with open(out_dir / filename, 'wb') as f:
                        f.write(data)
                    count += 1
                except Exception as e:
                    print(f"  [ERROR] Failed to extract {src_path}: {e}")
        
        with counter_lock:
            extracted_count[0] += count
            current = extracted_count[0]
            if current % 500 < count:
                print(f"  Progress: {current}/{total_files} CSVs extracted")
        return count
    
    # Split files into batches
    batch_size = max(1, len(csv_files) // NUM_WORKERS)
    batches = [csv_files[i:i + batch_size] for i in range(0, len(csv_files), batch_size)]
    
    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        futures = [executor.submit(extract_batch, batch) for batch in batches]
        for future in as_completed(futures):
            pass
    
    # Extract log_frequency
    with zipfile.ZipFile(zip_path, 'r') as zf:
        all_files = zf.namelist()
        log_freq_zip = config.get('log_frequency_zip', 'metrics')
        if log_freq_zip == 'logs':
            log_freq_path = config.get('log_frequency_path')
            if log_freq_path and log_freq_path in all_files:
                log_freq_dst = f"{scenario_id[-4:]}_log_frequency_pod_level_removed.npy"
                print(f"  [OK] log_frequency -> {log_freq_dst}")
                with zf.open(log_freq_path) as f:
                    data = f.read()
                with open(metrics_out_dir / log_freq_dst, 'wb') as f:
                    f.write(data)
            else:
                print(f"  [ERROR] log_frequency not found at {log_freq_path}")
    
    return extracted_count[0]


def extract_logs(scenario_id: str, config: dict) -> Path:
    """
    Extract log CSV files from source ZIP to target format.
    Uses native unzip if available (faster), falls back to Python threading.
    """
    zip_path = ZIPS_LOGS / config["zip_logs"]
    out_dir = OUT_LOGS / scenario_id / "log_data" / "pod"
    out_dir.mkdir(parents=True, exist_ok=True)
    
    metrics_out_dir = OUT_METRICS / scenario_id
    metrics_out_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"\n{'='*60}")
    print(f"Extracting logs: {scenario_id}")
    print(f"{'='*60}")
    print(f"Source ZIP: {zip_path.name}")
    print(f"Prefix: {config['logs_prefix']}")
    
    # Check if native unzip is available and preferred
    use_native = USE_NATIVE_UNZIP
    if use_native:
        try:
            subprocess.run(["unzip", "-v"], capture_output=True, check=True)
            print(f"  Using native unzip (fastest)")
            count = extract_logs_native(scenario_id, config, zip_path, out_dir, metrics_out_dir)
        except (subprocess.CalledProcessError, FileNotFoundError):
            print(f"  Native unzip not available, using Python fallback")
            use_native = False
    
    if not use_native:
        count = extract_logs_python(scenario_id, config, zip_path, out_dir, metrics_out_dir)
    
    print(f"  Total CSVs extracted: {count}")
    return out_dir



## 5. Verification

Checks that all expected files exist and NPYs load without corruption.


In [5]:
def verify_extraction(scenario_id: str) -> bool:
    """
    Verify that extraction produced all required files.
    Uses memory-efficient verification without keeping data in RAM.
    
    Args:
        scenario_id: Scenario identifier
    
    Returns:
        True if all files present and valid, False otherwise
    """
    print(f"\n{'='*60}")
    print(f"Verifying: {scenario_id}")
    print(f"{'='*60}")
    
    metrics_dir = OUT_METRICS / scenario_id
    logs_dir = OUT_LOGS / scenario_id / "log_data" / "pod"
    
    required_metrics = [
        "pod_level_data_pod_cpu_usage_total.npy",
        "pod_level_data_pod_memory_working_set.npy",
        "pod_level_data_pod_network_rx_bytes.npy",
        "pod_level_data_pod_network_tx_bytes.npy",
        f"{scenario_id[-4:]}_log_frequency_pod_level_removed.npy",
    ]
    
    all_ok = True
    
    print("\nMetrics:")
    for fname in required_metrics:
        fpath = metrics_dir / fname
        if not fpath.exists():
            print(f"  [MISSING] {fname}")
            all_ok = False
            continue
            
        try:
            # Load, extract info, and immediately free memory
            data = np.load(fpath, allow_pickle=True).item()
            key = list(data.keys())[0]
            n_entities = len(data[key].get('Pod_Name', data[key].get('Node_Name', [])))
            n_time = len(data[key]['time'])
            del data  # Explicit delete to allow GC
            print(f"  [OK] {fname}: {n_entities} entities, {n_time} timestamps")
        except Exception as e:
            print(f"  [CORRUPT] {fname}: {e}")
            all_ok = False
    
    print("\nLogs:")
    if not logs_dir.exists():
        print(f"  [MISSING] Directory: {logs_dir}")
        all_ok = False
    else:
        csv_files = list(logs_dir.glob("*_structured.csv"))
        if len(csv_files) == 0:
            print(f"  [WARNING] No structured CSV files found")
            all_ok = False
        else:
            print(f"  [OK] Directory exists: {len(csv_files)} structured.csv files")
    
    return all_ok


## 6. Run

Processes all scenarios. Log extraction is parallelized with ThreadPoolExecutor.


In [6]:
# Process each scenario
results = {}

for scenario_id, config in SCENARIOS.items():
    try:
        extract_metrics(scenario_id, config)
        extract_logs(scenario_id, config)
        ok = verify_extraction(scenario_id)
        results[scenario_id] = "OK" if ok else "PARTIAL"
    except Exception as e:
        print(f"\n[ERROR] {scenario_id}: {e}")
        import traceback
        traceback.print_exc()
        results[scenario_id] = "ERROR"
    finally:
        # Force garbage collection to free RAM between scenarios
        gc.collect()

print("\n" + "="*60)
print("SUMMARY")
print("="*60)
for sid, status in results.items():
    print(f"  {sid}: {status}")



Extracting metrics: 20231207
Source ZIP: 20231207.zip
Prefix: 20231207/
  [OK] pod_level_data_cpu_usage.npy -> pod_level_data_pod_cpu_usage_total.npy
  [OK] pod_level_data_memory_usage.npy -> pod_level_data_pod_memory_working_set.npy
  [OK] pod_level_data_received_bandwidth.npy -> pod_level_data_pod_network_rx_bytes.npy
  [OK] pod_level_data_transmit_bandwidth.npy -> pod_level_data_pod_network_tx_bytes.npy

Extracting logs: 20231207
Source ZIP: 20231207.zip
Prefix: log_data/pod_removed/
  Using native unzip (fastest)
  [OK] log_frequency -> 1207_log_frequency_pod_level_removed.npy
  Total CSVs extracted: 195

Verifying: 20231207

Metrics:
  [OK] pod_level_data_pod_cpu_usage_total.npy: 196 entities, 79288 timestamps
  [OK] pod_level_data_pod_memory_working_set.npy: 196 entities, 79299 timestamps
  [OK] pod_level_data_pod_network_rx_bytes.npy: 196 entities, 79509 timestamps
  [OK] pod_level_data_pod_network_tx_bytes.npy: 196 entities, 79509 timestamps
  [OK] 1207_log_frequency_pod_level

In [7]:
# %reset -f 
# if you are in a linux machine, you can run this to free up memory: 
# sync && echo 3 | sudo tee /proc/sys/vm/drop_caches