In [None]:
import os
import glob
import re
import pandas as pd
import matplotlib.pyplot as plt

# ==========================================
# CONFIGURATION
# ==========================================
# Directory containing the log files ('.' for current dir)
LOG_DIR = '/Users/rikiotaki/workspace/es/logs/resource_bench_2025-11-24_03-16-40' 
# Output filename for the plot
OUTPUT_PLOT = 'benchmark_results_parsed.png'

# ==========================================
# PARSER LOGIC
# ==========================================
def parse_log_file(filepath):
    filename = os.path.basename(filepath)
    
    # 1. Parse Metadata from Filename (e.g., Exp1_Thr8_Mem2GB.log)
    # Regex to capture Threads and Memory
    name_match = re.search(r"Thr(\d+)_Mem(\d+)GB", filename)
    if not name_match:
        return None
    
    threads = int(name_match.group(1))
    memory_gb = int(name_match.group(2))
    
    # 2. Parse Metrics from File Content
    # We look for the line ending in "[avg]" in the "Benchmark Results Summary"
    metrics = {}
    with open(filepath, 'r') as f:
        content = f.readlines()
        
    found_summary = False
    for line in content:
        if "[avg]" in line:
            # Example Line:
            # Exp1_Thr8_Mem2GB[avg] 256.0 440 8 8 1880.56 1275.20 605.36 ...
            parts = line.split()
            
            # Based on the log format provided:
            # col 0: Name
            # col 1: Run Size
            # col 2: RG Runs
            # col 3: Gen Thr
            # col 4: Merge Thr
            # col 5: Total (s)
            # col 6: RunGen (s)
            # col 7: Merge (s)
            
            try:
                metrics['total_time'] = float(parts[5])
                metrics['rungen_time'] = float(parts[6])
                metrics['merge_time'] = float(parts[7])
                found_summary = True
                break
            except (IndexError, ValueError):
                continue
                
    if not found_summary:
        print(f"Warning: Could not parse summary line in {filename}")
        return None
        
    return {
        'filename': filename,
        'threads': threads,
        'memory_gb': memory_gb,
        'total_time': metrics['total_time'],
        'rungen_time': metrics['rungen_time'],
        'merge_time': metrics['merge_time']
    }

def load_data(directory):
    files = glob.glob(os.path.join(directory, "*.log"))
    data = []
    print(f"Found {len(files)} log files in '{directory}'...")
    
    for f in files:
        parsed = parse_log_file(f)
        if parsed:
            data.append(parsed)
            print(f"  Loaded: {parsed['filename']} -> T={parsed['threads']}, M={parsed['memory_gb']}GB, Time={parsed['total_time']}s")
            
    return pd.DataFrame(data)

# ==========================================
# PLOTTING LOGIC
# ==========================================
def plot_results(df):
    if df.empty:
        print("No data found to plot.")
        return

    plt.rcParams.update({'font.size': 12, 'font.family': 'sans-serif'})
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # ---------------------------------------------------------
    # DATA PREP: Experiment 1 (Scalability)
    # Filter: Memory = 2GB (Fixed)
    # Sort by Threads
    # ---------------------------------------------------------
    exp1_df = df[df['memory_gb'] == 2].sort_values('threads')
    
    if not exp1_df.empty:
        ax1.plot(exp1_df['threads'], exp1_df['total_time'], 'o-', linewidth=2, color='#1f77b4', label='Total Time')
        ax1.plot(exp1_df['threads'], exp1_df['rungen_time'], '--', color='#2ca02c', alpha=0.6, label='Run Gen Phase')
        ax1.plot(exp1_df['threads'], exp1_df['merge_time'], '--', color='#d62728', alpha=0.6, label='Merge Phase')
        
        # Annotations (Dynamically find min/max)
        if not exp1_df[exp1_df['threads'] == 24].empty:
            opt = exp1_df[exp1_df['threads'] == 24].iloc[0]
            ax1.annotate(f"Optimal (24 Thr)\n{opt['total_time']:.0f}s", 
                         xy=(24, opt['total_time']), xytext=(24, opt['total_time'] - 300),
                         arrowprops=dict(facecolor='black', shrink=0.05),
                         ha='center', fontsize=10, fontweight='bold')
                         
        if not exp1_df[exp1_df['threads'] == 40].empty:
            trap = exp1_df[exp1_df['threads'] == 40].iloc[0]
            ax1.annotate(f"Scalability Trap!\n{trap['total_time']:.0f}s", 
                         xy=(40, trap['total_time']), xytext=(35, trap['total_time'] + 400),
                         arrowprops=dict(facecolor='red', shrink=0.05),
                         ha='center', color='red', fontweight='bold')

        ax1.set_title("Exp 1: The Scalability Trap (Fixed 2GB RAM)", fontsize=14, fontweight='bold')
        ax1.set_xlabel("Number of Threads", fontsize=12)
        ax1.set_ylabel("Execution Time (s)", fontsize=12)
        ax1.grid(True, which='both', linestyle='--', alpha=0.7)
        ax1.legend()
    else:
        ax1.text(0.5, 0.5, "No Data for Mem=2GB", ha='center', transform=ax1.transAxes)

    # ---------------------------------------------------------
    # DATA PREP: Experiment 2 (Memory Cliff)
    # Filter: Threads = 40 (Fixed)
    # Sort by Memory
    # ---------------------------------------------------------
    exp2_df = df[df['threads'] == 40].sort_values('memory_gb')
    
    if not exp2_df.empty:
        ax2.plot(exp2_df['memory_gb'], exp2_df['total_time'], 's-', linewidth=2, color='#ff7f0e', label='Total Time (40 Threads)')
        
        # Annotations
        cliff_row = exp2_df[exp2_df['memory_gb'] == 2]
        if not cliff_row.empty:
            val = cliff_row.iloc[0]['total_time']
            ax2.annotate(f"The Cliff\n{val:.0f}s", 
                         xy=(2, val), xytext=(2.5, val - 100),
                         arrowprops=dict(facecolor='red', shrink=0.05),
                         ha='left', color='red', fontweight='bold')

        safe_row = exp2_df[exp2_df['memory_gb'] == 4]
        if not safe_row.empty:
            val = safe_row.iloc[0]['total_time']
            ax2.annotate(f"Safe Zone\n{val:.0f}s", 
                         xy=(4, val), xytext=(4, val - 150),
                         arrowprops=dict(facecolor='green', shrink=0.05),
                         ha='center', color='green', fontweight='bold')
        
        # CPU Cache Note
        high_mem_row = exp2_df[exp2_df['memory_gb'] == 8]
        if not high_mem_row.empty:
            val = high_mem_row.iloc[0]['total_time']
            ax2.annotate('CPU Cache\nOverhead', 
                         xy=(8, val), xytext=(7, val + 150),
                         arrowprops=dict(facecolor='gray', shrink=0.05),
                         ha='center', fontsize=9, style='italic')

        ax2.set_title("Exp 2: The Memory Cliff (Fixed 40 Threads)", fontsize=14, fontweight='bold')
        ax2.set_xlabel("Memory Budget (GB)", fontsize=12)
        ax2.set_ylabel("Execution Time (s)", fontsize=12)
        ax2.set_xticks([2, 4, 6, 8])
        ax2.grid(True, which='both', linestyle='--', alpha=0.7)
        ax2.legend()
    else:
        ax2.text(0.5, 0.5, "No Data for Threads=40", ha='center', transform=ax2.transAxes)

    plt.tight_layout()
    plt.savefig(OUTPUT_PLOT, dpi=300)
    print(f"Success! Plot saved to: {OUTPUT_PLOT}")
    plt.show()

# ==========================================
# MAIN EXECUTION
# ==========================================
df = load_data(LOG_DIR)
plot_results(df)