# UMAP WebAssembly vs JavaScript Performance Analysis
Performance-only analysis from benchmark JSON logs. Focus metrics: runtimeMs, memoryDeltaMb, fpsAvg, responsivenessMs.
Trustworthiness is used only as a sanity check (no correctness analysis).


In [None]:
%matplotlib inline
import json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, Markdown

plt.rcParams.update({
    'figure.figsize': (10, 4),
    'axes.titlesize': 12,
    'axes.labelsize': 10,
    'xtick.labelsize': 9,
    'ytick.labelsize': 9,
})

DATA_FILES = [
    Path('/mnt/data/bench-runs-1769593958138.json'),
    Path('/mnt/data/bench-runs-1769594451146.json'),
    Path('/mnt/data/bench-runs-1769595026443.json'),
    Path('/mnt/data/bench-runs-1769597840823.json'),
    Path('/mnt/data/bench-runs-1769598240434.json'),
    Path('/mnt/data/bench-runs-1769598749010.json'),
]


In [None]:
records = []
results_records = []
file_meta_records = []

for path in DATA_FILES:
    if not path.exists():
        print(f'Missing file: {path}')
        continue
    with path.open('r', encoding='utf-8') as f:
        data = json.load(f)

    file_wasmFeatures = data.get('wasmFeatures')
    machine = data.get('machine') or {}
    git = data.get('git') or {}

    file_meta_records.append({
        'source_file': path.name,
        'file_wasmFeatures': file_wasmFeatures,
        'machine_cpuModel': machine.get('cpuModel'),
        'machine_cpuCores': machine.get('cpuCores'),
        'machine_totalMemBytes': machine.get('totalMemBytes'),
        'git_commit': git.get('commit'),
        'git_branch': git.get('branch'),
        'git_statusDirty': git.get('statusDirty'),
    })

    results = data.get('results') or []
    for r_idx, result in enumerate(results):
        result_exitCode = result.get('exitCode')
        result_durationMs = result.get('durationMs')
        resultLabel = result.get('resultLabel')
        result_success = (result_exitCode == 0) if result_exitCode is not None else None

        results_records.append({
            'source_file': path.name,
            'file_wasmFeatures': file_wasmFeatures,
            'result_index': r_idx,
            'result_exitCode': result_exitCode,
            'result_success': result_success,
            'result_durationMs': result_durationMs,
            'resultLabel': resultLabel,
        })

        ui_metrics = result.get('uiMetrics') or []
        for ui in ui_metrics:
            rows = ui.get('rows') or []
            for row in rows:
                if not isinstance(row, dict):
                    continue
                rec = dict(row)
                rec.update({
                    'source_file': path.name,
                    'file_wasmFeatures': file_wasmFeatures,
                    'result_exitCode': result_exitCode,
                    'result_success': result_success,
                    'result_durationMs': result_durationMs,
                    'resultLabel': resultLabel,
                    'machine_cpuModel': machine.get('cpuModel'),
                    'machine_cpuCores': machine.get('cpuCores'),
                    'machine_totalMemBytes': machine.get('totalMemBytes'),
                    'git_commit': git.get('commit'),
                    'git_branch': git.get('branch'),
                    'git_statusDirty': git.get('statusDirty'),
                })
                records.append(rec)

golden_df = pd.DataFrame(records)
results_df = pd.DataFrame(results_records)
file_meta_df = pd.DataFrame(file_meta_records).drop_duplicates()

numeric_cols = [
    'runtimeMs', 'memoryDeltaMb', 'fpsAvg', 'responsivenessMs', 'trustworthiness',
    'datasetSize', 'dimensions', 'machine_cpuCores', 'machine_totalMemBytes'
]
for col in numeric_cols:
    if col in golden_df.columns:
        golden_df[col] = pd.to_numeric(golden_df[col], errors='coerce')

if 'runtimeMs' not in golden_df.columns:
    raise ValueError('runtimeMs column missing from flattened rows')
golden_df = golden_df[golden_df['runtimeMs'] > 0].copy()

def infer_mode(row):
    file_feat = str(row.get('file_wasmFeatures') or '').lower()
    wm = row.get('wasmMode')
    wm_l = wm.lower() if isinstance(wm, str) else ''

    if file_feat == 'none':
        return 'js'
    if file_feat == 'all':
        return 'wasm'
    if 'wasm:' in wm_l:
        return 'wasm'
    if 'js' in wm_l and 'wasm' not in wm_l:
        return 'js'
    return np.nan

golden_df['mode'] = golden_df.apply(infer_mode, axis=1)

dataset_base = golden_df.get('datasetName', pd.Series(['unknown'] * len(golden_df)))
dataset_base = dataset_base.fillna('unknown').astype(str)
if 'scope' in golden_df.columns:
    scope_col = golden_df['scope']
    scope_ok = scope_col.notna() & (scope_col.astype(str) != '') & (scope_col.astype(str) != 'nan')
    golden_df['datasetLabel'] = np.where(scope_ok, dataset_base + ' | ' + scope_col.astype(str), dataset_base)
else:
    golden_df['datasetLabel'] = dataset_base

golden_df.to_csv('/mnt/data/bench_golden_rows.csv', index=False)

group_cols = ['datasetName', 'scope', 'mode']
runtime_stats = (
    golden_df
    .groupby(group_cols)['runtimeMs']
    .agg(['mean', 'median', 'std', 'count'])
    .reset_index()
    .rename(columns={'count': 'n'})
)

runtime_wide = runtime_stats.pivot_table(index=['datasetName', 'scope'], columns='mode')
if not runtime_wide.empty:
    runtime_wide.columns = [f'{stat}_{mode}' for stat, mode in runtime_wide.columns]
    runtime_summary = runtime_wide.reset_index()
else:
    runtime_summary = pd.DataFrame(columns=['datasetName', 'scope'])

def calc_speedup(row):
    med_js = row.get('median_js')
    med_wasm = row.get('median_wasm')
    mean_js = row.get('mean_js')
    mean_wasm = row.get('mean_wasm')
    if pd.notna(med_js) and pd.notna(med_wasm) and med_wasm > 0:
        return med_js / med_wasm
    if pd.notna(mean_js) and pd.notna(mean_wasm) and mean_wasm > 0:
        return mean_js / mean_wasm
    return np.nan

if not runtime_summary.empty:
    scope_series = runtime_summary.get('scope')
    base = runtime_summary['datasetName'].fillna('unknown').astype(str)
    if scope_series is not None:
        scope_ok = scope_series.notna() & (scope_series.astype(str) != '') & (scope_series.astype(str) != 'nan')
        runtime_summary['datasetLabel'] = np.where(scope_ok, base + ' | ' + scope_series.astype(str), base)
    else:
        runtime_summary['datasetLabel'] = base
    runtime_summary['speedup'] = runtime_summary.apply(calc_speedup, axis=1)

runtime_summary.to_csv('/mnt/data/bench_runtime_summary.csv', index=False)

print('Golden rows:', len(golden_df))
print('Runtime summary rows:', len(runtime_summary))
print('Saved: /mnt/data/bench_golden_rows.csv and /mnt/data/bench_runtime_summary.csv')


## A. Data overview


In [None]:
print('Counts by mode:')
display(golden_df['mode'].value_counts(dropna=False))

if 'scope' in golden_df.columns:
    print('Counts by scope (top 20):')
    display(golden_df['scope'].value_counts(dropna=False).head(20))

print('Counts by datasetName (top 20):')
display(golden_df['datasetName'].value_counts(dropna=False).head(20))

print('Machine and git metadata by file:')
display(file_meta_df)


## B. Sanity checks


In [None]:
dataset_counts = golden_df['datasetName'].value_counts()
rep_datasets = dataset_counts.index.tolist()[:2] if len(dataset_counts) > 2 else dataset_counts.index.tolist()

rep_df = golden_df[golden_df['datasetName'].isin(rep_datasets) & golden_df['mode'].isin(['js', 'wasm'])].copy()
if 'timestamp' in rep_df.columns:
    rep_df = rep_df.sort_values('timestamp')
rep_df['run_index'] = rep_df.groupby(['datasetName', 'mode']).cumcount()

if rep_datasets:
    fig, axes = plt.subplots(1, len(rep_datasets), figsize=(6 * len(rep_datasets), 4), sharey=False)
    if len(rep_datasets) == 1:
        axes = [axes]
    for ax, ds in zip(axes, rep_datasets):
        sub = rep_df[rep_df['datasetName'] == ds]
        for mode in ['js', 'wasm']:
            msub = sub[sub['mode'] == mode]
            ax.plot(msub['run_index'], msub['runtimeMs'], marker='o', linestyle='-', label=mode)
        ax.set_title(f'Runtime vs run index: {ds}')
        ax.set_xlabel('Run index')
        ax.set_ylabel('runtimeMs')
        ax.legend()
    plt.tight_layout()
    plt.show()

outlier_mask = pd.Series(False, index=golden_df.index)
for (ds, mode), group in golden_df[golden_df['mode'].isin(['js', 'wasm'])].groupby(['datasetName', 'mode']):
    if len(group) < 4:
        continue
    q1 = group['runtimeMs'].quantile(0.25)
    q3 = group['runtimeMs'].quantile(0.75)
    iqr = q3 - q1
    if iqr == 0 or pd.isna(iqr):
        continue
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    mask = (group['runtimeMs'] < lower) | (group['runtimeMs'] > upper)
    outlier_mask.loc[group.index] = mask

outliers = golden_df[outlier_mask]
print(f'Outliers by IQR: {len(outliers)} rows out of {len(golden_df)}')
display(outliers[['datasetName', 'scope', 'mode', 'runtimeMs', 'datasetSize']].head(10))

if 'trustworthiness' in golden_df.columns:
    print('Trustworthiness summary (sanity check only):')
    display(golden_df['trustworthiness'].describe())


## C. Core runtime comparison (primary)


In [None]:
plot_summary = runtime_summary.copy()
plot_summary = plot_summary.sort_values('datasetLabel') if not plot_summary.empty else plot_summary

if not plot_summary.empty:
    x = np.arange(len(plot_summary))
    width = 0.35
    mean_js = plot_summary['mean_js'] if 'mean_js' in plot_summary.columns else np.full(len(plot_summary), np.nan)
    mean_wasm = plot_summary['mean_wasm'] if 'mean_wasm' in plot_summary.columns else np.full(len(plot_summary), np.nan)
    fig, ax = plt.subplots(figsize=(max(8, len(plot_summary) * 1.2), 4))
    ax.bar(x - width/2, mean_js, width, label='js')
    ax.bar(x + width/2, mean_wasm, width, label='wasm')
    ax.set_title('Mean runtimeMs by dataset (js vs wasm)')
    ax.set_ylabel('Mean runtimeMs')
    ax.set_xticks(x)
    ax.set_xticklabels(plot_summary['datasetLabel'], rotation=45, ha='right')
    ax.legend()
    plt.tight_layout()
    plt.show()

plot_df = golden_df[golden_df['mode'].isin(['js', 'wasm']) & golden_df['datasetLabel'].notna()]
datasets = sorted(plot_df['datasetLabel'].unique().tolist())
if datasets:
    data = []
    positions = []
    centers = []
    for i, ds in enumerate(datasets):
        for j, mode in enumerate(['js', 'wasm']):
            vals = plot_df[(plot_df['datasetLabel'] == ds) & (plot_df['mode'] == mode)]['runtimeMs'].dropna().values
            data.append(vals)
            positions.append(i * 3 + j)
        centers.append(i * 3 + 0.5)

    fig, ax = plt.subplots(figsize=(max(8, len(datasets) * 1.2), 5))
    bp = ax.boxplot(data, positions=positions, widths=0.6, patch_artist=True, showfliers=False)
    for i, box in enumerate(bp['boxes']):
        box.set_facecolor('C0' if i % 2 == 0 else 'C1')
        box.set_alpha(0.6)
    ax.set_title('Runtime distribution by dataset (js vs wasm)')
    ax.set_ylabel('runtimeMs')
    ax.set_xticks(centers)
    ax.set_xticklabels(datasets, rotation=45, ha='right')
    ax.legend([
        plt.Line2D([0], [0], color='C0', lw=6),
        plt.Line2D([0], [0], color='C1', lw=6)
    ], ['js', 'wasm'])
    plt.tight_layout()
    plt.show()

if 'datasetSize' in golden_df.columns:
    agg = golden_df[(golden_df['mode'].isin(['js', 'wasm'])) & (golden_df['datasetSize'] > 0)]
    agg = agg.groupby(['datasetSize', 'mode'])['runtimeMs'].median().reset_index()
    if not agg.empty:
        fig, ax = plt.subplots(figsize=(7, 4))
        for mode in ['js', 'wasm']:
            sub = agg[agg['mode'] == mode].sort_values('datasetSize')
            ax.plot(sub['datasetSize'], sub['runtimeMs'], marker='o', label=mode)
        ax.set_title('Median runtime vs datasetSize (log-x)')
        ax.set_xlabel('datasetSize')
        ax.set_ylabel('runtimeMs')
        ax.set_xscale('log')
        ax.legend()
        plt.tight_layout()
        plt.show()
else:
    print('datasetSize column not available for runtime vs size plot.')

print('Runtime summary (mean/median/std/n) by datasetName + scope + mode:')
display(runtime_stats)


## D. Scaling behaviour
Log-log plots and simple linear regression on log(runtimeMs) ~ log(datasetSize) per mode.
Interpretation is cautious: slope ~1 suggests roughly linear scaling, >1 super-linear, <1 sub-linear.


In [None]:
if 'datasetSize' in golden_df.columns:
    scale_df = golden_df[(golden_df['mode'].isin(['js', 'wasm'])) & (golden_df['datasetSize'] > 0) & (golden_df['runtimeMs'] > 0)]

    if not scale_df.empty:
        fig, ax = plt.subplots(figsize=(7, 4))
        for mode in ['js', 'wasm']:
            sub = scale_df[scale_df['mode'] == mode]
            ax.scatter(sub['datasetSize'], sub['runtimeMs'], alpha=0.4, label=mode)
        ax.set_title('Runtime vs datasetSize (log-log)')
        ax.set_xlabel('datasetSize')
        ax.set_ylabel('runtimeMs')
        ax.set_xscale('log')
        ax.set_yscale('log')
        ax.legend()
        plt.tight_layout()
        plt.show()

    scaling_slopes = {}
    for mode in ['js', 'wasm']:
        sub = scale_df[scale_df['mode'] == mode]
        if len(sub) >= 2:
            x = np.log10(sub['datasetSize'])
            y = np.log10(sub['runtimeMs'])
            slope, intercept = np.polyfit(x, y, 1)
            scaling_slopes[mode] = slope

    if scaling_slopes:
        print('Estimated log-log slopes (runtime vs datasetSize):')
        for mode, slope in scaling_slopes.items():
            print(f'  {mode}: {slope:.3f}')
    else:
        print('Insufficient data for slope estimates.')
else:
    scaling_slopes = {}
    print('datasetSize column not available for scaling analysis.')


## E. Memory behaviour (secondary)
Negative memory deltas can occur due to GC or allocator behavior; interpret direction cautiously.


In [None]:
if 'memoryDeltaMb' in golden_df.columns:
    mem_df = golden_df[golden_df['mode'].isin(['js', 'wasm'])]
    mem_stats = (
        mem_df.groupby(['datasetLabel', 'mode'])['memoryDeltaMb']
        .agg(['mean', 'median', 'std', 'count'])
        .reset_index()
    )
    display(mem_stats)

    mem_wide = mem_stats.pivot_table(index='datasetLabel', columns='mode', values='mean')
    if not mem_wide.empty:
        labels = mem_wide.index.tolist()
        mean_js = mem_wide['js'] if 'js' in mem_wide.columns else np.full(len(labels), np.nan)
        mean_wasm = mem_wide['wasm'] if 'wasm' in mem_wide.columns else np.full(len(labels), np.nan)
        x = np.arange(len(labels))
        width = 0.35
        fig, ax = plt.subplots(figsize=(max(8, len(labels) * 1.2), 4))
        ax.bar(x - width/2, mean_js, width, label='js')
        ax.bar(x + width/2, mean_wasm, width, label='wasm')
        ax.set_title('Mean memoryDeltaMb by dataset (js vs wasm)')
        ax.set_ylabel('Mean memoryDeltaMb')
        ax.set_xticks(x)
        ax.set_xticklabels(labels, rotation=45, ha='right')
        ax.legend()
        plt.tight_layout()
        plt.show()
else:
    print('memoryDeltaMb column not available in data.')


## F. Responsiveness & FPS (supporting)
Responsiveness thresholds: <16ms ~ smooth, >50ms noticeable.


In [None]:
if 'fpsAvg' in golden_df.columns:
    fps_df = golden_df[(golden_df['mode'].isin(['js', 'wasm'])) & (golden_df['fpsAvg'] > 0)]
    fps_stats = fps_df.groupby(['datasetLabel', 'mode'])['fpsAvg'].mean().reset_index()
    fps_wide = fps_stats.pivot_table(index='datasetLabel', columns='mode', values='fpsAvg')
    if not fps_wide.empty:
        labels = fps_wide.index.tolist()
        mean_js = fps_wide['js'] if 'js' in fps_wide.columns else np.full(len(labels), np.nan)
        mean_wasm = fps_wide['wasm'] if 'wasm' in fps_wide.columns else np.full(len(labels), np.nan)
        x = np.arange(len(labels))
        width = 0.35
        fig, ax = plt.subplots(figsize=(max(8, len(labels) * 1.2), 4))
        ax.bar(x - width/2, mean_js, width, label='js')
        ax.bar(x + width/2, mean_wasm, width, label='wasm')
        ax.set_title('Mean fpsAvg by dataset (js vs wasm)')
        ax.set_ylabel('fpsAvg')
        ax.set_xticks(x)
        ax.set_xticklabels(labels, rotation=45, ha='right')
        ax.legend()
        plt.tight_layout()
        plt.show()
else:
    print('fpsAvg column not available in data.')

if 'responsivenessMs' in golden_df.columns:
    resp_df = golden_df[(golden_df['mode'].isin(['js', 'wasm'])) & (golden_df['responsivenessMs'] > 0)]
    resp_stats = resp_df.groupby(['datasetLabel', 'mode'])['responsivenessMs'].median().reset_index()
    resp_wide = resp_stats.pivot_table(index='datasetLabel', columns='mode', values='responsivenessMs')
    if not resp_wide.empty:
        labels = resp_wide.index.tolist()
        median_js = resp_wide['js'] if 'js' in resp_wide.columns else np.full(len(labels), np.nan)
        median_wasm = resp_wide['wasm'] if 'wasm' in resp_wide.columns else np.full(len(labels), np.nan)
        x = np.arange(len(labels))
        width = 0.35
        fig, ax = plt.subplots(figsize=(max(8, len(labels) * 1.2), 4))
        ax.bar(x - width/2, median_js, width, label='js')
        ax.bar(x + width/2, median_wasm, width, label='wasm')
        ax.axhline(16, color='green', linestyle='--', linewidth=1, label='16ms (smooth)')
        ax.axhline(50, color='red', linestyle='--', linewidth=1, label='50ms (noticeable)')
        ax.set_title('Median responsivenessMs by dataset (js vs wasm)')
        ax.set_ylabel('responsivenessMs')
        ax.set_xticks(x)
        ax.set_xticklabels(labels, rotation=45, ha='right')
        ax.legend()
        plt.tight_layout()
        plt.show()
else:
    print('responsivenessMs column not available in data.')


## G. Confidence intervals (lightweight)


In [None]:
def bootstrap_ci(data, n_resamples=1000, ci=95, seed=0):
    data = np.asarray(data)
    if len(data) == 0:
        return np.nan, np.nan, np.nan
    rng = np.random.default_rng(seed)
    means = np.empty(n_resamples)
    for i in range(n_resamples):
        sample = rng.choice(data, size=len(data), replace=True)
        means[i] = sample.mean()
    low = np.percentile(means, (100 - ci) / 2)
    high = np.percentile(means, 100 - (100 - ci) / 2)
    return data.mean(), low, high

ci_records = []
for (ds_label, mode), group in golden_df[golden_df['mode'].isin(['js', 'wasm'])].groupby(['datasetLabel', 'mode']):
    mean, low, high = bootstrap_ci(group['runtimeMs'].dropna().values)
    ci_records.append({
        'datasetLabel': ds_label,
        'mode': mode,
        'mean': mean,
        'ci_low': low,
        'ci_high': high,
        'n': len(group)
    })

ci_df = pd.DataFrame(ci_records)
ci_wide = ci_df.pivot_table(index='datasetLabel', columns='mode', values=['mean', 'ci_low', 'ci_high'])
if not ci_wide.empty:
    ci_wide.columns = [f'{stat}_{mode}' for stat, mode in ci_wide.columns]
    ci_wide['speedup'] = ci_wide['mean_js'] / ci_wide['mean_wasm']
    display(ci_wide.reset_index())
else:
    print('No CI data available (insufficient rows).')


## H. Failure/timeout analysis


In [None]:
if not results_df.empty:
    results_df['failure'] = results_df['result_exitCode'].apply(lambda x: (x is not None) and (x != 0))
    failure_summary = (
        results_df
        .groupby('file_wasmFeatures')
        .agg(
            total=('result_exitCode', 'size'),
            failures=('failure', 'sum'),
            missing_exitCode=('result_exitCode', lambda s: s.isna().sum())
        )
        .reset_index()
    )
    failure_summary['failure_rate'] = failure_summary['failures'] / failure_summary['total']
    display(failure_summary)
else:
    failure_summary = pd.DataFrame()
    print('No results-level data found for failure analysis.')


## I. Thesis-ready summary


In [None]:
summary_lines = []

if not runtime_summary.empty:
    for _, row in runtime_summary.sort_values('datasetLabel').iterrows():
        sp = row.get('speedup')
        if pd.notna(sp):
            summary_lines.append(f"- {row['datasetLabel']}: {sp:.2f}x median-based speedup (js/wasm)")
        else:
            summary_lines.append(f"- {row['datasetLabel']}: speedup n/a (missing mode)")

overall_median_speedup = runtime_summary['speedup'].median() if 'speedup' in runtime_summary.columns else np.nan
if pd.notna(overall_median_speedup):
    summary_lines.append(f"- Overall median speedup: {overall_median_speedup:.2f}x")

if scaling_slopes:
    slope_parts = [f"{mode} slope {slope:.2f}" for mode, slope in scaling_slopes.items()]
    summary_lines.append("- Scaling (log-log): " + ", ".join(slope_parts) + ". Interpret cautiously.")

if 'memoryDeltaMb' in golden_df.columns:
    mem_overall = golden_df[golden_df['mode'].isin(['js', 'wasm'])].groupby('mode')['memoryDeltaMb'].mean()
    if not mem_overall.empty:
        summary_lines.append(
            f"- Mean memoryDeltaMb: js={mem_overall.get('js', np.nan):.2f}, wasm={mem_overall.get('wasm', np.nan):.2f} (negatives can reflect GC)"
        )

if 'responsivenessMs' in golden_df.columns:
    resp_overall = (
        golden_df[(golden_df['mode'].isin(['js', 'wasm'])) & (golden_df['responsivenessMs'] > 0)]
        .groupby('mode')['responsivenessMs'].median()
    )
    if not resp_overall.empty:
        summary_lines.append(
            f"- Median responsivenessMs: js={resp_overall.get('js', np.nan):.1f}, wasm={resp_overall.get('wasm', np.nan):.1f} (16ms smooth, 50ms noticeable)"
        )

if 'failure_summary' in globals() and not failure_summary.empty:
    total_failures = failure_summary['failures'].sum()
    total_runs = failure_summary['total'].sum()
    if total_runs > 0:
        summary_lines.append(
            f"- Failures/timeouts: {int(total_failures)} of {int(total_runs)} runs ({total_failures/total_runs:.1%})"
        )

display(Markdown("\n".join(summary_lines)))
