In [9]:
# Imports
import os, glob, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Paths (relative to repo root); adjust if needed
RESULTS_DIR = '../results'
PLOTS_DIR = '../plots3'

os.makedirs(PLOTS_DIR, exist_ok=True)

print('Results dir:', os.path.abspath(RESULTS_DIR))
print('Plots dir  :', os.path.abspath(PLOTS_DIR))

Results dir: /workspaces/mcp-vs-dta/analysis/results
Plots dir  : /workspaces/mcp-vs-dta/analysis/plots3


In [10]:
def load_summaries(results_dir=RESULTS_DIR):
    import os, glob, pandas as pd
    paths = glob.glob(os.path.join(results_dir, '*', 'bench.csv'))
    frames = []
    for p in paths:
        profile = os.path.basename(os.path.dirname(p))
        df = pd.read_csv(p)
        df['profile'] = profile
        frames.append(df)
    if not frames:
        raise FileNotFoundError(f'No bench.csv files under {results_dir}/<profile>/')
    out = pd.concat(frames, ignore_index=True)
    # Ensure expected dtypes
    for col in ['sizeKB','meanMs','p50Ms','p95Ms','p99Ms','hz','iterations']:
        if col in out.columns:
            out[col] = pd.to_numeric(out[col], errors='coerce')
    return out

summary = load_summaries()
summary.head()

Unnamed: 0,timestamp,profile,section,implementation,operation,sizeKB,meanMs,p50Ms,p95Ms,p99Ms,hz,iterations,url
0,2025-10-07T14:03:02.387Z,delay40,HTTP,dta,,1,82.059189,82.291862,94.753367,98.02334,12.186326,122,http://localhost:8080/blob?size=512
1,2025-10-07T14:03:02.387Z,delay40,HTTP,dta,,1,80.854407,81.157213,94.545917,97.805563,12.36791,124,http://localhost:8080/blob?size=1024
2,2025-10-07T14:03:02.387Z,delay40,HTTP,dta,,2,82.005159,82.004658,95.527942,97.286851,12.194355,123,http://localhost:8080/blob?size=2048
3,2025-10-07T14:03:02.387Z,delay40,HTTP,dta,,4,80.99288,81.540715,94.998968,97.221742,12.346764,124,http://localhost:8080/blob?size=4096
4,2025-10-07T14:03:02.387Z,delay40,HTTP,dta,,8,82.329188,82.594712,94.603948,97.122271,12.146361,122,http://localhost:8080/blob?size=8192


In [11]:
# Uses your existing `summary = load_summaries()` and RESULTS_DIR/PLOTS_DIR set to '../*'

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

os.makedirs(PLOTS_DIR, exist_ok=True)

# Keep only the two impls we compare
summary = summary[summary["implementation"].isin(["dta", "mcp"])].copy()

# Convenient splits
http = summary[summary["section"] == "HTTP"].copy()
fs   = summary[summary["section"] == "FS"].copy()

http["sizeB"] = http["sizeKB"] * 1024
fs["sizeB"]   = fs["sizeKB"] * 1024

def compute_overhead(df, keys, metrics=("p50Ms","p95Ms","p99Ms")):
    piv = df.pivot_table(
        index=keys, columns="implementation", values=list(metrics), aggfunc="median"
    )
    piv.columns = [f"{m}_{impl}" for m, impl in piv.columns]
    piv = piv.reset_index()
    for m in metrics:
        dcol = f"{m}_dta"
        mcol = f"{m}_mcp"
        if dcol in piv and mcol in piv:
            piv[f"{m}_overhead_pct"] = (piv[mcol] - piv[dcol]) / piv[dcol] * 100.0
            piv[f"{m}_delta_ms"]     =  piv[mcol] - piv[dcol]
    return piv

http_overhead = compute_overhead(http, ["profile","sizeKB"])
fs_overhead   = compute_overhead(fs,   ["profile","operation","sizeKB"])

# Summaries you can cite in the paper
def summarize_overhead(ov_df, group_cols):
    return ov_df.groupby(group_cols).agg(
        p50_overhead_med_pct=("p50Ms_overhead_pct","median"),
        p95_overhead_med_pct=("p95Ms_overhead_pct","median"),
        p99_overhead_med_pct=("p99Ms_overhead_pct","median"),
        p50_overhead_mean_pct=("p50Ms_overhead_pct","mean"),
        p95_overhead_mean_pct=("p95Ms_overhead_pct","mean"),
        p99_overhead_mean_pct=("p99Ms_overhead_pct","mean"),
    ).reset_index()

http_summary = summarize_overhead(http_overhead, ["profile"])
fs_summary   = summarize_overhead(fs_overhead,   ["profile","operation"])

# Persist machine-readable outputs
Path(PLOTS_DIR, "derived").mkdir(exist_ok=True)
http.to_csv(Path(PLOTS_DIR, "derived", "http_all.csv"), index=False)
fs.to_csv(Path(PLOTS_DIR, "derived", "fs_all.csv"), index=False)
http_overhead.to_csv(Path(PLOTS_DIR, "derived", "http_overhead.csv"), index=False)
fs_overhead.to_csv(Path(PLOTS_DIR, "derived", "fs_overhead.csv"), index=False)
http_summary.to_csv(Path(PLOTS_DIR, "derived", "http_overhead_summary_by_profile.csv"), index=False)
fs_summary.to_csv(Path(PLOTS_DIR, "derived", "fs_overhead_summary_by_profile_operation.csv"), index=False)

http_summary, fs_summary.head(9)


(    profile  p50_overhead_med_pct  p95_overhead_med_pct  p99_overhead_med_pct  \
 0    bursty              2.432060              2.380241              3.321294   
 1   default             57.546702             51.120946             47.571166   
 2   delay40              1.539131             -0.324490              1.105978   
 3     loss1             45.071703             18.941397              0.020971   
 4   netem40              1.249350              0.164040             -0.277541   
 5  slowlink             -3.018334             -1.376755             -1.102582   
 
    p50_overhead_mean_pct  p95_overhead_mean_pct  p99_overhead_mean_pct  
 0               2.305442               2.430323               3.178355  
 1              53.756253              66.897141              45.398263  
 2               1.176644               0.593084               1.226940  
 3              47.145394              23.360688              -8.784960  
 4               3.138729               7.551529      

In [12]:
def lineplot_xy(df, x, y_cols, title, xlabel, ylabel, outfile):
    plt.figure(figsize=(9,6))
    for label, col in y_cols.items():
        if col in df and not pd.isna(df[col]).all():
            plt.plot(df[x].values, df[col].values, marker="o", label=label)
    # sizes double → log2 x-axis makes trends clearer across payloads
    if x.lower().endswith("kb") or x.lower().endswith("b"):
        try:
            plt.xscale("log", base=2)
        except Exception:
            pass
    plt.grid(True, which="both", linewidth=0.5, alpha=0.6)
    plt.xlabel(xlabel); plt.ylabel(ylabel); plt.title(title)
    if len(y_cols) > 1: plt.legend()
    plt.tight_layout()
    plt.savefig(outfile, dpi=180)
    plt.close()


In [13]:
for prof in sorted(http["profile"].unique()):
    dprof = http[http["profile"] == prof].sort_values("sizeKB")
    agg = dprof.groupby(["profile","sizeKB","implementation"], as_index=False)[["p95Ms"]].median()
    piv = agg.pivot_table(index="sizeKB", columns="implementation", values="p95Ms", aggfunc="first").reset_index().sort_values("sizeKB")

    outfile = os.path.join(PLOTS_DIR, f"combined_http_p95_{prof}.png")
    ycols = {}
    if "dta" in piv: ycols["DTA"] = "dta"
    if "mcp" in piv: ycols["MCP"] = "mcp"

    lineplot_xy(
        piv, x="sizeKB", y_cols=ycols,
        title=f"HTTP p95 vs Size — {prof}",
        xlabel="Payload size (KB)", ylabel="Latency p95 (ms)", outfile=outfile
    )
outfile


'../plots3/combined_http_p95_slowlink.png'

In [14]:
for prof in sorted(fs["profile"].unique()):
    for op in ["read","write","remove"]:
        sub = fs[(fs["profile"] == prof) & (fs["operation"] == op)]
        if sub.empty:
            continue
        agg = sub.groupby(["profile","operation","sizeKB","implementation"], as_index=False)[["p95Ms"]].median()
        piv = agg.pivot_table(index="sizeKB", columns="implementation", values="p95Ms", aggfunc="first").reset_index().sort_values("sizeKB")

        outfile = os.path.join(PLOTS_DIR, f"combined_fs_{op}_p95_{prof}.png")
        ycols = {}
        if "dta" in piv: ycols["DTA"] = "dta"
        if "mcp" in piv: ycols["MCP"] = "mcp"

        lineplot_xy(
            piv, x="sizeKB", y_cols=ycols,
            title=f"FS {op} p95 vs Size — {prof}",
            xlabel="Payload size (KB)", ylabel="Latency p95 (ms)", outfile=outfile
        )
outfile


'../plots3/combined_fs_remove_p95_slowlink.png'

In [15]:
# HTTP
for prof in sorted(http_overhead["profile"].unique()):
    sub = http_overhead[http_overhead["profile"] == prof].sort_values("sizeKB")
    outfile = os.path.join(PLOTS_DIR, f"combined_http_overhead_p95_{prof}.png")
    lineplot_xy(
        sub, x="sizeKB",
        y_cols={"MCP overhead vs DTA (p95, %)": "p95Ms_overhead_pct"},
        title=f"HTTP MCP overhead vs DTA (p95) — {prof}",
        xlabel="Payload size (KB)", ylabel="Overhead (%)", outfile=outfile
    )

# FS
for prof in sorted(fs_overhead["profile"].unique()):
    for op in ["read","write","remove"]:
        sub = fs_overhead[(fs_overhead["profile"] == prof) & (fs_overhead["operation"] == op)].sort_values("sizeKB")
        if sub.empty:
            continue
        outfile = os.path.join(PLOTS_DIR, f"combined_fs_overhead_{op}_p95_{prof}.png")
        lineplot_xy(
            sub, x="sizeKB",
            y_cols={"MCP overhead vs DTA (p95, %)": "p95Ms_overhead_pct"},
            title=f"FS {op} MCP overhead vs DTA (p95) — {prof}",
            xlabel="Payload size (KB)", ylabel="Overhead (%)", outfile=outfile
        )


In [16]:
import pandas as pd
from pathlib import Path

def _read_latency_series(path: Path):
    df = pd.read_csv(path)
    # Guess: any numeric col with 'ms' or 'lat' in name; else first numeric column
    cands = [c for c in df.columns if ("ms" in c.lower() or "lat" in c.lower()) and pd.api.types.is_numeric_dtype(df[c])]
    col = cands[0] if cands else next((c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])), None)
    if col is None:
        return None
    return pd.to_numeric(df[col], errors="coerce").dropna()

def plot_ecdf_overlay(profile: str, sizeB: int):
    raw_dir = Path(RESULTS_DIR) / profile / "raw"
    if not raw_dir.exists():
        return
    series = {}
    for impl in ["dta","mcp"]:
        p = raw_dir / f"HTTP_{impl}_{sizeB}B_http.csv"
        if p.exists():
            s = _read_latency_series(p)
            if s is not None and len(s) > 0:
                series[impl] = s
    if not series:
        return

    xs, ys, labels = [], [], []
    plt.figure(figsize=(9,6))
    for impl, s in series.items():
        x = np.sort(s.values)
        y = np.arange(1, len(x)+1) / len(x)
        lbl = "DTA" if impl=="dta" else "MCP"
        plt.plot(x, y, label=lbl)

    plt.grid(True, which="both", linewidth=0.5, alpha=0.6)
    plt.xlabel("Per-request latency (ms)"); plt.ylabel("ECDF")
    plt.title(f"HTTP ECDF — {profile}, size={sizeB} B")
    plt.legend(); plt.tight_layout()
    out = Path(PLOTS_DIR) / f"combined_http_ecdf_{profile}_{sizeB}B.png"
    plt.savefig(out, dpi=180); plt.close()
    return out

ecdfs = []
for prof in sorted(http["profile"].unique()):
    for sizeB in [512, 1024, 1048576]:
        outp = plot_ecdf_overlay(prof, sizeB)
        if outp: ecdfs.append(outp)
ecdfs[:5]


[PosixPath('../plots3/combined_http_ecdf_bursty_512B.png'),
 PosixPath('../plots3/combined_http_ecdf_bursty_1024B.png'),
 PosixPath('../plots3/combined_http_ecdf_bursty_1048576B.png'),
 PosixPath('../plots3/combined_http_ecdf_default_512B.png'),
 PosixPath('../plots3/combined_http_ecdf_default_1024B.png')]