In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import pyplot

from parse import parse

import os
import re

In [None]:
plt.rc('font', size=16)
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['legend.title_fontsize'] = 14
plt.rc('pdf', fonttype=42)

In [None]:
# pipeline is 290 + 117 + model size. when model size is 531, CMS size is 1 MB (same as paper setup). 
# Otherwise, extra/less memory is given to the CMS according to the difference. Baseline and Elastic Sketch
# is given 1 MB + the additional memory used by Elephant Tracker.

FM_SIZE = 290
ET_SIZE = 117
CODA_SIZE = 531
CODA_NAME = 'DUMBO'
results_dir = '../../results/simu_output'
traces_dir = f'{results_dir}/../../traces'

In [None]:
hh_frac, trace_fmt, minutes = 0.02, 'traces/caida/20160121-13{}00.UTC.anon.pcap', [46, 47, 48, 49, 50, 51, 52, 53, 54, 55]
#hh_frac, trace_fmt, minutes = 0.015, 'traces/mawi/{}.pcap', [1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925]
#hh_frac, trace_fmt, minutes = 0.03, 'traces/uni/{}.pcap', [146, 147, 148, 149, 151, 152, 153, 154, 155, 156]

#hh_frac, trace = 0.02, 'traces/caida/20160121-135000.UTC.anon.pcap'
#hh_frac, trace = 0.015, 'traces/mawi/201904091920.pcap'
proto = 'all'

trace_set = trace_fmt.split('/')[-2]

## AP score

In [None]:
data = []
oracle = []
baseline = []
es = []
coda = []
pheavy = []

for mm in minutes:
    trace = trace_fmt.format(mm)
    trace_set, trace_name = trace.split('/')[-2:]
    trace_id = f"{trace_set}-{trace_name.split('.')[0]}"
    folder = f'{results_dir}/{trace_id}/{proto}/top_{hh_frac}_pct/fse/memory_1.0MB/error'
    
    for filename in os.listdir(folder):
        model = filename.split('_')[0].replace('.txt', '')
        if model == 'baseline':
            with open(f"{folder}/{filename}", 'r') as f:
                baseline.append([float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1))])
        elif model == 'es':
            with open(f"{folder}/{filename}", 'r') as f:
                es.append([float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1))])
        elif model == 'coda':
            with open(f"{folder}/{filename}", 'r') as f:
                coda.append([float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1))])
        elif model == 'pheavy':
            with open(f"{folder}/{filename}", 'r') as f:
                pheavy.append([float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1))])
        elif model == 'oracle':
            size, = parse("oracle_{:d}KB.txt", filename)
            with open(f"{folder}/{filename}", 'r') as f:
                error = float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1))
            oracle.append([mm, size, error])
        elif model == 'sim':
            if filename.split('_')[1][:2] != 'ap':
                continue
            ap, size = parse("sim_ap{:f}_{:d}KB.txt", filename)
            with open(f"{folder}/{filename}", 'r') as f:
                error = float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1))
            data.append([mm, ap, size, error])
    
errors_df = pd.DataFrame(data, columns=['minute', 'AP', 'size', 'error'])
oracle_errors_df = pd.DataFrame(oracle, columns=['minute', 'size', 'error'])

# median on multiple minutes
baseline = np.median(baseline)
es = np.median(es)
coda = np.median(coda)
pheavy = np.median(pheavy)
errors_df = errors_df.groupby(['AP', 'size']).median().reset_index()[['AP', 'size', 'error']]
oracle_errors_df = oracle_errors_df.groupby(['size']).median().reset_index()[['size', 'error']]

In [None]:
errors_df = errors_df.sort_values(by=['AP', 'size'])
oracle_errors_df = oracle_errors_df.sort_values(by=['size'])

fig, ax = plt.subplots(1, 1, figsize=(6,4))
plot_lines_model = {}

# Iterate over each AP and plot the corresponding line
for ap in errors_df['AP'].unique():
    if ap in [0.1, 0.2, 0.4, 0.6, 0.8, 0.9]:
        ap_data = errors_df[errors_df['AP'] == ap]
        l, = ax.plot(ap_data['size'] + FM_SIZE + ET_SIZE, ap_data['error'], zorder=0)
        plot_lines_model[f'{ap}'] = l

# oracle
ax.plot(oracle_errors_df['size'] + FM_SIZE + ET_SIZE, oracle_errors_df['error'], color='gold', label=f'oracle', zorder=0)

# baseline
ax.axhline(y=baseline, color='black', linestyle='--', label=f'CMS', zorder=1)

# Elastic Sketch
ax.axhline(y=es, color='black', linestyle=':', label=f'ElasticSketch', zorder=1)

legend_model = pyplot.legend(plot_lines_model.values(), plot_lines_model.keys(), title='Model AP-score:',
                             ncol=2, columnspacing=0.9, frameon=False, borderaxespad=0.0, loc='upper left')
pyplot.gca().add_artist(legend_model)

# Coda
l1 = ax.scatter(CODA_SIZE + FM_SIZE + ET_SIZE, coda, marker="*", color='black', s=120, zorder=2)
# pHeavy
l2 = ax.scatter(1805, pheavy, marker="", color='black', s=120, zorder=2)
pyplot.gca().add_artist(pyplot.legend([l1], [CODA_NAME],
                                      ncol=1, columnspacing=0.9, frameon=False, borderaxespad=0.05, handletextpad=0.2,
                                      bbox_to_anchor=(1, 0.2), loc='lower right'))
pyplot.gca().add_artist(pyplot.legend([l2], [f'(pHeavy AWAE={pheavy:.0f})'],
                                      ncol=1, columnspacing=0.9, frameon=False, borderaxespad=0.05, handletextpad=0.2,
                                      bbox_to_anchor=(1, 0.1), loc='lower right'))

def second_xaxis(x):
    return x - FM_SIZE - ET_SIZE

ax2 = ax.secondary_xaxis('top', functions=(second_xaxis, second_xaxis))

ax.legend(ncol=3, columnspacing=0.9, frameon=False, loc='lower center', borderaxespad=0.00)

x_min = 600
x_max = 1450
y_max = es*2.5
y_min = (oracle_errors_df.loc[oracle_errors_df['size'] == x_min - 450, 'error'].values[0] - 0.1*y_max)/0.9

ax.set_xlim([x_min, x_max])
ax.set_ylim([y_min, y_max])

ax.set_xlabel('ML pipeline overhead [KB]')
ax.set_ylabel('AWAE')
ax2.set_xlabel('Model size [KB]')
ax.grid(alpha=0.5)

#plt.tight_layout()
plt.subplots_adjust(bottom=0.15, top=0.86, left=0.12, right=0.96)
plt.savefig(f'out/fse_sim_ap.{trace_set}.{proto}.{hh_frac}.pdf', format='pdf', dpi=1200)
plt.show()

## FNR

In [None]:
data = []
oracle = []
baseline = []
es = []
coda = []
pheavy = []

for mm in minutes:
    trace = trace_fmt.format(mm)
    trace_set, trace_name = trace.split('/')[-2:]
    trace_id = f"{trace_set}-{trace_name.split('.')[0]}"
    folder = f'{results_dir}/{trace_id}/{proto}/top_{hh_frac}_pct/fse/memory_1.0MB/error'

    fnr_to_fpr_file = f'{traces_dir}/{trace_set}/{trace_name[:-5]}.{proto}.{hh_frac}_fnr_to_fpr.csv'
    fnr_to_fpr = {round(k, 3): round(v, 3) for k, v in pd.read_csv(fnr_to_fpr_file, usecols=["fnr", "fpr"], index_col=0).squeeze("columns").to_dict().items()}
    
    for filename in os.listdir(folder):
        model = filename.split('_')[0].replace('.txt', '')
        if model == 'baseline':
            with open(f"{folder}/{filename}", 'r') as f:
                baseline.append(float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1)))
        elif model == 'es':
            with open(f"{folder}/{filename}", 'r') as f:
                es.append(float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1)))
        elif model == 'coda':
            with open(f"{folder}/{filename}", 'r') as f:
                coda.append(float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1)))
        elif model == 'pheavy':
            with open(f"{folder}/{filename}", 'r') as f:
                pheavy.append(float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1)))
        elif model == 'oracle':
            size, = parse("oracle_{:d}KB.txt", filename)
            with open(f"{folder}/{filename}", 'r') as f:
                error = float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1))
            oracle.append([mm, size, error])
        elif model == 'sim':
            if filename.split('_')[1][:3] != 'fnr':
                continue
            error = None
            fnr, fpr, size = parse("sim_fnr{:f}_fpr{:f}_{:d}KB.txt", filename)
            if (fnr, fpr) in fnr_to_fpr.items():
                with open(f"{folder}/{filename}", 'r') as f:
                    error = float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1))
                data.append([mm, fnr, fpr, size, error])

errors_df = pd.DataFrame(data, columns=['minute', 'FNR', 'FPR', 'size', 'error'])
oracle_errors_df = pd.DataFrame(oracle, columns=['minute', 'size', 'error'])

# median on various minutes
baseline = np.median(baseline)
es = np.median(es)
coda = np.median(coda)
pheavy = np.median(pheavy)
errors_df = errors_df.groupby(['FNR', 'size']).median().reset_index()[['FNR', 'size', 'error']]
oracle_errors_df = oracle_errors_df.groupby(['size']).median().reset_index()[['size', 'error']]

In [None]:
errors_df = errors_df.sort_values(by=['FNR', 'size'])
oracle_errors_df = oracle_errors_df.sort_values(by=['size'])

fig, ax = plt.subplots(1, 1, figsize=(6,4))
model_lines = {}

# Iterate over each FNR and plot the corresponding line
for i, fnr in enumerate(errors_df['FNR'].unique()):
    fnr_label = float(f'{fnr*2:.1f}')/2
    if fnr_label in [0.1, 0.2, 0.3, 0.4, 0.6, 0.8]:
        ap_data = errors_df[errors_df['FNR'] == fnr]
        l, = ax.plot(ap_data['size'] + FM_SIZE + ET_SIZE, ap_data['error'])
        model_lines[f'{fnr_label*100:.0f}%'] = l

# oracle
ax.plot(oracle_errors_df['size'] + FM_SIZE + ET_SIZE, oracle_errors_df['error'], color='gold', label=f'oracle')

# baseline
ax.axhline(y=baseline, color='black', linestyle='--', label=f'CMS')

# Elastic Sketch
ax.axhline(y=es, color='black', linestyle=':', label=f'ElasticSketch')

legend_model = pyplot.legend(model_lines.values(), model_lines.keys(), title='Elephants mispredictions:', 
                             ncol=2, columnspacing=0.9, frameon=False, borderaxespad=0.0)
pyplot.gca().add_artist(legend_model)

# Coda
l1 = ax.scatter(CODA_SIZE + FM_SIZE + ET_SIZE, coda, marker="*", color='black', s=120, zorder=2)
# pHeavy
l2 = ax.scatter(1805, pheavy, marker="", color='black', s=120, zorder=2)
pyplot.gca().add_artist(pyplot.legend([l1], [CODA_NAME],
                                      ncol=1, columnspacing=0.9, frameon=False, borderaxespad=0.05, handletextpad=0.2,
                                      bbox_to_anchor=(1, 0.2), loc='lower right'))
pyplot.gca().add_artist(pyplot.legend([l2], [f'(pHeavy AWAE={pheavy:.0f})'],
                               ncol=1, columnspacing=0.9, frameon=False, borderaxespad=0.05, handletextpad=0.2, 
                               bbox_to_anchor=(1, 0.1), loc='lower right'))

def second_xaxis(x):
    return x - FM_SIZE - ET_SIZE

ax2 = ax.secondary_xaxis('top', functions=(second_xaxis, second_xaxis))

ax.legend(ncol=3, columnspacing=0.9, frameon=False, loc='lower center', borderaxespad=0.00)

x_min = 600
x_max = 1450
y_max = es*2.5
y_min = (oracle_errors_df.loc[oracle_errors_df['size'] == x_min - 450, 'error'].values[0] - 0.1*y_max)/0.9

ax.set_xlim([x_min, x_max])
ax.set_ylim([y_min, y_max])

ax.set_xlabel('ML pipeline overhead [KB]')
ax.set_ylabel('AWAE')
ax2.set_xlabel('Model size [KB]')

ax.grid(alpha=0.5)

#plt.tight_layout()
plt.subplots_adjust(bottom=0.15, top=0.86, left=0.12, right=0.96)
plt.savefig(f'out/fse_sim_fnr.{trace_set}.{proto}.{hh_frac}.pdf', format='pdf', dpi=1200)
plt.show()

## FPR

In [None]:
data = []
oracle = []
baseline = []
es = []
coda = []
pheavy = []

for mm in minutes:
    trace = trace_fmt.format(mm)
    trace_set, trace_name = trace.split('/')[-2:]
    trace_id = f"{trace_set}-{trace_name.split('.')[0]}"
    folder = f'{results_dir}/{trace_id}/{proto}/top_{hh_frac}_pct/fse/memory_1.0MB/error'

    fnr_to_fpr_file = f'{traces_dir}/{trace_set}/{trace_name[:-5]}.{proto}.{hh_frac}_fpr_to_fnr.csv'
    fpr_to_fnr = {round(k, 3): round(v, 3) for k, v in pd.read_csv(fnr_to_fpr_file, usecols=["fpr", "fnr"], index_col=0).squeeze("columns").to_dict().items()}
    
    for filename in os.listdir(folder):
        model = filename.split('_')[0].replace('.txt', '')
        if model == 'baseline':
            with open(f"{folder}/{filename}", 'r') as f:
                baseline.append(float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1)))
        elif model == 'es':
            with open(f"{folder}/{filename}", 'r') as f:
                es.append(float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1)))
        elif model == 'coda':
            with open(f"{folder}/{filename}", 'r') as f:
                coda.append(float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1)))
        elif model == 'pheavy':
            with open(f"{folder}/{filename}", 'r') as f:
                pheavy.append(float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1)))
        elif model == 'oracle':
            size, = parse("oracle_{:d}KB.txt", filename)
            with open(f"{folder}/{filename}", 'r') as f:
                error = float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1))
            oracle.append([mm, size, error])
        elif model == 'sim':
            #if "_700KB" in filename:
            #    continue
            if filename.split('_')[1][:3] != 'fnr':
                continue
            error = None
            fnr, fpr, size = parse("sim_fnr{:f}_fpr{:f}_{:d}KB.txt", filename)
            if (fpr, fnr) in fpr_to_fnr.items():
                with open(f"{folder}/{filename}", 'r') as f:
                    error = float(re.search(r'Total Error:\s+([\d\.e-]+)', f.read()).group(1))
                data.append([mm, fnr, fpr, size, error])

errors_df = pd.DataFrame(data, columns=['minute', 'FNR', 'FPR', 'size', 'error'])
oracle_errors_df = pd.DataFrame(oracle, columns=['minute', 'size', 'error'])

# median on various minutes
baseline = np.median(baseline)
es = np.median(es)
coda = np.median(coda)
pheavy = np.median(pheavy)
errors_df = errors_df.groupby(['FPR', 'size']).median().reset_index()[['FPR', 'size', 'error']]
oracle_errors_df = oracle_errors_df.groupby(['size']).median().reset_index()[['size', 'error']]

In [None]:
errors_df = errors_df.sort_values(by=['FPR', 'size'])
oracle_errors_df = oracle_errors_df.sort_values(by=['size'])

fig, ax = plt.subplots(1, 1, figsize=(6,4))
model_lines = {}

# Iterate over each FPR and plot the corresponding line
for i, fpr in enumerate(errors_df['FPR'].unique()):
    if i in [0, 1, 2, 3, 4, 5, 6]:
        fpr_data = errors_df[errors_df['FPR'] == fpr]
        #fpr_label = float(f'{fpr*2:.1f}')/2
        l, = ax.plot(fpr_data['size'] + FM_SIZE + ET_SIZE, fpr_data['error'])
        model_lines[f'{fpr*100:.1f}%'] = l

# oracle
ax.plot(oracle_errors_df['size'] + FM_SIZE + ET_SIZE, oracle_errors_df['error'], color='gold', label=f'oracle')

# baseline
ax.axhline(y=baseline, color='black', linestyle='--', label=f'CMS')

# Elastic Sketch
ax.axhline(y=es, color='black', linestyle=':', label=f'ElasticSketch')

legend_model = pyplot.legend(model_lines.values(), model_lines.keys(), title='Mice mispredictions:', 
                             ncol=2, columnspacing=0.9, frameon=False, borderaxespad=0.00, loc='upper left')
pyplot.gca().add_artist(legend_model)

# Coda
l1 = ax.scatter(CODA_SIZE + FM_SIZE + ET_SIZE, coda, marker="*", color='black', s=120, zorder=2)
# pHeavy
l2 = ax.scatter(1805, pheavy, marker="", color='black', s=120, zorder=2)
pyplot.gca().add_artist(pyplot.legend([l1], [CODA_NAME],
                                      ncol=1, columnspacing=0.9, frameon=False, borderaxespad=0.05, handletextpad=0.2,
                                      bbox_to_anchor=(1, 0.2), loc='lower right'))
pyplot.gca().add_artist(pyplot.legend([l2], [f'(pHeavy AWAE={pheavy:.0f})'],
                               ncol=1, columnspacing=0.9, frameon=False, borderaxespad=0.05, handletextpad=0.2, 
                               bbox_to_anchor=(1, 0.1), loc='lower right'))

def second_xaxis(x):
    return x - FM_SIZE - ET_SIZE

ax2 = ax.secondary_xaxis('top', functions=(second_xaxis, second_xaxis))

ax.legend(ncol=3, columnspacing=0.9, frameon=False, loc='lower center', borderaxespad=0.0)

x_min = 600
x_max = 1450
y_max = es*2.5
y_min = (oracle_errors_df.loc[oracle_errors_df['size'] == x_min - 450, 'error'].values[0] - 0.1*y_max)/0.9

ax.set_xlim([x_min, x_max])
ax.set_ylim([y_min, y_max])

ax.set_xlabel('ML pipeline overhead [KB]')
ax.set_ylabel('AWAE')
ax2.set_xlabel('Model size [KB]')

ax.grid(alpha=0.5)

#plt.tight_layout()
plt.subplots_adjust(bottom=0.15, top=0.86, left=0.12, right=0.96)
plt.savefig(f'out/fse_sim_fpr.{trace_set}.{proto}.{hh_frac}.pdf', format='pdf', dpi=1200)
plt.show()