In [None]:
import os
import pandas as pd
import numpy as np
import json
from datetime import datetime
import copy

In [None]:
root_data_path = "../data"
df_dict: dict = {}
    
runtime_targets: dict = {
    "LR": 5,
    "MPC": 9,
    "K-Means": 14,
    "GBT": 16
}
    
paper_names = {
    "LogisticRegression": "LR",
    "KMeans": "K-Means",
    "GradientBoostedTrees": "GBT"
}
    
job_names = ["LR", "MPC", "K-Means", "GBT"]

In [None]:
# get data from ellis + profiling runs
cols = ["APP_EVENT_ID", "JOB_ID", "DURATION_MS", "APP_ID", "SCALE_OUT"]
for file in os.listdir(root_data_path):
    
    def calculate_ellis_cvc(row: pd.Series):
        duration_m = row["DURATION_MS"] / 60000
        return 1 if duration_m > runtime_targets.get(row["APP_ID"]) else 0
    def calculate_ellis_cvs(row: pd.Series):
        return abs((row["DURATION_MS"] / 60000) - runtime_targets.get(row["APP_ID"])) if row["CVC"] == 1 else 0
    
    if ".csv" in file:
        raw_df = pd.read_csv(os.path.join(root_data_path, file), delimiter="|", usecols=cols)
        raw_df["APP_ID"] = raw_df["APP_ID"].map(lambda n: paper_names.get(n, n))
        raw_df = raw_df.drop(columns=['SCALE_OUT'])
        df = raw_df.groupby(by=["APP_EVENT_ID"], as_index=False).agg({"DURATION_MS": sum, 
                                                                      "APP_ID": max
                                                                     })
        
        df["CVC"] = df[['DURATION_MS', 'APP_ID']].apply(calculate_ellis_cvc, axis=1)
        df["CVS"] = df[['DURATION_MS', 'APP_ID', 'CVC']].apply(calculate_ellis_cvs, axis=1)
        
        df = df.sort_values(["APP_EVENT_ID"])
        
        job_name = str(df["APP_ID"].values[0])
        
        df["duration"] = df["DURATION_MS"].map(lambda v: v / 60000)
        df = df.drop(columns=['DURATION_MS', 'APP_ID'])
        df = df.rename(columns={"APP_EVENT_ID": "id"})
        df["Method"] = "Ellis"
        df.loc[df["id"] < 11, "Method"] = "Profiling Runs"
        print(df.shape)
        
        df_dict[job_name] = df
        
for file in os.listdir(root_data_path):    
    if "enel_runs.json" == file:
        keys = ["application_execution_id", "application_signature", 
                "end_time", "start_time"]
        list_dict = {k:[] for k in keys}
        
        with open(os.path.join(root_data_path, file), "r") as f:
            obj_list = json.load(f)
            for obj in obj_list:
                for k in keys:
                    list_dict[k].append(obj[k])
        
        def get_duration(row: pd.Series):
            date_format = "%Y-%m-%dT%H:%M:%S.%fZ"
            start = datetime.strptime(row["start_time"]["$date"], date_format)
            end = datetime.strptime(row["end_time"]["$date"], date_format)
            duration = (end - start).total_seconds() / 60
            return pd.Series([start, end, duration])
        
        def calculate_enel_cvc(row: pd.Series):
            return 1 if row["duration"] > runtime_targets.get(row["application_signature"]) else 0
        
        def calculate_enel_cvs(row: pd.Series):
            return abs(row["duration"] - runtime_targets.get(row["application_signature"])) if row["CVC"] == 1 else 0
            
        json_df = pd.DataFrame.from_dict(list_dict)
        json_df["application_signature"] = json_df["application_signature"].map(lambda n: paper_names.get(n, n))
        json_df[['start_time', 'end_time', "duration"]] = json_df[['start_time', 'end_time']].apply(get_duration, axis=1)

        json_df = json_df.sort_values(by=['start_time'])
        json_df = json_df.drop(columns=['start_time', 'end_time'])
        json_df = json_df.rename(columns={"application_execution_id": "id"})
        json_df = json_df.groupby(by=["id"], as_index=False).agg({"duration": sum, 
                                                                  "application_signature": max})
        
        json_df["Method"] = "Enel"
        
        for job_name in job_names:
            sub_df = copy.deepcopy(json_df.loc[json_df["application_signature"] == job_name, :])
            sub_df["id"] = list(range(1, len(sub_df) + 1))
            sub_df.loc[sub_df["id"] < 11, "Method"] = "Profiling2 Runs"
            
            sub_df["CVC"] = sub_df[['duration', 'application_signature']].apply(calculate_enel_cvc, axis=1)
            sub_df["CVS"] = sub_df[['duration', 'application_signature', 'CVC']].apply(calculate_enel_cvs, axis=1)
        
            merge = pd.concat([df_dict.get(job_name), sub_df], ignore_index=True)
            merge = merge.loc[merge["Method"] != "Profiling2 Runs", :]
            
            print(merge.shape)
            df_dict[job_name] = merge

In [None]:
from collections import OrderedDict

new_df_dict = OrderedDict()

for k in job_names:
    new_df_dict[k] = df_dict[k]

df_dict = new_df_dict

for k, v in df_dict.items():
    print(k, len(v))

In [None]:
palette: dict = {
    "Profiling Runs": "grey",
    "Ellis": "#4878D0",
    "Enel": "#EE854A"
}
    
y_lim_dict: dict = {
    "LR": [2,12],
    "MPC": [2, 25],
    "K-Means": [0, 100],
    "GBT": [10, 43]
}

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator

with sns.axes_style("whitegrid"):

    fig, axes = plt.subplots(4, 1, figsize=(12,12), sharex=True)

    for idx, ((job_name, df), ax) in enumerate(zip(list(df_dict.items()), axes.reshape(-1))):
        # plot runtime target line
        ax.plot(range(len(df)), [runtime_targets.get(job_name)] * len(df), 
                color="tab:red", label="Target Runtime", linestyle='dashed', linewidth=2)
        ax.plot(range(len(df)), [runtime_targets.get(job_name) * 1.5] * len(df), 
                color="tab:red", linestyle='dashed', linewidth=1)
        ax.plot(range(len(df)), [runtime_targets.get(job_name) * 2] * len(df), 
                color="tab:red", linestyle='dashed', linewidth=0.5)
        # plot actual data
        ax = sns.lineplot(ax=ax, data=df, x="id", y="duration", hue="Method", palette=palette, linewidth=2)
        ax.get_legend().remove()
        ax.set_title(job_name, fontsize=16)
        ax.xaxis.set_tick_params(labelsize=14)
        ax.yaxis.set_tick_params(labelsize=14)
        
        if job_name == "K-Means":
            vals = df.loc[df["Method"] == "Ellis", "duration"].values.tolist()
            max_vals = sorted(vals)[-2:]
            ax.scatter(np.array([vals.index(e) + 11 for e in max_vals]), 
                       np.array(max_vals), marker="x", color="red", s=100, zorder=10)
        
        if idx == 3:
            ax.set_xlabel("# Run", fontsize=16)
        else:
            ax.set_xlabel(None)
        ax.set_ylabel("Runtime [m]", fontsize=16)
            
        # plot background color for profiling runs
        ax.axvspan(xmin=0, xmax=10, color=(.9, .9, .9), zorder=-1)
        ax.axvspan(xmin=41, xmax=50, color="mistyrose", zorder=-1)
        ax.axvspan(xmin=61, xmax=65, color="mistyrose", zorder=-1)
        ax.set_xlim(0, 65)
        if job_name == "LR":
            ax.set_ylim(1,16)
        # allow only integers as y-axis elements
        ax.xaxis.set_major_locator(MaxNLocator(13, integer=True))
        ax.yaxis.set_major_locator(MaxNLocator(5, integer=True))

    legend = plt.legend(loc='lower center', 
                        ncol=4, 
                        fancybox=True, 
                        fontsize=16,
                        bbox_to_anchor=(0.5, -0.6))    

    fig.savefig("complete_experiment_runtime_target_comparison.pdf", bbox_inches='tight')
    plt.show()

In [None]:
for job_name, df in list(df_dict.items()):
    offsets = [11, 22, 33, 44, 55]
    
    cvc_res = []
    cvs_res = []
    
    for off in offsets:
        sub_df = copy.deepcopy(df.loc[(df.id >= off) & (df.id < (off+11)), :])
        
        enel_df = sub_df.loc[sub_df.Method == "Enel", :]
        cvc_res.append((np.mean(enel_df['CVC'].values), 
                       np.std(enel_df['CVC'].values), 
                       np.median(enel_df['CVC'].values)))
        
        cvs_res.append((np.mean(enel_df['CVS'].values), 
                       np.std(enel_df['CVS'].values), 
                       np.median(enel_df['CVS'].values)))
    
    print(job_name)
    print("--> CVC:", " & ".join([f"${tup[0]:.2f}$ & ${tup[2]:.2f}$" for tup in cvc_res]))
    print("--> CVS:", "& ".join([f"${tup[0]:.2f}$m & ${tup[2]:.2f}$m" for tup in cvs_res]))
    print("\n")