In [None]:
import os
import pandas as pd
import numpy as np
import json
from datetime import datetime
import copy

In [None]:
root_data_path = "../data"
df_dict: dict = {}
    
runtime_targets: dict = {
    "LR": 5,
    "MPC": 9,
    "K-Means": 14,
    "GBT": 16
}
    
paper_names = {
    "LogisticRegression": "LR",
    "KMeans": "K-Means",
    "GradientBoostedTrees": "GBT"
}
    
job_names = ["LR", "MPC", "K-Means", "GBT"]

In [None]:
my_set = set()
date_format = "%Y-%m-%dT%H:%M:%S.%fZ"

def get_id(app_id):
    my_set.add(app_id)
    return len(my_set)

for file in os.listdir(root_data_path):    
    if "enel_runs.json" == file:
        keys = ["job_id", "fit_time", "predict_time", "application_signature", "start_time", "application_execution_id"]
        list_dict = {k:[] for k in keys}
        
        with open(os.path.join(root_data_path, file), "r") as f:
            obj_list = json.load(f)
            for obj in obj_list:
                for k in keys:
                    list_dict[k].append(obj[k])
            
        json_df = pd.DataFrame.from_dict(list_dict)
        json_df["start_time"] = json_df["start_time"].map(lambda time: datetime.strptime(time["$date"], date_format))
        json_df["application_signature"] = json_df["application_signature"].map(lambda n: paper_names.get(n, n))
        json_df = json_df.sort_values(by=['start_time'])
        json_df["application_execution_id"] = json_df["application_execution_id"].map(lambda app_id: get_id(app_id))
        json_df = json_df.rename(columns={"application_execution_id": "id"})
        
        a = copy.deepcopy(json_df)
        a = a.rename(columns={"fit_time": "time"})
        a["Type"] = "Training"
        
        b = copy.deepcopy(json_df)
        b = b.rename(columns={"predict_time": "time"})
        b["Type"] = "Prediction"
        
        json_df = pd.concat([b, a], ignore_index=True)
        
        for job_name in job_names:
            sub_df = copy.deepcopy(json_df.loc[json_df["application_signature"] == job_name, :])
            sub_df = sub_df.loc[sub_df["id"] > 10, :]
            print(sub_df.shape)
            
            sub_df["Job"] = job_name
        
            df_dict[job_name] = sub_df

In [None]:
from collections import OrderedDict

new_df_dict = OrderedDict()

for k in job_names:
    new_df_dict[k] = df_dict[k]

df_dict = new_df_dict

for k, v in df_dict.items():
    print(k, len(v))

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

new_df = pd.concat(list(df_dict.values()), ignore_index=True)

with sns.axes_style("whitegrid"):

    fig, ax = plt.subplots(1, 1, figsize=(10,4))

    ax = sns.barplot(data=new_df, x="Job", y="time", hue="Type", ax=ax, palette="Blues")

    ax.xaxis.set_tick_params(labelsize=14)
    ax.yaxis.set_tick_params(labelsize=14)

    ax.set_xlabel("Benchmark Job", fontsize=16)
    ax.set_ylabel("Duration [s]", fontsize=16)
    
    plt.setp(ax.get_legend().get_texts(), fontsize="14") # for legend text
    plt.setp(ax.get_legend().get_title(), fontsize='16') # for legend title

    fig.savefig("duration_fit_predict.pdf", bbox_inches='tight')

    plt.show()