# Analysis notebook for Fives simulation output

In [1]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

sns.set_theme()

# Changes only when calibration dataset is modified
CALIBRATION_MONTH = 11 # Month used for calibration (eg. 11 for november)
CALIB_CAT = 1          # Job category used during calibration
VALID_CAT = 1          # Job category used after calibration, using the calibrated configuration 
                       # (it is possible to simulate, eg. cat. 0 jobs with a cat. 1 calibrated file)

# Changes with each calibration, depending on time of calibration and results
ID = 955  # Update with your own here
EXP_UID = f"para{ID}" # used inside filenames
RES_DIR = f"months_{ID}" # In the directory containing this notebook, we'll be looking for Fives output files inside this sub directory.

# Corresponds to static overheads added to batches of I/O actions during the simulation, 
# to compensate for the lack of metadata operations and other related phenomemon. Values 
# can be obtained from the calibrated configuration files of Fives.
overhead_read = 2
overhead_write = 3

In [2]:
# 'Constants' for Theta2022 dataset, computed inside ThetaDarshanCompositeLogs notebook (Artefact A_1)
m_mean = 0.9985043721898039 
non0_percentile75__bf = 23931362.014779434   # Value after filtering on original dataset
non0_percentile25__bf = 3162775.2197486456   # Value after filtering on original dataset

def compute_b(p0):
    x0, y0 = p0[0], p0[1]
    b = y0 / ((x0)**m_mean)
    return b

def computeCategory(io_volume, io_duration, job_id):
    """ Output the category a job is in (2-fast ; 1-regular ; 0-slow) 
        based on its I/O volume and time spent in I/O"""
    b_job = compute_b((io_duration, io_volume))
    category = None
    if b_job >= non0_percentile75__bf:
        category = 2
    elif (b_job < non0_percentile75__bf) & (b_job >= non0_percentile25__bf):
        category = 1
    elif b_job < non0_percentile25__bf:
        category = 0
        
    if (category == None):
        print(f"ERROR for job {job_id} -> b_job = {b_job} (io_volume = {io_volume} and io_duration = {io_duration}")
    
    return category

def cohend(data1: list, data2: list):
    """Compute a Cohen's d metric of two list of values"""
    n_data1, n_data2 = len(data1), len(data2)
    var1, var2 = np.var(data1, ddof=1), np.var(data2, ddof=1)
    global_var = np.sqrt(
        ((n_data1 - 1) * var1 + (n_data2 - 1) * var2) / (n_data1 + n_data2 - 2)
    )
    mean1, mean2 = np.mean(data1), np.mean(data2)
    return (mean1 - mean2) / global_var

In [3]:
import yaml

months = {"month_nb": [], "corr": []}
read_months = {"month_nb": [], "corr": []}
write_months = {"month_nb": [], "corr": []}
job_count_months = {"month_nb": [], "value": []}
cohen = {"month_nb": [], "value": []}

for i in range(1, 13):
    
    with open(f"./{RES_DIR}/analysis_month{i}_cat{VALID_CAT}/{EXP_UID}_month{i}_metrics.yaml", "r", encoding="utf-8") as analysis:
        metrics = yaml.load(analysis, Loader=yaml.SafeLoader)
        # print(metrics)
        months["month_nb"].append(i)
        months["corr"].append(metrics["iotime_correlation"])
        read_months["month_nb"].append(i)
        read_months["corr"].append(metrics["iotime_read_correlation"])
        write_months["month_nb"].append(i)
        write_months["corr"].append(metrics["iotime_write_correlation"])
        job_count_months["month_nb"].append(i)
        job_count_months["value"].append(metrics["job_count"])
        cohen["month_nb"].append(i)
        cohen["value"].append(metrics["iovolume_cohend_effect"])

In [4]:
# Prepare stats values per month
import random
import pandas as pd

months["row_index"] = []
months["col_index"] = []
read_months["row_index"] = []
read_months["col_index"] = []
write_months["row_index"] = []
write_months["col_index"] = []
job_count_months["row_index"] = []
job_count_months["col_index"] = []
cohen["row_index"] = []
cohen["col_index"] = []

col = 0
for i in range(0, 12):
    if i % 4 == 0:
        col += 1
    months["row_index"].append(i % 4  + 1)
    months["col_index"].append(col)
    read_months["row_index"].append(i % 4 + 1)
    read_months["col_index"].append(col)
    write_months["row_index"].append(i % 4 + 1)
    write_months["col_index"].append(col)
    job_count_months["row_index"].append(i % 4 + 1)
    job_count_months["col_index"].append(col)
    cohen["row_index"].append(i % 4 + 1)
    cohen["col_index"].append(col)


pd_months = pd.DataFrame(months)
pmonths = (
    pd_months
    .pivot(index="col_index", columns="row_index", values="corr")
)

pd_Readmonths = pd.DataFrame(read_months)
read_pmonths = (
    pd_Readmonths
    .pivot(index="col_index", columns="row_index", values="corr")
)

pd_Writemonths = pd.DataFrame(write_months)
write_pmonths = (
    pd_Writemonths
    .pivot(index="col_index", columns="row_index", values="corr")
)


pd_JCmonths = pd.DataFrame(job_count_months)
JC_pmonths = (
    pd_JCmonths
    .pivot(index="col_index", columns="row_index", values="value")
)

pd_Cohenmonths = pd.DataFrame(cohen)
pd_Cohenmonths = (
    pd_Cohenmonths
    .pivot(index="col_index", columns="row_index", values="value")
)

## Job count per month in the calibration category

In [5]:
f, ax = plt.subplots(figsize=(9, 3))
g = sns.heatmap(JC_pmonths, annot=True, fmt="d", linewidths=.5, cmap="viridis", ax=ax)
g.set_title(f"Job count per month (calibration {ID})")
g.set(xlabel="", ylabel="")
g.set(xticklabels=[], yticklabels=[])

plt.savefig(f"{RES_DIR}/{ID}_jobCountPerMonth.png", dpi=200)

## Global I/O time correlation between simulation and real traces (R/W I/O)

In [6]:
f, ax = plt.subplots(figsize=(9, 3))

g = sns.heatmap(pmonths, annot=True, fmt=".2f", linewidths=.5, cmap="crest", ax=ax)
g.set_title(f"R/W Correlation (calibration {ID})")
g.set(xlabel="", ylabel="")
g.set(xticklabels=[], yticklabels=[])
plt.savefig(f"{RES_DIR}/{ID}_rwCorrMonth.png", dpi=200)

## Read I/O time correlation between simulation and real traces 

In [7]:
f, ax = plt.subplots(figsize=(9, 3))
g = sns.heatmap(read_pmonths, annot=True, fmt=".2f", linewidths=.5, cmap="crest",ax=ax)
g.set_title(f"Read Correlation (calibration {ID})")
g.set(xlabel="", ylabel="")
g.set(xticklabels=[], yticklabels=[])
plt.savefig(f"{RES_DIR}/{ID}_readCorrMonth.png", dpi=200)

## Write I/O time correlation between simulation and real traces 

In [8]:
f, ax = plt.subplots(figsize=(9, 3))
g = sns.heatmap(write_pmonths, annot=True, fmt=".2f", linewidths=.5, cmap="crest",ax=ax)
g.set_title(f"Write Correlation (calibration {ID})")
g.set(xlabel="", ylabel="")
g.set(xticklabels=[], yticklabels=[])
plt.savefig(f"{RES_DIR}/{ID}_writeCorrMonth.png", dpi=200)

## Simulated vs real cumulative I/O time, per month

In [9]:
sns.set(rc={'figure.figsize':(24, 12)})

gridspec = {"hspace":0.4}
figure, axis = plt.subplots(3, 4, gridspec_kw=gridspec)

row = 0
col = 0
month_nb = 0


for i in range(1, 13):
    
    with open(f"./{RES_DIR}/simulatedJobs_theta2022_aggMonth{i}_cat{VALID_CAT}__Fives_C_theta2022_aggMonth11_cat{CALIB_CAT}_0.0.1_month{i}.yml", "r", encoding="utf-8") as job_results:
        results = yaml.load(job_results, Loader=yaml.CLoader)

    # Mean diffs
    sim_io_time = []
    sim_read_time = []
    sim_write_time = []
    real_io_time = []
    real_read_time = []
    real_write_time = []

    for job in results:

        # Simulated
        s_io_time = 0
        s_r_time = 0
        s_w_time = 0
        for action in job["actions"]:
            if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
                continue
            if action["act_status"] != "COMPLETED":
                continue
            if action["act_type"] == "FILEREAD":
                s_r_time += (action["act_duration"]  + overhead_read) * action["nb_stripes"] 
            if action["act_type"] == "CUSTOM" and "write" in str(action["sub_job"]):
                s_w_time += (action["act_duration"]  + overhead_write) * action["nb_stripes"]

        if len(job['actions']) != 0:
            # "Real"
            r_io_time = ( job["real_cReadTime_s"] 
                        + job["real_cWriteTime_s"])
            real_io_time.append(r_io_time)
            real_read_time.append(job["real_cReadTime_s"])
            real_write_time.append(job["real_cWriteTime_s"])

            s_io_time = (s_r_time + s_w_time)

            sim_io_time.append(s_io_time)
            sim_read_time.append(s_r_time)
            sim_write_time.append(s_w_time)
        else:
            print(f"Job {job['job_id']} has 0 actions") 
            
            
    max_target = max(max(real_io_time), max(sim_io_time))
    line = {"x": [0, max_target], "y": [0, max_target]}

    scatter = sns.scatterplot(
        x=real_io_time, 
        y=sim_io_time, 
        s=40, 
        color=".15", 
        alpha=0.5, 
        ax=axis[row, col],
        label="Read/Write")
    read_scatter = sns.scatterplot(
        x=real_read_time, y=sim_read_time, s=20, ax=axis[row, col], facecolors="red", marker="+", alpha=0.6, label="read")
    write_scatter = sns.scatterplot(
        x=real_write_time, y=sim_write_time, s=20, color=".10", ax=axis[row, col], facecolors="blue", marker="x", alpha=0.3, label="write")
    target_line = sns.lineplot(
        line, x="x", y="y", color="red", linestyle="--", ax=axis[row, col], label="Real == Sim target")
    scatter.set(xlabel="Real", ylabel="Simulated")
    scatter.set(title=f"Month {month_nb + 1}")
    # axis[row, col].legend()
    axis[row, col].set_xscale('log')
    axis[row, col].set_xlim([0.0001, max_target*1.05])
    axis[row, col].set_yscale('log')
    axis[row, col].set_ylim([0.0001, max_target*1.05])

    
    if col == 3:
        row += 1 
    col = (col + 1) % 4
    month_nb += 1

plt.savefig(f"{RES_DIR}/{ID}_simToRealIotimes.png", dpi=300)
plt.show()

## Figure 7 - Calibration dataset - Simulated vs real cumulative I/O time

In [10]:
sns.set(rc={'figure.figsize':(12, 12)})
sns.set_style("white")

# Mean diffs
sim_io_time = []
sim_read_time = []
sim_write_time = []
real_io_time = []
real_read_time = []
real_write_time = []

calib_month = 11

with open(f"./{RES_DIR}/simulatedJobs_theta2022_aggMonth{calib_month}_cat{VALID_CAT}__Fives_C_theta2022_aggMonth11_cat{CALIB_CAT}_0.0.1_month{calib_month}.yml", "r", encoding="utf-8") as job_results:
    results = yaml.load(job_results, Loader=yaml.CLoader)

for job in results:

    # Simulated
    s_io_time = 0
    s_r_time = 0
    s_w_time = 0
    for action in job["actions"]:
        if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
            continue
        if action["act_status"] != "COMPLETED":
            continue
        if action["act_type"] == "FILEREAD":
            s_r_time += (action["act_duration"]  + overhead_read) * action["nb_stripes"] 
        if action["act_type"] == "CUSTOM" and "write" in str(action["sub_job"]):
            s_w_time += (action["act_duration"]  + overhead_write) * action["nb_stripes"]

    if len(job['actions']) != 0:
        # "Real"
        r_io_time = ( job["real_cReadTime_s"] 
                    + job["real_cWriteTime_s"])
        real_io_time.append(r_io_time)
        real_read_time.append(job["real_cReadTime_s"])
        real_write_time.append(job["real_cWriteTime_s"])

        s_io_time = (s_r_time + s_w_time)

        sim_io_time.append(s_io_time)
        sim_read_time.append(s_r_time)
        sim_write_time.append(s_w_time)
    else:
        print(f"Job {job['job_id']} has 0 actions") 
            
number_of_jobs = len(real_io_time)       
max_target = max(max(real_io_time), max(sim_io_time))
min_target = min(min(real_io_time), min(sim_io_time))
line = {"x": [0, max_target], "y": [0, max_target]}

scatter = sns.scatterplot(
    x=real_io_time, 
    y=sim_io_time, 
    s=800, 
    color=".15", 
    alpha=0.5, 
    label="Jobs",
    zorder=20,
)
#read_scatter = sns.scatterplot(
    #x=real_read_time, y=sim_read_time, s=20, facecolors="red", marker="+", alpha=0.6, label="read")
#write_scatter = sns.scatterplot(
    #x=real_write_time, y=sim_write_time, s=20, color=".10", facecolors="blue", marker="x", alpha=0.3, label="write")
target_line = sns.lineplot(
    line, x="x", y="y", color="red", linestyle="--", label="Sim. I/O == Real I/O target", linewidth=3.5, zorder=10)
scatter.set_xlabel("Cumul. Real I/O Time (s)", fontsize=26)
scatter.set_ylabel("Cumul. Simulated I/O Time (s)", fontsize=26)
# scatter.axes.set_title(f"Full year 2019 - {number_of_jobs} jobs (after filtering)", fontsize=24)
scatter.tick_params(labelsize=26)
plt.legend(fontsize='26')
scatter.set_xscale('log')
scatter.set_xlim([min_target, max_target*1.1])
scatter.set_yscale('log')
scatter.set_ylim([min_target, max_target*1.1])
# scatter.minorticks_on()
scatter.grid(visible=True, which="both", axis="both", zorder=-10.0, alpha=0.4, linewidth=1)
scatter.set_frame_on(False)

plt.tight_layout()
plt.savefig(f"{RES_DIR}/{ID}_simToRealIotimes_fcalibrationMonth.pdf", dpi=300)
plt.savefig(f"{RES_DIR}/{ID}_simToRealIotimes_fcalibrationMonth.png", dpi=300)
plt.show()

In [11]:
sns.set(rc={'figure.figsize':(12, 12)})
sns.set_style("white")

from scipy.stats import pearsonr, ttest_rel, wilcoxon
import statsmodels.api as sm

# Mean diffs
sim_io_time_cat1 = []
sim_read_time = []
sim_write_time = []
real_io_time_cat1 = []
real_read_time = []
real_write_time = []
cat1_jobs = []

for i in range(1, 13):
    
    with open(f"./{RES_DIR}/simulatedJobs_theta2022_aggMonth{i}_cat{VALID_CAT}__Fives_C_theta2022_aggMonth11_cat{CALIB_CAT}_0.0.1_month{i}.yml", "r", encoding="utf-8") as job_results:
        results = yaml.load(job_results, Loader=yaml.CLoader)

    for job in results:

        #if job["job_uid"] in cursed_jobs:
        #    continue

        cat1_jobs.append(job["job_uid"])
        
        # Simulated
        s_io_time = 0
        s_r_time = 0
        s_w_time = 0
        for action in job["actions"]:
            if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
                continue
            if action["act_status"] != "COMPLETED":
                continue
            if action["act_type"] == "FILEREAD":
                s_r_time += (action["act_duration"]  + overhead_read) * action["nb_stripes"] 
            if action["act_type"] == "CUSTOM" and "write" in str(action["sub_job"]):
                s_w_time += (action["act_duration"]  + overhead_write) * action["nb_stripes"]

        if len(job['actions']) != 0:
            # "Real"
            r_io_time = ( job["real_cReadTime_s"] 
                        + job["real_cWriteTime_s"])
            real_io_time_cat1.append(r_io_time)
            real_read_time.append(job["real_cReadTime_s"])
            real_write_time.append(job["real_cWriteTime_s"])

            s_io_time = (s_r_time + s_w_time)

            sim_io_time_cat1.append(s_io_time)
            sim_read_time.append(s_r_time)
            sim_write_time.append(s_w_time)
        else:
            print(f"Job {job['job_id']} has 0 actions") 

io_time_corr_cat1, _ = pearsonr(sim_io_time_cat1, real_io_time_cat1)
print(f"Global correlation CAT 1: {io_time_corr_cat1}")
            
number_of_jobs = len(real_io_time_cat1)       
max_target = max(max(real_io_time_cat1), max(sim_io_time_cat1))
min_target = min(min(real_io_time_cat1), min(sim_io_time_cat1))
line = {"x": [0, max_target], "y": [0, max_target]}

scatter = sns.scatterplot(
    x=real_io_time_cat1, 
    y=sim_io_time_cat1, 
    s=180, 
    color=".15", 
    alpha=0.5, 
    label="Jobs",
    zorder=20,
)
#read_scatter = sns.scatterplot(
    #x=real_read_time, y=sim_read_time, s=20, facecolors="red", marker="+", alpha=0.6, label="read")
#write_scatter = sns.scatterplot(
    #x=real_write_time, y=sim_write_time, s=20, color=".10", facecolors="blue", marker="x", alpha=0.3, label="write")
target_line = sns.lineplot(
    line, x="x", y="y", color="red", linestyle="--", label="Sim. I/O == Real I/O target", linewidth=3.5, zorder=10)
scatter.set_xlabel("Cumulated real I/O Time (s)", fontsize=24)
scatter.set_ylabel("Cumulated simulated I/O Time (s)", fontsize=24)
# scatter.axes.set_title(f"Full year 2019 - {number_of_jobs} jobs (after filtering)", fontsize=24)
scatter.tick_params(labelsize=20)
plt.legend(fontsize='26')
scatter.set_xscale('log')
scatter.set_xlim([min_target, max_target*1.05])
scatter.set_yscale('log')
scatter.set_ylim([min_target, max_target*1.05])
scatter.minorticks_on()
scatter.grid(visible=True, which="both", axis="both", zorder=-10.0, alpha=0.4, linewidth=1)
scatter.set_frame_on(False)

plt.savefig(f"{RES_DIR}/{ID}_simToRealIotimes_fullYearCat{VALID_CAT}.pdf", dpi=300)
plt.savefig(f"{RES_DIR}/{ID}_simToRealIotimes_fullYearCat{VALID_CAT}.png", dpi=300)
plt.show()

In [12]:
sns.set(rc={'figure.figsize':(12, 12)})
sns.set_style("white")

from scipy.stats import pearsonr, ttest_rel, wilcoxon
import statsmodels.api as sm

# Mean diffs
sim_io_time_cat1 = []
sim_read_time = []
sim_write_time = []
real_io_time_cat1 = []
real_read_time = []
real_write_time = []
cat1_jobs = []

for i in range(1, 13):

    if i == 11:
        continue
    
    with open(f"./{RES_DIR}/simulatedJobs_theta2022_aggMonth{i}_cat{VALID_CAT}__Fives_C_theta2022_aggMonth11_cat{CALIB_CAT}_0.0.1_month{i}.yml", "r", encoding="utf-8") as job_results:
        results = yaml.load(job_results, Loader=yaml.CLoader)

    for job in results:

        #if job["job_uid"] in cursed_jobs:
        #    continue

        cat1_jobs.append(job["job_uid"])
        
        # Simulated
        s_io_time = 0
        s_r_time = 0
        s_w_time = 0
        for action in job["actions"]:
            if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
                continue
            if action["act_status"] != "COMPLETED":
                continue
            if action["act_type"] == "FILEREAD":
                s_r_time += (action["act_duration"]  + overhead_read) * action["nb_stripes"] 
            if action["act_type"] == "CUSTOM" and "write" in str(action["sub_job"]):
                s_w_time += (action["act_duration"]  + overhead_write) * action["nb_stripes"]

        if len(job['actions']) != 0:
            # "Real"
            r_io_time = ( job["real_cReadTime_s"] 
                        + job["real_cWriteTime_s"])
            real_io_time_cat1.append(r_io_time)
            real_read_time.append(job["real_cReadTime_s"])
            real_write_time.append(job["real_cWriteTime_s"])

            s_io_time = (s_r_time + s_w_time)

            sim_io_time_cat1.append(s_io_time)
            sim_read_time.append(s_r_time)
            sim_write_time.append(s_w_time)
        else:
            print(f"Job {job['job_id']} has 0 actions") 

io_time_corr_cat1, _ = pearsonr(sim_io_time_cat1, real_io_time_cat1)
print(f"Global correlation CAT 1: {io_time_corr_cat1}")
            
number_of_jobs = len(real_io_time_cat1)       
max_target = max(max(real_io_time_cat1), max(sim_io_time_cat1))
min_target = min(min(real_io_time_cat1), min(sim_io_time_cat1))
line = {"x": [0, max_target], "y": [0, max_target]}

scatter = sns.scatterplot(
    x=real_io_time_cat1, 
    y=sim_io_time_cat1, 
    s=180, 
    color=".15", 
    alpha=0.5, 
    label="Jobs",
    zorder=20,
)
#read_scatter = sns.scatterplot(
    #x=real_read_time, y=sim_read_time, s=20, facecolors="red", marker="+", alpha=0.6, label="read")
#write_scatter = sns.scatterplot(
    #x=real_write_time, y=sim_write_time, s=20, color=".10", facecolors="blue", marker="x", alpha=0.3, label="write")
target_line = sns.lineplot(
    line, x="x", y="y", color="red", linestyle="--", label="Sim. I/O == Real I/O target", linewidth=3.5, zorder=10)
scatter.set_xlabel("Cumulated real I/O Time (s)", fontsize=24)
scatter.set_ylabel("Cumulated simulated I/O Time (s)", fontsize=24)
# scatter.axes.set_title(f"Full year 2019 - {number_of_jobs} jobs (after filtering)", fontsize=24)
scatter.tick_params(labelsize=20)
plt.legend(fontsize='26')
scatter.set_xscale('log')
scatter.set_xlim([min_target, max_target*1.05])
scatter.set_yscale('log')
scatter.set_ylim([min_target, max_target*1.05])
scatter.minorticks_on()
scatter.grid(visible=True, which="both", axis="both", zorder=-10.0, alpha=0.4, linewidth=1)
scatter.set_frame_on(False)

plt.savefig(f"{RES_DIR}/{ID}_simToRealIotimes_fullYearCat{VALID_CAT}_noTrainSet.pdf", dpi=300)
plt.savefig(f"{RES_DIR}/{ID}_simToRealIotimes_fullYearCat{VALID_CAT}_noTrainSet.png", dpi=300)
plt.show()

## Figure 9 - Entire year, all job cat. - Simulated vs real cumulative I/O time

In [13]:
sns.set(rc={'figure.figsize':(12, 12), 'figure.dpi':300})
sns.set_style("white")

import matplotlib as mpl
import yaml

fig = mpl.pyplot.Figure()
ax = fig.get_axes()

with open(f"{RES_DIR}/simulatedJobs_theta2022_aggMonth11_cat1__Fives_C_theta2022_aggMonth11_cat1_0.0.1_month11.yml", "r", encoding="utf-8") as job_results:
    november = yaml.load(job_results, Loader=yaml.CLoader)
november_id = set()
for job in november:
    november_id.add(job["job_uid"])

cat_files = [
    f"{RES_DIR}/simulatedJobs_theta2022_0__Fives_C_theta2022_aggMonth11_cat1_0.0.1_p955.yml",
    f"{RES_DIR}/simulatedJobs_theta2022_1__Fives_C_theta2022_aggMonth11_cat1_0.0.1_p955.yml",
    "months_657/simulatedJobs_theta2022_2__Fives_C_theta2022_aggMonth10_cat2_0.0.1_657C2.yml",
]

r_oh = [overhead_read, overhead_read, 1]
w_oh = [overhead_write, overhead_write, 3]

colors = [
    "#196A9F",
    "#464B4F",
    "#C42626",
]

labels = [
    "Slow jobs",
    "Regular jobs",
    "Fast jobs",
]

markers = [
    "X",
    "o",
    "P",
]

for cat, file in enumerate(cat_files):
    
    # Mean diffs
    sim_io_time = []
    sim_read_time = []
    sim_write_time = []
    real_io_time = []
    real_read_time = []
    real_write_time = []
    
    with open(f"./{file}", "r", encoding="utf-8") as job_results:
        results = yaml.load(job_results, Loader=yaml.CLoader)
    
    for job in results:

        
        if job["job_uid"] in november_id:
            continue
            
        # Simulated
        s_io_time = 0
        s_r_time = 0
        s_w_time = 0
        for action in job["actions"]:
            if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
                continue
            if action["act_status"] != "COMPLETED":
                continue
            if action["act_type"] == "FILEREAD":
                s_r_time += (action["act_duration"]  + r_oh[cat]) * action["nb_stripes"] 
            if action["act_type"] == "CUSTOM" and "write" in str(action["sub_job"]):
                s_w_time += (action["act_duration"]  + w_oh[cat]) * action["nb_stripes"]
    
        if len(job['actions']) != 0:
            # "Real"
            r_io_time = ( job["real_cReadTime_s"] 
                        + job["real_cWriteTime_s"])
            real_io_time.append(r_io_time)
            real_read_time.append(job["real_cReadTime_s"])
            real_write_time.append(job["real_cWriteTime_s"])
    
            s_io_time = (s_r_time + s_w_time)
    
            sim_io_time.append(s_io_time)
            sim_read_time.append(s_r_time)
            sim_write_time.append(s_w_time)
        else:
            print(f"Job {job['job_id']} has 0 actions") 
                
    number_of_jobs = len(real_io_time)       
    print(f"{number_of_jobs} jobs plotted")
    max_target = max(max(real_io_time), max(sim_io_time))
    min_target = min(min(real_io_time), min(sim_io_time))
    line = {"x": [0, max_target], "y": [0, max_target]}
    
    io_time_corr, _ = pearsonr(sim_io_time, real_io_time)
    print(f"Global correlation cat {cat}: {io_time_corr}")
    
    scatter = sns.scatterplot(
        x=real_io_time, 
        y=sim_io_time, 
        s=125, 
        c=colors[cat],
        alpha=0.5, 
        label=labels[cat],
        zorder=10,
        marker=markers[cat],
    )



target_line = sns.lineplot(
    line, x="x", y="y", color="red", linestyle="--", label="Sim. I/O == Real I/O target", linewidth=3.5, zorder=-10)
scatter.set_xlabel("Cumulated real I/O Time (s)", fontsize=24)
scatter.set_ylabel("Cumulated simulated I/O Time (s)", fontsize=24)
scatter.tick_params(labelsize=20)
scatter.set_xscale('log')
scatter.set_xlim([min_target, max_target*1.05])
scatter.set_yscale('log')
scatter.set_ylim([min_target, max_target*1.05])
# scatter.minorticks_on()
scatter.grid(visible=True, which="both", axis="both", zorder=-10.0, alpha=0.4, linewidth=1)
scatter.set_frame_on(False)

rect = mpl.patches.Rectangle(
    xy=(1000, 50000), 
    width=3000, 
    height=350000, 
    alpha=1, 
    edgecolor="black", 
    fill=False, 
    linewidth=2,
    clip_on=False,
    zorder=50,
    linestyle="--",
)
target_line.axes.add_artist(rect)


plt.legend(fontsize='20')
plt.tight_layout()
plt.savefig(f"{RES_DIR}/{ID}_simToRealIotimes_fullYearAllCat.pdf", dpi=300)
plt.savefig(f"{RES_DIR}/{ID}_simToRealIotimes_fullYearAllCat.png", dpi=300)
plt.show()

In [14]:
sns.set(rc={'figure.figsize':(12, 12), 'figure.dpi':300})
sns.set_style("white")

import matplotlib as mpl
import yaml

fig = mpl.pyplot.Figure()
ax = fig.get_axes()

with open(f"{RES_DIR}/simulatedJobs_theta2022_aggMonth11_cat1__Fives_C_theta2022_aggMonth11_cat1_0.0.1_month11.yml", "r", encoding="utf-8") as job_results:
    november = yaml.load(job_results, Loader=yaml.CLoader)
november_id = set()
for job in november:
    november_id.add(job["job_uid"])

cat_files = [
    f"{RES_DIR}/simulatedJobs_theta2022_0__Fives_C_theta2022_aggMonth11_cat1_0.0.1_p955.yml",
    f"{RES_DIR}/simulatedJobs_theta2022_1__Fives_C_theta2022_aggMonth11_cat1_0.0.1_p955.yml",
    "months_657/simulatedJobs_theta2022_2__Fives_C_theta2022_aggMonth10_cat2_0.0.1_657C2.yml",
]

r_oh = [overhead_read, overhead_read, 1]
w_oh = [overhead_write, overhead_write, 3]

colors = [
    "#196A9F",
    "#464B4F",
    "#C42626",
]

labels = [
    "Slow jobs",
    "Regular jobs",
    "Fast jobs",
]

markers = [
    "X",
    "o",
    "P",
]

for cat, file in enumerate(cat_files):
    
    # Mean diffs
    sim_io_time = []
    sim_read_time = []
    sim_write_time = []
    real_io_time = []
    real_read_time = []
    real_write_time = []
    
    with open(f"./{file}", "r", encoding="utf-8") as job_results:
        results = yaml.load(job_results, Loader=yaml.CLoader)
    
    for job in results:

        
        if job["job_uid"] in november_id:
            continue
            
        # Simulated
        s_io_time = 0
        s_r_time = 0
        s_w_time = 0
        for action in job["actions"]:
            if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
                continue
            if action["act_status"] != "COMPLETED":
                continue
            if action["act_type"] == "FILEREAD":
                s_r_time += (action["act_duration"]  + r_oh[cat]) * action["nb_stripes"] 
            if action["act_type"] == "CUSTOM" and "write" in str(action["sub_job"]):
                s_w_time += (action["act_duration"]  + w_oh[cat]) * action["nb_stripes"]
    
        if len(job['actions']) != 0:
            # "Real"
            r_io_time = ( job["real_cReadTime_s"] 
                        + job["real_cWriteTime_s"])
            real_io_time.append(r_io_time)
            real_read_time.append(job["real_cReadTime_s"])
            real_write_time.append(job["real_cWriteTime_s"])
    
            s_io_time = (s_r_time + s_w_time)
    
            sim_io_time.append(s_io_time)
            sim_read_time.append(s_r_time)
            sim_write_time.append(s_w_time)
        else:
            print(f"Job {job['job_id']} has 0 actions") 
                
    number_of_jobs = len(real_io_time)       
    print(f"{number_of_jobs} jobs plotted")
    max_target = max(max(real_io_time), max(sim_io_time))
    min_target = min(min(real_io_time), min(sim_io_time))
    line = {"x": [0, max_target], "y": [0, max_target]}
    
    io_time_corr, _ = pearsonr(sim_io_time, real_io_time)
    print(f"Global correlation cat {cat}: {io_time_corr}")
    
    scatter = sns.scatterplot(
        x=real_io_time, 
        y=sim_io_time, 
        s=125, 
        c=colors[cat],
        alpha=0.5, 
        label=labels[cat],
        zorder=10,
        marker=markers[cat],
    )

    target_line = sns.lineplot(
        line, x="x", y="y", color="red", linestyle="--", label="Sim. I/O == Real I/O target", linewidth=3.5, zorder=-10)
    scatter.set_xlabel("Cumulated real I/O Time (s)", fontsize=40)
    scatter.set_ylabel("Cumulated simulated I/O Time (s)", fontsize=40)
    scatter.tick_params(labelsize=32)
    scatter.set_xscale('log')
    scatter.set_xlim([min_target, max_target*1.05])
    scatter.set_yscale('log')
    scatter.set_ylim([min_target, max_target*1.05])
    # scatter.minorticks_on()
    scatter.grid(visible=True, which="both", axis="both", zorder=-10.0, alpha=0.4, linewidth=1)
    scatter.set_frame_on(False)

    if cat == 1:
        rect = mpl.patches.Rectangle(
            xy=(1000, 50000), 
            width=3000, 
            height=350000, 
            alpha=1, 
            edgecolor="black", 
            fill=False, 
            linewidth=2,
            clip_on=False,
            zorder=50,
            linestyle="--",
        )
        target_line.axes.add_artist(rect)

    plt.legend(fontsize='36')
    plt.tight_layout()
    plt.savefig(f"{RES_DIR}/{ID}_simToRealIotimes_fullYearAllCat{cat}.pdf", dpi=300)
    plt.savefig(f"{RES_DIR}/{ID}_simToRealIotimes_fullYearAllCat{cat}.png", dpi=300)
    plt.show()

## I/O volume over I/O duration (simulated, per month and per job category)

In [15]:
import numpy as np
from yaml import load, CLoader
import pathlib

sns.set(rc={'figure.figsize':(12, 12)})
sns.set_style("white")

directory = f"./{RES_DIR}"

result_path = pathlib.Path(directory)
files = [f for f in result_path.iterdir() if f.is_file()]
files = sorted(files)
files = [file for file in files if ("simulatedJobs" in str(file) and "p955" not in str(file))]

for idx, filename in enumerate(files):

    sim_io_times = np.array([])
    sim_io_volumes = np.array([])
    sim_io_bw_mb = np.array([])
    
    out_of_class = 0
    
    results = None
    with open(filename, "r", encoding="utf-8") as job_results:
        results = load(job_results, Loader=CLoader)
    
    for job in results:

        # Simulated only
        s_io_time = 0
        s_r_time = 0
        s_w_time = 0
        s_io_vol = 0
        for action in job["actions"]:
            if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
                continue
            if action["act_status"] != "COMPLETED":
                continue
            if action["act_type"] == "FILEREAD":
                s_io_time += (action["act_duration"] + overhead_read)* action["nb_stripes"] 
                s_io_vol += action["io_size_bytes"]
            if action["act_type"] == "CUSTOM" and "write" in str(action["sub_job"]):
                s_io_time += (action["act_duration"] + overhead_write)* action["nb_stripes"] 
                s_io_vol += action["io_size_bytes"] 

        if len(job['actions']) != 0:
            sim_io_times = np.append(sim_io_times, s_io_time)
            sim_io_volumes = np.append(sim_io_volumes, s_io_vol)
            sim_io_bw_mb = np.append(sim_io_bw_mb, (s_io_vol / 1_000_000) / s_io_time)
            
        sim_category = computeCategory(s_io_vol, s_io_time, job['job_uid'])
        if int(sim_category) != int(job["category"]):
            out_of_class += 1
            # print(f"Sim category {sim_category} != real category {job['category']}")
                
    print(f"Jobs out of their class : {out_of_class} / {len(results)}")
    print(f"({(out_of_class * 100) / len(results)} % out of class)")


    fig, ax = plt.subplots(figsize=(8,5), frameon=True, linewidth=0, layout='constrained')

    # Scatter plot of Total IO time to Total Bytes READ
    g = sns.scatterplot(x=sim_io_times, y=sim_io_volumes, palette=sns.color_palette("crest", as_cmap=True), hue=sim_io_bw_mb, ax=ax, zorder=10)
    g.set(xscale='log', yscale='log')
    g.set(xlabel="Total simulated I/O duration per job (s)", ylabel="Total simulated I/O volume per job (Bytes)")
    month= str(filename)[str(filename).rfind("_")+1:-4] # Diirtttyy
    g.set(title=month)

    line_75_f = {'x': [0.1, 1000000.0], 'y': [2401391.9159769486, 23441945375976.598]}
    line_25_f = {'x': [0.1, 1000000.0], 'y': [317368.59941636777, 3098093785554.3726]}
    sns.lineplot(x="x", y="y", data=line_75_f, legend=None, color="#e91140", zorder=20, label="Q3 I/O BW Filtered", ax=ax, linestyle="-", linewidth=1, alpha=0.8)
    sns.lineplot(x="x", y="y", data=line_25_f, legend=None, color="#2b54a5", zorder=20, label="Q1 I/O BW Filtered", ax=ax, linestyle="-", linewidth=1, alpha=0.8)
    
    ax.minorticks_on()
    ax.grid(visible=True, which="both", axis="both", zorder=-10.0, alpha=0.4, linewidth=1)
    ax.set_frame_on(False)
    # ax.set_xlim(10e-5)

    legend = ax.legend(loc="best", bbox_to_anchor=(1, 0.5, 0, 0.4), fontsize=10, frameon=False, handletextpad=0.2)

    handles, labels = ax.get_legend_handles_labels()
    handles.insert(-2, handles[0])
    labels.insert(-2, "Bandwidth\nthresholds")

    legend._legend_box = None
    legend._init_legend_box(handles, labels)
    legend._set_loc(legend._loc)
    legend.set_title(legend.get_title().get_text())

    texts = legend.get_texts()
    for text in texts:
        if text.get_text() == "CUMUL_READ_BW_MB":
            text.set_text("Cumulated read\nbandwidth")
        if text.get_text() == "NODES_USED":
            text.set_text("Nodes in\nreservation")
            break
        try:
            int(text.get_text())
        except ValueError:
            continue
        else:
            text.set_text(f"<= {int(text.get_text())} MB/s")

    plt.savefig(f"{RES_DIR}/{ID}_IOVolumeToTime_Simulated_{month}.png", dpi=300)
    plt.show()

In [16]:
import numpy as np
from yaml import load, CLoader
import pathlib

directory = f"./{RES_DIR}"

result_path = pathlib.Path(directory)
files = [f for f in result_path.iterdir() if f.is_file()]
files = sorted(files)
files = [file for file in files if "p955" in str(file)]

for idx, filename in enumerate(files):

    sim_io_times = np.array([])
    sim_io_volumes = np.array([])
    sim_io_bw_mb = np.array([])
    
    out_of_class = 0
    
    results = None
    with open(filename, "r", encoding="utf-8") as job_results:
        results = load(job_results, Loader=CLoader)
    
    for job in results:

        # Simulated only
        s_io_time = 0
        s_r_time = 0
        s_w_time = 0
        s_io_vol = 0
        for action in job["actions"]:
            if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
                continue
            if action["act_status"] != "COMPLETED":
                continue
            if action["act_type"] == "FILEREAD":
                s_io_time += (action["act_duration"] + overhead_read) *action["nb_stripes"] 
                s_io_vol += action["io_size_bytes"]
            if action["act_type"] == "CUSTOM" and "write" in str(action["sub_job"]):
                s_io_time += (action["act_duration"] + overhead_write) * action["nb_stripes"] 
                s_io_vol += action["io_size_bytes"] 

        if len(job['actions']) != 0:
            sim_io_times = np.append(sim_io_times, s_io_time)
            sim_io_volumes = np.append(sim_io_volumes, s_io_vol)
            sim_io_bw_mb = np.append(sim_io_bw_mb, (s_io_vol / 1_000_000) / s_io_time)
            
        sim_category = computeCategory(s_io_vol, s_io_time, job['job_uid'])
        if int(sim_category) != int(job["category"]):
            out_of_class += 1
            # print(f"Sim category {sim_category} != real category {job['category']}")
                
    print(f"Jobs out of their class : {out_of_class} / {len(results)}")

    line_75 = {'x': [0.1, 1000000.0], 'y': [5071992.807945422, 49511859168074.27]}
    line_25 = {'x': [0.1, 1000000.0], 'y': [242675.1002595881, 2368949610659.6855]}
    line_75_f = {'x': [0.1, 1000000.0], 'y': [2401391.9159769486, 23441945375976.598]}
    line_25_f = {'x': [0.1, 1000000.0], 'y': [317368.59941636777, 3098093785554.3726]}


    fig, ax = plt.subplots(figsize=(8,5), frameon=True, linewidth=0, layout='constrained')

    # Scatter plot of Total IO time to Total Bytes READ
    g = sns.scatterplot(x=sim_io_times, y=sim_io_volumes, palette=sns.color_palette("crest", as_cmap=True), hue=sim_io_bw_mb, ax=ax, zorder=10)
    g.set(xscale='log', yscale='log')
    g.set(xlabel="Total simulated I/O duration per job (s)", ylabel="Total simulated I/O volume per job (Bytes)")
    pos = str(filename).find("theta2022_")
    title = str(filename)[pos + 10:str(filename).find("_", pos + 12)] # Diirtttyy
    print(title)
    g.set(title=title)

    sns.lineplot(x="x", y="y", data=line_75_f, legend=None, color="#e91140", zorder=20, label="Q3 I/O BW Filtered", ax=ax, linestyle="-", linewidth=1, alpha=0.8)
    sns.lineplot(x="x", y="y", data=line_25_f, legend=None, color="#2b54a5", zorder=20, label="Q1 I/O BW Filtered", ax=ax, linestyle="-", linewidth=1, alpha=0.8)

    
    ax.minorticks_on()
    ax.grid(visible=True, which="both", axis="both", zorder=-10.0, alpha=0.4, linewidth=1)
    ax.set_frame_on(False)
    # ax.set_xlim(10e-5)

    legend = ax.legend(loc="best", bbox_to_anchor=(1, 0.5, 0, 0.4), fontsize=10, frameon=False, handletextpad=0.2)

    handles, labels = ax.get_legend_handles_labels()
    handles.insert(-2, handles[0])
    labels.insert(-2, "Bandwidth\nthresholds")

    legend._legend_box = None
    legend._init_legend_box(handles, labels)
    legend._set_loc(legend._loc)
    legend.set_title(legend.get_title().get_text())

    texts = legend.get_texts()
    for text in texts:
        if text.get_text() == "CUMUL_READ_BW_MB":
            text.set_text("Cumulated read\nbandwidth")
        if text.get_text() == "NODES_USED":
            text.set_text("Nodes in\nreservation")
            break
        try:
            int(text.get_text())
        except ValueError:
            continue
        else:
            text.set_text(f"<= {int(text.get_text())} MB/s")

    plt.savefig(f"{RES_DIR}/{ID}_IOVolumeToTime_Simulated_{title}.png", dpi=300)
    plt.show()

## Figure 7 (legacy) - I/O volume over I/O duration - Simulation vs traces comparison

In [18]:
### REAL PART
import numpy as np
from yaml import load, CLoader
import pathlib
import matplotlib as mpl

with open(f"../exp_datasets/theta2022_aggMonth11_cat1.yaml", "r", encoding="utf-8") as job_dataset:
    dataset = yaml.load(job_dataset, Loader=yaml.CLoader)

out_of_class = 0 

real_io_time = []
real_read_time = []
real_write_time = []

real_jobs = {"io_vol_bytes": [], "io_time_s": []}

for job in dataset["jobs"]:

    sum_io_volume = job["readBytes"] + job["writtenBytes"] 
    sum_io_time =  job["readTimeSeconds"] + job["writeTimeSeconds"] + job["metaTimeSeconds"]
    real_jobs["io_vol_bytes"].append(sum_io_volume)
    real_jobs["io_time_s"].append(sum_io_time)

## SIMULATION PART
results = None
with open(f"./{RES_DIR}/simulatedJobs_theta2022_aggMonth11_cat1__Fives_C_theta2022_aggMonth11_cat1_0.0.1_month11.yml", "r", encoding="utf-8") as job_results:
    results = load(job_results, Loader=CLoader)

sim_io_times = np.array([])
sim_io_volumes = np.array([])
sim_io_bw_mb = np.array([])

for job in results:
    
    s_io_time = 0
    s_r_time = 0
    s_w_time = 0
    s_io_vol = 0
    for action in job["actions"]:
        if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
            continue
        if action["act_status"] != "COMPLETED":
            continue
        if action["act_type"] == "FILEREAD":
            s_io_time += (action["act_duration"] + overhead_read)* action["nb_stripes"] 
            s_io_vol += action["io_size_bytes"]
        if action["act_type"] == "CUSTOM" and "write" in str(action["sub_job"]):
            s_io_time += (action["act_duration"] + overhead_write)* action["nb_stripes"] 
            s_io_vol += action["io_size_bytes"] 

    if len(job['actions']) != 0:
        sim_io_times = np.append(sim_io_times, s_io_time)
        sim_io_volumes = np.append(sim_io_volumes, s_io_vol)
        sim_io_bw_mb = np.append(sim_io_bw_mb, (s_io_vol / 1_000_000) / s_io_time)
        
    sim_category = computeCategory(s_io_vol, s_io_time, job['job_uid'])
    if int(sim_category) != int(job["category"]):
        out_of_class += 1

line_75_f = {'x': [0.1, 1000000.0], 'y': [2401391.9159769486, 23441945375976.598]}
line_25_f = {'x': [0.1, 1000000.0], 'y': [317368.59941636777, 3098093785554.3726]}
x_min = min(min(real_jobs["io_time_s"]), min(sim_io_times))
x_max = max(max(real_jobs["io_time_s"]), max(sim_io_times))
y_min = min(min(real_jobs["io_vol_bytes"]), min(sim_io_volumes))
y_max = max(max(real_jobs["io_vol_bytes"]), max(sim_io_volumes))

# Scatter plot of Total IO time to Total Bytes READ
g = sns.scatterplot(x=real_jobs["io_time_s"], y=real_jobs["io_vol_bytes"], color="green", zorder=10, alpha=0.4, s=800, label="Real")
g.set_xlabel("Cumul. IO time (R/W Sec. from all processes)", fontsize=26)
g.set_ylabel("Cumul. IO volume (R/W Bytes from all processes)", fontsize=26)
g.set(xscale='log', yscale='log')
g.tick_params(labelsize=26)
plt.legend(fontsize='22')
g.set_xscale('log')
g.set_yscale('log')
g.set_xlim([x_min*0.90, x_max*1.15])
g.set_ylim([y_min*0.90, y_max*1.15])
#g.minorticks_on()
g.grid(visible=True, which="both", axis="both", zorder=-10.0, alpha=0.4, linewidth=1)
g.set_frame_on(False)

g2 = sns.scatterplot(x=sim_io_times, y=sim_io_volumes, color='orange', alpha=0.4, zorder=10, s=800,label="Simulated")

sns.lineplot(x="x", y="y", data=line_75_f, legend=None, color="#e91140", zorder=-20, label="Q3 I/O BW Filtered", linestyle="-", linewidth=2, alpha=0.8)
sns.lineplot(x="x", y="y", data=line_25_f, legend=None, color="#2b54a5", zorder=-20, label="Q1 I/O BW Filtered", linestyle="-", linewidth=2, alpha=0.8)
plt.legend(fontsize='26')

plt.tight_layout()
plt.savefig(f"{RES_DIR}/{ID}_volumeToTime_realVSsim_month11.png", dpi=300)
plt.savefig(f"{RES_DIR}/{ID}_volumeToTime_realVSsim_month11.pdf", dpi=300)
plt.show()


print(f"Out of class : {out_of_class}")
tt_job_count = len(results)
print(f"Total number of jobs: {tt_job_count}")
print(f"%tage of simulated jobs IN rightfull class: {(tt_job_count - out_of_class) / (tt_job_count) * 100}")

### Figure 7 HiPC Paper (Cumulative I/O volume vs. I/O time for real (green) and simulated (grey) regular jobs. Year of 2022, excl. training set (November).)

In [19]:
## 
import numpy as np
from yaml import load, CLoader
import pathlib
import matplotlib as mpl

with open(f"{RES_DIR}/simulatedJobs_theta2022_aggMonth11_cat1__Fives_C_theta2022_aggMonth11_cat1_0.0.1_month11.yml", "r", encoding="utf-8") as job_results:
    november = load(job_results, Loader=CLoader)
november_id = set()
for job in november:
    november_id.add(job["job_uid"])

### REAL PART
with open(f"../exp_datasets/theta2022_1.yaml", "r", encoding="utf-8") as job_dataset:
    dataset = yaml.load(job_dataset, Loader=yaml.CLoader)

dataset_without_november = {'jobs': []}
for job in dataset['jobs']:
    if job['id'] not in november_id:
        dataset_without_november['jobs'].append(job)
        
out_of_class = 0 

real_io_time = []
real_read_time = []
real_write_time = []
real_jobs = {"io_vol_bytes": [], "io_time_s": []}

for job in dataset_without_november["jobs"]:
    sum_io_volume = job["readBytes"] + job["writtenBytes"] 
    sum_io_time =  job["readTimeSeconds"] + job["writeTimeSeconds"] + job["metaTimeSeconds"]
    real_jobs["io_vol_bytes"].append(sum_io_volume)
    real_jobs["io_time_s"].append(sum_io_time)

## SIMULATION PART
results = None
with open(f"{RES_DIR}/simulatedJobs_theta2022_1__Fives_C_theta2022_aggMonth11_cat1_0.0.1_p955.yml", "r", encoding="utf-8") as job_results:
    results = load(job_results, Loader=CLoader)

sim_io_times = np.array([])
sim_io_volumes = np.array([])
sim_io_bw_mb = np.array([])

for job in results:

    if job["job_uid"] in november_id:
        continue
    
    s_io_time = 0
    s_r_time = 0
    s_w_time = 0
    s_io_vol = 0
    for action in job["actions"]:
        if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
            continue
        if action["act_status"] != "COMPLETED":
            continue
        if action["act_type"] == "FILEREAD":
            s_io_time += (action["act_duration"] + overhead_read)* action["nb_stripes"] 
            s_io_vol += action["io_size_bytes"]
        if action["act_type"] == "CUSTOM" and "write" in str(action["sub_job"]):
            s_io_time += (action["act_duration"] + overhead_write)* action["nb_stripes"] 
            s_io_vol += action["io_size_bytes"] 

    if len(job['actions']) != 0:
        sim_io_times = np.append(sim_io_times, s_io_time)
        sim_io_volumes = np.append(sim_io_volumes, s_io_vol)
        sim_io_bw_mb = np.append(sim_io_bw_mb, (s_io_vol / 1_000_000) / s_io_time)
        
    sim_category = computeCategory(s_io_vol, s_io_time, job['job_uid'])
    if int(sim_category) != int(job["category"]):
        out_of_class += 1

    
line_75_f = {'x': [0.1, 1000000.0], 'y': [2401391.9159769486, 23441945375976.598]}
line_25_f = {'x': [0.1, 1000000.0], 'y': [317368.59941636777, 3098093785554.3726]}

x_min = min(min(real_jobs["io_time_s"]), min(sim_io_times))
x_max = max(max(real_jobs["io_time_s"]), max(sim_io_times))
y_min = min(min(real_jobs["io_vol_bytes"]), min(sim_io_volumes))
y_max = max(max(real_jobs["io_vol_bytes"]), max(sim_io_volumes))

# Scatter plot of Total IO time to Total Bytes READ
g = sns.scatterplot(x=real_jobs["io_time_s"], y=real_jobs["io_vol_bytes"], color="green", zorder=10, alpha=0.4, s=150, label="Real")
g.set_xlabel("Cumul. IO time (R/W Sec. from all processes)", fontsize=26)
g.set_ylabel("Cumul. IO volume (R/W Bytes from all processes)", fontsize=26)
g.set(xscale='log', yscale='log')
g.tick_params(labelsize=26)
plt.legend(fontsize='22')
g.set_xscale('log')
g.set_yscale('log')
g.set_xlim([x_min*0.90, x_max*1.15])
g.set_ylim([y_min*0.90, y_max*1.15])
#g.minorticks_on()
g.grid(visible=True, which="both", axis="both", zorder=-10.0, alpha=0.4, linewidth=1)
g.set_frame_on(False)

g2 = sns.scatterplot(x=sim_io_times, y=sim_io_volumes, color='orange', alpha=0.4, zorder=10, s=150,label="Simulated", marker="v")

sns.lineplot(x="x", y="y", data=line_75_f, legend=None, color="#e91140", zorder=-20, label="Q3 I/O BW Filtered", linestyle="-", linewidth=3, alpha=0.8)
sns.lineplot(x="x", y="y", data=line_25_f, legend=None, color="#2b54a5", zorder=-20, label="Q1 I/O BW Filtered", linestyle="-", linewidth=3, alpha=0.8)
plt.legend(fontsize='26')

rect = mpl.patches.Rectangle(
    xy=(45000, 2.5e11), 
    width=2.9e5, 
    height=0.9e11, 
    alpha=1, 
    edgecolor="black", 
    fill=False, 
    linewidth=2,
    clip_on=False,
    zorder=50,
    linestyle="--",
)
g2.axes.add_artist(rect)

plt.tight_layout()
plt.savefig(f"{RES_DIR}/{ID}_volumeToTime_realVSsim_FullYearC1_noTrainSet.png", dpi=300)
plt.savefig(f"{RES_DIR}/{ID}_volumeToTime_realVSsim_FullYearC1_noTrainSet.pdf", dpi=300)
plt.show()

print(f"Out of class : {out_of_class}")
tt_job_count = len(results)
print(f"Total number of jobs: {tt_job_count}")
print(f"%tage of simulated jobs IN rightfull class: {(tt_job_count - out_of_class) / (tt_job_count) * 100}")

In [23]:
## Same as above but with simulated jobs only, and for the 3 classes
import numpy as np
from yaml import load, CLoader
import pathlib
import matplotlib as mpl

out_of_class_regular = 0
out_of_class_slow = 0
out_of_class_fast = 0

## SIMULATION JOBS, CAT 1
results = None
with open(f"{RES_DIR}/simulatedJobs_theta2022_1__Fives_C_theta2022_aggMonth11_cat1_0.0.1_p955.yml", "r", encoding="utf-8") as job_results:
    results = load(job_results, Loader=CLoader)

sim_io_times_reg = np.array([])
sim_io_volumes_reg = np.array([])

for job in results:
    
    s_io_time = 0
    s_r_time = 0
    s_w_time = 0
    s_io_vol = 0
    for action in job["actions"]:
        if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
            continue
        if action["act_status"] != "COMPLETED":
            continue
        if action["act_type"] == "FILEREAD":
            s_io_time += (action["act_duration"] + overhead_read)* action["nb_stripes"] 
            s_io_vol += action["io_size_bytes"]
        if action["act_type"] == "CUSTOM" and "write" in str(action["sub_job"]):
            s_io_time += (action["act_duration"] + overhead_write)* action["nb_stripes"] 
            s_io_vol += action["io_size_bytes"] 

    if len(job['actions']) != 0:
        sim_io_times_reg = np.append(sim_io_times, s_io_time)
        sim_io_volumes_reg = np.append(sim_io_volumes, s_io_vol)
        
    sim_category = computeCategory(s_io_vol, s_io_time, job['job_uid'])
    if int(sim_category) != int(job["category"]):
        out_of_class_regular += 1


## SIMULATION JOBS, CAT 2
results_fast = None
with open("months_657/simulatedJobs_theta2022_2__Fives_C_theta2022_aggMonth10_cat2_0.0.1_657C2.yml", "r", encoding="utf-8") as job_results:
    results_fast = load(job_results, Loader=CLoader)

sim_io_times_fast = np.array([])
sim_io_volumes_fast = np.array([])

for job in results_fast:
    
    s_io_time = 0
    s_r_time = 0
    s_w_time = 0
    s_io_vol = 0
    for action in job["actions"]:
        if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
            continue
        if action["act_status"] != "COMPLETED":
            continue
        if action["act_type"] == "FILEREAD":
            s_io_time += (action["act_duration"] + 1)* action["nb_stripes"] 
            s_io_vol += action["io_size_bytes"]
        if action["act_type"] == "CUSTOM" and "write" in str(action["sub_job"]):
            s_io_time += (action["act_duration"] + 3)* action["nb_stripes"] 
            s_io_vol += action["io_size_bytes"] 

    if len(job['actions']) != 0:
        sim_io_times_fast = np.append(sim_io_times, s_io_time)
        sim_io_volumes_fast = np.append(sim_io_volumes, s_io_vol)
        
    sim_category = computeCategory(s_io_vol, s_io_time, job['job_uid'])
    if int(sim_category) != int(job["category"]):
        out_of_class_fast += 1


## SIMULATION JOBS, CAT 0
results_slow = None
with open(f"{RES_DIR}/simulatedJobs_theta2022_0__Fives_C_theta2022_aggMonth11_cat1_0.0.1_p955.yml", "r", encoding="utf-8") as job_results:
    results_slow = load(job_results, Loader=CLoader)

sim_io_times_slow = np.array([])
sim_io_volumes_slow = np.array([])

for job in results_slow:
    
    s_io_time = 0
    s_r_time = 0
    s_w_time = 0
    s_io_vol = 0
    for action in job["actions"]:
        if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
            continue
        if action["act_status"] != "COMPLETED":
            continue
        if action["act_type"] == "FILEREAD":
            s_io_time += (action["act_duration"] + overhead_read) * action["nb_stripes"] 
            s_io_vol += action["io_size_bytes"]
        if action["act_type"] == "CUSTOM" and "write" in str(action["sub_job"]):
            s_io_time += (action["act_duration"] + overhead_write) * action["nb_stripes"] 
            s_io_vol += action["io_size_bytes"] 

    if len(job['actions']) != 0:
        sim_io_times_slow = np.append(sim_io_times, s_io_time)
        sim_io_volumes_slow = np.append(sim_io_volumes, s_io_vol)
        
    sim_category = computeCategory(s_io_vol, s_io_time, job['job_uid'])
    if int(sim_category) != int(job["category"]):
        out_of_class_slow += 1



# Min-max plot bounds
x_min = min(min(sim_io_times_reg), min(sim_io_times_fast), min(sim_io_times_slow))
x_max = max(max(sim_io_times_reg), max(sim_io_times_fast), max(sim_io_times_fast))
y_min = min(min(sim_io_volumes_reg), min(sim_io_volumes_fast), min(sim_io_volumes_slow))
y_max = max(max(sim_io_volumes_reg), max(sim_io_volumes_fast), max(sim_io_volumes_fast))

# Plot everything

# Quartile lines
line_75_f = {'x': [0.1, 1000000.0], 'y': [2401391.9159769486, 23441945375976.598]}
line_25_f = {'x': [0.1, 1000000.0], 'y': [317368.59941636777, 3098093785554.3726]}
sns.lineplot(x="x", y="y", data=line_75_f, legend=None, color="#e91140", zorder=-20, label="Q3 I/O BW Filtered", linestyle="-", linewidth=3, alpha=0.8)
sns.lineplot(x="x", y="y", data=line_25_f, legend=None, color="#2b54a5", zorder=-20, label="Q1 I/O BW Filtered", linestyle="-", linewidth=3, alpha=0.8)

g = sns.scatterplot(x=sim_io_times_reg, y=sim_io_volumes_reg, color='grey', alpha=0.4, zorder=10, s=150,label="Regular", marker="o")
g2 = sns.scatterplot(x=sim_io_times_fast, y=sim_io_volumes_fast, color='red', alpha=0.4, zorder=10, s=150,label="Fast", marker="+")
g3 = sns.scatterplot(x=sim_io_times_slow, y=sim_io_volumes_slow, color='blue', alpha=0.4, zorder=10, s=150,label="Slow", marker="v")

# Plot Settings
plt.legend(fontsize='26')
g.set_xlabel("Cumul. IO time (R/W Sec. from all processes)", fontsize=26)
g.set_ylabel("Cumul. IO volume (R/W Bytes from all processes)", fontsize=26)
g.set(xscale='log', yscale='log')
g.tick_params(labelsize=26)
g.set_xscale('log')
g.set_yscale('log')
g.set_xlim([x_min*0.90, x_max*1.15])
g.set_ylim([y_min*0.90, y_max*1.15])
g.grid(visible=True, which="both", axis="both", zorder=-10.0, alpha=0.4, linewidth=1)
g.set_frame_on(False)

# Print plot 
plt.tight_layout()
plt.savefig(f"{RES_DIR}/{ID}_volumeToTime_realVSsim_FullYearAllCat.png", dpi=300)
plt.savefig(f"{RES_DIR}/{ID}_volumeToTime_realVSsim_FullYearAllCat.pdf", dpi=300)
plt.show()

print(f"Out of class (regular): {out_of_class_regular}")
print(f"Out of class (fast): {out_of_class_fast}")
print(f"Out of class (slow): {out_of_class_slow}")

## Figure 10 - Variation of the OST count of the platform, based on calibrated configuration

In [18]:
sns.set(rc={'figure.figsize':(12, 12), 'figure.dpi':300})
sns.set_style("white")

import numpy as np
from yaml import load, CLoader
import pathlib
import matplotlib as mpl
from  scipy.stats import ttest_rel, ttest_ind

ost_files = [
    "ost_count_inc/simulatedJobs_theta2022_aggMonth11_cat1__Fives_C_theta2022_aggMonth11_cat1_0.0.1_955_O7F.yml",   
    "ost_count_inc/simulatedJobs_theta2022_aggMonth11_cat1__Fives_C_theta2022_aggMonth11_cat1_0.0.1_955_O14F.yml",    
    "ost_count_inc/simulatedJobs_theta2022_aggMonth11_cat1__Fives_C_theta2022_aggMonth11_cat1_0.0.1_955_O28F.yml",
    "ost_count_inc/simulatedJobs_theta2022_aggMonth11_cat1__Fives_C_theta2022_aggMonth11_cat1_0.0.1_955_O56F.yml",
    "ost_count_inc/simulatedJobs_theta2022_aggMonth11_cat1__Fives_C_theta2022_aggMonth11_cat1_0.0.1_955_O84F.yml",
]

colors = [
    "#70A9A1",
    "#EDAE49",
    "#FF6B6B",
    "#464B4F",   # OST =56
    "#8390FA",
]

labels = [
    "7",
    "14",
    "28",
    "56",
    "84",
]

all_sim_io_times = []
all_sim_io_volumes = []

for ost_inc, file in enumerate(ost_files):

    sim_io_times = np.array([])
    sim_io_volumes = np.array([])

    results = None
    with open(f"./{file}", "r", encoding="utf-8") as job_results:
        results = load(job_results, Loader=CLoader)
    
    out_of_class = 0
    for job in results:

        s_io_time = 0
        s_r_time = 0
        s_w_time = 0
        s_io_vol = 0
        for action in job["actions"]:
            if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
                continue
            if action["act_status"] != "COMPLETED":
                continue
            if action["act_type"] == "FILEREAD":
                s_io_time += (action["act_duration"] + overhead_read) * action["nb_stripes"] 
                s_io_vol += action["io_size_bytes"]
            if action["act_type"] == "CUSTOM" and "write" in str(action["sub_job"]):
                s_io_time += (action["act_duration"] + overhead_write) * action["nb_stripes"] 
                s_io_vol += action["io_size_bytes"] 
    
        if len(job['actions']) != 0:
            sim_io_times = np.append(sim_io_times, s_io_time)
            sim_io_volumes = np.append(sim_io_volumes, s_io_vol)
            
    x_min = min(sim_io_times)
    x_max = max(sim_io_times)
    y_min = min(sim_io_volumes)
    y_max = max(sim_io_volumes)
    
    scatter = sns.scatterplot(
        x=sim_io_times, 
        y=sim_io_volumes, 
        s=800, 
        c=colors[ost_inc],
        alpha=0.8, 
        label=labels[ost_inc],
        zorder=10 * ost_inc,
    )
    scatter.set_xlabel("Cumul. IO time (R/W Sec. from all processes)", fontsize=26)
    scatter.set_ylabel("Cumul. IO volume (R/W Bytes from all processes)", fontsize=26)
    scatter.set(xscale='log', yscale='log')
    scatter.tick_params(labelsize=26)
    scatter.set_xscale('log')
    scatter.set_yscale('log')

    all_sim_io_times.append(sim_io_times)
    all_sim_io_volumes.append(sim_io_volumes)
    print(f"Mean cumulative I/O times for jobs with {labels[ost_inc]} osts: {sim_io_times.mean()}s")



print(f"T test 56 <-> 84 : {ttest_rel(all_sim_io_times[3], all_sim_io_times[4])}")
print(f"T test 7 <-> 56 : {ttest_ind(all_sim_io_times[0], all_sim_io_times[3])}")

#scatter.set_xlim([x_min*0.80, x_max*1.3])
#scatter.set_ylim([y_min*0.80, y_max*1.3])
scatter.grid(visible=True, which="both", axis="both", zorder=-10.0, alpha=0.4, linewidth=1)

scatter.set_frame_on(False)

plt.legend(fontsize='26')
plt.tight_layout()
plt.savefig(f"{RES_DIR}/{ID}_ostCountInc.pdf", dpi=300)
plt.savefig(f"{RES_DIR}/{ID}_ostCountInc.png", dpi=300)
plt.show()

In [19]:
colors = [
    "#70A9A1",
    "#EDAE49",
    "#FF6B6B",
    "#464B4F",   # OST =56
    "#8390FA",
    "#ED474A",
    #"#8390FA",
    #"#A0C9B1",
]


x=[7, 14, 28, 56, 84]
y=[35, 24, 10, 0 ,0]

bar = sns.barplot(x=x, y=y, hue=x, palette=colors, legend=False)
bar.tick_params(labelsize=26)
bar.set_title("Failed jobs", fontsize=26)
plt.tight_layout()
plt.savefig(f"{RES_DIR}/{ID}_ostCountIncFailedJobs.pdf", dpi=300)
plt.savefig(f"{RES_DIR}/{ID}_ostCountIncFailedJobs.png", dpi=300)
plt.show()