# Simulation output analysis

## Cost function 

For each job, we want to minimize the gap between:

- simulated job's wait time and real wait time before execution -> Wj = abs(job_waiting_time_s) - real_waiting_time_s)
- simulated job's run time and real runtime -> Rj = abs(job_runtime_s -  real_runtime_s)
- simulated job's total IO time and real IO time -> IOj = abs() 

 

In [1]:
from yaml import load, CLoader

simulated_job_trace = "./simulatedJobs_theta2022_week4__StorAlloc_Theta_Lustre_fs0_0.0.1_i.yml"

results = None
with open(simulated_job_trace, "r", encoding="utf-8") as job_results:
    results = load(job_results, Loader=CLoader)
    
print(f"Loaded result dataset with {len(results)} jobs")

In [2]:
import numpy as np
from scipy.stats import pearsonr

def cohend(d1, d2):
 # calculate the size of samples
 n1, n2 = len(d1), len(d2)
 # calculate the variance of the samples
 s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
 # calculate the pooled standard deviation
 s = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
 # calculate the means of the samples
 u1, u2 = np.mean(d1), np.mean(d2)
 # calculate the effect size
 return (u1 - u2) / s

In [3]:
## Compute the wait time differences and stats for all jobs

# Mean diffs
wait_time_diffs = []
sim_wait_time = []
real_wait_time = []

for job in results:
    wait_time_diffs.append(abs(job["job_waiting_time_s"] - job["real_waiting_time_s"]))
    sim_wait_time.append(job["job_waiting_time_s"])
    real_wait_time.append(job["real_waiting_time_s"])
    
mean_wait_time_difference = np.mean(wait_time_diffs)

# Pearson's correlation
wait_time_corr, _ = pearsonr(sim_wait_time, real_wait_time)

# Cohen's D 
wait_time_cohen_d = cohend(sim_wait_time, real_wait_time)

print(f"The mean wait time difference between simulated and real values for all jobs is {mean_wait_time_difference}s (we want a mean difference as close to 0 as possible)")
print(f"The Pearson's corr is {wait_time_corr} (we want a correlation as high as possible, simulated and real values should be close)") 
print(f"The Cohen d effect size is {wait_time_cohen_d} (we want an effect size as low as possible, the use of the simulator should lead to values close to real world traces)")

In [4]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
sns.set_theme(style="ticks")

# COLORS FOR THE ENTIRE NOTEBOOK
real_color = (0.1, 0.4, 0.8, 0.5)
sim_color = (1, 0.5, 0.2, 0.5)


# Values for x=y target lineplot
max_target = max(max(real_wait_time), max(sim_wait_time))
line = {"x": [0, max_target], "y": [0, max_target]}

        
fig, axs = plt.subplots(ncols=3)
fig.set_tight_layout(tight=True)
fig.set_figheight(6)
fig.set_figwidth(20)

scatter = sns.scatterplot(x=real_wait_time, y=sim_wait_time, s=15, color=".15", ax=axs[0])
target_line = sns.lineplot(line, x="x", y="y", color="red", linestyle="--", ax=axs[0])
scatter.set(xlabel="Real", ylabel="Simulated")

binwidth = 10000
real_hist = sns.histplot(data=real_wait_time, binwidth=binwidth, ax=axs[1], color=real_color)
real_hist.set(xlabel=f"Real wait time - binwidth = {binwidth}s")

sim_hist = sns.histplot(data=sim_wait_time, binwidth=binwidth, ax=axs[2], color=sim_color)
sim_hist.set(xlabel=f"Simulated wait time - binwidth = {binwidth}s")

In [18]:
## Compute the run time differences for all jobs

runtime_diffs = []
sim_runtime = []
real_runtime = []

for job in results:
    runtime_diffs.append(abs(job["job_runtime_s"] - job["real_runtime_s"]))
    sim_runtime.append(job["job_runtime_s"])
    real_runtime.append(job["real_runtime_s"])
    
mean_real_runtime = np.mean(real_runtime)
mean_sim_runtime = np.mean(sim_runtime)   
    
mean_runtime_difference = np.mean(runtime_diffs)

# Pearson's correlation
runtime_corr, _ = pearsonr(sim_runtime, real_runtime)

# Cohen's D 
runtime_cohen_d = cohend(sim_runtime, real_runtime)

print(f"Mean runtime for simulation : {mean_sim_runtime}s")
print(f"Mean runtime in traces : {mean_real_runtime}s")
print(f"The mean run time difference between simulated and real values for all jobs is {mean_runtime_difference}s")
print(f"The Pearson's corr is {runtime_corr} (we want a correlation as high as possible)") 
print(f"The Cohen d effect size is {runtime_cohen_d} (we want an effect size as low as possible, the use of the simulator should lead to values close to real world traces)")

In [6]:
fig, axs = plt.subplots(ncols=3)
fig.set_tight_layout(tight=True)
fig.set_figheight(6)
fig.set_figwidth(20)

max_target = max(max(real_runtime), max(sim_runtime))
line = {"x": [0, max_target], "y": [0, max_target]}

scatter = sns.scatterplot(x=real_runtime, y=sim_runtime, s=15, color=".15", ax=axs[0])
target_line = sns.lineplot(line, x="x", y="y", color="red", linestyle="--",  ax=axs[0])
scatter.set(xlabel="Real", ylabel="Simulated")

binwidth = 1000
real_hist = sns.histplot(data=real_runtime, binwidth=binwidth, ax=axs[1], color=real_color)
real_hist.set(xlabel=f"Real runtime - binwidth = {binwidth}s")

sim_hist = sns.histplot(data=sim_runtime, binwidth=binwidth, ax=axs[2], color=sim_color)
sim_hist.set(xlabel=f"Simulated runtime - binwidth = {binwidth}s")

plt.savefig("runtimes_xreal_ysim.pdf", format='pdf')
plt.savefig("runtimes_xreal_ysim.png", format='png')

In [7]:
## Compute the IO durations differences and stats for all jobs

# Mean diffs
io_time_diff = []
sim_io_time = []
sim_read_time = []
sim_write_time = []
real_io_time = []
real_read_time = []
real_write_time = []

for job in results:
    
    # "Real"
    r_io_time = (job["real_cReadTime_s"] +  job["real_cWriteTime_s"] +  job["real_cMetaTime_s"]) / job["real_cores_used"]
    real_io_time.append(r_io_time)
    real_read_time.append(job["real_cReadTime_s"] / job["real_cores_used"])
    real_write_time.append(job["real_cWriteTime_s"] / job["real_cores_used"])
    
    # Simulated
    s_io_time = 0
    s_r_time = 0
    s_w_time = 0
    for action in job["actions"]:
        if action["act_type"] == "COMPUTE" or action["act_type"] == "SLEEP":
            continue
        if action["act_status"] != "COMPLETED":
            continue
        if action["act_type"] == "FILEREAD":
            s_r_time += action["act_duration"]
        if action["act_type"] == "FILEWRITE":
            s_w_time += action["act_duration"]
        s_io_time += action["act_duration"]
    
    sim_io_time.append(s_io_time)
    sim_read_time.append(s_r_time)
    # print(s_r_time)
    sim_write_time.append(s_w_time)
    
    io_time_diff.append(abs(s_io_time - r_io_time))
    
mean_io_time_difference = np.mean(io_time_diff)

# Pearson's correlation
io_time_corr, _ = pearsonr(sim_io_time, real_io_time)

# Cohen's D 
io_time_cohen_d = cohend(sim_io_time, real_io_time)

print(f"The mean wait time difference between simulated and real values for all jobs is {mean_io_time_difference}s (we want a mean difference as close to 0 as possible)")
print(f"The Pearson's corr is {io_time_corr} (we want a correlation as high as possible)") 
print(f"The Cohen d effect size is {io_time_cohen_d} (we want an effect size as low as possible, the use of the simulator should lead to values close to real world traces)")

In [8]:
fig, axs = plt.subplots(ncols=3)
fig.set_tight_layout(tight=True)
fig.set_figheight(6)
fig.set_figwidth(20)

print(f"Min value for real_read_time : {min(real_read_time)}")
print(f"Min value for sim_read_time : {min(sim_read_time)}")
print(f"Min value for real_write_time : {min(real_write_time)}")
print(f"Min value for sim_write_time : {min(sim_write_time)}")

max_target = max(max(real_io_time), max(sim_io_time))
line = {"x": [0, max_target], "y": [0, max_target]}

scatter = sns.scatterplot(x=real_io_time, y=sim_io_time, s=40, color=".15", alpha=0.5, ax=axs[0])
read_scatter = sns.scatterplot(x=real_read_time, y=sim_read_time, s=20, ax=axs[0], facecolors="red", marker="+", alpha=0.6)
write_scatter = sns.scatterplot(x=real_write_time, y=sim_write_time, s=20, color=".10", ax=axs[0], facecolors="blue", marker="x", alpha=0.3)
target_line = sns.lineplot(line, x="x", y="y", color="red", linestyle="--", ax=axs[0])
scatter.set(xlabel="Real", ylabel="Simulated")
#axs[0].set_xscale('log')
axs[0].set_xlim([0.0001, max_target*1.2])
#axs[0].set_yscale('log')
axs[0].set_ylim([0.0001, max_target*1.2])

binwidth = 100
real_hist = sns.histplot(data=real_io_time, binwidth=binwidth, ax=axs[1], color=real_color)
real_hist.set(xlabel=f"Real IO time - binwidth = {binwidth}s")

sim_hist = sns.histplot(data=sim_io_time, binwidth=binwidth, ax=axs[2], color=sim_color)
sim_hist.set(xlabel=f"Simulated IO time - binwidth = {binwidth}s")

In [9]:
## Compute the IO volume differences and stats for all jobs (Here we're just checking that simulated values are coherent, 
## as the simulation should always read / write the data volume specified in the dataset anyway.

# Mean diffs
io_volume_diff = []
sim_io_volume_gb = []
real_io_volume_gb = []

for job in results:
    
    # Real:
    r_io_volume_gb = job["real_read_bytes"] / 1_000_000_000 + job["real_written_bytes"] / 1_000_000_000
    real_io_volume_gb.append(r_io_volume_gb)
    
    # Simulated:
    s_io_volume_gb = 0
    for action in job["actions"]:
        
        if (action["act_type"] == "FILEREAD" or action["act_type"] == "CUSTOM") and action["act_status"] == "COMPLETED":
            s_io_volume_gb += action["io_size_bytes"] / 1_000_000_000
        
    sim_io_volume_gb.append(s_io_volume_gb)
    
    io_volume_diff.append(abs(s_io_volume_gb - r_io_volume_gb))
    
mean_io_volume_difference = np.mean(io_volume_diff)

# Pearson's correlation
io_vol_corr, _ = pearsonr(sim_io_volume_gb, real_io_volume_gb)

# Cohen's D 
io_vol_cohen_d = cohend(sim_io_volume_gb, real_io_volume_gb)

print(f"The mean wait time difference between simulated and real values for all jobs is {mean_io_volume_difference}s (we want a mean difference as close to 0 as possible)")
print(f"The Pearson's corr is {io_vol_corr} (we want a correlation as high as possible)") 
print(f"The Cohen d effect size is {io_vol_cohen_d} (we want an effect size as low as possible, the use of the simulator should lead to values close to real world traces)")

In [10]:
fig, axs = plt.subplots(ncols=3)
fig.set_tight_layout(tight=True)
fig.set_figheight(6)
fig.set_figwidth(20)

max_target = max(max(real_io_volume_gb), max(sim_io_volume_gb))
line = {"x": [0, max_target], "y": [0, max_target]}

scatter = sns.scatterplot(x=real_io_volume_gb, y=sim_io_volume_gb, s=15, color=".15", ax=axs[0])
target_line = sns.lineplot(line, x="x", y="y", color="red", linestyle="--", linewidth=0.3, ax=axs[0])
scatter.set(xlabel="Real", ylabel="Simulated")

binwidth = 100
real_hist = sns.histplot(data=real_io_volume_gb, binwidth=binwidth, ax=axs[1], color=real_color)
real_hist.set(xlabel=f"Real IO Volume - binwidth = {binwidth}GB")

sim_hist = sns.histplot(data=sim_io_volume_gb, binwidth=binwidth, ax=axs[2], color=sim_color)
sim_hist.set(xlabel=f"Simulated IO Volume - binwidth = {binwidth}GB")

In [11]:
import pandas as pd

fig, axs = plt.subplots(ncols=1)
fig.set_tight_layout(tight=True)
fig.set_figheight(6)
fig.set_figwidth(20)

job_start_times = []

lines = []

runtime_index = 0
for job in results:
    
    job_start_times.append(job["job_start_ts"])
    
    lines.append({
                    "x": [pd.to_datetime(job["job_start_ts"], unit='s', origin="2022-01-29 23:16:48"), 
                        pd.to_datetime(job["job_end_ts"], unit='s', origin="2022-01-29 23:16:48")], 
                    "y": [runtime_index, runtime_index]
                })
    runtime_index += 1
    

# I manually set the origin date by looking at the start ts of the first job in the dataset
job_start_times = pd.to_datetime(job_start_times, unit='s', origin="2022-01-29 23:16:48")


scatter = sns.scatterplot(x=job_start_times, y=range(1,len(results) + 1), s=15, color=".15", ax=axs)
for line in lines:
    sns.lineplot(line, x="x", y="y", color="red", linestyle="-", linewidth=0.5, ax=axs)
scatter.set(xlabel="Date (origin approx)", ylabel="Nb of jobs run")

In [12]:
simulated_job_trace

In [13]:
ds_name_start = simulated_job_trace.find('_')
ds_name_end = simulated_job_trace.find('__')
ds_name = simulated_job_trace[ds_name_start + 1:ds_name_end]

In [14]:
dataset_path = f"../../raw_data_processing/theta/{ds_name}.yaml"
dataset_path

In [15]:
dataset_yaml = None
with open(dataset_path, "r", encoding="utf-8") as dataset:
    dataset_yaml = load(dataset, Loader=CLoader)

In [16]:
import datetime as dt
fig, axs = plt.subplots(ncols=1)
fig.set_tight_layout(tight=True)
fig.set_figheight(6)
fig.set_figwidth(20)

job_start_times = []

lines = []

runtime_index = 0
for job in dataset_yaml["jobs"]:
    
    job_start_times.append(dt.datetime.fromisoformat(job["startTime"]))

    lines.append({
                    "x": [dt.datetime.fromisoformat(job["startTime"]), 
                          dt.datetime.fromisoformat(job["endTime"])], 
                    "y": [runtime_index, runtime_index]
                })
    runtime_index += 1
    
scatter = sns.scatterplot(x=job_start_times, y=range(1,len(results) + 1), s=15, color=".15", ax=axs)
for line in lines:
    sns.lineplot(line, x="x", y="y", color="red", linestyle="-", linewidth=0.5, ax=axs)
scatter.set(xlabel="Date (origin approx)", ylabel="Nb of jobs run")