In [None]:
import pandas as pd
from rich.progress import Progress
import re
import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout
from pathlib import Path
from collections import defaultdict
from collections import deque
import json
from plotly.offline import init_notebook_mode; init_notebook_mode()
from IPython.display import display
import seaborn as sns
import matplotlib.font_manager as font_manager

In [None]:
!mkdir -p data && cd data && wget -c --retry-connrefused --tries=0 --timeout=50 http://aliopentrace.oss-cn-beijing.aliyuncs.com/v2018Traces/batch_task.tar.gz

In [None]:
!cd data && tar -xvzf batch_task.tar.gz

In [None]:
# From https://github.com/alibaba/clusterdata/blob/master/cluster-trace-v2018/fetchData.sh
df = pd.read_csv('data/batch_task.csv', names=['task_name', 'instance_num', 'job_name', 'task_type', 'status', 'start_time', 'end_time', 'plan_cpu', 'plan_mem'])
df['duration'] = df['end_time'] - df['start_time']

In [None]:
# df = df.head(n=1000)

In [None]:
# Task name is:
# a) containing dependencies (like 'J4_2_3' -> task 4 depends on 2 and 3)
TASK_NAME_RE = re.compile('^[^_]*[A-Z](?P<task_id>\d+)(_(?P<deps>[\d+_]+))?(_Stg\d+)?$') # Note: sometimes job ends with _Stg*
# b) independent task (like 'task_LTE4NjUxMjg5NDY5MDI4NjAzNzU=')
SINGLE_TASK_RE = re.compile('^task_[a-zA-Z0-9]+=*$')
# c) 'MergeTask'

# check that we cover all cases:
assert df.task_name.apply(lambda f: TASK_NAME_RE.match(f) is not None or SINGLE_TASK_RE.match(f) is not None or f == 'MergeTask').all()

In [None]:
# Check that 'MergeTask's are in fact independent tasks (i.e. the only task in a job)
def get_merge_task_stats(df):
    df = df[['job_name', 'task_name']].copy()
    df['is_merge_task'] = df['task_name'] == 'MergeTask'
    return df.groupby('job_name').agg(
        count=pd.NamedAgg('task_name', 'count'),
        mergeCount=pd.NamedAgg('is_merge_task', 'sum')
    )

assert len(get_merge_task_stats(df).query('mergeCount > 0 and count > 1')) == 0

In [None]:
# Extract dependency info
def get_task_index_and_deps(row):
    task_name = row.task_name
    if m := TASK_NAME_RE.match(task_name):
        if m.group('deps'):
            deps = [int(item) for item in m.group('deps').split('_') if item != '']
        else:
            deps = []
        return int(m.group('task_id')), deps
    else:
        return 1, []

df[['task_index', 'task_deps']] = df[['task_name']].apply(get_task_index_and_deps, result_type='expand', axis=1)

In [None]:
df[['job_name', 'task_index', 'task_deps', 'duration', 'instance_num']].head()

In [None]:
# Sample job
df[['job_name', 'task_index', 'task_deps', 'duration', 'instance_num']].query('job_name == "j_3"')

In [None]:
# Filter jobs with 10 or more tasks
jobs = df.groupby("job_name").filter(lambda x: len(x) >= 10)

In [None]:
len(jobs)

In [None]:
# Filter jobs that are uninteresting, such as simple lines or parallel dag covered by other experiments

grouped = jobs.groupby("job_name")
filtered = []

with Progress() as progress:
    task = progress.add_task("[red]Removing simple DAGs", total=len(grouped))
    for group_name, df_group in grouped:
        progress.update(task, advance=1)

        complex_deps = False
        for row_index, row in df_group.iterrows():
            if len(row["task_deps"]) > 1:
                complex_deps = True

        if complex_deps:
            filtered.append(df_group)
            
jobs = pd.concat(filtered)

In [None]:
len(jobs)

In [None]:
job_names = jobs["job_name"]

In [None]:
# Get a sample
sample_job_names = pd.DataFrame({"job_name": job_names.sample(n=30, random_state=1337)})
sample_jobs = jobs[jobs["job_name"].isin(sample_job_names["job_name"])]

In [None]:
# Save dataframe to a file for cutting down on re-processing times
jobs_cache_location = "data/jobs_cache"
jobs_names_cache_location = "data/jobs_names_cache" 

if not Path(jobs_cache_location).is_file() or not Path(jobs_names_cache_location).is_file():
    sample_jobs.to_pickle(jobs_cache_location)
    sample_job_names.to_pickle(jobs_names_cache_location)
    
sample_jobs = pd.read_pickle(jobs_cache_location)
sample_job_names = pd.read_pickle(jobs_names_cache_location)

In [None]:
sample_jobs["job_name"].value_counts()

In [None]:
sample_jobs.query('job_name == "j_3302772"')

In [None]:
# Ensure each job takes no more than 60 seconds

MAX_TIME = 60

for index, row in sample_jobs.iterrows():
    sample_jobs.loc[index, 'original_duration'] = row["duration"]
    if row["duration"] > MAX_TIME:
        sample_jobs.loc[index, 'duration'] = MAX_TIME
    


In [None]:
sample_jobs.query('job_name == "j_3302772"')

In [None]:
# Visualize the DAGs
%matplotlib inline


def build_graph(dag):
    graph = nx.DiGraph()
    dependencies = []
    for _, task_data in dag.iterrows():
        graph.add_node(task_data["task_name"], duration=task_data["duration"])
        unique_deps = set(task_data["task_deps"])
        for dependency_index in unique_deps:
            for _, dependency in dag.iterrows():
                if dependency["task_index"] == dependency_index:
                    dependencies.append((dependency["task_name"], task_data["task_name"]))
    graph.add_edges_from(dependencies)
    return graph


def max_parallel(graph):
    from queue import PriorityQueue
    q = PriorityQueue()
    indegree = defaultdict(lambda: 0)
    
    for _, t in graph.edges():
        indegree[t] += 1
    
    for node in graph.nodes():
        if indegree[node] == 0:
            q.put((0, node, "start"))
            
    max_concurrency = 0
    concurrency = 0
            
    while q.qsize():
        current_time, node, action = q.get()
        
        concurrency += 1 if action == "start" else -1
        max_concurrency = max(max_concurrency, concurrency)
        
        if action == "start":
            duration = graph.nodes[node]["duration"]
            q.put((current_time + duration, node, "end"))
            continue
        
        for _, next_node in graph.edges(node):
            indegree[next_node] -= 1
            if indegree[next_node] == 0:
                q.put((current_time, next_node, "start"))
        
    return max_concurrency


def critical_path(graph):
    # tuple to represent (distance in duration, distance in nodes)
    distance = defaultdict(lambda: (0, 0))
    indegree = defaultdict(lambda: 0)
    critical_path = 0
    queue = deque()
    
    for f, t in graph.edges():
        indegree[t] += 1
        
    for node in graph.nodes():
        if indegree[node] == 0:
            queue.append(node)
            distance[node] = (graph.nodes[node]["duration"], 1)
    
    while len(queue) != 0:
        top = queue.popleft()
        for f, t in graph.edges(top):
            assert f == top
            
            indegree[t] -= 1
            node_data = graph.nodes(top)
            top_duration, top_nodes = distance[top]
            next_duration, _ = distance[t]
            
            if top_duration + graph.nodes[t]["duration"] > next_duration:
                distance[t] = (top_duration + graph.nodes[t]["duration"], top_nodes + 1)
            
            if indegree[t] == 0:
                queue.append(t)
    
    nodes_on_path = 0
    for _, value in distance.items():
        path_duration, nodes = value
        if path_duration > critical_path:
            critical_path = path_duration
            nodes_on_path = nodes
    
    return critical_path, nodes_on_path


def longest_path_in_nodes(graph):
    distance = defaultdict(lambda: 0)
    indegree = defaultdict(lambda: 0)
    critical_path = 0
    queue = deque()
    
    for f, t in graph.edges():
        indegree[t] += 1
        
    for node in graph.nodes():
        if indegree[node] == 0:
            queue.append(node)
            distance[node] = 1
    
    while len(queue) != 0:
        top = queue.popleft()
        for f, t in graph.edges(top):
            assert f == top
            
            indegree[t] -= 1
            node_data = graph.nodes(top)
            distance[t] = max(distance[t], distance[top] + 1)
            
            if indegree[t] == 0:
                queue.append(t)
    
    for _, value in distance.items():
        if value > critical_path:
            critical_path = value
    
    return critical_path


def total_work(graph):
    work = 0
    
    for node in graph.nodes():
        work += graph.nodes[node]["duration"]
        
    return work


def adjusted_tasks(dag):
    changes = 0
    for _, task_data in dag.iterrows():
        if task_data["duration"] != task_data["original_duration"]:
            changes += 1
    return changes


def draw(graph, name):
    plt.figure(name)
    mapping = {}
    for node in graph.nodes():
        mapping[node] = graph.nodes[node]["duration"]
    pos = graphviz_layout(graph, prog='dot')  
    nx.draw(
        graph,
        pos,
        with_labels=False,
        node_size=200,
        node_color="#000000",
        width=1.4,
        font_size=14,
    )
    nx.draw_networkx_labels(graph, pos, mapping, font_size=9, font_color="whitesmoke")
    plt.savefig(f"charts/{name}.pdf", format="pdf")
    
def draw_highly_parallel(graph, name):
    plt.figure(name, figsize=(10, 3))
    mapping = {}
    for node in graph.nodes():
        mapping[node] = graph.nodes[node]["duration"]
    pos = graphviz_layout(graph, prog='dot')  
    nx.draw(
        graph,
        pos,
        with_labels=False,
        node_size=60,
        node_color="#000000",
        width=0.8,
        font_size=3,
    )
    nx.draw_networkx_labels(graph, pos, mapping, font_size=6, font_color="whitesmoke")
    plt.savefig(f"charts/{name}.pdf", format="pdf")
    
def draw_just_shape(graph, name):
    plt.figure(name)
    mapping = {}
    for node in graph.nodes():
        mapping[node] = graph.nodes[node]["duration"]
    pos = graphviz_layout(graph, prog='dot')  
    nx.draw(
        graph,
        pos,
        with_labels=False,
        node_size=250,
        node_color="#000000",
        width=1.8,
        font_size=14,
    )
    plt.savefig(f"charts/{name}.pdf", format="pdf")


job_info = pd.DataFrame(columns=['job_name', 'critical_path'])

for job_name in sample_job_names["job_name"]:
    job_data = sample_jobs[sample_jobs["job_name"] == job_name].copy()
    graph = build_graph(job_data)
    
    path_duration, path_nodes = critical_path(graph)
    
    info = pd.DataFrame({
        'job_name': job_name,
        'critical_path': path_duration,
        'critical_path_in_nodes': path_nodes,
        'longest_path_in_nodes_number': longest_path_in_nodes(graph),
        'tasks': len(job_data),
        'total_work': total_work(graph),
        'adjusted_tasks': adjusted_tasks(job_data),
        'parallelisation_degree': max_parallel(graph),
    }, index=[0])
    job_info = pd.concat([job_info, info], ignore_index=True)
    
    if job_name == "j_3302772":
        draw_highly_parallel(graph, job_name)
        continue
    draw(graph, job_name)
    draw_just_shape(graph, f"{job_name}_raw")
    
    
    


In [None]:
job_info = job_info.sort_values('tasks', ascending=False)
job_info

In [None]:
# Export to Airflow's format

from pathlib import Path
import shutil


MAX_DURATION = 60
INDENT = "    "
DIRECTORY = "data/generated_dags"

def task_template(task_data, job_data) -> (str, [str]):
    dependencies = []
    base_task = f"""
task_{task_data['task_name']} = BashOperator(
    task_id='{task_data['task_name']}',
    bash_command='sleep {min(task_data['duration'], MAX_DURATION)}',
)
    """.strip()
    
    unique_deps = set(task_data["task_deps"])
    for dependency_index in unique_deps:
        dependency = None
        for _, task in job_data.iterrows():
            if task["task_index"] == dependency_index:
                dependency = task
                break
        assert dependency is not None
        
        dependency_template = f"""
task_{dependency['task_name']} >> task_{task_data['task_name']}
""".strip()
        dependencies.append(dependency_template)
    
    return base_task, dependencies

for job_name in sample_job_names["job_name"]:
    job_data = sample_jobs[sample_jobs["job_name"] == job_name].copy()
    info = job_info[job_info["job_name"] == job_name].copy().iloc[0]
    schedule = 5 if info["critical_path"] < 201 else 10
    
    
    imports = f"""
import pendulum
from airflow import DAG
from airflow.operators.bash import BashOperator


with DAG(
    dag_id='{job_name}',
    schedule_interval='*/{schedule} * * * *',
    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
    catchup=False,
) as dag:
""".strip()
    
    templated_tasks = []
    templated_dependencies = []
    
    for _, task_data in job_data.iterrows():
        task_templated, dependencies_templated = task_template(task_data, job_data)
        templated_tasks.append(task_templated)
        templated_dependencies.extend(dependencies_templated)
    
    Path(DIRECTORY).mkdir(parents=True, exist_ok=True)
    Path(f'{DIRECTORY}/{job_name}').mkdir(parents=True, exist_ok=True)
    
    with open(f'{DIRECTORY}/{job_name}.py', 'w+') as f:
        f.write(imports)
        
        task_lines = "\n".join(templated_tasks)
        tasks_data = [f"{INDENT}{line}" for line in task_lines.split("\n")]
        tasks = "\n".join(tasks_data)
        
        f.write("\n")
        f.write(tasks)
        
        dependencies_lines = "\n".join(templated_dependencies)
        dependencies_data = [f"{INDENT}{line}" for line in dependencies_lines.split("\n")]
        deps = "\n".join(dependencies_data)
        
        f.write("\n")
        f.write(deps)
        f.write("\n")
    
    shutil.copyfile(f'{DIRECTORY}/{job_name}.py', f'{DIRECTORY}/{job_name}/{job_name}.py')
        
    

In [None]:
# Get beeflow experiments config
    
experiments = []

for job_name in job_info.sort_values('tasks', ascending=False)["job_name"]:
    experiments.append(
    {
        "dags_local_path": f'notebooks/{DIRECTORY}/{job_name}',
        "dag_ids": [job_name],
        "metrics_collection_time_seconds": 3600,
        "experiment_id": job_name,    
    })
    
print(json.dumps(experiments, indent=2))

## Metrics visualistion

In [None]:
# visualising dag experimentation

BUCKET_NAME="beeflow-dev-metadata-dumps"
CHARTS_DIRECTORY = "charts/" 

import pandas as pd
import boto3
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

s3 = boto3.client('s3') 

s3_cache = {}

def get_s3_csv(file: str):
    if file in s3_cache:
        return s3_cache[file]
    obj = s3.get_object(Bucket=BUCKET_NAME, Key=file)
    df = pd.read_csv(obj['Body'])
    s3_cache[file] = df
    return df


from enum import Enum

class System(Enum):
    MWAA = "mwaa"
    BEEFLOW = "serverless"
    
class Stats(Enum):
    DAG_RUNS = "dagrun"
    TASKS = "taskinstance"
    
def get_stats(system: System, date: str, stats: Stats):
    return get_s3_csv(f"{system.value}/export/{stats.value}/dt={date}/{stats.value}.csv")

def filter_on_dag_ids(df, dag_ids):
    return df[df['dag_id'].isin(dag_ids)]

def filter_on_not_task_ids(df, task_ids):
    return df[~df['task_id'].isin(task_ids)]

def filter_on_not_dag_run_id(df, dag_run_ids):
    return df[~df['run_id'].isin(dag_run_ids)]

def filter_not_on_running(df):
    try:
        return df[~df['_state'].isin(["running"])]
    except:
        return df[~df['state'].isin(["running"])] 

def enhance_with_duration(df):
    start_ms_time = pd.to_datetime(df['start_date'])
    end_ms_time = pd.to_datetime(df['end_date'])
    df['duration'] = end_ms_time.sub(start_ms_time).dt.total_seconds()
    
def enhance_with_wait_time_dag_runs(df):
    queued_ms_time = pd.to_datetime(df['queued_at'])
    end_ms_time = pd.to_datetime(df['start_date'])
    df['wait_time'] = end_ms_time.sub(queued_ms_time).dt.total_seconds()
    
def enhance_with_wait_time_tasks(df, G: nx.DiGraph, dag_runs_df):
    wait_time = defaultdict(lambda: dict())
    
    for run_id, one_run in df.groupby("run_id"):
        dag_run = dag_runs_df[dag_runs_df["run_id"] == run_id].copy()
        assert len(dag_run) == 1, f"{run_id} missing"
        
        dag_run_queued_at = pd.to_datetime(dag_run["queued_at"]).iloc[0]
        
        for node in G.nodes():
            current_node = one_run[one_run["task_id"] == node].copy()
            assert len(current_node) == 1, f"len {len(current_node)}, node {node}"
            
            node_started_at = pd.to_datetime(current_node["start_date"]).iloc[0]
            node_last_finished_at = dag_run_queued_at
            
            for upstream, _ in G.in_edges(node):
                upstream_node = one_run[one_run["task_id"] == upstream].copy()
                assert len(upstream_node) == 1, f"len {len(upstream_node)}, node {upstream}"
                
                node_last_finished_at = max(node_last_finished_at, pd.to_datetime(upstream_node["end_date"]).iloc[0])
            
            wait_time[run_id][node] = (node_started_at-node_last_finished_at).total_seconds()
            
       
    wait_time_column = []
    
    for _, row in df.iterrows():
        wait_time_column.append(wait_time[row["run_id"]][row["task_id"]])
            
              
    assert len(wait_time_column) == len(df)
    df['wait_time'] = wait_time_column
    
from dataclasses import dataclass
from typing import List

@dataclass
class BoxPlotEntry:
    data: pd.DataFrame
    identifier: str
    system: str = ""
    
def identifier_to_color(identifier):
    if identifier.startswith("mwaa"):
        return "purple"
    return "blue"
    
def box_plot_for(entries: List[BoxPlotEntry], fig, col, measured_column="duration", hide_x_axis=False, **kwargs):
    transformed_data = []
    for entry in entries:
        df = entry.data.loc[:, ]
        df["identifier"] = entry.identifier
        transformed_data.append(df)
    transformed_data = pd.concat(transformed_data)
    
    for name, group in transformed_data.groupby("identifier"):
        fig.add_trace(
            go.Box(
                x=group["identifier"], 
                y=group[measured_column], 
                boxpoints="all",
                name=name,
                line=dict(color=identifier_to_color(name)),
                customdata=group['run_id'],
                hovertemplate = "ID:%{x}: <br>VAL: %{y} </br> RUN:%{customdata}",
            ), 
            row=1, 
            col=col,
        )

@dataclass
class BoxPlotsEntry:
    system: System
    export_date: str
    dag_ids: List[str]
    
def system_to_title(system):
    if system == System.MWAA:
        return "MWAA"
    return "sAirflow"
    
def three_box_plots_for(entries: List[BoxPlotsEntry], not_task_ids, graph_shape, not_dag_run_ids=[], hide_x_axis=False, experiment_id="", postprocess=None, **kwargs):
    dags_data = []
    fig = make_subplots(rows=1, cols=3, figure=go.Figure())
    
    for entry in entries:
        entry_dag_runs = filter_not_on_running(filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        ))
        enhance_with_duration(entry_dag_runs)
        enhance_with_wait_time_dag_runs(entry_dag_runs)
        
        dags_data.append(
            BoxPlotEntry(
                data=entry_dag_runs,
                identifier=f"{entry.system.value}_{experiment_id}",
                system=system_to_title(entry.system),
            )
        )
    
    box_plot_for(dags_data, 
                 fig,
                 1,
                 measured_column="duration", 
                 title="DAG runs comparison: duration",
                 hover_data=["run_id"],
                 hide_x_axis=hide_x_axis,
                 **kwargs)
    
    tasks_data = []
    
    for entry in entries:
        entry_tasks = filter_not_on_running(filter_on_not_dag_run_id(
            df=filter_on_not_task_ids(
                df=filter_on_dag_ids(
                    df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                    dag_ids=entry.dag_ids,
                ),
                task_ids=not_task_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        ))
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_tasks)
        enhance_with_wait_time_tasks(entry_tasks, graph_shape, entry_dag_runs)
        
        tasks_data.append(
            BoxPlotEntry(
                data=entry_tasks,
                identifier=f"{entry.system.value}_{experiment_id}",
                system=system_to_title(entry.system),
            )
        )
    
    box_plot_for(tasks_data,
                 fig,
                 2,
                 measured_column="duration", 
                 title=f"Tasks runs comparison: task duration",
                 hide_x_axis=hide_x_axis,
                 hover_data=["task_id", "run_id"],
                 **kwargs)
    box_plot_for(tasks_data,
                 fig,
                 3,
                 measured_column="wait_time", 
                 title="Tasks run comparision: wait time (amount of time passed from task being marked as queued to started)",
                 hide_x_axis=hide_x_axis,
                 hover_data=["task_id", "run_id"],
                 **kwargs)
    
    fig.update_layout(height=600, showlegend=False)
    fig.show()
    
arrow_standard_args = dict(
            xref="x",
            yref="y",
            showarrow=True,
            font=dict(
                family='Latin Modern Math',
                color='black',
                size=32,
                ),
            align="center",
            arrowhead=2,
            arrowsize=1,
            arrowwidth=2,
            arrowcolor="black",
            ax=100,
            ay=0,
            borderwidth=0,
            borderpad=4,
            opacity=0.8
)

In [None]:
def box_plot_for_thesis(entries: List[BoxPlotEntry], measured_column="duration", hide_x_axis=False, experiment_id="", postprocess=None, **kwargs):
    transformed_data = []
    for entry in entries:
        df = entry.data.loc[:, ]
        df["identifier"] = entry.system
        transformed_data.append(df)
    transformed_data = pd.concat(transformed_data)
    fig = px.box(transformed_data, x="identifier", y=measured_column, color="identifier", points="all", **kwargs)
    if hide_x_axis:
        fig.update_xaxes(visible=False, showticklabels=False)
    fig.update_layout(showlegend=False)
    fig.update_layout(title_text=kwargs["title"], title_x=0.5)
    fig.update_layout(xaxis_title=None,     
                      paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_yaxes(showgrid=False, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(rangemode="nonnegative")
    if postprocess is not None:
        postprocess(kwargs["title"], fig)
    fig.show()
    fig.write_image(f'{CHARTS_DIRECTORY}{kwargs["title"]}-{experiment_id}.pdf')
    
def box_plots_for_thesis(entries: List[BoxPlotsEntry], 
                  not_task_ids, 
                  graph_shape,
                  not_dag_run_ids=[], 
                  hide_x_axis=False, 
                  experiment_id="", 
                  postprocess=None,
                  **kwargs):
    dags_data = []
    task_duration = None
    
    for entry in entries:
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_dag_runs)
        enhance_with_wait_time_dag_runs(entry_dag_runs)
        
        dags_data.append(
            BoxPlotEntry(
                data=entry_dag_runs,
                identifier=f"{entry.system.value}_{entry.export_date}",
                system=system_to_title(entry.system),
            )
        )
        
    box_plot_for_thesis(dags_data, 
                 measured_column="duration", 
                 title="DAG makespan",
                 hover_data=["run_id"],
                 labels={
                   "identifier": "System",
                   "duration": "Duration [s]",
                 },
                 experiment_id=experiment_id,
                 hide_x_axis=hide_x_axis,
                 postprocess=postprocess,
                 **kwargs)
    
    tasks_data = []
    
    for entry in entries:
        entry_tasks = filter_on_not_dag_run_id(
            df=filter_on_not_task_ids(
                df=filter_on_dag_ids(
                    df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                    dag_ids=entry.dag_ids,
                ),
                task_ids=[],
            ),
            dag_run_ids=not_dag_run_ids,
        )
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_tasks)
        enhance_with_wait_time_tasks(entry_tasks, graph_shape, entry_dag_runs)
        
        entry_tasks = filter_on_not_task_ids(entry_tasks, task_ids=not_task_ids)
        
        tasks_data.append(
            BoxPlotEntry(
                data=entry_tasks,
                identifier=f"{entry.system.value}_{entry.export_date}",
                system=system_to_title(entry.system),
            )
        )
    
    box_plot_for_thesis(tasks_data, 
         measured_column="wait_time", 
         title="Task wait time",
         hover_data=["run_id"],
         labels={
           "identifier": "System",
           "wait_time": "Wait time [s]",
         },
         experiment_id=experiment_id,
         postprocess=postprocess,
         hide_x_axis=hide_x_axis,
         **kwargs)
        

In [None]:
experiment_result_serverless_dates = [
    "2023-04-01_23-07",
    "2023-04-02_00-36",
    "2023-04-02_02-05",
    "2023-04-02_03-34",
    "2023-04-02_05-03",
    "2023-04-02_06-32",
    "2023-04-02_12-36",
    "2023-04-02_14-05",
    "2023-04-02_15-34",
    "2023-04-02_17-03",
    "2023-04-02_18-32",
    "2023-04-02_20-01",
    "2023-04-02_21-30",
    "2023-04-02_22-59",
    "2023-04-03_00-28",
    "2023-04-03_01-57",
    "2023-04-03_03-26",
    "2023-04-03_04-55",
    "2023-04-03_06-24",
    "2023-04-03_07-53",
    "2023-04-03_09-22",
    "2023-04-03_10-51",
    "2023-04-03_12-20",
    "2023-04-03_13-49",
    "2023-04-03_15-18",
    "2023-04-03_22-48",
    "2023-04-04_00-17",
    "2023-04-04_01-46",
    "2023-04-04_03-15",
    "2023-04-04_04-44"
]

experiment_result_mwaa_dates = [
    "2023-04-01_23-18",
    "2023-04-02_00-48",
    "2023-04-02_02-17",
    "2023-04-02_03-46",
    "2023-04-02_05-15",
    "2023-04-02_06-44",
    "2023-04-02_12-48",
    "2023-04-02_14-16",
    "2023-04-02_15-45",
    "2023-04-02_17-14",
    "2023-04-02_18-43",
    "2023-04-02_20-12",
    "2023-04-02_21-41",
    "2023-04-02_23-10",
    "2023-04-03_00-39",
    "2023-04-03_02-08",
    "2023-04-03_03-37",
    "2023-04-03_05-06",
    "2023-04-03_06-35",
    "2023-04-03_08-04",
    "2023-04-03_09-33",
    "2023-04-03_11-02",
    "2023-04-03_12-31",
    "2023-04-03_14-00",
    "2023-04-03_15-29",
    "2023-04-03_22-59",
    "2023-04-04_00-28",
    "2023-04-04_01-57",
    "2023-04-04_03-26",
    "2023-04-04_04-56",
]

In [None]:
def draw_one_trace(selected):
    it = 0
    for job_name, serverless_date, mwaa_date in zip(job_info["job_name"], experiment_result_serverless_dates, experiment_result_mwaa_dates):
        if selected > it:
            it += 1
            continue


        job_data = sample_jobs[sample_jobs["job_name"] == job_name].copy()
        graph = build_graph(job_data)
        specific_job_info = job_info[job_info["job_name"] == job_name].copy()

        draw(graph, job_name)

        entries = [
            BoxPlotsEntry(
                system=System.BEEFLOW,
                export_date=serverless_date,
                dag_ids=[job_name]
            ),
            BoxPlotsEntry(
                system=System.MWAA,
                export_date=mwaa_date,
                dag_ids=[job_name]
            ),
        ]

        three_box_plots_for(
            entries=entries, 
            not_task_ids=[], 
            graph_shape=graph,
            height=700, 
            width=700, 
            not_dag_run_ids=[], 
            experiment_id=job_name)

        fig = go.Figure(data=[go.Table(
            header=dict(values=list(specific_job_info.columns),
                        fill_color='white',
                       font_color='red',
                        align='left'),
            cells=dict(values=specific_job_info.transpose().values.tolist(),
                       fill_color='white',
                       font_color='red',
                       align='left'))
            ])

        fig.show()

        break

In [None]:
draw_one_trace(6)

In [None]:
def thesis_metrics(job_name: str, postprocess=None):
    found = None
    for name, serverless, mwaa in zip(job_info["job_name"], experiment_result_serverless_dates, experiment_result_mwaa_dates):
        if name == job_name:
            found = name, serverless, mwaa
    assert found != None
    
    _, serverless_date, mwaa_date = found
    
    entries = [
            BoxPlotsEntry(
                system=System.BEEFLOW,
                export_date=serverless_date,
                dag_ids=[job_name]
            ),
            BoxPlotsEntry(
                system=System.MWAA,
                export_date=mwaa_date,
                dag_ids=[job_name]
            ),
        ]
    job_data = sample_jobs[sample_jobs["job_name"] == job_name].copy()
    graph = build_graph(job_data)

    box_plots_for_thesis(
            entries=entries, 
            not_task_ids=[], 
            graph_shape=graph,
            height=700, 
            width=700, 
            not_dag_run_ids=[], 
            experiment_id=job_name,
            postprocess=postprocess)
    

In [None]:
def postprocess(title, fig):
    
    if title ==  "DAG makespan":
        fig.add_annotation(
            x=-0.30,
            y=487.92,
            **arrow_standard_args,
            text="cold start",
        )
        
    if title ==  "Task wait time":
        fig.add_annotation(
            x=-0.30,
            y=12.00,
            **arrow_standard_args,
            text="cold starts",
        )

thesis_metrics("j_3441830", postprocess)

In [None]:
thesis_metrics("j_3302772")

In [None]:
thesis_metrics("j_581851")

In [None]:
job_info

In [None]:
def get_dag_run_traces_data():
    df = pd.DataFrame()
    for job_name, serverless_date, mwaa_date in zip(
        job_info["job_name"], 
        experiment_result_serverless_dates, 
        experiment_result_mwaa_dates):
        mwaa_dag_run = get_stats(system=System.MWAA, date=mwaa_date, stats=Stats.DAG_RUNS)
        mwaa_dag_run = filter_on_dag_ids(mwaa_dag_run, [job_name])
        mwaa_dag_run = filter_not_on_running(mwaa_dag_run)
        enhance_with_duration(mwaa_dag_run)
        serverless_dag_run = get_stats(system=System.BEEFLOW, date=serverless_date, stats=Stats.DAG_RUNS)
        serverless_dag_run = filter_on_dag_ids(serverless_dag_run, [job_name])
        serverless_dag_run = filter_on_not_dag_run_id(serverless_dag_run, ["scheduled__2023-04-02T15:10:00+00:00"])
        serverless_dag_run = filter_not_on_running(serverless_dag_run)
        enhance_with_duration(serverless_dag_run)
        
        mwaa_dag_run["queued_at"] = pd.to_datetime(mwaa_dag_run["queued_at"])
        serverless_dag_run["queued_at"] = pd.to_datetime(serverless_dag_run["queued_at"])
        mwaa_dag_run = mwaa_dag_run.sort_values(by=['queued_at'], ascending=True).reset_index(drop=True)
        serverless_dag_run = serverless_dag_run.sort_values(by=['queued_at'], ascending=True).reset_index(drop=True)
        
        specific_job_info = job_info[job_info["job_name"] == job_name].copy()
        assert len(specific_job_info) == 1
        specific_job_info = specific_job_info.iloc[0]
        
        is_cold = pd.Series("standard", index=range(max(len(mwaa_dag_run), len(serverless_dag_run))))
        is_cold[0] = "cold_start"

        
        to_attach = pd.DataFrame.from_dict(
            {
                "mwaa": mwaa_dag_run["duration"], 
                "beeflow": serverless_dag_run["duration"],
                "mwaa_normalized": mwaa_dag_run["duration"].div(specific_job_info["total_work"]),
                "beeflow_normalized": serverless_dag_run["duration"].div(specific_job_info["total_work"]),
                "critical_path": specific_job_info["critical_path_in_nodes"],
                "job_name": job_name,
                "cron_used": 5 if specific_job_info["critical_path"] < 201 else 10,
                "label": is_cold,
                "critical_path": specific_job_info["critical_path"],
                "critical_path_in_nodes": specific_job_info["critical_path_in_nodes"],
                "longest_path_in_nodes_number": specific_job_info["longest_path_in_nodes_number"],
                "parallelisation_degree": specific_job_info["parallelisation_degree"],
                "tasks": specific_job_info["tasks"],
                "is_sairflow_first_run": is_cold,
            })
        
        df = pd.concat([df, to_attach], ignore_index=True)
    return df

In [None]:
df = get_dag_run_traces_data()

df["mwaa_norm"] = (df["mwaa"] - df["critical_path"]) / (df["parallelisation_degree"] / df["longest_path_in_nodes_number"])
df["beeflow_norm"] = (df["beeflow"] - df["critical_path"]) / (df["parallelisation_degree"] / df["longest_path_in_nodes_number"])

plt.figure()

fig = px.scatter(df,
                 x="mwaa_norm",
                 y="beeflow_norm",
                 height=700,
                 width=700,
                 labels={
                     "mwaa_normalized": "MWAA",
                     "beeflow_normalized": "sAirflow",
                 },
                 trendline="ols",
                 color="label",
                 trendline_scope="overall",
                 trendline_color_override="red",
                 )
fig.update_layout(showlegend=True)
fig.update_layout(legend=dict(
    title_text="",
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))
for idx, name in enumerate(["Cold start", "Warm start", "Trendline"]):
    fig.data[idx].name = name

fig.update_layout(title_text="normalization 1", title_x=0.5)
fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                  plot_bgcolor='rgb(255, 255, 255)',
                  font=dict(
                      family='Latin Modern Math',
                      color='black',
                      size=32,
                  ), )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(range=[0, 125])
fig.update_xaxes(range=[0, 125])
fig.update_traces(marker=dict(size=12))
fig.show()

results = px.get_trendline_results(fig)
trendline = results["px_fit_results"].iloc[0]
display(trendline.summary())

In [None]:
df["mwaa_norm_rev"] = (df["mwaa"] - df["critical_path"]) / (df["longest_path_in_nodes_number"] / df["parallelisation_degree"])
df["beeflow_norm_rev"] = (df["beeflow"] - df["critical_path"]) / (df["longest_path_in_nodes_number"] / df["parallelisation_degree"])

plt.figure()

fig = px.scatter(df,
                 x="mwaa_norm_rev",
                 y="beeflow_norm_rev",
                 height=700,
                 width=700,
                 labels={
                     "mwaa_normalized": "MWAA",
                     "beeflow_normalized": "sAirflow",
                 },
                 trendline="ols",
                 color="label",
                 trendline_scope="overall",
                 trendline_color_override="red",
                 )
fig.update_layout(showlegend=True)
fig.update_layout(legend=dict(
    title_text="",
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))
for idx, name in enumerate(["Cold start", "Warm start", "Trendline"]):
    fig.data[idx].name = name

fig.update_layout(title_text="normalization 2", title_x=0.5)
fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                  plot_bgcolor='rgb(255, 255, 255)',
                  font=dict(
                      family='Latin Modern Math',
                      color='black',
                      size=32,
                  ), )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(range=[0, 125])
fig.update_xaxes(range=[0, 125])
fig.update_traces(marker=dict(size=12))
fig.show()

results = px.get_trendline_results(fig)
trendline = results["px_fit_results"].iloc[0]
display(trendline.summary())

In [None]:
df.head()

In [None]:
# there are outliers for the DAG that is highly parallelizable, lets remove it

dft = df[df["job_name"] != "j_3302772"]

plt.figure()

fig = px.scatter(dft,
                 x="mwaa_norm_rev",
                 y="beeflow_norm_rev",
                 height=700,
                 width=700,
                 labels={
                     "mwaa_normalized": "MWAA",
                     "beeflow_normalized": "sAirflow",
                 },
                 trendline="ols",
                 color="label",
                 trendline_scope="overall",
                 trendline_color_override="red",
                 )
fig.update_layout(showlegend=True)
fig.update_layout(legend=dict(
    title_text="",
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))
for idx, name in enumerate(["Cold start", "Warm start", "Trendline"]):
    fig.data[idx].name = name

fig.update_layout(title_text="normalization 2", title_x=0.5)
fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                  plot_bgcolor='rgb(255, 255, 255)',
                  font=dict(
                      family='Latin Modern Math',
                      color='black',
                      size=32,
                  ), )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(range=[0, 125])
fig.update_xaxes(range=[0, 125])
fig.update_traces(marker=dict(size=12))
fig.show()

results = px.get_trendline_results(fig)
trendline = results["px_fit_results"].iloc[0]
display(trendline.summary())

In [None]:
def draw_norm_overhead():
    font = font_manager.FontProperties(family='Latin Modern Math', size=28)

    plt.figure()
    fig, ax = plt.subplots(figsize=(6.5, 6.5))
    chart = sns.regplot(x="mwaa_norm_rev", y="beeflow_norm_rev", data=df, ax=ax, scatter=False, color="black")
    chart = sns.scatterplot(x="mwaa_norm_rev", y="beeflow_norm_rev", hue="label", data=df, ax=ax,
                            palette=['#3ee67b', '#ab63fa'], s=100)
    chart.legend_.set_title(None)
    ax.set_xlim(0, 125)
    ax.set_ylim(0, 125)
    ax.set_xlabel('MWAA', fontsize=24, fontfamily='Latin Modern Math');
    ax.set_ylabel('sAirflow', fontsize=24, fontfamily='Latin Modern Math')
    ax.tick_params(axis='both', which='major', labelsize=20)
    ax.tick_params(axis='both', which='minor', labelsize=20)
    ax.title.set_size(32)
    ax.set_xticks([25, 50, 75, 100])
    ax.set_yticks([25, 50, 75, 100])
    ax.set_xticklabels(ax.get_xticks(), fontdict={"family": 'Latin Modern Math'})
    ax.set_yticklabels(ax.get_yticks(), fontdict={"family": 'Latin Modern Math'})
    plt.legend(title='', labels=['Trendline', 'Confidence Interval', None, 'sAirflow cold start', 'sAirflow warm start'],
               loc='lower right', prop={'size': 12})
                        
with sns.axes_style("whitegrid"):
    sns.set_palette(sns.color_palette(["#636EFA"]))
    sns.set_context("notebook", rc={"grid.linewidth": 1, "grid.color": "#D3D3D3", "font.family": 'Latin Modern Math'})
    draw_norm_overhead()

In [None]:
def draw_norm_overhead_rev():
    font = font_manager.FontProperties(family='Latin Modern Math', size=28)

    plt.figure()
    fig, ax = plt.subplots(figsize=(6.5, 6.5))
    chart = sns.regplot(x="mwaa_norm", y="beeflow_norm", data=df, ax=ax, scatter=False, color="black")
    chart = sns.scatterplot(x="mwaa_norm", y="beeflow_norm", hue="label", data=df, ax=ax,
                            palette=['#3ee67b', '#ab63fa'], s=100)
    chart.legend_.set_title(None)
    ax.set_xlim(0, 125)
    ax.set_ylim(0, 125)
    ax.set_xlabel('MWAA', fontsize=24, fontfamily='Latin Modern Math');
    ax.set_ylabel('sAirflow', fontsize=24, fontfamily='Latin Modern Math')
    ax.tick_params(axis='both', which='major', labelsize=20)
    ax.tick_params(axis='both', which='minor', labelsize=20)
    ax.title.set_size(32)
    ax.set_xticks([25, 50, 75, 100])
    ax.set_yticks([25, 50, 75, 100])
    ax.set_xticklabels(ax.get_xticks(), fontdict={"family": 'Latin Modern Math'})
    ax.set_yticklabels(ax.get_yticks(), fontdict={"family": 'Latin Modern Math'})
    plt.legend(title='', labels=['Trendline', 'Confidence Interval', '_nolegend_', 'sAirflow cold start', 'sAirflow warm start'],
               loc='lower right', prop={'size': 12})
    plt.subplots_adjust(left=0.2)
    plt.savefig(f'{CHARTS_DIRECTORY}/traces_dag_overhead__norm_metric_formatted.pdf', format="pdf")
                        
with sns.axes_style("whitegrid"):
    sns.set_palette(sns.color_palette(["#636EFA"]))
    sns.set_context("notebook", rc={"grid.linewidth": 1, "grid.color": "#D3D3D3", "font.family": 'Latin Modern Math', 'figure.figsize':(7, 7)})
    draw_norm_overhead_rev()

In [None]:
# makespan raw

plt.figure()

fig = px.scatter(df,
                 x="mwaa",
                 y="beeflow",
                 height=700,
                 width=700,
                 labels={
                     "mwaa": "MWAA",
                     "beeflow": "sAirflow",
                     "Overall Trendline": "Trendline",
                 },
                 trendline="ols",
                 color="label",
                 trendline_scope="overall",
                 trendline_color_override="black",
                 color_discrete_sequence=['#3ee67b', '#ab63fa'],
                 )
fig.update_layout(showlegend=True)
fig.update_layout(legend=dict(
    title_text="",
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))
for idx, name in enumerate(["sAirflow cold start", "sAirflow warm start", "Trendline"]):
    fig.data[idx].name = name

fig.update_layout(title_text="DAG makespan [s]", title_x=0.5)
fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                  plot_bgcolor='rgb(255, 255, 255)',
                  font=dict(
                      family='Latin Modern Math',
                      color='black',
                      size=32,
                  ), )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(range=[0, 500])
fig.update_xaxes(range=[0, 500])
fig.update_traces(marker=dict(size=12))

tr_line = []
for k, trace in enumerate(fig.data):
    if trace.mode is not None and trace.mode == 'lines':
        tr_line.append(k)

for id in tr_line:
    fig.data[id].update(line_width=4)

fig.show()

fig.write_image(f'{CHARTS_DIRECTORY}/traces_dag_makespan_raw_comparison.pdf')

results = px.get_trendline_results(fig)
trendline = results["px_fit_results"].iloc[0]
display(trendline.summary())

In [None]:
def draw_scatter_plot_dag_data():
    df = get_dag_run_traces_data()
    
    # normalized div by dur
    
    plt.figure()

    
    fig = px.scatter(df, 
                     x="mwaa_normalized", 
                     y="beeflow_normalized",
                     height=700, 
                     width=700,
                     labels={
                       "mwaa_normalized": "MWAA",
                       "beeflow_normalized": "sAirflow",
                     },
                     trendline="ols",
                     color="label",
                     trendline_scope="overall",
                     trendline_color_override="black",
                    )
    fig.update_layout(showlegend=True)
    fig.update_layout(legend=dict(
        title_text="",
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    ))
    for idx, name in enumerate(["Cold start", "Warm start", "Trendline"]):
        fig.data[idx].name = name
    
    fig.update_layout(title_text="DAG duration over total work", title_x=0.5)
    fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(range=[0, 3.5])
    fig.update_xaxes(range=[0, 3.5])
    fig.update_traces(marker=dict(size=12))
    fig.show()
    
    results = px.get_trendline_results(fig)
    trendline = results["px_fit_results"].iloc[0]
    display(trendline.summary())

    fig.write_image(f'{CHARTS_DIRECTORY}/traces_dag_run_normalized_comparison.pdf')
    
    
    # normalized 
    
    df["mwaa_norm_2"] = (df["mwaa"] - df["critical_path"]) 
    df["beeflow_norm_2"] = (df["beeflow"] - df["critical_path"]) 
    
    df["mwaa_norm_3"] = (df["mwaa"] - df["critical_path"]) / df["tasks"]
    df["beeflow_norm_3"] = (df["beeflow"] - df["critical_path"]) / df["tasks"]
    
    df["mwaa_norm_4"] = (df["mwaa"] - df["critical_path"]) / df["longest_path_in_nodes_number"]
    df["beeflow_norm_4"] = (df["beeflow"] - df["critical_path"]) / df["longest_path_in_nodes_number"]
    
    # normalized minus critical path
    
    fig = px.scatter(df, 
                     x="mwaa_norm_2", 
                     y="beeflow_norm_2",
                     height=700, 
                     width=700,
                     labels={
                       "mwaa_normalized": "MWAA",
                       "beeflow_normalized": "sAirflow",
                     },
                     trendline="ols",
                     color="label",
                     trendline_scope="overall",
                     trendline_color_override="red",
                    )
    fig.update_layout(showlegend=True)
    fig.update_layout(legend=dict(
        title_text="",
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    ))
    for idx, name in enumerate(["Cold start", "Warm start", "Trendline"]):
        fig.data[idx].name = name
    
    fig.update_layout(title_text="DAG duration overhead", title_x=0.5)
    fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(range=[0, 65])
    fig.update_xaxes(range=[0, 65])
    fig.update_traces(marker=dict(size=12))
    fig.show()
    
    
    results = px.get_trendline_results(fig)
    trendline = results["px_fit_results"].iloc[0]
    display(trendline.summary())
    
    # normalized minus critical path over nodes longest path size
    
    fig = px.scatter(df, 
                     x="mwaa_norm_3", 
                     y="beeflow_norm_3",
                     height=700, 
                     width=700,
                     labels={
                       "mwaa_normalized": "MWAA",
                       "beeflow_normalized": "sAirflow",
                     },
                     trendline="ols",
                     color="label",
                     trendline_scope="overall",
                     trendline_color_override="red",
                    )
    fig.update_layout(showlegend=True)
    fig.update_layout(legend=dict(
        title_text="",
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    ))
    for idx, name in enumerate(["Cold start", "Warm start", "Trendline"]):
        fig.data[idx].name = name
    
    fig.update_layout(title_text="DAG duration overhead over tasks num", title_x=0.5)
    fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(range=[0, 5])
    fig.update_xaxes(range=[0, 5])
    fig.update_traces(marker=dict(size=12))
    fig.show()
    
    
    results = px.get_trendline_results(fig)
    trendline = results["px_fit_results"].iloc[0]
    display(trendline.summary())
    
    
    # normalized minus critical path over nodes longest path size
    
    fig = px.scatter(df, 
                     x="mwaa_norm_4", 
                     y="beeflow_norm_4",
                     height=700, 
                     width=700,
                     labels={
                       "mwaa_normalized": "MWAA",
                       "beeflow_normalized": "sAirflow",
                     },
                     trendline="ols",
                     color="label",
                     trendline_scope="overall",
                     trendline_color_override="red",
                    )
    fig.update_layout(showlegend=True)
    fig.update_layout(legend=dict(
        title_text="",
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    ))
    for idx, name in enumerate(["Cold start", "Warm start", "Trendline"]):
        fig.data[idx].name = name
    
    fig.update_layout(title_text="DAG duration overhead over longest path", title_x=0.5)
    fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(range=[0, 15])
    fig.update_xaxes(range=[0, 15])
    fig.update_traces(marker=dict(size=12))
    fig.show()
    
    
    results = px.get_trendline_results(fig)
    trendline = results["px_fit_results"].iloc[0]
    display(trendline.summary())
    
    # normalized minus critical path over nodes longest path size filtered on longer dags
    
    df2 = df[df["critical_path_in_nodes"] > 8]
    
    fig = px.scatter(df2, 
                     x="mwaa_norm_4", 
                     y="beeflow_norm_4",
                     height=700, 
                     width=700,
                     labels={
                       "mwaa_normalized": "MWAA",
                       "beeflow_normalized": "sAirflow",
                     },
                     trendline="ols",
                     color="label",
                     trendline_scope="overall",
                     trendline_color_override="red",
                    )
    fig.update_layout(showlegend=True)
    fig.update_layout(legend=dict(
        title_text="",
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    ))
    for idx, name in enumerate(["Cold start", "Warm start", "Trendline"]):
        fig.data[idx].name = name
    
    fig.update_layout(title_text="DAG duration overhead olp > 120", title_x=0.5)
    fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(range=[0, 15])
    fig.update_xaxes(range=[0, 15])
    fig.update_traces(marker=dict(size=12))
    fig.show()
    
    
    results = px.get_trendline_results(fig)
    trendline = results["px_fit_results"].iloc[0]
    display(trendline.summary())
    
    
    ## reg
    
    plt.figure()
    fig, ax = plt.subplots(figsize=(6.5, 6.5))
    chart = sns.regplot(x = "mwaa_normalized", y = "beeflow_normalized", data = df, ax = ax)
    # chart.set_title("Normalized DAG run duration", fontdict={'size': 28, "family": 'Latin Modern Math', 'color': 'black'}, bbox=None, pad=20)
    ax.set_xlim(0,3.5)
    ax.set_ylim(0,3.5)
    ax.set_xlabel('MWAA',fontsize=24, fontfamily='Latin Modern Math');
    ax.set_ylabel('sAirflow',fontsize=24, fontfamily='Latin Modern Math')
    ax.tick_params(axis='both', which='major', labelsize=20)
    ax.tick_params(axis='both', which='minor', labelsize=20)
    ax.title.set_size(32)
    ax.set_xticks([0.7, 1.4, 2.1, 2.8])
    ax.set_yticks([0.7, 1.4, 2.1, 2.8])
    ax.set_xticklabels(ax.get_xticks(), fontdict={"family": 'Latin Modern Math'})
    ax.set_yticklabels(ax.get_yticks(), fontdict={"family": 'Latin Modern Math'})
    plt.savefig(f'{CHARTS_DIRECTORY}/traces_dag_run_normalized_comparison.pdf', format="pdf", bbox_inches='tight')
    
    
    ## same figure but filtered data
    
    plt.figure()
    filetered_df = df[df["beeflow_normalized"] <=2.0]
    filetered_df = filetered_df[filetered_df["mwaa_normalized"] <= 2.0] 
    fig, ax = plt.subplots(figsize=(6.5, 6.5))
    chart = sns.regplot(x = "mwaa_normalized", y = "beeflow_normalized", data = filetered_df, ax = ax)
    # chart.set_title("Normalized DAG run duration (<2.0)", fontdict={'size': 28, "family": 'Latin Modern Math', 'color': 'black'}, bbox=None, pad=20)
    ax.set_xlim(0,2.0)
    ax.set_ylim(0,2.0)
    ax.set_xlabel('MWAA',fontsize=24, fontfamily='Latin Modern Math');
    ax.set_ylabel('sAirflow',fontsize=24, fontfamily='Latin Modern Math')
    ax.tick_params(axis='both', which='major', labelsize=20)
    ax.tick_params(axis='both', which='minor', labelsize=20)
    ax.title.set_size(32)
    ax.set_xticks([0.4, 0.8, 1.2, 1.6])
    ax.set_yticks([0.4, 0.8, 1.2, 1.6])
    ax.set_xticklabels(ax.get_xticks(), fontdict={"family": 'Latin Modern Math'})
    ax.set_yticklabels(ax.get_yticks(), fontdict={"family": 'Latin Modern Math'})
    plt.savefig(f'{CHARTS_DIRECTORY}/traces_dag_run_normalized_comparison_zoom.pdf', format="pdf", bbox_inches='tight')
    
    ## reg on norm2
    
    font = font_manager.FontProperties(family='Latin Modern Math', size=28)
    
    plt.figure()
    fig, ax = plt.subplots(figsize=(6.5, 6.5))
    chart = sns.regplot(x = "mwaa_norm_2", y = "beeflow_norm_2", data = df, ax = ax, scatter = False, color = "black")
    # chart.set_title("DAG overhead", fontdict={'size': 26, "family": 'Latin Modern Math', 'color': 'black'}, bbox=None, pad=20)
    chart = sns.scatterplot(x="mwaa_norm_2", y="beeflow_norm_2", hue="label", data=df, ax = ax, palette=['#3ee67b', '#ab63fa'], s=100)
    chart.legend_.set_title(None)
    ax.set_xlim(0,70)
    ax.set_ylim(0,70)
    ax.set_xlabel('MWAA',fontsize=24, fontfamily='Latin Modern Math');
    ax.set_ylabel('sAirflow',fontsize=24, fontfamily='Latin Modern Math')
    ax.tick_params(axis='both', which='major', labelsize=20)
    ax.tick_params(axis='both', which='minor', labelsize=20)
    ax.title.set_size(32)
    ax.set_xticks([14, 28, 42, 56])
    ax.set_yticks([14, 28, 42, 56])
    ax.set_xticklabels(ax.get_xticks(), fontdict={"family": 'Latin Modern Math'})
    ax.set_yticklabels(ax.get_yticks(), fontdict={"family": 'Latin Modern Math'})
    plt.legend(title='', labels=['Trendline', 'Confidence Interval', '_nolegend_', 'sAirflow cold start', 'sAirflow warm start'], loc='lower right', prop={'size': 12})
    plt.subplots_adjust(left=0.2)
    plt.savefig(f'{CHARTS_DIRECTORY}/traces_dag_overhead_formatted.pdf', format="pdf")
    
    ## reg on norm3
    
    font = font_manager.FontProperties(family='Latin Modern Math', size=28)
    
    plt.figure()
    fig, ax = plt.subplots(figsize=(6.5, 6.5))
    chart = sns.regplot(x = "mwaa_norm_3", y = "beeflow_norm_3", data = df, ax = ax, scatter = False, color = "black")
    # chart.set_title("DAG overhead over tasks count", fontdict={'size': 26, "family": 'Latin Modern Math', 'color': 'black'}, bbox=None, pad=20)
    chart = sns.scatterplot(x="mwaa_norm_3", y="beeflow_norm_3", hue="label", data=df, ax = ax, palette=['#3ee67b', '#ab63fa'], s=100)
    chart.legend_.set_title(None)
    ax.set_xlim(0, 5)
    ax.set_ylim(0, 5)
    ax.set_xlabel('MWAA',fontsize=24, fontfamily='Latin Modern Math');
    ax.set_ylabel('sAirflow',fontsize=24, fontfamily='Latin Modern Math')
    ax.tick_params(axis='both', which='major', labelsize=20)
    ax.tick_params(axis='both', which='minor', labelsize=20)
    ax.title.set_size(32)
    ax.set_xticks([1, 2, 3, 4])
    ax.set_yticks([1, 2, 3, 4])
    ax.set_xticklabels(ax.get_xticks(), fontdict={"family": 'Latin Modern Math'})
    ax.set_yticklabels(ax.get_yticks(), fontdict={"family": 'Latin Modern Math'})
    plt.legend(title='', labels=['Trendline', 'Confidence Interval', '_nolegend_', 'sAirflow cold start', 'sAirflow warm start'], loc='lower right', prop={'size': 12})
    plt.subplots_adjust(left=0.2)
    plt.savefig(f'{CHARTS_DIRECTORY}/traces_dag_overhead__task_average_formatted.pdf', format="pdf")
    
    ## reg on norm3
    
    font = font_manager.FontProperties(family='Latin Modern Math', size=28)
    
    plt.figure()
    fig, ax = plt.subplots(figsize=(6.5, 6.5))
    chart = sns.regplot(x = "mwaa_norm_4", y = "beeflow_norm_4", data = df, ax = ax, scatter = False, color = "black")
    # chart.set_title("DAG overhead over longest path", fontdict={'size': 26, "family": 'Latin Modern Math', 'color': 'black'}, bbox=None, pad=20)
    chart = sns.scatterplot(x="mwaa_norm_4", y="beeflow_norm_4", hue="label", data=df, ax = ax, palette=['#3ee67b', '#ab63fa'], s=100)
    chart.legend_.set_title(None)
    ax.set_xlim(0, 15)
    ax.set_ylim(0, 15)
    ax.set_xlabel('MWAA',fontsize=24, fontfamily='Latin Modern Math');
    ax.set_ylabel('sAirflow',fontsize=24, fontfamily='Latin Modern Math')
    ax.tick_params(axis='both', which='major', labelsize=20)
    ax.tick_params(axis='both', which='minor', labelsize=20)
    ax.title.set_size(32)
    ax.set_xticks([0, 3, 6, 9, 12, 15])
    ax.set_yticks([0, 3, 6, 9, 12, 15])
    ax.set_xticklabels(ax.get_xticks(), fontdict={"family": 'Latin Modern Math'})
    ax.set_yticklabels(ax.get_yticks(), fontdict={"family": 'Latin Modern Math'})
    plt.legend(title='', labels=['Trendline', 'Confidence Interval', None, 'sAirflow cold start', 'sAirflow warm start'], loc='lower right', prop={'size': 12})
    plt.savefig(f'{CHARTS_DIRECTORY}/traces_dag_overhead__norm_by_parallel_metric.pdf', format="pdf", bbox_inches='tight')
    
    return df
        
    

In [None]:
with sns.axes_style("whitegrid"):
        sns.set_palette(sns.color_palette(["#636EFA"]))
        sns.set_context("notebook", rc={"grid.linewidth": 1, "grid.color": "#D3D3D3", "font.family": 'Latin Modern Math', 'figure.figsize':(7, 7)})
        df = draw_scatter_plot_dag_data()

In [None]:
df = get_dag_run_traces_data()

df["mwaa_overhead"] = (df["mwaa"] - df["critical_path"]) 
df["beeflow_overhead"] = (df["beeflow"] - df["critical_path"]) 


# plot overheads for each system for the overhead and the dag makespan

plt.figure()

fig = px.scatter(df,
                 x="mwaa",
                 y="mwaa_overhead",
                 height=700,
                 width=700,
                 labels={
                     "mwaa": "DAG makespan [s]",
                     "mwaa_overhead": "DAG overhead [s]",
                 },
                 color_discrete_sequence=['#ef553b'],
                 )
fig.update_layout(showlegend=True)
fig.update_layout(legend=dict(
    title_text="",
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

fig.update_layout(title_text="MWAA", title_x=0.5)
fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                  plot_bgcolor='rgb(255, 255, 255)',
                  font=dict(
                      family='Latin Modern Math',
                      color='black',
                      size=32,
                  ), )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(range=[0, 60])
fig.update_xaxes(range=[0, 500])
fig.update_traces(marker=dict(size=12))

tr_line = []
for k, trace in enumerate(fig.data):
    if trace.mode is not None and trace.mode == 'lines':
        tr_line.append(k)

for id in tr_line:
    fig.data[id].update(line_width=4)

fig.show()

fig.write_image(f'{CHARTS_DIRECTORY}/traces_dag_makespan_to_overhead_mwaa.pdf')


# sairflow

plt.figure()

fig = px.scatter(df,
                 x="beeflow",
                 y="beeflow_overhead",
                 height=700,
                 width=700,
                 labels={
                     "beeflow": "DAG makespan [s]",
                     "beeflow_overhead": "DAG overhead [s]",
                 },
                 color_discrete_sequence=['#636EFA'],
                 )
fig.update_layout(showlegend=True)
fig.update_layout(legend=dict(
    title_text="",
    yanchor="bottom",
    y=0.01,
    xanchor="right",
    x=0.99
))

fig.update_layout(title_text="sAirflow", title_x=0.5)
fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                  plot_bgcolor='rgb(255, 255, 255)',
                  font=dict(
                      family='Latin Modern Math',
                      color='black',
                      size=32,
                  ), )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(range=[0, 60])
fig.update_xaxes(range=[0, 500])
fig.update_traces(marker=dict(size=12))

tr_line = []
for k, trace in enumerate(fig.data):
    if trace.mode is not None and trace.mode == 'lines':
        tr_line.append(k)

for id in tr_line:
    fig.data[id].update(line_width=4)

fig.show()

fig.write_image(f'{CHARTS_DIRECTORY}/traces_dag_makespan_to_overhead_sairflow.pdf')


In [None]:
df = get_dag_run_traces_data()

df["mwaa_overhead"] = (df["mwaa"] - df["critical_path"]) 
df["beeflow_overhead"] = (df["beeflow"] - df["critical_path"])

xs = []
ys = []
ls = []

for _, row in df.iterrows():
    xs.append(row["mwaa"])
    ys.append(row["mwaa_overhead"])
    ls.append("MWAA")
    xs.append(row["beeflow"])
    ys.append(row["beeflow_overhead"])
    ls.append("sAirflow")
    
df2 = pd.DataFrame({"xs": xs, "ys": ys, "ls": ls})


# plot overheads for each system for the overhead and the dag makespan

plt.figure()

fig = px.scatter(df2,
                 x="xs",
                 y="ys",
                 height=700,
                 width=700,
                 labels={
                     "xs": "DAG makespan [s]",
                     "ys": "DAG overhead [s]",
                 },
                 color_discrete_sequence=['#ef553b', '#636EFA'],
                 color="ls",
                 )
fig.update_layout(showlegend=True)
fig.update_layout(legend=dict(
    title_text="",
    yanchor="bottom",
    y=0.01,
    xanchor="right",
    x=0.99
))

fig.update_layout(title_text="", title_x=0.5)
fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                  plot_bgcolor='rgb(255, 255, 255)',
                  font=dict(
                      family='Latin Modern Math',
                      color='black',
                      size=32,
                  ), )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(range=[0, 60])
fig.update_xaxes(range=[0, 500])
fig.update_traces(marker=dict(size=9))
fig.update_traces(marker_opacity=0.7)

tr_line = []
for k, trace in enumerate(fig.data):
    if trace.mode is not None and trace.mode == 'lines':
        tr_line.append(k)

for id in tr_line:
    fig.data[id].update(line_width=4)

fig.show()

fig.write_image(f'{CHARTS_DIRECTORY}/traces_dag_makespan_to_overhead_both.pdf')


In [None]:
df["mwaa_average_task_overhead"] = (df["mwaa"] - df["critical_path"]) / df["tasks"]
df["beeflow_average_task_overhead"] = (df["beeflow"] - df["critical_path"]) / df["tasks"]
    
    
# plot average task overhead

plt.figure()

fig = px.scatter(df,
                 x="mwaa",
                 y="mwaa_average_task_overhead",
                 height=700,
                 width=700,
                 labels={
                     "mwaa": "DAG makespan [s]",
                     "mwaa_average_task_overhead": "Average task overhead [s]",
                 },
                 color_discrete_sequence=['#ef553b'],
                 )
fig.update_layout(showlegend=True)
fig.update_layout(legend=dict(
    title_text="",
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

fig.update_layout(title_text="MWAA", title_x=0.5)
fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                  plot_bgcolor='rgb(255, 255, 255)',
                  font=dict(
                      family='Latin Modern Math',
                      color='black',
                      size=32,
                  ), )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(range=[0, 5])
fig.update_xaxes(range=[0, 500])
fig.update_traces(marker=dict(size=12))

tr_line = []
for k, trace in enumerate(fig.data):
    if trace.mode is not None and trace.mode == 'lines':
        tr_line.append(k)

for id in tr_line:
    fig.data[id].update(line_width=4)

fig.show()

fig.write_image(f'{CHARTS_DIRECTORY}/traces_dag_makespan_to_average_task_overhead_mwaa.pdf')


# sairflow

plt.figure()

fig = px.scatter(df,
                 x="beeflow",
                 y="beeflow_average_task_overhead",
                 height=700,
                 width=700,
                 labels={
                     "beeflow": "DAG makespan [s]",
                     "beeflow_average_task_overhead": "Average task overhead [s]",
                 },
                 color_discrete_sequence=['#636EFA'],
                 )
fig.update_layout(showlegend=True)
fig.update_layout(legend=dict(
    title_text="",
    yanchor="bottom",
    y=0.01,
    xanchor="right",
    x=0.99
))

fig.update_layout(title_text="sAirflow", title_x=0.5)
fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                  plot_bgcolor='rgb(255, 255, 255)',
                  font=dict(
                      family='Latin Modern Math',
                      color='black',
                      size=32,
                  ), )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(range=[0, 5])
fig.update_xaxes(range=[0, 500])
fig.update_traces(marker=dict(size=12))

tr_line = []
for k, trace in enumerate(fig.data):
    if trace.mode is not None and trace.mode == 'lines':
        tr_line.append(k)

for id in tr_line:
    fig.data[id].update(line_width=4)

fig.show()

fig.write_image(f'{CHARTS_DIRECTORY}/traces_dag_makespan_to_average_task_overhead_sairflow.pdf')

In [None]:
df["mwaa_norm"] = (df["mwaa"] - df["critical_path"]) / (df["parallelisation_degree"] / df["longest_path_in_nodes_number"])
df["beeflow_norm"] = (df["beeflow"] - df["critical_path"]) / (df["parallelisation_degree"] / df["longest_path_in_nodes_number"])
    
    
# plot average task overhead

plt.figure()

fig = px.scatter(df,
                 x="mwaa",
                 y="mwaa_norm",
                 height=700,
                 width=700,
                 labels={
                     "mwaa": "DAG makespan [s]",
                     "mwaa_norm": "DAG overhead by features factor [s]",
                 },
                 color_discrete_sequence=['#ef553b'],
                 )
fig.update_layout(showlegend=True)
fig.update_layout(legend=dict(
    title_text="",
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

fig.update_layout(title_text="MWAA", title_x=0.5)
fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                  plot_bgcolor='rgb(255, 255, 255)',
                  font=dict(
                      family='Latin Modern Math',
                      color='black',
                      size=32,
                  ), )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(range=[0, 125])
fig.update_xaxes(range=[0, 500])
fig.update_traces(marker=dict(size=12))
fig.update_layout(
    yaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 25
    )
)

tr_line = []
for k, trace in enumerate(fig.data):
    if trace.mode is not None and trace.mode == 'lines':
        tr_line.append(k)

for id in tr_line:
    fig.data[id].update(line_width=4)

fig.show()

fig.write_image(f'{CHARTS_DIRECTORY}/traces_dag_makespan_to_norm_overhead_mwaa.pdf')


# sairflow

plt.figure()

fig = px.scatter(df,
                 x="beeflow",
                 y="beeflow_norm",
                 height=700,
                 width=700,
                 labels={
                     "beeflow": "DAG makespan [s]",
                     "beeflow_norm": "DAG overhead by features factor [s]",
                 },
                 color_discrete_sequence=['#636EFA'],
                 )
fig.update_layout(showlegend=True)
fig.update_layout(legend=dict(
    title_text="",
    yanchor="bottom",
    y=0.01,
    xanchor="right",
    x=0.99
))

fig.update_layout(title_text="sAirflow", title_x=0.5)
fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                  plot_bgcolor='rgb(255, 255, 255)',
                  font=dict(
                      family='Latin Modern Math',
                      color='black',
                      size=32,
                  ), )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(range=[0, 125])
fig.update_xaxes(range=[0, 500])
fig.update_traces(marker=dict(size=12))
fig.update_layout(
    yaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 25
    )
)

tr_line = []
for k, trace in enumerate(fig.data):
    if trace.mode is not None and trace.mode == 'lines':
        tr_line.append(k)

for id in tr_line:
    fig.data[id].update(line_width=4)

fig.show()

fig.write_image(f'{CHARTS_DIRECTORY}/traces_dag_makespan_to_norm_overhead_sairflow.pdf')

In [None]:
df["mwaa_norm"] = (df["mwaa"] - df["critical_path"]) / (df["parallelisation_degree"] / df["longest_path_in_nodes_number"])
df["beeflow_norm"] = (df["beeflow"] - df["critical_path"]) / (df["parallelisation_degree"] / df["longest_path_in_nodes_number"])


xs = []
ys = []
ls = []

for _, row in df.iterrows():
    xs.append(row["mwaa"])
    ys.append(row["mwaa_norm"])
    ls.append("MWAA")
    xs.append(row["beeflow"])
    ys.append(row["beeflow_norm"])
    ls.append("sAirflow")
    
df2 = pd.DataFrame({"xs": xs, "ys": ys, "ls": ls})
    
    
# plot average task overhead

plt.figure()

fig = px.scatter(df2,
                 x="xs",
                 y="ys",
                 height=700,
                 width=700,
                 labels={
                     "xs": "DAG makespan [s]",
                     "ys": "DAG normalized overhead [s]",
                 },
                 color_discrete_sequence=['#ef553b', '#636EFA'],
                 color="ls",
                 )
fig.update_layout(showlegend=True)
fig.update_layout(legend=dict(
    title_text="",
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

fig.update_layout(title_text="", title_x=0.5)
fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                  plot_bgcolor='rgb(255, 255, 255)',
                  font=dict(
                      family='Latin Modern Math',
                      color='black',
                      size=32,
                  ), )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
fig.update_yaxes(range=[0, 125])
fig.update_xaxes(range=[0, 500])
fig.update_traces(marker=dict(size=9))
fig.update_traces(marker_opacity=0.7)
fig.update_layout(
    yaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 25
    )
)

tr_line = []
for k, trace in enumerate(fig.data):
    if trace.mode is not None and trace.mode == 'lines':
        tr_line.append(k)

for id in tr_line:
    fig.data[id].update(line_width=4)

fig.show()

fig.write_image(f'{CHARTS_DIRECTORY}/traces_dag_makespan_to_norm_overhead_both.pdf')


In [None]:
df.head()

In [None]:
def prepare_draw_scatter_plot_task_data():
    df = pd.DataFrame()
    for job_name, serverless_date, mwaa_date in zip(
        job_info["job_name"], 
        experiment_result_serverless_dates, 
        experiment_result_mwaa_dates):
        
        mwaa_dag_run = get_stats(system=System.MWAA, date=mwaa_date, stats=Stats.DAG_RUNS)
        mwaa_dag_run = filter_on_dag_ids(mwaa_dag_run, [job_name])
        
        serverless_dag_run = get_stats(system=System.BEEFLOW, date=serverless_date, stats=Stats.DAG_RUNS)
        serverless_dag_run = filter_on_dag_ids(serverless_dag_run, [job_name])
        serverless_dag_run = filter_on_not_dag_run_id(serverless_dag_run, ["scheduled__2023-04-02T15:10:00+00:00"])
        
        job_data = sample_jobs[sample_jobs["job_name"] == job_name].copy()
        graph = build_graph(job_data)
        
        mwaa_tasks = get_stats(system=System.MWAA, date=mwaa_date, stats=Stats.TASKS)
        mwaa_tasks = filter_on_dag_ids(mwaa_tasks, [job_name])
        enhance_with_wait_time_tasks(mwaa_tasks, graph, mwaa_dag_run)
        
        serverless_tasks = get_stats(system=System.BEEFLOW, date=serverless_date, stats=Stats.TASKS)
        serverless_tasks = filter_on_dag_ids(serverless_tasks, [job_name])
        serverless_tasks = filter_on_not_dag_run_id(serverless_tasks, ["scheduled__2023-04-02T15:10:00+00:00"])
        serverless_tasks = filter_not_on_running(serverless_tasks)
        enhance_with_wait_time_tasks(serverless_tasks, graph, serverless_dag_run)
        
        mwaa_tasks["queued_dttm"] = pd.to_datetime(mwaa_tasks["queued_dttm"])
        serverless_tasks["queued_dttm"] = pd.to_datetime(serverless_tasks["queued_dttm"])
        mwaa_tasks = mwaa_tasks.sort_values(by=['queued_dttm'], ascending=True).reset_index(drop=True)
        serverless_tasks = serverless_tasks.sort_values(by=['queued_dttm'], ascending=True).reset_index(drop=True)
        
        to_attach = pd.DataFrame.from_dict(
            {
                "mwaa": mwaa_tasks["wait_time"], 
                "beeflow": serverless_tasks["wait_time"],
            })
        
        df = pd.concat([df, to_attach], ignore_index=True)
    return df

def draw_scatter_plot_task_data():
    df_cache_location = "data/draw_scatter_plot_task_data"

    if not Path(df_cache_location).is_file():
        df = prepare_draw_scatter_plot_task_data()
        df.to_pickle(df_cache_location)

    df = pd.read_pickle(df_cache_location)
    
    fig = px.scatter(df, 
                     x="mwaa", 
                     y="beeflow",
                     height=700, 
                     width=700,
                     labels={
                       "mwaa": "MWAA",
                       "beeflow": "sAirflow",
                     },
                     color_discrete_sequence = ["#ffa15a"],
                    )
    fig.update_layout(showlegend=False)
    fig.update_layout(title_text="Task wait time [s]", title_x=0.5)
    fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(range=[0, 25])
    fig.update_xaxes(range=[0, 25])
    fig.show()

    fig.write_image(f'{CHARTS_DIRECTORY}/traces_wait_time_comparison.pdf')
    
    return df

In [None]:
df = draw_scatter_plot_task_data()

In [None]:
def prepare_draw_scatter_plot_task_data_duration():
    df = pd.DataFrame()
    
    for job_name, serverless_date, mwaa_date in zip(
        job_info["job_name"], 
        experiment_result_serverless_dates, 
        experiment_result_mwaa_dates):
        
        job_data = sample_jobs[sample_jobs["job_name"] == job_name].copy()
        graph = build_graph(job_data)
        
        indegree_zero_nodes = []
        for node in graph.nodes():
            if graph.in_degree(node) == 0:
                indegree_zero_nodes.append(node)
        
        mwaa_dag_run = get_stats(system=System.MWAA, date=mwaa_date, stats=Stats.DAG_RUNS)
        mwaa_dag_run = filter_on_dag_ids(mwaa_dag_run, [job_name])
        
        serverless_dag_run = get_stats(system=System.BEEFLOW, date=serverless_date, stats=Stats.DAG_RUNS)
        serverless_dag_run = filter_on_dag_ids(serverless_dag_run, [job_name])
        serverless_dag_run = filter_on_not_dag_run_id(serverless_dag_run, ["scheduled__2023-04-02T15:10:00+00:00"])
        
        job_data = sample_jobs[sample_jobs["job_name"] == job_name].copy()
        graph = build_graph(job_data)
        
        mwaa_tasks = get_stats(system=System.MWAA, date=mwaa_date, stats=Stats.TASKS)
        mwaa_tasks = filter_on_dag_ids(mwaa_tasks, [job_name])
        enhance_with_duration(mwaa_tasks)
        
        serverless_tasks = get_stats(system=System.BEEFLOW, date=serverless_date, stats=Stats.TASKS)
        serverless_tasks = filter_on_dag_ids(serverless_tasks, [job_name])
        serverless_tasks = filter_on_not_dag_run_id(serverless_tasks, ["scheduled__2023-04-02T15:10:00+00:00"])
        serverless_tasks = filter_not_on_running(serverless_tasks)
        enhance_with_duration(serverless_tasks)
        
        mwaa_dag_run["queued_at"] = pd.to_datetime(mwaa_dag_run["queued_at"])
        serverless_dag_run["queued_at"] = pd.to_datetime(serverless_dag_run["queued_at"])
        mwaa_dag_run = mwaa_dag_run.sort_values(by=['queued_at'], ascending=True).reset_index(drop=True)
        serverless_dag_run = serverless_dag_run.sort_values(by=['queued_at'], ascending=True).reset_index(drop=True)
        
        mwaa_duration_metric = []
        sairflow_duration_metric = []
        expected_duratoin = []
        task_ids = []
        mwaa_run_ids = []
        s_run_ids = []
        is_indegree_zero = []
        
        for mwaa_run, serverless_run in zip(mwaa_dag_run["run_id"], serverless_dag_run["run_id"]):
            serverless_tasks_run = serverless_tasks[serverless_tasks["run_id"] == serverless_run]
            mwaa_tasks_run = mwaa_tasks[mwaa_tasks["run_id"] == mwaa_run]
            
            assert len(mwaa_tasks_run) > 0
            assert len(serverless_tasks_run) == len(mwaa_tasks_run)
            
            for task in serverless_tasks_run["task_id"]:
                serverless_tasks_run_task = serverless_tasks_run[serverless_tasks_run["task_id"] == task]
                mwaa_tasks_run_task = mwaa_tasks_run[mwaa_tasks_run["task_id"] == task]
                
                assert len(serverless_tasks_run_task) == 1
                assert len(mwaa_tasks_run_task) == 1
                
                mwaa_duration_metric.append((mwaa_tasks_run_task.iloc[0])["duration"])
                sairflow_duration_metric.append((serverless_tasks_run_task.iloc[0])["duration"])
                
                if abs((mwaa_tasks_run_task.iloc[0])["duration"] - (serverless_tasks_run_task.iloc[0])["duration"]) > 4:
                    print({
                        "duration": graph.nodes[task]["duration"], 
                        "task": task, 
                        "mwaa": (mwaa_tasks_run_task.iloc[0])["duration"], 
                        "serverless": (serverless_tasks_run_task.iloc[0])["duration"],
                        "job": job_name,
                        "mwaa_run": mwaa_run,
                        "serverless_run": serverless_run,
                    })
                
                expected_duratoin.append(graph.nodes[task]["duration"])
                task_ids.append(task)
                mwaa_run_ids.append(mwaa_run)
                s_run_ids.append(serverless_run)
                
                if task in indegree_zero_nodes:
                    is_indegree_zero.append("0")
                else:
                    is_indegree_zero.append("other")
        
        
        to_attach = pd.DataFrame.from_dict(
            {
                "mwaa": mwaa_duration_metric, 
                "sairflow": sairflow_duration_metric,
                "task_duration": expected_duratoin,
                "mwaa_run_ids": mwaa_run_ids,
                "s_run_ids": s_run_ids,
                "task_id": task_ids,
                "job_id": job_name,
                "indegree_label": is_indegree_zero,
            })
        
        df = pd.concat([df, to_attach], ignore_index=True)
    df["normalized_mwaa"] = df["mwaa"] / df["task_duration"]
    df["normalized_sairflow"] = df["sairflow"] / df["task_duration"]
    return df

def draw_scatter_plot_task_data_duration():
    df_cache_location = "data/draw_scatter_duration_task_i"

    if not Path(df_cache_location).is_file():
        df = prepare_draw_scatter_plot_task_data_duration()
        df.to_pickle(df_cache_location)

    df = pd.read_pickle(df_cache_location)
    df["mwaa_overhead"] = df["mwaa"] - df["task_duration"]
    df["sairflow_overhead"] = df["sairflow"] - df["task_duration"]
    df["label"] = df["job_id"].apply(lambda j: "Other" if j != "j_3302772" else "j_3302772")
    
    plt.figure()
    fig = px.scatter(df, 
                     x="mwaa", 
                     y="sairflow",
                     height=700, 
                     width=700,
                     labels={
                       "mwaa": "MWAA",
                       "sairflow": "sAirflow",
                     },
                     color_discrete_sequence = ["#ffa15a"],
                    )
    fig.update_layout(showlegend=False)
    fig.update_layout(title_text="Task duration [s]", title_x=0.5)
    fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(range=[0, 60])
    fig.update_xaxes(range=[0, 60])
    fig.show()

    fig.write_image(f'{CHARTS_DIRECTORY}/traces_task_duration.pdf')
    
    ## overhead
    plt.figure()
    fig = px.scatter(df, 
                     x="mwaa_overhead", 
                     y="sairflow_overhead",
                     height=700, 
                     width=700,
                     labels={
                       "mwaa_overhead": "MWAA",
                       "sairflow_overhead": "sAirflow",
                     },
                     color="label",
                     color_discrete_sequence=['#3ee67b', '#ab63fa'],
                    )
    fig.update_layout(showlegend=True)
    fig.update_layout(title_text="Task duration overhead [s]", title_x=0.5)
    fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(range=[0 ,10])
    fig.update_xaxes(range=[0, 10])
    fig.update_traces(marker=dict(size=6))
    fig.update_layout(legend=dict(
        title_text="DAG",
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99
    ))
    fig.show()

    fig.write_image(f'{CHARTS_DIRECTORY}/traces_duration_overhead.pdf')
    
    # overhead zoom
    plt.figure()
    fig = px.scatter(df, 
                     x="mwaa_overhead", 
                     y="sairflow_overhead",
                     height=700, 
                     width=700,
                     labels={
                       "mwaa_overhead": "MWAA",
                       "sairflow_overhead": "sAirflow",
                     },
                     color="label",
                     color_discrete_sequence=['#3ee67b', '#ab63fa'],
                    )
    fig.update_layout(showlegend=True)
    fig.update_layout(title_text="Task duration overhead [s]", title_x=0.5)
    fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(range=[0, 5])
    fig.update_xaxes(range=[0, 5])
    fig.update_traces(marker=dict(size=6))
    fig.update_layout(legend=dict(
        title_text="DAG",
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99
    ))
    fig.show()

    fig.write_image(f'{CHARTS_DIRECTORY}/traces_duration_overhead_zoom.pdf')
    
    
    # overhead zoom label by first task
    plt.figure()
    fig = px.scatter(df, 
                     x="mwaa_overhead", 
                     y="sairflow_overhead",
                     height=700, 
                     width=700,
                     labels={
                       "mwaa_overhead": "MWAA",
                       "sairflow_overhead": "sAirflow",
                     },
                     color="indegree_label",
                     color_discrete_sequence=['#3ee67b', '#ab63fa'],
                    )
    fig.update_layout(showlegend=True)
    fig.update_layout(title_text="Task duration overhead (zoom) bb", title_x=0.5)
    fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(range=[0, 5])
    fig.update_xaxes(range=[0, 5])
    fig.update_traces(marker=dict(size=6))
    fig.update_layout(legend=dict(
        title_text="DAG",
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99
    ))
    fig.show()

    
    ## overhead j_3302772
    plt.figure()
    dfp = df[df["job_id"] == "j_3302772"]
    fig = px.scatter(dfp ,
                     x="mwaa_overhead", 
                     y="sairflow_overhead",
                     height=700, 
                     width=700,
                     labels={
                       "mwaa_overhead": "MWAA",
                       "sairflow_overhead": "sAirflow",
                     },
                     color_discrete_sequence = ["#ffa15a"],
                    )
    fig.update_layout(showlegend=False)
    fig.update_layout(title_text="Task duration overhead [s]", title_x=0.5)
    fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(range=[0 ,5])
    fig.update_xaxes(range=[0, 5])
    fig.update_traces(marker=dict(size=6))
    fig.show()
    fig.write_image(f'{CHARTS_DIRECTORY}/traces_duration_overhead_j_3302772.pdf')
    
    # overhead j_3441830
    plt.figure()
    dfp = df[df["job_id"] == "j_3441830"]
    fig = px.scatter(dfp ,
                     x="mwaa_overhead", 
                     y="sairflow_overhead",
                     height=700, 
                     width=700,
                     labels={
                       "mwaa_overhead": "MWAA",
                       "sairflow_overhead": "sAirflow",
                     },
                     color_discrete_sequence = ["#ffa15a"],
                    )
    fig.update_layout(showlegend=False)
    fig.update_layout(title_text="Task duration overhead [s]", title_x=0.5)
    fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(range=[0, 3])
    fig.update_xaxes(range=[0, 3])
    fig.update_traces(marker=dict(size=6))
    fig.show()
    fig.write_image(f'{CHARTS_DIRECTORY}/traces_duration_overhead_j_3441830.pdf')
    
        # overhead j_3441830
    plt.figure()
    dfp = df[df["job_id"] == "j_581851"]
    fig = px.scatter(dfp ,
                     x="mwaa_overhead", 
                     y="sairflow_overhead",
                     height=700, 
                     width=700,
                     labels={
                       "mwaa_overhead": "MWAA",
                       "sairflow_overhead": "sAirflow",
                     },
                     color_discrete_sequence = ["#ffa15a"],
                    )
    fig.update_layout(showlegend=False)
    fig.update_layout(title_text="Task duration overhead [s]", title_x=0.5)
    fig.update_layout(paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(range=[0, 3])
    fig.update_xaxes(range=[0, 3])
    fig.update_traces(marker=dict(size=6))
    fig.show()
    fig.write_image(f'{CHARTS_DIRECTORY}/traces_duration_overhead_j_581851.pdf')
    
    
    return df

In [None]:
df = draw_scatter_plot_task_data_duration()