# Helper code

In [None]:
from plotly.offline import init_notebook_mode; init_notebook_mode()

In [None]:
CHARTS_DIRECTORY = "charts/" 

In [None]:
BUCKET_NAME="beeflow-dev-metadata-dumps"

import pandas as pd
import boto3
import plotly.express as px

s3 = boto3.client('s3') 

s3_cache = {}

def get_s3_csv(file: str):
    if file in s3_cache:
        return s3_cache[file]
    obj = s3.get_object(Bucket=BUCKET_NAME, Key=file)
    df = pd.read_csv(obj['Body'])
    s3_cache[file] = df
    return df


from enum import Enum

class System(Enum):
    MWAA = "mwaa"
    BEEFLOW = "serverless"
    
class Stats(Enum):
    DAG_RUNS = "dagrun"
    TASKS = "taskinstance"
    
def get_stats(system: System, date: str, stats: Stats):
    return get_s3_csv(f"{system.value}/export/{stats.value}/dt={date}/{stats.value}.csv")

def filter_on_dag_ids(df, dag_ids):
    return df[df['dag_id'].isin(dag_ids)]

def filter_on_not_task_ids(df, task_ids):
    return df[~df['task_id'].isin(task_ids)]

def filter_on_not_dag_run_id(df, dag_run_ids):
    return df[~df['run_id'].isin(dag_run_ids)]
    

In [None]:
import networkx as nx
from collections import defaultdict

In [None]:
def enhance_with_duration(df):
    start_ms_time = pd.to_datetime(df['start_date'])
    end_ms_time = pd.to_datetime(df['end_date'])
    df['duration'] = end_ms_time.sub(start_ms_time).dt.total_seconds()
    
def enhance_with_wait_time_dag_runs(df):
    queued_ms_time = pd.to_datetime(df['queued_at'])
    end_ms_time = pd.to_datetime(df['start_date'])
    df['wait_time'] = end_ms_time.sub(queued_ms_time).dt.total_seconds()
    
def enhance_with_wait_time_tasks(df, G: nx.DiGraph, dag_runs_df):
    wait_time = defaultdict(lambda: dict())
    
    for run_id, one_run in df.groupby("run_id"):
        dag_run = dag_runs_df[dag_runs_df["run_id"] == run_id].copy()
        assert len(dag_run) == 1
        
        dag_run_queued_at = pd.to_datetime(dag_run["queued_at"]).iloc[0]
        
        for node in G.nodes():
            current_node = one_run[one_run["task_id"] == node].copy()
            assert len(current_node) == 1, f"len {len(current_node)}, node {node}"
            
            node_started_at = pd.to_datetime(current_node["start_date"]).iloc[0]
            node_last_finished_at = dag_run_queued_at
            
            for upstream, _ in G.in_edges(node):
                upstream_node = one_run[one_run["task_id"] == upstream].copy()
                assert len(upstream_node) == 1, f"len {len(upstream_node)}, node {upstream}"
                
                node_last_finished_at = max(node_last_finished_at, pd.to_datetime(upstream_node["end_date"]).iloc[0])
            
            wait_time[run_id][node] = (node_started_at-node_last_finished_at).total_seconds()
            
       
    wait_time_column = []
    
    for _, row in df.iterrows():
        wait_time_column.append(wait_time[row["run_id"]][row["task_id"]])
            
              
    assert len(wait_time_column) == len(df)
    df['wait_time'] = wait_time_column

In [None]:
def linear_graph(tasks: int):
    G = nx.DiGraph()
    G.add_nodes_from([f"runme_{task}" for task in range(tasks)])
    for task in range(tasks):
        if task == 0:
            continue
        G.add_edge(f"runme_{task-1}", f"runme_{task}")
    return G

def parallel_graph(tasks: int):
    G = nx.DiGraph()
    first_node_name = "run_before_loop"
    G.add_nodes_from([first_node_name])
    G.add_nodes_from([f"runme_{task}" for task in range(tasks)])
    for task in range(tasks):
        G.add_edge(first_node_name, f"runme_{task}")
    return G
    

In [None]:
from dataclasses import dataclass
from typing import List

@dataclass
class BoxPlotEntry:
    data: pd.DataFrame
    identifier: str
    system: str = ""

In [None]:
def box_plot_for(entries: List[BoxPlotEntry], measured_column="duration", hide_x_axis=False, **kwargs):
    transformed_data = []
    for entry in entries:
        df = entry.data.loc[:, ]
        df["identifier"] = entry.identifier
        transformed_data.append(df)
    transformed_data = pd.concat(transformed_data)
    fig = px.box(transformed_data, x="identifier", y=measured_column, color="identifier", points="all", **kwargs)
    if hide_x_axis:
        fig.update_xaxes(visible=False, showticklabels=False)
    fig.show()
    # fig.write_image(f'{kwargs["title"]}.pdf')

In [None]:
def box_plot_for_thesis(entries: List[BoxPlotEntry], measured_column="duration", hide_x_axis=False, experiment_id="", postprocess=None, **kwargs):
    transformed_data = []
    for entry in entries:
        df = entry.data.loc[:, ]
        df["identifier"] = entry.system
        transformed_data.append(df)
    transformed_data = pd.concat(transformed_data)
    fig = px.box(transformed_data, x="identifier", y=measured_column, color="identifier", points="all", **kwargs)
    if hide_x_axis:
        fig.update_xaxes(visible=False, showticklabels=False)
    fig.update_layout(showlegend=False)
    fig.update_layout(title_text=kwargs["title"], title_x=0.5)
    fig.update_layout(xaxis_title=None,     
                      paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_yaxes(showgrid=False, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(rangemode="nonnegative")
    if postprocess is not None:
        postprocess(kwargs["title"], fig)
    fig.show()
    fig.write_image(f'{CHARTS_DIRECTORY}{kwargs["title"]}-{experiment_id}.pdf')

In [None]:
@dataclass
class BoxPlotsEntry:
    system: System
    export_date: str
    dag_ids: List[str]
    task_duration: int
    tasks_count: int
    additional_metadata: str = ""

In [None]:
def system_to_title(system):
    if system == System.MWAA:
        return "MWAA"
    return "sAirflow"

In [None]:
def box_plots_for(entries: List[BoxPlotsEntry], 
                  not_task_ids, 
                  graph_shape,
                  not_dag_run_ids=[], 
                  hide_x_axis=False, 
                  experiment_id="", 
                  postprocess=None,
                  **kwargs):
    dags_data = []
    task_duration = None
    
    for entry in entries:
        task_duration = entry.task_duration
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_dag_runs)
        enhance_with_wait_time_dag_runs(entry_dag_runs)
        
        dags_data.append(
            BoxPlotEntry(
                data=entry_dag_runs,
                identifier=f"{entry.system.value}_tasks:{entry.tasks_count}_duration:{entry.task_duration}_additional:{entry.additional_metadata}",
                system=system_to_title(entry.system),
            )
        )
        
    box_plot_for(dags_data, 
                 measured_column="duration", 
                 title="DAG runs comparison: duration",
                 hover_data=["run_id"],
                 hide_x_axis=hide_x_axis,
                 **kwargs)
    box_plot_for_thesis(dags_data, 
                 measured_column="duration", 
                 title="DAG makespan",
                 hover_data=["run_id"],
                 labels={
                   "identifier": "System",
                   "duration": "Duration [s]",
                 },
                 experiment_id=experiment_id,
                 hide_x_axis=hide_x_axis,
                 postprocess=postprocess,
                 **kwargs)
    
    tasks_data = []
    
    for entry in entries:
        entry_tasks = filter_on_not_dag_run_id(
            df=filter_on_not_task_ids(
                df=filter_on_dag_ids(
                    df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                    dag_ids=entry.dag_ids,
                ),
                task_ids=[],
            ),
            dag_run_ids=not_dag_run_ids,
        )
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_tasks)
        enhance_with_wait_time_tasks(entry_tasks, graph_shape, entry_dag_runs)
        
        entry_tasks = filter_on_not_task_ids(entry_tasks, task_ids=not_task_ids)
        
        tasks_data.append(
            BoxPlotEntry(
                data=entry_tasks,
                identifier=f"{entry.system.value}_tasks:{entry.tasks_count}_duration:{entry.task_duration}_additional:{entry.additional_metadata}",
                system=system_to_title(entry.system),
            )
        )
    
    box_plot_for(tasks_data,
                 measured_column="duration", 
                 title=f"Tasks runs comparison: task duration (ideal: {task_duration}s)",
                 hide_x_axis=hide_x_axis,
                 hover_data=["task_id", "run_id"],
                 **kwargs)
    box_plot_for_thesis(tasks_data, 
             measured_column="duration", 
             title="Task duration",
             hover_data=["run_id"],
             labels={
               "identifier": "System",
               "duration": "Duration [s]",
             },
             experiment_id=experiment_id,
             postprocess=postprocess,
             hide_x_axis=hide_x_axis,
             **kwargs)
    box_plot_for(tasks_data,
                 measured_column="wait_time", 
                 title="Tasks run comparision: wait time (amount of time passed from task being marked as queued to started)",
                 hide_x_axis=hide_x_axis,
                 hover_data=["task_id", "run_id"],
                 **kwargs)
    box_plot_for_thesis(tasks_data, 
         measured_column="wait_time", 
         title="Task wait time",
         hover_data=["run_id"],
         labels={
           "identifier": "System",
           "wait_time": "Wait time [s]",
         },
         experiment_id=experiment_id,
         postprocess=postprocess,
         hide_x_axis=hide_x_axis,
         **kwargs)
        

In [None]:
def trend_for(data: pd.DataFrame, x_col="tasks_count", measured_column="duration", **kwargs):
    # display(data)
    fig = px.line(data, x=x_col, y=measured_column, color="identifier", **kwargs)
    if "hide_x_axis" in kwargs:
        fig.update_xaxes(visible=False, showticklabels=False)
    fig.show()

In [None]:
def trends_for(entries: List[BoxPlotsEntry], not_task_ids, not_dag_run_ids=[], **kwargs):
    dags_data = []
    
    for entry in entries:
        task_duration = entry.task_duration
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_dag_runs)
        enhance_with_wait_time_dag_runs(entry_dag_runs)
        
        dags_data.append(
            {
                "median_duration": entry_dag_runs["duration"].median(),
                "identifier": f"{entry.system.value}:{entry.additional_metadata}",
                "tasks_count": entry.tasks_count,
            }
        )
        
    trend_for(data=pd.DataFrame(dags_data), measured_column="median_duration", title="Median DAG duration", **kwargs)
    
    tasks_data = []
    for entry in entries:
        entry_tasks = filter_on_not_dag_run_id(
            df=filter_on_not_task_ids(
                df=filter_on_dag_ids(
                    df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                    dag_ids=entry.dag_ids,
                ),
                task_ids=not_task_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_tasks)
        enhance_with_wait_time_tasks(entry_tasks)
        
        tasks_data.append(
            {
                "median_duration": entry_tasks["duration"].median(),
                "median_wait_time": entry_tasks["wait_time"].median(),
                "identifier": f"{entry.system.value}:{entry.additional_metadata}",
                "tasks_count": entry.tasks_count,
            }
        )
        
    
    trend_for(data=pd.DataFrame(tasks_data), measured_column="median_duration", title="Median Tasks duration", **kwargs)
    trend_for(data=pd.DataFrame(tasks_data), measured_column="median_wait_time", title="Median Tasks waittime", **kwargs)
    
        

In [None]:
from IPython.display import display


def print_tables(entries: List[BoxPlotsEntry]):
    for entry in entries:
        entry_dag_runs = filter_on_dag_ids(
            df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
            dag_ids=entry.dag_ids,
        )
        enhance_with_duration(entry_dag_runs)
        print(entry.system)
        display(entry_dag_runs[["duration", "run_id", "start_date", "end_date", "_state"]])

In [None]:
@dataclass
class HistogramEntry:
    system: System
    export_date: str
    dag_ids: List[str]
    task_duration: int
    tasks_count: int

In [None]:
@dataclass
class HistogramPlotEntry:
    system: System
    data: pd.DataFrame
    task_duration: int
    tasks_count: int


def cross_histogram_plot_for(entries: List[HistogramPlotEntry], 
                             nbins=30, 
                             measurment_column_name="duration", 
                             **kwargs):
    transformed_data = []
    for entry in entries:
        df = entry.data.loc[:, [measurment_column_name]]
        df["identifier"] = f"{entry.system.value}_tasks:{entry.tasks_count}_duration:{entry.task_duration}"
        transformed_data.append(df)
    fig = px.histogram(pd.concat(transformed_data), 
                       y=measurment_column_name, 
                       color="identifier", 
                       marginal="box", 
                       barmode="group", 
                       nbins=nbins,
                       **kwargs,
                      )
    fig.show()

In [None]:
def compare_runs_by_histograms_for(entries: List[HistogramEntry], not_task_ids, nbins=30, **kwargs):
    # Draw dags
    dags_data = []
    for entry in entries:
        entry_dag_runs = filter_on_dag_ids(
            df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
            dag_ids=entry.dag_ids,
        )
        enhance_with_duration(entry_dag_runs)
        enhance_with_wait_time_dag_runs(entry_dag_runs)
        dags_data.append(
            HistogramPlotEntry(
                system=entry.system,
                data=entry_dag_runs,
                task_duration=entry.task_duration,
                tasks_count=entry.tasks_count,
            )
        )
    cross_histogram_plot_for(dags_data, 
                             nbins, 
                             measurment_column_name="duration", 
                             title="DAG runs comparison: duration",
                             **kwargs)
    # Tasks data
    tasks_data = []
    for entry in entries:
        entry_tasks = filter_on_not_task_ids(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                dag_ids=entry.dag_ids,
            ),
            task_ids=not_task_ids,
        )
        enhance_with_duration(entry_tasks)
        enhance_with_wait_time_tasks(entry_tasks)
        tasks_data.append(
            HistogramPlotEntry(
                system=entry.system,
                data=entry_tasks,
                task_duration=entry.task_duration,
                tasks_count=entry.tasks_count,
            )
        )
    cross_histogram_plot_for(tasks_data, 
                             nbins, 
                             measurment_column_name="duration", 
                             title="Tasks runs comparison: duration",
                             **kwargs)
    cross_histogram_plot_for(tasks_data, 
                             nbins, 
                             measurment_column_name="wait_time", 
                             title="Tasks run comparision: wait time (start - queued)",
                             **kwargs)
        

In [None]:
def gantt_task(df, range_x=None):
    fig = px.timeline(df, x_start="start_date", x_end="end_date", color="run_id", y="task_id", range_x=range_x)
    fig.update_yaxes(autorange="reversed")
    fig.show()

In [None]:
@dataclass
class TasksChartEntry:
    system: System
    export_date: str
    dag_ids: List[str]
    task_duration: int
    tasks_count: int

In [None]:
def tasks_plot(entry: TasksChartEntry, not_task_ids=[], **kwargs):
    entry_tasks = filter_on_not_task_ids(
        df=filter_on_dag_ids(
            df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
            dag_ids=entry.dag_ids,
        ),
        task_ids=not_task_ids,
    )
    gantt_task(entry_tasks)
    

In [None]:
arrow_standard_args = dict(
            xref="x",
            yref="y",
            showarrow=True,
            font=dict(
                family='Latin Modern Math',
                color='black',
                size=32,
                ),
            align="center",
            arrowhead=2,
            arrowsize=1,
            arrowwidth=2,
            arrowcolor="black",
            ax=100,
            ay=0,
            borderwidth=0,
            borderpad=4,
            opacity=0.8
)

# Points

## Beeflow's performance for line DAG's rivals MWAA

### 1 task

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-12_18-48",
        tasks_count=1,
        task_duration=10,
        dag_ids=["10s_1t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-12_19-01",
        tasks_count=1,
        task_duration=10,
        dag_ids=["10s_1t_line"]
    ),
]

In [None]:
box_plots_for(entries=entries, 
              not_task_ids=[], 
              graph_shape=linear_graph(1),
              height=600, 
              width=700, 
              not_dag_run_ids=["scheduled__2023-03-12T17:35:00+00:00", "scheduled__2023-03-12T17:40:00+00:00"], 
              experiment_id="1task_line")

In [None]:
def postprocess(title, fig):
    
    if title ==  "DAG run duration":
        fig.add_annotation(
            x=-0.30,
            y=24.95,
            **arrow_standard_args,
            text="cold start",
        )
        
    if title ==  "Task wait time":
        fig.add_annotation(
            x=-0.30,
            y=11.46,
            **arrow_standard_args,
            text="cold start",
        )
        

box_plots_for(
    entries=entries, 
    graph_shape=linear_graph(1),
    not_task_ids=[], 
    height=600, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-03-12T17:35:00+00:00"], 
    experiment_id="1task_line_withcold", 
    postprocess=postprocess)

### 5 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-12_20-19",
        tasks_count=5,
        task_duration=10,
        dag_ids=["10s_5t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-12_20-30",
        tasks_count=5,
        task_duration=10,
        dag_ids=["10s_5t_line"]
    ),
]

In [None]:
box_plots_for(
    entries=entries, 
    graph_shape=linear_graph(5),
    not_task_ids=[], 
    height=700, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-03-12T19:05:00+00:00"], 
    experiment_id="5task_line")

### 10 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-12_21-48",
        tasks_count=10,
        task_duration=10,
        dag_ids=["10s_10t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-12_21-59",
        tasks_count=10,
        task_duration=10,
        dag_ids=["10s_10t_line"]
    ),
]

In [None]:
box_plots_for(
    entries=entries, 
    graph_shape=linear_graph(10),
    not_task_ids=[], 
    height=700, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-03-12T20:35:00+00:00"], 
    experiment_id="10task_line")

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2022-12-11_20-45",
        tasks_count=15,
        task_duration=10,
        dag_ids=["10s_15t_line"]
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2022-12-28_16-43",
        tasks_count=15,
        task_duration=10,
        dag_ids=["10s_15t_line"]
    ),
]

In [None]:
box_plots_for(entries=entries, not_task_ids=[], height=700, width=1500)

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2022-12-11_17-47",
        tasks_count=5,
        task_duration=10,
        dag_ids=["10s_5t_line"]
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2022-12-28_13-52",
        tasks_count=5,
        task_duration=10,
        dag_ids=["10s_5t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2022-12-11_19-16",
        tasks_count=10,
        task_duration=10,
        dag_ids=["10s_10t_line"]
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2022-12-28_15-24",
        tasks_count=10,
        task_duration=10,
        dag_ids=["10s_10t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2022-12-11_20-45",
        tasks_count=15,
        task_duration=10,
        dag_ids=["10s_15t_line"]
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2022-12-28_16-43",
        tasks_count=15,
        task_duration=10,
        dag_ids=["10s_15t_line"]
    ),
]

In [None]:
trends_for(entries=entries, not_task_ids=[])

 ## Beeflow's more performant for warm starts (parallel)

### 16 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-14_15-04",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-14_15-15",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(
    entries=entries,
    graph_shape=parallel_graph(16),
    not_task_ids=["run_before_loop"], 
    height=700, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-03-14T13:50:00+00:00"], 
    experiment_id="16task_parallel")

### 32 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-14_16-33",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-14_16-44",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(
    entries=entries,
    graph_shape=parallel_graph(32),
    not_task_ids=["run_before_loop"], 
    height=700, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-03-14T15:20:00+00:00"], 
    experiment_id="32task_parallel")

### 64 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-14_18-02",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-14_18-13",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
def postprocess(title, fig):
        
    if title ==  "Task wait time":
        fig.add_annotation(
            x=-0.30,
            y=14.46,
            **arrow_standard_args,
            text="cold starts",
        )
        
    if title ==  "DAG makespan":
        fig.add_annotation(
            x=-0.30,
            y=31.06,
            **arrow_standard_args,
            text="1+ cold start",
        )

box_plots_for(
    entries=entries, 
    graph_shape=parallel_graph(64),
    not_task_ids=["run_before_loop"], 
    height=700, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-03-14T16:50:00+00:00"], 
    experiment_id="64task_parallel",
    postprocess=postprocess)

### 125 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-14_19-31",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-14_19-42",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
def postprocess(title, fig):
        
    if title ==  "Task wait time":
        fig.add_annotation(
            x=-0.30,
            y=18.06,
            **arrow_standard_args,
            text="cold starts",
        )
        
    if title ==  "DAG makespan":
        fig.add_annotation(
            x=-0.30,
            y=36.06,
            **arrow_standard_args,
            text="1+ cold start",
        )

box_plots_for(
    entries=entries, 
    graph_shape=parallel_graph(125),
    not_task_ids=["run_before_loop"], 
    height=700, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-03-14T18:20:00+00:00", "scheduled__2023-03-14T18:15:00+00:00", "scheduled__2023-03-14T18:25:00+00:00"], 
    experiment_id="125task_parallel",
    postprocess=postprocess)

## Beeflow's autoscaling is faster (parallel)

### 16 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-21_15-40",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t_30cron"],
        additional_metadata="coldstart",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-21_15-51",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t_30cron"],
        additional_metadata="1workermin_20workersmax_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, 
              graph_shape=parallel_graph(16),
              not_task_ids=["run_before_loop"],
              height=700, 
              width=700,
              experiment_id="16task_parallel_cold")

### 32 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-21_17-41",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-21_17-52",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t_30cron"],
        additional_metadata="1workermin_20workersmax_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, 
              graph_shape=parallel_graph(32),
              not_task_ids=["run_before_loop"],
              not_dag_run_ids=["scheduled__2023-01-08T18:00:00+00:00"],
              height=700, 
              width=700,
              experiment_id="32task_parallel_cold")

### 64 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-21_19-41",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-21_19-52",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t_30cron"],
        additional_metadata="1workermin_20workersmax_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, 
              graph_shape=parallel_graph(64),
              not_task_ids=["run_before_loop"],
              not_dag_run_ids=["scheduled__2023-01-08T18:30:00+00:00"],
              height=700, 
              width=700,
              experiment_id="64task_parallel_cold")

### 125

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-21_21-42",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t_30cron"],
        additional_metadata="db.small.coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-21_21-53",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t_30cron"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, 
              not_task_ids=["run_before_loop"],
              graph_shape=parallel_graph(125),
              not_dag_run_ids=["scheduled__2023-01-18T17:00:00+00:00", "scheduled__2023-01-18T16:30:00+00:00", "scheduled__2023-01-18T17:30:00+00:00"],
              height=700, 
              width=700,
              experiment_id="125task_parallel_cold")

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_12-18",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-18_12-29",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t_30cron"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_14-19",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-18_14-30",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t_30cron"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_16-20",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-18_16-31",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t_30cron"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_18-21",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-18_18-32",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t_30cron"],
    ),

]

In [None]:
trends_for(entries, not_task_ids=[], not_dag_run_ids=["scheduled__2023-01-18T17:00:00+00:00", "scheduled__2023-01-18T16:30:00+00:00", "scheduled__2023-01-18T17:30:00+00:00"],)

In [None]:
Tests

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-19_15-42",
        tasks_count=125,
        task_duration=30,
        dag_ids=["30s_125t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-21_16-27",
        tasks_count=125,
        task_duration=30,
        dag_ids=["30s_125t"],
        additional_metadata="db.m5.large",
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-21_17-42",
        tasks_count=125,
        task_duration=30,
        dag_ids=["30s_125t"],
        additional_metadata="db.m4.xlarge",
    ),
]


In [None]:
box_plots_for(entries=entries, 
              not_task_ids=["run_before_loop"],
              height=700, 
              width=1500)

In [None]:
experiment_result_serverless_dates = [
    "2023-04-01_23-07",
    "2023-04-02_00-36",
    "2023-04-02_02-05",
    "2023-04-02_03-34",
    "2023-04-02_05-03",
    "2023-04-02_06-32",
    "2023-04-02_12-36",
    "2023-04-02_14-05",
    "2023-04-02_15-34",
    "2023-04-02_17-03",
    "2023-04-02_18-32",
    "2023-04-02_20-01",
    "2023-04-02_21-30",
    "2023-04-02_22-59",
    "2023-04-03_00-28",
    "2023-04-03_01-57",
    "2023-04-03_03-26",
    "2023-04-03_04-55",
    "2023-04-03_06-24",
    "2023-04-03_07-53",
    "2023-04-03_09-22",
    "2023-04-03_10-51",
    "2023-04-03_12-20",
    "2023-04-03_13-49",
    "2023-04-03_15-18",
    "2023-04-03_22-48",
    "2023-04-04_00-17",
    "2023-04-04_01-46",
    "2023-04-04_03-15",
    "2023-04-04_04-44"
]

experiment_result_mwaa_dates = [
    "2023-04-01_23-18",
    "2023-04-02_00-48",
    "2023-04-02_02-17",
    "2023-04-02_03-46",
    "2023-04-02_05-15",
    "2023-04-02_06-44",
    "2023-04-02_12-48",
    "2023-04-02_14-16",
    "2023-04-02_15-45",
    "2023-04-02_17-14",
    "2023-04-02_18-43",
    "2023-04-02_20-12",
    "2023-04-02_21-41",
    "2023-04-02_23-10",
    "2023-04-03_00-39",
    "2023-04-03_02-08",
    "2023-04-03_03-37",
    "2023-04-03_05-06",
    "2023-04-03_06-35",
    "2023-04-03_08-04",
    "2023-04-03_09-33",
    "2023-04-03_11-02",
    "2023-04-03_12-31",
    "2023-04-03_14-00",
    "2023-04-03_15-29",
    "2023-04-03_22-59",
    "2023-04-04_00-28",
    "2023-04-04_01-57",
    "2023-04-04_03-26",
    "2023-04-04_04-56",
]

job_names = [
  "j_3302772",
  "j_440175",
  "j_861714",
  "j_3441830",
  "j_1657836",
  "j_3672699",
  "j_581851",
  "j_1393008",
  "j_2779966",
  "j_1507987",
  "j_2875170",
  "j_2208835",
  "j_3962513",
  "j_3197222",
  "j_125835",
  "j_2749804",
  "j_3684272",
  "j_964836",
  "j_3505701",
  "j_18436",
  "j_1526622",
  "j_1231082",
  "j_1518466",
  "j_189171",
  "j_1903033",
  "j_410587",
  "j_2094884",
  "j_1043738",
  "j_1760399",
  "j_2821440"
]

In [None]:
assert len(job_names) == len(experiment_result_dates)

In [None]:
for serverless_date, mwaa_date, job in zip(experiment_result_serverless_dates, experiment_result_mwaa_dates, job_names[:10]):
    entries = [
        BoxPlotsEntry(
            system=System.BEEFLOW,
            export_date=serverless_date,
            tasks_count=1,
            task_duration=10,
            dag_ids=[job]
        ),
        BoxPlotsEntry(
            system=System.MWAA,
            export_date=mwaa_date,
            tasks_count=1,
            task_duration=10,
            dag_ids=[job]
        ),
    ]
    
    box_plots_for(
        entries=entries, 
        not_task_ids=[], 
        height=700, 
        width=700, 
        not_dag_run_ids=[], 
        experiment_id=job)
    