# Helper code

In [None]:
from plotly.offline import init_notebook_mode; init_notebook_mode()

In [None]:
CHARTS_DIRECTORY = "charts/" 

In [None]:
BUCKET_NAME="beeflow-dev-metadata-dumps"

import pandas as pd
import boto3
import plotly.express as px

s3 = boto3.client('s3') 

s3_cache = {}

def get_s3_csv(file: str):
    if file in s3_cache:
        return s3_cache[file]
    obj = s3.get_object(Bucket=BUCKET_NAME, Key=file)
    df = pd.read_csv(obj['Body'])
    s3_cache[file] = df
    return df


from enum import Enum

class System(Enum):
    MWAA = "mwaa"
    BEEFLOW = "serverless"
    
class Stats(Enum):
    DAG_RUNS = "dagrun"
    TASKS = "taskinstance"
    
def get_stats(system: System, date: str, stats: Stats):
    return get_s3_csv(f"{system.value}/export/{stats.value}/dt={date}/{stats.value}.csv")

def filter_on_dag_ids(df, dag_ids):
    return df[df['dag_id'].isin(dag_ids)]

def filter_on_not_task_ids(df, task_ids):
    return df[~df['task_id'].isin(task_ids)]

def filter_on_not_dag_run_id(df, dag_run_ids):
    return df[~df['run_id'].isin(dag_run_ids)]
    

In [None]:
import networkx as nx
from collections import defaultdict
import datetime

In [None]:
def enhance_with_duration(df):
    start_ms_time = pd.to_datetime(df['start_date'])
    end_ms_time = pd.to_datetime(df['end_date'])
    df['duration'] = end_ms_time.sub(start_ms_time).dt.total_seconds()
    
def enhance_with_wait_time_dag_runs(df):
    queued_ms_time = pd.to_datetime(df['queued_at'])
    end_ms_time = pd.to_datetime(df['start_date'])
    df['wait_time'] = end_ms_time.sub(queued_ms_time).dt.total_seconds()
    
def enhance_with_wait_time_tasks(df, G: nx.DiGraph, dag_runs_df):
    wait_time = defaultdict(lambda: dict())
    
    for dag_id, one_dag in df.groupby("dag_id"):
        by_dag = dag_runs_df[dag_runs_df["dag_id"] == dag_id]
        assert len(by_dag) > 0

        for run_id, one_run in one_dag.groupby("run_id"):
            dag_run = by_dag[by_dag["run_id"] == run_id].copy()
            assert len(dag_run) == 1, f"{len(dag_run)}, for {run_id}"

            dag_run_queued_at = pd.to_datetime(dag_run["queued_at"]).iloc[0]

            for node in G.nodes():
                current_node = one_run[one_run["task_id"] == node].copy()
                assert len(current_node) == 1, f"len {len(current_node)}, node {node}"

                node_started_at = pd.to_datetime(current_node["start_date"]).iloc[0]
                node_last_finished_at = dag_run_queued_at

                for upstream, _ in G.in_edges(node):
                    upstream_node = one_run[one_run["task_id"] == upstream].copy()
                    assert len(upstream_node) == 1, f"len {len(upstream_node)}, node {upstream}"

                    node_last_finished_at = max(node_last_finished_at, pd.to_datetime(upstream_node["end_date"]).iloc[0])

                wait_time[run_id][node] = (node_started_at-node_last_finished_at).total_seconds()
            
       
    wait_time_column = []
    
    for _, row in df.iterrows():
        wait_time_column.append(wait_time[row["run_id"]][row["task_id"]])
            
              
    assert len(wait_time_column) == len(df)
    df['wait_time'] = wait_time_column

In [None]:
def linear_graph(tasks: int):
    G = nx.DiGraph()
    G.add_nodes_from([f"runme_{task}" for task in range(tasks)])
    for task in range(tasks):
        if task == 0:
            continue
        G.add_edge(f"runme_{task-1}", f"runme_{task}")
    return G

def parallel_graph(tasks: int):
    G = nx.DiGraph()
    first_node_name = "run_before_loop"
    G.add_nodes_from([first_node_name])
    G.add_nodes_from([f"runme_{task}" for task in range(tasks)])
    for task in range(tasks):
        G.add_edge(first_node_name, f"runme_{task}")
    return G
    

In [None]:
from dataclasses import dataclass
from typing import List

@dataclass
class BoxPlotEntry:
    data: pd.DataFrame
    identifier: str
    system: str = ""

In [None]:
def box_plot_for(entries: List[BoxPlotEntry], measured_column="duration", hide_x_axis=False, **kwargs):
    transformed_data = []
    for entry in entries:
        df = entry.data.loc[:, ]
        df["identifier"] = entry.identifier
        transformed_data.append(df)
    transformed_data = pd.concat(transformed_data)
    fig = px.box(transformed_data, x="identifier", y=measured_column, color="identifier", points="all", **kwargs)
    if hide_x_axis:
        fig.update_xaxes(visible=False, showticklabels=False)
    fig.show()
    # fig.write_image(f'{kwargs["title"]}.pdf')

In [None]:
def box_plot_for_thesis(entries: List[BoxPlotEntry], measured_column="duration", hide_x_axis=False, experiment_id="", postprocess=None, **kwargs):
    transformed_data = []
    for entry in entries:
        df = entry.data.loc[:, ]
        df["identifier"] = entry.system
        transformed_data.append(df)
    transformed_data = pd.concat(transformed_data)
    fig = px.box(transformed_data, x="identifier", y=measured_column, color="identifier", points="all", **kwargs)
    if hide_x_axis:
        fig.update_xaxes(visible=False, showticklabels=False)
    fig.update_layout(showlegend=False)
    fig.update_layout(title_text=kwargs["title"], title_x=0.5)
    fig.update_layout(xaxis_title=None,     
                      paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_yaxes(showgrid=False, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(rangemode="nonnegative")
    if postprocess is not None:
        postprocess(kwargs["title"], fig)
    fig.show()
    fig.write_image(f'{CHARTS_DIRECTORY}{kwargs["title"]}-{experiment_id}.pdf')

In [None]:
@dataclass
class BoxPlotsEntry:
    system: System
    export_date: str
    dag_ids: List[str]
    task_duration: int
    tasks_count: int
    additional_metadata: str = ""

In [None]:
def system_to_title(system):
    if system == System.MWAA:
        return "MWAA"
    return "sAirflow"

In [None]:
def box_plots_for(entries: List[BoxPlotsEntry], 
                  not_task_ids, 
                  graph_shape,
                  not_dag_run_ids=[], 
                  hide_x_axis=False, 
                  experiment_id="", 
                  postprocess=None,
                  sairflow_graph_shape_override=None,
                  **kwargs):
    dags_data = []
    task_duration = None
    
    for entry in entries:
        task_duration = entry.task_duration
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_dag_runs)
        enhance_with_wait_time_dag_runs(entry_dag_runs)
        
        dags_data.append(
            BoxPlotEntry(
                data=entry_dag_runs,
                identifier=f"{entry.system.value}_tasks:{entry.tasks_count}_duration:{entry.task_duration}_additional:{entry.additional_metadata}",
                system=system_to_title(entry.system),
            )
        )
        
    box_plot_for(dags_data, 
                 measured_column="duration", 
                 title="DAG runs comparison: duration",
                 hover_data=["run_id"],
                 hide_x_axis=hide_x_axis,
                 **kwargs)
    box_plot_for_thesis(dags_data, 
                 measured_column="duration", 
                 title="DAG makespan",
                 hover_data=["run_id"],
                 labels={
                   "identifier": "System",
                   "duration": "Duration [s]",
                 },
                 experiment_id=experiment_id,
                 hide_x_axis=hide_x_axis,
                 postprocess=postprocess,
                 **kwargs)
    
    tasks_data = []
    
    for entry in entries:
        entry_tasks = filter_on_not_dag_run_id(
            df=filter_on_not_task_ids(
                df=filter_on_dag_ids(
                    df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                    dag_ids=entry.dag_ids,
                ),
                task_ids=[],
            ),
            dag_run_ids=not_dag_run_ids,
        )
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_tasks)
        
        gs = graph_shape
        if sairflow_graph_shape_override is not None and entry.system == System.BEEFLOW:
            gs = sairflow_graph_shape_override
        enhance_with_wait_time_tasks(entry_tasks, gs, entry_dag_runs)
        
        entry_tasks = filter_on_not_task_ids(entry_tasks, task_ids=not_task_ids)
        
        tasks_data.append(
            BoxPlotEntry(
                data=entry_tasks,
                identifier=f"{entry.system.value}_tasks:{entry.tasks_count}_duration:{entry.task_duration}_additional:{entry.additional_metadata}",
                system=system_to_title(entry.system),
            )
        )
    
    box_plot_for(tasks_data,
                 measured_column="duration", 
                 title=f"Tasks runs comparison: task duration (ideal: {task_duration}s)",
                 hide_x_axis=hide_x_axis,
                 hover_data=["task_id", "run_id"],
                 **kwargs)
    box_plot_for_thesis(tasks_data, 
             measured_column="duration", 
             title="Task duration",
             hover_data=["run_id"],
             labels={
               "identifier": "System",
               "duration": "Duration [s]",
             },
             experiment_id=experiment_id,
             postprocess=postprocess,
             hide_x_axis=hide_x_axis,
             **kwargs)
    box_plot_for(tasks_data,
                 measured_column="wait_time", 
                 title="Tasks run comparision: wait time (amount of time passed from task being marked as queued to started)",
                 hide_x_axis=hide_x_axis,
                 hover_data=["task_id", "run_id"],
                 **kwargs)
    box_plot_for_thesis(tasks_data, 
         measured_column="wait_time", 
         title="Task wait time",
         hover_data=["run_id"],
         labels={
           "identifier": "System",
           "wait_time": "Wait time [s]",
         },
         experiment_id=experiment_id,
         postprocess=postprocess,
         hide_x_axis=hide_x_axis,
         **kwargs)
        

In [None]:
def trend_for(data: pd.DataFrame, x_col="tasks_count", measured_column="duration", **kwargs):
    # display(data)
    fig = px.line(data, x=x_col, y=measured_column, color="identifier", **kwargs)
    if "hide_x_axis" in kwargs:
        fig.update_xaxes(visible=False, showticklabels=False)
    fig.show()

In [None]:
def trends_for(entries: List[BoxPlotsEntry], not_task_ids, not_dag_run_ids=[], **kwargs):
    dags_data = []
    
    for entry in entries:
        task_duration = entry.task_duration
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_dag_runs)
        enhance_with_wait_time_dag_runs(entry_dag_runs)
        
        dags_data.append(
            {
                "median_duration": entry_dag_runs["duration"].median(),
                "identifier": f"{entry.system.value}:{entry.additional_metadata}",
                "tasks_count": entry.tasks_count,
            }
        )
        
    trend_for(data=pd.DataFrame(dags_data), measured_column="median_duration", title="Median DAG duration", **kwargs)
    
    tasks_data = []
    for entry in entries:
        entry_tasks = filter_on_not_dag_run_id(
            df=filter_on_not_task_ids(
                df=filter_on_dag_ids(
                    df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                    dag_ids=entry.dag_ids,
                ),
                task_ids=not_task_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_tasks)
        enhance_with_wait_time_tasks(entry_tasks)
        
        tasks_data.append(
            {
                "median_duration": entry_tasks["duration"].median(),
                "median_wait_time": entry_tasks["wait_time"].median(),
                "identifier": f"{entry.system.value}:{entry.additional_metadata}",
                "tasks_count": entry.tasks_count,
            }
        )
        
    
    trend_for(data=pd.DataFrame(tasks_data), measured_column="median_duration", title="Median Tasks duration", **kwargs)
    trend_for(data=pd.DataFrame(tasks_data), measured_column="median_wait_time", title="Median Tasks waittime", **kwargs)
    
        

In [None]:
from IPython.display import display


def print_tables(entries: List[BoxPlotsEntry]):
    for entry in entries:
        entry_dag_runs = filter_on_dag_ids(
            df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
            dag_ids=entry.dag_ids,
        )
        enhance_with_duration(entry_dag_runs)
        print(entry.system)
        display(entry_dag_runs[["duration", "run_id", "start_date", "end_date", "_state"]])

In [None]:
@dataclass
class HistogramEntry:
    system: System
    export_date: str
    dag_ids: List[str]
    task_duration: int
    tasks_count: int

In [None]:
@dataclass
class HistogramPlotEntry:
    system: System
    data: pd.DataFrame
    task_duration: int
    tasks_count: int


def cross_histogram_plot_for(entries: List[HistogramPlotEntry], 
                             nbins=30, 
                             measurment_column_name="duration", 
                             **kwargs):
    transformed_data = []
    for entry in entries:
        df = entry.data.loc[:, [measurment_column_name]]
        df["identifier"] = f"{entry.system.value}_tasks:{entry.tasks_count}_duration:{entry.task_duration}"
        transformed_data.append(df)
    fig = px.histogram(pd.concat(transformed_data), 
                       y=measurment_column_name, 
                       color="identifier", 
                       marginal="box", 
                       barmode="group", 
                       nbins=nbins,
                       **kwargs,
                      )
    fig.show()

In [None]:
def compare_runs_by_histograms_for(entries: List[HistogramEntry], not_task_ids, nbins=30, **kwargs):
    # Draw dags
    dags_data = []
    for entry in entries:
        entry_dag_runs = filter_on_dag_ids(
            df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
            dag_ids=entry.dag_ids,
        )
        enhance_with_duration(entry_dag_runs)
        enhance_with_wait_time_dag_runs(entry_dag_runs)
        dags_data.append(
            HistogramPlotEntry(
                system=entry.system,
                data=entry_dag_runs,
                task_duration=entry.task_duration,
                tasks_count=entry.tasks_count,
            )
        )
    cross_histogram_plot_for(dags_data, 
                             nbins, 
                             measurment_column_name="duration", 
                             title="DAG runs comparison: duration",
                             **kwargs)
    # Tasks data
    tasks_data = []
    for entry in entries:
        entry_tasks = filter_on_not_task_ids(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                dag_ids=entry.dag_ids,
            ),
            task_ids=not_task_ids,
        )
        enhance_with_duration(entry_tasks)
        enhance_with_wait_time_tasks(entry_tasks)
        tasks_data.append(
            HistogramPlotEntry(
                system=entry.system,
                data=entry_tasks,
                task_duration=entry.task_duration,
                tasks_count=entry.tasks_count,
            )
        )
    cross_histogram_plot_for(tasks_data, 
                             nbins, 
                             measurment_column_name="duration", 
                             title="Tasks runs comparison: duration",
                             **kwargs)
    cross_histogram_plot_for(tasks_data, 
                             nbins, 
                             measurment_column_name="wait_time", 
                             title="Tasks run comparision: wait time (start - queued)",
                             **kwargs)
        

In [None]:
def gantt_task(df, range_x=None):
    fig = px.timeline(df, x_start="start_date", x_end="end_date", color="run_id", y="task_id", range_x=range_x)
    fig.update_yaxes(autorange="reversed")
    fig.show()

In [None]:
@dataclass
class TasksChartEntry:
    system: System
    export_date: str
    dag_ids: List[str]
    task_duration: int
    tasks_count: int

In [None]:
def tasks_plot(entry: TasksChartEntry, not_task_ids=[], **kwargs):
    entry_tasks = filter_on_not_task_ids(
        df=filter_on_dag_ids(
            df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
            dag_ids=entry.dag_ids,
        ),
        task_ids=not_task_ids,
    )
    gantt_task(entry_tasks)
    

In [None]:
def gantt_task_chart(entry_tasks, experiment_id="", tickformat="", xaxis_title="", postprocess=None, **kwargs):
    fig = px.timeline(entry_tasks, x_start="start_date", x_end="end_date", y="task_id", **kwargs)
    fig.update_layout(showlegend=False)
    fig.update_layout(title_text=kwargs["title"], title_x=0.5)
    fig.update_layout(xaxis_title=xaxis_title,     
                      paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                      font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_yaxes(showgrid=False, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showgrid=False, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True, showticklabels=False)
    fig.update_xaxes(
        tickformat=tickformat,
    )
    fig.update_yaxes(autorange="reversed")
    if postprocess is not None:
        postprocess(kwargs["title"], fig)
    fig.show()
    fig.write_image(f'{CHARTS_DIRECTORY}{kwargs["title"]}-{experiment_id}.pdf')


def thesis_gantt(entries, specific_run, experiment_id, xaxis_title="", tickformat="", sairflow_run_override=None, **kwargs):
    lowest_date = None
    further_date = None
    charts_data = []

    for entry in entries:
            entry_tasks = filter_on_not_dag_run_id(
                df=filter_on_not_task_ids(
                    df=filter_on_dag_ids(
                        df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                        dag_ids=entry.dag_ids,
                    ),
                    task_ids=[],
                ),
                dag_run_ids=[],
            )
            sr = specific_run
            if sairflow_run_override is not None and entry.system == System.BEEFLOW:
                sr = sairflow_run_override
            entry_tasks = entry_tasks[entry_tasks["run_id"] == sr]
            entry_tasks["start_date"] = pd.to_datetime(entry_tasks['start_date'])
            entry_tasks["end_date"] = pd.to_datetime(entry_tasks['end_date'])
            entry_tasks = entry_tasks.sort_values('start_date', ascending=True)
            charts_data.append(entry_tasks)

            if lowest_date is None:
                lowest_date = entry_tasks["start_date"].min()
            lowest_date = min(lowest_date, entry_tasks["start_date"].min())

            if further_date is None:
                further_date = entry_tasks["end_date"].max()
            further_date = max(further_date, entry_tasks["end_date"].max())    

    color_discrete_sequence = ["#636EFA", "#ef553b"]

    for i, (entry, entry_tasks) in enumerate(zip(entries, charts_data)):
            lowest = max(lowest_date, entry_tasks["start_date"].min())
            difference = entry_tasks["start_date"].min() - lowest_date
            entry_tasks["start_date"] = entry_tasks["start_date"] - difference
            entry_tasks["end_date"] = entry_tasks["end_date"] - difference
            
            
            entry_tasks["start_date"] = datetime.datetime.utcfromtimestamp(0) + (entry_tasks["start_date"] - lowest_date)
            entry_tasks["end_date"] = datetime.datetime.utcfromtimestamp(0) + (entry_tasks["end_date"] - lowest_date)
            
            title = f"{system_to_title(entry.system)}: Tasks Gantt"
            gantt_task_chart(entry_tasks,
                             title=system_to_title(entry.system),
                             experiment_id=experiment_id,
                             xaxis_title=xaxis_title,
                             tickformat=tickformat,
                             labels={
                               "task_id": "Tasks",
                               "duration": "Duration [s]",
                             },
                             color_discrete_sequence=[color_discrete_sequence[i]],
                             range_x=[datetime.datetime.utcfromtimestamp(0), datetime.datetime.utcfromtimestamp(0) + (further_date - lowest_date)],
                             **kwargs)

In [None]:
def gantt_task_chart_multi(entry_tasks_multi, experiment_id="", tickformat="", xaxis_title="", postprocess=None, **kwargs):
    transformed_data = []
    for (entry, data) in entry_tasks_multi:
        df = data.loc[:, ]
        df["identifier"] = system_to_title(entry.system)
        transformed_data.append(df)
    transformed_data = pd.concat(transformed_data)
    transformed_data = transformed_data.sort_values('start_date', ascending=True)
    fig = px.timeline(transformed_data, x_start="start_date", x_end="end_date", y="identifier", color="identifier", **kwargs)
    fig.update_layout(showlegend=False)
    fig.update_layout(title_text=kwargs["title"], title_x=0.5)
    fig.update_layout(xaxis_title=xaxis_title,     
                      paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                      font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_layout(yaxis_title="")
    fig.update_yaxes(showgrid=False, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showgrid=False, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True, showticklabels=True)
    fig.update_xaxes(
        tickformat=tickformat,
    )
    fig.update_yaxes(autorange="reversed")
    if postprocess is not None:
        postprocess(kwargs["title"], fig)
    fig.show()
    fig.write_image(f'{CHARTS_DIRECTORY}{kwargs["title"]}-{experiment_id}.pdf')


def thesis_gantt_multi(entries, specific_run, experiment_id, xaxis_title="", tickformat="", **kwargs):
    lowest_date = None
    further_date = None
    charts_data = []

    for entry in entries:
            entry_tasks = filter_on_not_dag_run_id(
                df=filter_on_not_task_ids(
                    df=filter_on_dag_ids(
                        df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                        dag_ids=entry.dag_ids,
                    ),
                    task_ids=[],
                ),
                dag_run_ids=[],
            )
            entry_tasks = entry_tasks[entry_tasks["run_id"] == specific_run]
            entry_tasks["start_date"] = pd.to_datetime(entry_tasks['start_date'])
            entry_tasks["end_date"] = pd.to_datetime(entry_tasks['end_date'])
            entry_tasks = entry_tasks.sort_values('start_date', ascending=True)
            charts_data.append(entry_tasks)

            if lowest_date is None:
                lowest_date = entry_tasks["start_date"].min()
            lowest_date = min(lowest_date, entry_tasks["start_date"].min())

            if further_date is None:
                further_date = entry_tasks["end_date"].max()
            further_date = max(further_date, entry_tasks["end_date"].max())    

    color_discrete_sequence = ["#636EFA", "#ef553b"]
    data = []

    for i, (entry, entry_tasks) in enumerate(zip(entries, charts_data)):
            lowest = max(lowest_date, entry_tasks["start_date"].min())
            difference = entry_tasks["start_date"].min() - lowest_date
            entry_tasks["start_date"] = entry_tasks["start_date"] - difference
            entry_tasks["end_date"] = entry_tasks["end_date"] - difference
            
            entry_tasks["start_date"] = datetime.datetime.utcfromtimestamp(0) + (entry_tasks["start_date"] - lowest_date)
            entry_tasks["end_date"] = datetime.datetime.utcfromtimestamp(0) + (entry_tasks["end_date"] - lowest_date)
            
            data.append((entry, entry_tasks))
 
    gantt_task_chart_multi(data,
                     title="",
                     experiment_id=experiment_id,
                     xaxis_title=xaxis_title,
                     tickformat=tickformat,
                     labels={
                       "task_id": "Tasks",
                       "duration": "Duration [s]",
                     },
                     color_discrete_sequence=color_discrete_sequence,
                     range_x=[datetime.datetime.utcfromtimestamp(0), datetime.datetime.utcfromtimestamp(0) + (further_date - lowest_date)],
                     **kwargs)

In [None]:
arrow_standard_args = dict(
            xref="x",
            yref="y",
            showarrow=True,
            font=dict(
                family='Latin Modern Math',
                color='black',
                size=32,
                ),
            align="center",
            arrowhead=2,
            arrowsize=1,
            arrowwidth=2,
            arrowcolor="black",
            ax=100,
            ay=0,
            borderwidth=0,
            borderpad=4,
            opacity=0.8
)

# Points

## Beeflow's performance for line DAG's rivals MWAA

### 1 task

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-12_18-48",
        tasks_count=1,
        task_duration=10,
        dag_ids=["10s_1t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-12_19-01",
        tasks_count=1,
        task_duration=10,
        dag_ids=["10s_1t_line"]
    ),
]

In [None]:
box_plots_for(entries=entries, 
              not_task_ids=[], 
              graph_shape=linear_graph(1),
              height=600, 
              width=700, 
              not_dag_run_ids=["scheduled__2023-03-12T17:35:00+00:00", "scheduled__2023-03-12T17:40:00+00:00"], 
              experiment_id="1task_line")

In [None]:
def postprocess(title, fig):
    
    if title ==  "DAG run duration":
        fig.add_annotation(
            x=-0.30,
            y=24.95,
            **arrow_standard_args,
            text="cold start",
        )
        
    if title ==  "Task wait time":
        fig.add_annotation(
            x=-0.30,
            y=11.46,
            **arrow_standard_args,
            text="cold start",
        )
        

box_plots_for(
    entries=entries, 
    graph_shape=linear_graph(1),
    not_task_ids=[], 
    height=600, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-03-12T17:35:00+00:00"], 
    experiment_id="1task_line_withcold", 
    postprocess=postprocess)

### 5 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-12_20-19",
        tasks_count=5,
        task_duration=10,
        dag_ids=["10s_5t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-12_20-30",
        tasks_count=5,
        task_duration=10,
        dag_ids=["10s_5t_line"]
    ),
]

In [None]:
box_plots_for(
    entries=entries, 
    graph_shape=linear_graph(5),
    not_task_ids=[], 
    height=700, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-03-12T19:05:00+00:00"], 
    experiment_id="5task_line")

In [None]:
thesis_gantt_multi(
    entries, 
    specific_run="scheduled__2023-03-12T19:15:00+00:00", 
    experiment_id="5task_line",
    tickformat="%s", 
    xaxis_title="Execution time [s]",
    height=700, 
    width=700,
)

### 10 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-12_21-48",
        tasks_count=10,
        task_duration=10,
        dag_ids=["10s_10t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-12_21-59",
        tasks_count=10,
        task_duration=10,
        dag_ids=["10s_10t_line"]
    ),
]

In [None]:
box_plots_for(
    entries=entries, 
    graph_shape=linear_graph(10),
    not_task_ids=[], 
    height=700, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-03-12T20:35:00+00:00"], 
    experiment_id="10task_line")

In [None]:
thesis_gantt_multi(
    entries, 
    specific_run="scheduled__2023-03-12T20:45:00+00:00", 
    experiment_id="10task_line", 
    tickformat="%s", 
    xaxis_title="Execution time [s]",
    height=700, 
    width=700,
)

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2022-12-11_20-45",
        tasks_count=15,
        task_duration=10,
        dag_ids=["10s_15t_line"]
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2022-12-28_16-43",
        tasks_count=15,
        task_duration=10,
        dag_ids=["10s_15t_line"]
    ),
]

In [None]:
box_plots_for(entries=entries, not_task_ids=[], height=700, width=1500)

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2022-12-11_17-47",
        tasks_count=5,
        task_duration=10,
        dag_ids=["10s_5t_line"]
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2022-12-28_13-52",
        tasks_count=5,
        task_duration=10,
        dag_ids=["10s_5t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2022-12-11_19-16",
        tasks_count=10,
        task_duration=10,
        dag_ids=["10s_10t_line"]
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2022-12-28_15-24",
        tasks_count=10,
        task_duration=10,
        dag_ids=["10s_10t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2022-12-11_20-45",
        tasks_count=15,
        task_duration=10,
        dag_ids=["10s_15t_line"]
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2022-12-28_16-43",
        tasks_count=15,
        task_duration=10,
        dag_ids=["10s_15t_line"]
    ),
]

In [None]:
trends_for(entries=entries, not_task_ids=[])

 ## Beeflow's more performant for warm starts (parallel)

### 16 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-14_15-04",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-14_15-15",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(
    entries=entries,
    graph_shape=parallel_graph(16),
    not_task_ids=["run_before_loop"], 
    height=700, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-03-14T13:50:00+00:00"], 
    experiment_id="16task_parallel")

### 32 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-14_16-33",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-14_16-44",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(
    entries=entries,
    graph_shape=parallel_graph(32),
    not_task_ids=["run_before_loop"], 
    height=700, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-03-14T15:20:00+00:00"], 
    experiment_id="32task_parallel")

### 64 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-14_18-02",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-14_18-13",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
def postprocess(title, fig):
        
    if title ==  "Task wait time":
        fig.add_annotation(
            x=-0.30,
            y=14.46,
            **arrow_standard_args,
            text="cold starts",
        )
        
    if title ==  "DAG makespan":
        fig.add_annotation(
            x=-0.30,
            y=31.06,
            **arrow_standard_args,
            text="1+ cold start",
        )

box_plots_for(
    entries=entries, 
    graph_shape=parallel_graph(64),
    not_task_ids=["run_before_loop"], 
    height=700, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-03-14T16:50:00+00:00"], 
    experiment_id="64task_parallel",
    postprocess=postprocess)

### 125 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-14_19-31",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-14_19-42",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
def postprocess(title, fig):
        
    if title ==  "Task wait time":
        fig.add_annotation(
            x=-0.30,
            y=18.06,
            **arrow_standard_args,
            text="cold starts",
        )
        
    if title ==  "DAG makespan":
        fig.add_annotation(
            x=-0.30,
            y=36.06,
            **arrow_standard_args,
            text="1+ cold start",
        )

box_plots_for(
    entries=entries, 
    graph_shape=parallel_graph(125),
    not_task_ids=["run_before_loop"], 
    height=700, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-03-14T18:20:00+00:00", "scheduled__2023-03-14T18:15:00+00:00", "scheduled__2023-03-14T18:25:00+00:00"], 
    experiment_id="125task_parallel",
    postprocess=postprocess)

## Beeflow's autoscaling is faster (parallel)

### 16 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-21_15-40",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t_30cron"],
        additional_metadata="coldstart",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-21_15-51",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t_30cron"],
        additional_metadata="1workermin_20workersmax_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, 
              graph_shape=parallel_graph(16),
              not_task_ids=["run_before_loop"],
              height=700, 
              width=700,
              experiment_id="16task_parallel_cold")

### 32 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-21_17-41",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-21_17-52",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t_30cron"],
        additional_metadata="1workermin_20workersmax_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, 
              graph_shape=parallel_graph(32),
              not_task_ids=["run_before_loop"],
              not_dag_run_ids=["scheduled__2023-01-08T18:00:00+00:00"],
              height=700, 
              width=700,
              experiment_id="32task_parallel_cold")

### 64 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-21_19-41",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-21_19-52",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t_30cron"],
        additional_metadata="1workermin_20workersmax_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, 
              graph_shape=parallel_graph(64),
              not_task_ids=["run_before_loop"],
              not_dag_run_ids=[],
              height=700, 
              width=700,
              experiment_id="64task_parallel_cold")

In [None]:
thesis_gantt(
    entries, 
    specific_run="scheduled__2023-03-21T18:30:00+00:00", 
    experiment_id="64task_parallel_cold_fmt", 
    tickformat="%s", 
    xaxis_title="",
    height=700, 
    width=700,
)

### 125

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-21_21-42",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t_30cron"],
        additional_metadata="db.small.coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-21_21-53",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t_30cron"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, 
              not_task_ids=["run_before_loop"],
              graph_shape=parallel_graph(125),
              not_dag_run_ids=[],
              height=700, 
              width=700,
              experiment_id="125task_parallel_cold")

In [None]:
thesis_gantt(
    entries, 
    specific_run="scheduled__2023-03-21T20:30:00+00:00", 
    experiment_id="125task_parallel_cold_fmt", 
    tickformat="%s ", 
    xaxis_title="",
    height=700, 
    width=700,
)

## Multiple DAGs at the same time

In [None]:
### DAG run 

mwaa_dates = [
    "2023-04-22_16-38",
    "2023-04-22_18-08",
    "2023-04-22_22-08",
    "2023-04-22_19-55",
]

beeflow_dates = [
    "2023-04-22_16-28",
    "2023-04-22_17-56",
    "2023-04-22_21-57",
    "2023-04-22_19-43",
]

In [None]:
def box_plot_for_thesis(entries: List[BoxPlotEntry], measured_column="duration", hide_x_axis=False, experiment_id="", postprocess=None, **kwargs):
    transformed_data = []
    for entry in entries:
        df = entry.data.loc[:, ]
        df["identifier"] = entry.identifier
        df["system"] = entry.system
        transformed_data.append(df)
    transformed_data = pd.concat(transformed_data)
    fig = px.box(transformed_data, x="identifier", y=measured_column, color="system", points="all", 
                 **kwargs)
    if hide_x_axis:
        fig.update_xaxes(visible=False, showticklabels=False)
    fig.update_layout(showlegend=False)
    fig.update_layout(title_text=kwargs["title"], title_x=0.5)
    fig.update_layout(xaxis_title=None,     
                      paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=21,
                    ),)
    fig.update_yaxes(showgrid=False, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(rangemode="nonnegative")
    fig.update_layout(showlegend=True)
    fig.update_layout(legend=dict(
        title_text="",
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    ))
    if postprocess is not None:
        postprocess(kwargs["title"], fig)
    fig.show()
    fig.write_image(f'{CHARTS_DIRECTORY}{kwargs["title"]}-{experiment_id}.pdf')

In [None]:
def box_plots_for(entries: List[BoxPlotsEntry], 
                  not_task_ids, 
                  graph_shape,
                  not_dag_run_ids=[], 
                  hide_x_axis=False, 
                  experiment_id="", 
                  postprocess=None,
                  **kwargs):
    dags_data = []
    task_duration = None
    
    for entry in entries:
        task_duration = entry.task_duration
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_dag_runs)
        enhance_with_wait_time_dag_runs(entry_dag_runs)
        
        dags_data.append(
            BoxPlotEntry(
                data=entry_dag_runs,
                identifier=f"{len(entry.dag_ids)} DAGs",
                system=system_to_title(entry.system),
            )
        )

    box_plot_for_thesis(dags_data, 
                 measured_column="duration", 
                 title="DAG makespan",
                 hover_data=["run_id"],
                 labels={
                   "identifier": "System",
                   "duration": "Duration [s]",
                 },
                 experiment_id=experiment_id,
                 hide_x_axis=hide_x_axis,
                 postprocess=postprocess,
                 **kwargs)
    
    tasks_data = []
    
    for entry in entries:
        entry_tasks = filter_on_not_dag_run_id(
            df=filter_on_not_task_ids(
                df=filter_on_dag_ids(
                    df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                    dag_ids=entry.dag_ids,
                ),
                task_ids=[],
            ),
            dag_run_ids=not_dag_run_ids,
        )
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_tasks)
        enhance_with_wait_time_tasks(entry_tasks, graph_shape, entry_dag_runs)
        
        entry_tasks = filter_on_not_task_ids(entry_tasks, task_ids=not_task_ids)
        
        tasks_data.append(
            BoxPlotEntry(
                data=entry_tasks,
                identifier=f"{len(entry.dag_ids)} DAGs",
                system=system_to_title(entry.system),
            )
        )

    box_plot_for_thesis(tasks_data, 
             measured_column="duration", 
             title="Task duration",
             hover_data=["run_id"],
             labels={
               "identifier": "System",
               "duration": "Duration [s]",
             },
             experiment_id=experiment_id,
             postprocess=postprocess,
             hide_x_axis=hide_x_axis,
             **kwargs)
    box_plot_for_thesis(tasks_data, 
         measured_column="wait_time", 
         title="Task wait time",
         hover_data=["run_id"],
         labels={
           "identifier": "System",
           "wait_time": "Wait time [s]",
         },
         experiment_id=experiment_id,
         postprocess=postprocess,
         hide_x_axis=hide_x_axis,
         **kwargs)
        

In [None]:
entries = []

for i, (mwaa_date, beeflow_date) in enumerate(zip(mwaa_dates, beeflow_dates)):
    dag_ids = [f"10s_8t_{d}" for d in range(2**i)]
    entries.append(BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date=beeflow_date,
        tasks_count=8,
        task_duration=10,
        dag_ids=dag_ids,
        additional_metadata="",
    ))
    entries.append(BoxPlotsEntry(
        system=System.MWAA,
        export_date=mwaa_date,
        tasks_count=8,
        task_duration=10,
        dag_ids=dag_ids,
        additional_metadata="",
    ))

In [None]:
arrow_standard_args = dict(
            xref="x",
            yref="y",
            showarrow=True,
            font=dict(
                family='Latin Modern Math',
                color='black',
                size=21,
                ),
            align="center",
            arrowhead=1,
            arrowsize=0.3,
            arrowwidth=0.1,
            arrowcolor="black",
            ax=50,
            ay=0,
            borderwidth=0,
            borderpad=4,
            opacity=0.8
)

def postprocess(title, fig):
    
    if title ==  "DAG makespan":
        fig.add_annotation(
            x=-0.3,
            y=29.45,
            **arrow_standard_args,
            text="cold start",
        )
        
        fig.add_annotation(
            x=0.7,
            y=36.5,
            **arrow_standard_args,
            text="cold starts",
        )
        
        fig.add_annotation(
            x=1.7,
            y=40.5,
            **arrow_standard_args,
            text="cold starts",
        )
        
        fig.add_annotation(
            x=2.7,
            y=43.05,
            **arrow_standard_args,
            text="cold starts",
        )
        
        

In [None]:
box_plots_for(entries=entries, 
              not_task_ids=["run_before_loop"],
              graph_shape=parallel_graph(8),
              not_dag_run_ids=["scheduled__2023-04-22T16:40:00+00:00", "scheduled__2023-04-22T20:40:00+00:00"],
              height=400, 
              width=750,
              experiment_id="8t_forest",
              postprocess=postprocess)

In [None]:
def box_plot_for_thesis(entries: List[BoxPlotEntry], measured_column="duration", hide_x_axis=False, experiment_id="", postprocess=None, **kwargs):
    transformed_data = []
    for entry in entries:
        df = entry.data.loc[:, ]
        df["identifier"] = entry.identifier
        df["system"] = entry.system
        transformed_data.append(df)
    transformed_data = pd.concat(transformed_data)
    fig = px.box(transformed_data, x="identifier", y=measured_column, color="system", points="all", 
                 **kwargs)
    if hide_x_axis:
        fig.update_xaxes(visible=False, showticklabels=False)
    fig.update_layout(showlegend=False)
    fig.update_layout(title_text=kwargs["title"], title_x=0.5)
    fig.update_layout(xaxis_title=None,     
                      paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=21,
                    ),)
    fig.update_yaxes(showgrid=False, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(rangemode="nonnegative")
    fig.update_layout(showlegend=True)
    fig.update_layout(legend=dict(
        title_text="",
        yanchor="top",
        y=1.00,
        xanchor="left",
        x=1.01
    ))
    if postprocess is not None:
        postprocess(kwargs["title"], fig)
    fig.show()
    fig.write_image(f'{CHARTS_DIRECTORY}{kwargs["title"]}-{experiment_id}.pdf')

In [None]:
def box_plots_for(entries: List[BoxPlotsEntry], 
                  not_task_ids, 
                  graph_shape,
                  not_dag_run_ids=[], 
                  hide_x_axis=False, 
                  experiment_id="", 
                  postprocess=None,
                  **kwargs):
    dags_data = []
    task_duration = None
    
    for entry in entries:
        task_duration = entry.task_duration
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_dag_runs)
        enhance_with_wait_time_dag_runs(entry_dag_runs)
        
        dags_data.append(
            BoxPlotEntry(
                data=entry_dag_runs,
                identifier=f"{len(entry.dag_ids) * entry.tasks_count} tasks",
                system=entry.additional_metadata,
            )
        )

    box_plot_for_thesis(dags_data, 
                 measured_column="duration", 
                 title="DAG makespan (sAirflow)",
                 hover_data=["run_id"],
                 labels={
                   "identifier": "System",
                   "duration": "Duration [s]",
                 },
                 experiment_id=experiment_id,
                 hide_x_axis=hide_x_axis,
                 postprocess=postprocess,
                 **kwargs)
    
    tasks_data = []
    
    for entry in entries:
        entry_tasks = filter_on_not_dag_run_id(
            df=filter_on_not_task_ids(
                df=filter_on_dag_ids(
                    df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                    dag_ids=entry.dag_ids,
                ),
                task_ids=[],
            ),
            dag_run_ids=not_dag_run_ids,
        )
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_tasks)
        enhance_with_wait_time_tasks(entry_tasks, parallel_graph(entry.tasks_count), entry_dag_runs)
        
        entry_tasks = filter_on_not_task_ids(entry_tasks, task_ids=not_task_ids)
        
        tasks_data.append(
            BoxPlotEntry(
                data=entry_tasks,
                identifier=f"{len(entry.dag_ids) * entry.tasks_count} tasks",
                system=entry.additional_metadata,
            )
        )
    box_plot_for_thesis(tasks_data, 
             measured_column="duration", 
             title="Task duration (sAirflow)",
             hover_data=["run_id"],
             labels={
               "identifier": "System",
               "duration": "Duration [s]",
             },
             experiment_id=experiment_id,
             postprocess=postprocess,
             hide_x_axis=hide_x_axis,
             **kwargs)
    box_plot_for_thesis(tasks_data, 
         measured_column="wait_time", 
         title="Task wait time (sAirflow)",
         hover_data=["run_id"],
         labels={
           "identifier": "System",
           "wait_time": "Wait time [s]",
         },
         experiment_id=experiment_id,
         postprocess=postprocess,
         hide_x_axis=hide_x_axis,
         **kwargs)
        

In [None]:
entries = []

for i, (mwaa_date, beeflow_date) in enumerate(zip(mwaa_dates, beeflow_dates)):
    if i == 0:
        continue
    
    dag_ids = [f"10s_8t_{d}" for d in range(2**i)]
    entries.append(BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date=beeflow_date,
        tasks_count=8,
        task_duration=10,
        dag_ids=dag_ids,
        additional_metadata="Parallel Forest",
    ))
    
    if i == 1:
        entries.append(BoxPlotsEntry(
            system=System.BEEFLOW,
            export_date="2023-03-14_15-04",
            tasks_count=16,
            task_duration=10,
            dag_ids=["10s_16t"],
            additional_metadata="Parallel DAG n=16",
    ))
        
    if i == 2:
        entries.append(BoxPlotsEntry(
            system=System.BEEFLOW,
            export_date="2023-03-14_16-33",
            tasks_count=32,
            task_duration=10,
            dag_ids=["10s_32t"],
            additional_metadata="Parallel DAG n=32",
    ))
        
    if i == 3:
        entries.append(BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-14_18-02",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t"],
        additional_metadata="Parallel DAG n=64",
    ))
    

In [None]:
box_plots_for(entries=entries, 
              not_task_ids=["run_before_loop"],
              graph_shape=parallel_graph(8),
              not_dag_run_ids=["scheduled__2023-04-22T16:40:00+00:00", "scheduled__2023-04-22T20:40:00+00:00"],
              height=400, 
              width=750,
              experiment_id="sairflow_cross",
              color_discrete_sequence=['#636EFA', '#00CC96', '#ab63fa', '#ffa15a'],
              postprocess=None)

## AWS Batch comparison

### linear 1 v 1

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-05-01_16-42",
        tasks_count=1,
        task_duration=10,
        dag_ids=["10s_1t_batch"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-12_19-01",
        tasks_count=1,
        task_duration=10,
        dag_ids=["10s_1t_line"]
    ),
]

In [None]:
def go():
    G = nx.DiGraph()
    G.add_nodes_from(["run_before_loop"])
    return G

In [None]:
box_plots_for(entries=entries, 
              not_task_ids=[], 
              graph_shape=linear_graph(1),
              height=600, 
              width=700, 
              not_dag_run_ids=["scheduled__2023-03-12T17:35:00+00:00", "scheduled__2023-03-12T17:40:00+00:00"], 
              sairflow_graph_shape_override=go(),
              experiment_id="1task_line_batch")

### 16 parallel

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-05-01_18-06",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t_batch"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-21_15-51",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t_30cron"],
        additional_metadata="1workermin_20workersmax_10task{for worker}",
    ),
]

In [None]:
box_plots_for(
    entries=entries,
    graph_shape=parallel_graph(16),
    not_task_ids=["run_before_loop"], 
    height=700, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-05-01T17:10:00+00:00"], 
    experiment_id="16task_parallel_batch")

In [None]:
def thesis_gantt(entries, specific_run, experiment_id, xaxis_title="", tickformat="", sairflow_run_override=None, **kwargs):
    lowest_date = None
    further_date = None
    charts_data = []
    highest_diff = None

    for entry in entries:
            entry_tasks = filter_on_not_dag_run_id(
                df=filter_on_not_task_ids(
                    df=filter_on_dag_ids(
                        df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                        dag_ids=entry.dag_ids,
                    ),
                    task_ids=[],
                ),
                dag_run_ids=[],
            )
            sr = specific_run
            if sairflow_run_override is not None and entry.system == System.BEEFLOW:
                sr = sairflow_run_override
            entry_tasks = entry_tasks[entry_tasks["run_id"] == sr]
            entry_tasks["start_date"] = pd.to_datetime(entry_tasks['start_date'])
            entry_tasks["end_date"] = pd.to_datetime(entry_tasks['end_date'])
            entry_tasks = entry_tasks.sort_values('start_date', ascending=True)
            charts_data.append(entry_tasks)

            if lowest_date is None:
                lowest_date = entry_tasks["start_date"].min()
            lowest_date = min(lowest_date, entry_tasks["start_date"].min())

            if further_date is None:
                further_date = entry_tasks["end_date"].max()
            further_date = max(further_date, entry_tasks["end_date"].max())   
            
            if highest_diff is None:
                highest_diff = entry_tasks["end_date"].max() - entry_tasks["start_date"].min()
            highest_diff = max(highest_diff, entry_tasks["end_date"].max() - entry_tasks["start_date"].min())

    color_discrete_sequence = ["#636EFA", "#ef553b"]

    for i, (entry, entry_tasks) in enumerate(zip(entries, charts_data)):
            lowest = max(lowest_date, entry_tasks["start_date"].min())
            difference = entry_tasks["start_date"].min() - lowest_date
            entry_tasks["start_date"] = entry_tasks["start_date"] - difference
            entry_tasks["end_date"] = entry_tasks["end_date"] - difference
            
            
            entry_tasks["start_date"] = datetime.datetime.utcfromtimestamp(0) + (entry_tasks["start_date"] - lowest_date)
            entry_tasks["end_date"] = datetime.datetime.utcfromtimestamp(0) + (entry_tasks["end_date"] - lowest_date)
            
            
            title = f"{system_to_title(entry.system)}: Tasks Gantt"
            gantt_task_chart(entry_tasks,
                             title=system_to_title(entry.system),
                             experiment_id=experiment_id,
                             xaxis_title=xaxis_title,
                             tickformat=tickformat,
                             labels={
                               "task_id": "Tasks",
                               "duration": "Duration [s]",
                             },
                             color_discrete_sequence=[color_discrete_sequence[i]],
                             range_x=[datetime.datetime.utcfromtimestamp(0), datetime.datetime.utcfromtimestamp(0) + highest_diff],
                             **kwargs)

In [None]:
thesis_gantt(
    entries=entries, 
    specific_run="scheduled__2023-03-21T14:30:00+00:00",
    sairflow_run_override="scheduled__2023-05-01T16:40:00+00:00",
    experiment_id="16task_parallel_batch_gantt_fmt", 
    tickformat="%s", 
    xaxis_title="",
    height=700, 
    width=700,
)

### 32?

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-05-01_19-25",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t_batch"],
        additional_metadata="db.small",
    ),
     BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-21_17-52",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t_30cron"],
        additional_metadata="1workermin_20workersmax_10task{for worker}",
    ),
]

In [None]:
box_plots_for(
    entries=entries,
    graph_shape=parallel_graph(32),
    not_task_ids=["run_before_loop"], 
    height=700, 
    width=700, 
    not_dag_run_ids=["scheduled__2023-05-01T18:20:00+00:00"], 
    experiment_id="32task_parallel_batch")

In [None]:
thesis_gantt(
    entries=entries, 
    specific_run="scheduled__2023-03-21T17:00:00+00:00",
    sairflow_run_override="scheduled__2023-05-01T18:30:00+00:00",
    experiment_id="32task_parallel_batch_gantt_fmt", 
    tickformat="%s", 
    xaxis_title="",
    height=700, 
    width=700,
)