# Helper code

In [None]:
from plotly.offline import init_notebook_mode; init_notebook_mode()

In [None]:
CHARTS_DIRECTORY = "charts/" 

In [None]:
BUCKET_NAME="beeflow-dev-metadata-dumps"

import pandas as pd
import boto3
import plotly.express as px

s3 = boto3.client('s3') 

s3_cache = {}

def get_s3_csv(file: str):
    if file in s3_cache:
        return s3_cache[file]
    obj = s3.get_object(Bucket=BUCKET_NAME, Key=file)
    df = pd.read_csv(obj['Body'])
    s3_cache[file] = df
    return df


from enum import Enum

class System(Enum):
    MWAA = "mwaa"
    BEEFLOW = "serverless"
    
class Stats(Enum):
    DAG_RUNS = "dagrun"
    TASKS = "taskinstance"
    
def get_stats(system: System, date: str, stats: Stats):
    return get_s3_csv(f"{system.value}/export/{stats.value}/dt={date}/{stats.value}.csv")

def filter_on_dag_ids(df, dag_ids):
    return df[df['dag_id'].isin(dag_ids)]

def filter_on_not_task_ids(df, task_ids):
    return df[~df['task_id'].isin(task_ids)]

def filter_on_not_dag_run_id(df, dag_run_ids):
    return df[~df['run_id'].isin(dag_run_ids)]
    

In [None]:
def enhance_with_duration(df):
    start_ms_time = pd.to_datetime(df['start_date'])
    end_ms_time = pd.to_datetime(df['end_date'])
    df['duration'] = end_ms_time.sub(start_ms_time).dt.total_seconds()
    
def enhance_with_wait_time_dag_runs(df):
    queued_ms_time = pd.to_datetime(df['queued_at'])
    end_ms_time = pd.to_datetime(df['start_date'])
    df['wait_time'] = end_ms_time.sub(queued_ms_time).dt.total_seconds()
    
def enhance_with_wait_time_tasks(df):
    queued_ms_time = pd.to_datetime(df['queued_dttm'])
    end_ms_time = pd.to_datetime(df['start_date'])
    df['wait_time'] = end_ms_time.sub(queued_ms_time).dt.total_seconds()

In [None]:
from dataclasses import dataclass
from typing import List

@dataclass
class BoxPlotEntry:
    data: pd.DataFrame
    identifier: str
    system: str = ""

In [None]:
def box_plot_for(entries: List[BoxPlotEntry], measured_column="duration", hide_x_axis=False, **kwargs):
    transformed_data = []
    for entry in entries:
        df = entry.data.loc[:, ]
        df["identifier"] = entry.identifier
        transformed_data.append(df)
    transformed_data = pd.concat(transformed_data)
    fig = px.box(transformed_data, x="identifier", y=measured_column, color="identifier", points="all", **kwargs)
    if hide_x_axis:
        fig.update_xaxes(visible=False, showticklabels=False)
    fig.show()
    # fig.write_image(f'{kwargs["title"]}.pdf')

In [None]:
def box_plot_for_thesis(entries: List[BoxPlotEntry], measured_column="duration", hide_x_axis=False, experiment_id="", postprocess=None, **kwargs):
    transformed_data = []
    for entry in entries:
        df = entry.data.loc[:, ]
        df["identifier"] = entry.system
        transformed_data.append(df)
    transformed_data = pd.concat(transformed_data)
    fig = px.box(transformed_data, x="identifier", y=measured_column, color="identifier", points="all", **kwargs)
    if hide_x_axis:
        fig.update_xaxes(visible=False, showticklabels=False)
    fig.update_layout(showlegend=False)
    fig.update_layout(title_text=kwargs["title"], title_x=0.5)
    fig.update_layout(xaxis_title=None,     
                      paper_bgcolor='rgb(255, 255, 255)',
                      plot_bgcolor='rgb(255, 255, 255)',
                     font=dict(
                        family='Latin Modern Math',
                        color='black',
                        size=32,
                    ),)
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=2, linecolor='LightGrey', mirror=True)
    if postprocess is not None:
        postprocess(kwargs["title"], fig)
    fig.show()
    fig.write_image(f'{CHARTS_DIRECTORY}{kwargs["title"]}-{experiment_id}.pdf')

In [None]:
@dataclass
class BoxPlotsEntry:
    system: System
    export_date: str
    dag_ids: List[str]
    task_duration: int
    tasks_count: int
    additional_metadata: str = ""

In [None]:
def system_to_title(system):
    if system == System.MWAA:
        return "MWAA"
    return "Serverless"

In [None]:
def box_plots_for(entries: List[BoxPlotsEntry], not_task_ids, not_dag_run_ids=[], hide_x_axis=False, experiment_id="", postprocess=None, **kwargs):
    dags_data = []
    task_duration = None
    
    for entry in entries:
        task_duration = entry.task_duration
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_dag_runs)
        enhance_with_wait_time_dag_runs(entry_dag_runs)
        
        dags_data.append(
            BoxPlotEntry(
                data=entry_dag_runs,
                identifier=f"{entry.system.value}_tasks:{entry.tasks_count}_duration:{entry.task_duration}_additional:{entry.additional_metadata}",
                system=system_to_title(entry.system),
            )
        )
        
    box_plot_for(dags_data, 
                 measured_column="duration", 
                 title="DAG runs comparison: duration",
                 hover_data=["run_id"],
                 hide_x_axis=hide_x_axis,
                 **kwargs)
    box_plot_for_thesis(dags_data, 
                 measured_column="duration", 
                 title="DAG run",
                 hover_data=["run_id"],
                 labels={
                   "identifier": "System",
                   "duration": "Duration [s]",
                 },
                 experiment_id=experiment_id,
                 hide_x_axis=hide_x_axis,
                 postprocess=postprocess,
                 **kwargs)
    
    tasks_data = []
    
    for entry in entries:
        entry_tasks = filter_on_not_dag_run_id(
            df=filter_on_not_task_ids(
                df=filter_on_dag_ids(
                    df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                    dag_ids=entry.dag_ids,
                ),
                task_ids=not_task_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_tasks)
        enhance_with_wait_time_tasks(entry_tasks)
        
        tasks_data.append(
            BoxPlotEntry(
                data=entry_tasks,
                identifier=f"{entry.system.value}_tasks:{entry.tasks_count}_duration:{entry.task_duration}_additional:{entry.additional_metadata}",
                system=system_to_title(entry.system),
            )
        )
    
    box_plot_for(tasks_data,
                 measured_column="duration", 
                 title=f"Tasks runs comparison: task duration (ideal: {task_duration}s)",
                 hide_x_axis=hide_x_axis,
                 hover_data=["task_id", "run_id"],
                 **kwargs)
    box_plot_for_thesis(tasks_data, 
             measured_column="duration", 
             title="Task duration",
             hover_data=["run_id"],
             labels={
               "identifier": "System",
               "duration": "Duration [s]",
             },
             experiment_id=experiment_id,
             postprocess=postprocess,
             hide_x_axis=hide_x_axis,
             **kwargs)
    box_plot_for(tasks_data,
                 measured_column="wait_time", 
                 title="Tasks run comparision: wait time (amount of time passed from task being marked as queued to started)",
                 hide_x_axis=hide_x_axis,
                 hover_data=["task_id", "run_id"],
                 **kwargs)
    box_plot_for_thesis(tasks_data, 
         measured_column="wait_time", 
         title="Wait time",
         hover_data=["run_id"],
         labels={
           "identifier": "System",
           "wait_time": "Wait time [s]",
         },
         experiment_id=experiment_id,
         postprocess=postprocess,
         hide_x_axis=hide_x_axis,
         **kwargs)
        

In [None]:
def trend_for(data: pd.DataFrame, x_col="tasks_count", measured_column="duration", **kwargs):
    # display(data)
    fig = px.line(data, x=x_col, y=measured_column, color="identifier", **kwargs)
    if "hide_x_axis" in kwargs:
        fig.update_xaxes(visible=False, showticklabels=False)
    fig.show()

In [None]:
def trends_for(entries: List[BoxPlotsEntry], not_task_ids, not_dag_run_ids=[], **kwargs):
    dags_data = []
    
    for entry in entries:
        task_duration = entry.task_duration
        entry_dag_runs = filter_on_not_dag_run_id(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
                dag_ids=entry.dag_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_dag_runs)
        enhance_with_wait_time_dag_runs(entry_dag_runs)
        
        dags_data.append(
            {
                "median_duration": entry_dag_runs["duration"].median(),
                "identifier": f"{entry.system.value}:{entry.additional_metadata}",
                "tasks_count": entry.tasks_count,
            }
        )
        
    trend_for(data=pd.DataFrame(dags_data), measured_column="median_duration", title="Median DAG duration", **kwargs)
    
    tasks_data = []
    for entry in entries:
        entry_tasks = filter_on_not_dag_run_id(
            df=filter_on_not_task_ids(
                df=filter_on_dag_ids(
                    df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                    dag_ids=entry.dag_ids,
                ),
                task_ids=not_task_ids,
            ),
            dag_run_ids=not_dag_run_ids,
        )
        enhance_with_duration(entry_tasks)
        enhance_with_wait_time_tasks(entry_tasks)
        
        tasks_data.append(
            {
                "median_duration": entry_tasks["duration"].median(),
                "median_wait_time": entry_tasks["wait_time"].median(),
                "identifier": f"{entry.system.value}:{entry.additional_metadata}",
                "tasks_count": entry.tasks_count,
            }
        )
        
    
    trend_for(data=pd.DataFrame(tasks_data), measured_column="median_duration", title="Median Tasks duration", **kwargs)
    trend_for(data=pd.DataFrame(tasks_data), measured_column="median_wait_time", title="Median Tasks waittime", **kwargs)
    
        

In [None]:
from IPython.display import display


def print_tables(entries: List[BoxPlotsEntry]):
    for entry in entries:
        entry_dag_runs = filter_on_dag_ids(
            df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
            dag_ids=entry.dag_ids,
        )
        enhance_with_duration(entry_dag_runs)
        print(entry.system)
        display(entry_dag_runs[["duration", "run_id", "start_date", "end_date", "_state"]])

In [None]:
@dataclass
class HistogramEntry:
    system: System
    export_date: str
    dag_ids: List[str]
    task_duration: int
    tasks_count: int

In [None]:
@dataclass
class HistogramPlotEntry:
    system: System
    data: pd.DataFrame
    task_duration: int
    tasks_count: int


def cross_histogram_plot_for(entries: List[HistogramPlotEntry], 
                             nbins=30, 
                             measurment_column_name="duration", 
                             **kwargs):
    transformed_data = []
    for entry in entries:
        df = entry.data.loc[:, [measurment_column_name]]
        df["identifier"] = f"{entry.system.value}_tasks:{entry.tasks_count}_duration:{entry.task_duration}"
        transformed_data.append(df)
    fig = px.histogram(pd.concat(transformed_data), 
                       y=measurment_column_name, 
                       color="identifier", 
                       marginal="box", 
                       barmode="group", 
                       nbins=nbins,
                       **kwargs,
                      )
    fig.show()

In [None]:
def compare_runs_by_histograms_for(entries: List[HistogramEntry], not_task_ids, nbins=30, **kwargs):
    # Draw dags
    dags_data = []
    for entry in entries:
        entry_dag_runs = filter_on_dag_ids(
            df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.DAG_RUNS), 
            dag_ids=entry.dag_ids,
        )
        enhance_with_duration(entry_dag_runs)
        enhance_with_wait_time_dag_runs(entry_dag_runs)
        dags_data.append(
            HistogramPlotEntry(
                system=entry.system,
                data=entry_dag_runs,
                task_duration=entry.task_duration,
                tasks_count=entry.tasks_count,
            )
        )
    cross_histogram_plot_for(dags_data, 
                             nbins, 
                             measurment_column_name="duration", 
                             title="DAG runs comparison: duration",
                             **kwargs)
    # Tasks data
    tasks_data = []
    for entry in entries:
        entry_tasks = filter_on_not_task_ids(
            df=filter_on_dag_ids(
                df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
                dag_ids=entry.dag_ids,
            ),
            task_ids=not_task_ids,
        )
        enhance_with_duration(entry_tasks)
        enhance_with_wait_time_tasks(entry_tasks)
        tasks_data.append(
            HistogramPlotEntry(
                system=entry.system,
                data=entry_tasks,
                task_duration=entry.task_duration,
                tasks_count=entry.tasks_count,
            )
        )
    cross_histogram_plot_for(tasks_data, 
                             nbins, 
                             measurment_column_name="duration", 
                             title="Tasks runs comparison: duration",
                             **kwargs)
    cross_histogram_plot_for(tasks_data, 
                             nbins, 
                             measurment_column_name="wait_time", 
                             title="Tasks run comparision: wait time (start - queued)",
                             **kwargs)
        

In [None]:
def gantt_task(df, range_x=None):
    fig = px.timeline(df, x_start="start_date", x_end="end_date", color="run_id", y="task_id", range_x=range_x)
    fig.update_yaxes(autorange="reversed")
    fig.show()

In [None]:
@dataclass
class TasksChartEntry:
    system: System
    export_date: str
    dag_ids: List[str]
    task_duration: int
    tasks_count: int

In [None]:
def tasks_plot(entry: TasksChartEntry, not_task_ids=[], **kwargs):
    entry_tasks = filter_on_not_task_ids(
        df=filter_on_dag_ids(
            df=get_stats(system=entry.system, date=entry.export_date, stats=Stats.TASKS), 
            dag_ids=entry.dag_ids,
        ),
        task_ids=not_task_ids,
    )
    gantt_task(entry_tasks)
    

In [None]:
arrow_standard_args = dict(
            xref="x",
            yref="y",
            text="cold start",
            showarrow=True,
            font=dict(
                family='Latin Modern Math',
                color='black',
                size=28,
                ),
            align="center",
            arrowhead=2,
            arrowsize=1,
            arrowwidth=2,
            arrowcolor="black",
            ax=100,
            ay=0,
            borderwidth=0,
            borderpad=4,
            opacity=0.8
)

# Points

## Beeflow's performance for line DAG's rivals MWAA

A simple line chart is being executed every 5 mins (lambda is then WARM on each DAG run but first, and the only time it's cold is the first task of the first DAG run).
1 Task in parallel runs, the chain has 1/5/10/15 tasks.
Beeflow is finishing a the chain in rougly `N*15s` while MWAA is `N*13s`.
For Beeflow coldstats that would be roughly `(N-1)*15s + 1(15s+coldstart)`, where `coldstart` is around `8-10s` (shown on the diagram as well).

As mentioned, lambda will _possibly_ be cold only for the first task in the DAG run (as consequent tasks
would hopefully resuse the container; and they do).

![Untitled Diagram-Page-2.drawio (1).png](attachment:c28181e2-402d-403f-bdde-7ecc5436749a.png)

The following config was applied for the systems

1. Beeflow
- `db.micro` (free tier), enough to handle 1 concurrent task!
- beeflow lambdas for scheduler uses: 512MB mem -> ~1/3 vCPU
- beeflow worker lambdas use: 512MB mem -> ~1/3 vCPU (but the only thing they do is sleep)

2. MWAA
- small environment (limits scheduler to 1vCPU)
- 1 workers min, 1 workers max
- 5 tasks per worker (default)
- one worker has 1 vCPU and 2GB RAM 

### 1 task

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-12_18-48",
        tasks_count=1,
        task_duration=10,
        dag_ids=["10s_1t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-12_19-01",
        tasks_count=1,
        task_duration=10,
        dag_ids=["10s_1t_line"]
    ),
]

In [None]:
box_plots_for(entries=entries, not_task_ids=[], height=600, width=700, not_dag_run_ids=["scheduled__2023-03-12T17:35:00+00:00", "scheduled__2023-03-12T17:40:00+00:00"], experiment_id="1task_line")

In [None]:
def postprocess(title, fig):
    
    if title ==  "DAG run":
        fig.add_annotation(
            x=-0.30,
            y=24.95,
            **arrow_standard_args,
        )
        
    if title ==  "Wait time":
        fig.add_annotation(
            x=-0.30,
            y=11.36,
            **arrow_standard_args,
        )
        

box_plots_for(entries=entries, not_task_ids=[], height=600, width=700, not_dag_run_ids=["scheduled__2023-03-12T17:35:00+00:00"], experiment_id="1task_line_withcold", postprocess=postprocess)

### 5 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-12_20-19",
        tasks_count=5,
        task_duration=10,
        dag_ids=["10s_5t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-12_20-30",
        tasks_count=5,
        task_duration=10,
        dag_ids=["10s_5t_line"]
    ),
]

In [None]:
box_plots_for(entries=entries, not_task_ids=[], height=700, width=700, not_dag_run_ids=["scheduled__2023-03-12T19:05:00+00:00"], experiment_id="5task_line")

### 10 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-12_21-48",
        tasks_count=10,
        task_duration=10,
        dag_ids=["10s_10t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-12_21-59",
        tasks_count=10,
        task_duration=10,
        dag_ids=["10s_10t_line"]
    ),
]

In [None]:
box_plots_for(entries=entries, not_task_ids=[], height=700, width=700, not_dag_run_ids=["scheduled__2023-03-12T20:35:00+00:00"], experiment_id="10task_line")

### 15 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2022-12-11_20-45",
        tasks_count=15,
        task_duration=10,
        dag_ids=["10s_15t_line"]
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2022-12-28_16-43",
        tasks_count=15,
        task_duration=10,
        dag_ids=["10s_15t_line"]
    ),
]

In [None]:
box_plots_for(entries=entries, not_task_ids=[], height=700, width=1500)

### Trends

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2022-12-11_17-47",
        tasks_count=5,
        task_duration=10,
        dag_ids=["10s_5t_line"]
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2022-12-28_13-52",
        tasks_count=5,
        task_duration=10,
        dag_ids=["10s_5t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2022-12-11_19-16",
        tasks_count=10,
        task_duration=10,
        dag_ids=["10s_10t_line"]
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2022-12-28_15-24",
        tasks_count=10,
        task_duration=10,
        dag_ids=["10s_10t_line"]
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2022-12-11_20-45",
        tasks_count=15,
        task_duration=10,
        dag_ids=["10s_15t_line"]
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2022-12-28_16-43",
        tasks_count=15,
        task_duration=10,
        dag_ids=["10s_15t_line"]
    ),
]

In [None]:
trends_for(entries=entries, not_task_ids=[])

 ## Beeflow's more performant for warm starts (parallel)

![Untitled Diagram.drawio (2).png](attachment:2fcbc691-7c93-453d-9b96-834330bec01b.png)

For this experiment, the following configuration was applied:

1. Beeflow
- `db.small` was used instead of `db.micro`
- the DB allows  for  around 170 concurrent connections, the cost of `db.micro` is `~14$`, while `db.small` is double that (`28`)
- `db.micro` is free under the free tier, that allows for `~85` concurrent connections, so highly parallel tests were throttled heavily
- `MWAA` seems to use something between `db.small` and `db.medium` (pulled from metrics, there is no official info on this)
- thus the comparision is fair, more on this later, the db is the single point of failure and the bottleneck of the system
- another possibility and what MWAA leverages is [a managed proxy service on AWS](https://aws.amazon.com/rds/proxy/) but GCP doesn't seem to have the equivalent (TO CONFIRM)
- the proxy costs `21$+` a month, so the `db.small` upgrade was chosen
- the rest of the setup stays the same
- beeflow lambdas for scheduler uses: 512MB mem -> ~1/3 vCPU
- beeflow worker lambdas use: 512MB mem -> ~1/3 vCPU (but the only thing they do is sleep)

2. MWAA
- small environment (limits scheduler to 1vCPU)
- 20 workers min, 20 workers max
- 10 tasks per worker
- one worker has 1 vCPU and 2GB RAM (but again, the tasks in both sleep, so it doesn't matter much)

### 16 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-14_15-04",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-14_15-15",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, not_task_ids=["run_before_loop"], height=700, width=700, not_dag_run_ids=["scheduled__2023-03-14T13:50:00+00:00"], experiment_id="16task_parallel")

### 32 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-14_16-33",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-14_16-44",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, not_task_ids=["run_before_loop"], height=700, width=700, not_dag_run_ids=["scheduled__2023-03-14T15:20:00+00:00"], experiment_id="32task_parallel")

### 64 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-14_18-02",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-14_18-13",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, not_task_ids=["run_before_loop"], height=700, width=700, not_dag_run_ids=["scheduled__2023-03-14T16:50:00+00:00"], experiment_id="64task_parallel")

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-19_14-12",
        tasks_count=64,
        task_duration=30,
        dag_ids=["30s_64t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-19_14-24",
        tasks_count=64,
        task_duration=30,
        dag_ids=["30s_64t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, not_task_ids=["run_before_loop"], height=700, width=1500)

### 125 tasks

In this case MWAA also started to throw some transient errors, 2 DAG runs have not finished in time,
a couple of tasks run for ~5minutes. Excluding these from the diagram for the sake of a proper comparison

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-03-14_19-31",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-03-14_19-42",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, not_task_ids=["run_before_loop"], height=700, width=700, not_dag_run_ids=["scheduled__2023-03-14T18:20:00+00:00", "scheduled__2023-03-14T18:15:00+00:00", "scheduled__2023-03-14T18:25:00+00:00"], experiment_id="125task_parallel")

In [None]:
entry = TasksChartEntry(
        system=System.MWAA,
        export_date="2023-01-17_18-19",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t"],
    )

tasks_plot(entry)

In [None]:
entry = TasksChartEntry(
        system=System.BEEFLOW,
        export_date="2023-01-17_18-08",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t"],
    )

tasks_plot(entry)

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-19_15-42",
        tasks_count=125,
        task_duration=30,
        dag_ids=["30s_125t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-19_15-53",
        tasks_count=125,
        task_duration=30,
        dag_ids=["30s_125t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, 
              not_task_ids=["run_before_loop"], 
              not_dag_run_ids=["scheduled__2023-01-07T21:45:00+00:00", "scheduled__2023-01-07T21:35:00+00:00"], 
              height=700, 
              width=1500)

### Trends

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-17_13-40",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t"],
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-17_13-52",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-17_15-10",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t"],
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-17_15-21",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-17_16-39",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t"],
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-17_16-50",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-17_18-08",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t"],
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-17_18-19",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t"],
    ),
]

In [None]:
trends_for(entries, not_task_ids=[])

## Beeflow's performance for cold stars rivals warm MWAA environment (parallel)

The setup in this experiment is exactly the same as in the previous one with warm start.
The difference is that the DAG was running on 30mins cron, instead of 5mins, thus, making
all the lambda start from `cold` invocations for each subsystem in Beeflow.

### 16 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-17_13-40",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_12-18",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t_30cron"],
        additional_metadata="db.small.coldstart",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-17_13-52",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t"],
        additional_metadata="25workers{min==max}_5task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, not_task_ids=["run_before_loop"], not_dag_run_ids=["scheduled__2023-01-08T14:30:00+00:00"], height=700, width=1500, hide_x_axis=True)

### 32 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-17_15-10",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_14-19",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t_30cron"],
        additional_metadata="db.small.coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-17_15-21",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, not_task_ids=["run_before_loop"], height=700, width=1500, hide_x_axis=True)

### 64 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-17_16-39",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_16-20",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t_30cron"],
        additional_metadata="db.small.coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-17_16-50",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, 
              not_task_ids=["run_before_loop"], 
              not_dag_run_ids=["scheduled__2023-01-08T18:30:00+00:00"], height=700, width=1500, hide_x_axis=True)

### 125 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-17_18-08",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_18-21",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t_30cron"],
        additional_metadata="db.small.coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-17_18-19",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, not_task_ids=["run_before_loop"], 
              not_dag_run_ids=["scheduled__2023-01-07T21:45:00+00:00", "scheduled__2023-01-07T21:35:00+00:00", "scheduled__2023-01-08T20:30:00+00:00"], 
              hide_x_axis=True,
              height=700, width=1500)

### Trends

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-17_13-40",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_12-18",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-17_13-52",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-17_15-10",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_14-19",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-17_15-21",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-17_16-39",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_16-20",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-17_16-50",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-17_18-08",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_18-21",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-17_18-19",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t"],
    ),

]

In [None]:
trends_for(entries, not_task_ids=[],
          not_dag_run_ids=[
              "scheduled__2023-01-07T21:45:00+00:00", 
              "scheduled__2023-01-07T21:35:00+00:00", 
              "scheduled__2023-01-08T20:30:00+00:00",
              "scheduled__2023-01-08T18:30:00+00:00",
              "scheduled__2023-01-08T16:30:00+00:00",
              "scheduled__2023-01-08T14:30:00+00:00",
          ])

## Beeflow's autoscaling is faster (parallel)

MWAA used in this experiment is the same setup, but instead of keeping 20 workers
up and running at all times, the config is to use 1 workers at min, and 20 worker at maximum.
Thus, enabling the autoscaling.

The problem with MWAA's autoscaling is that it takes a lot of time! (2-5mins for a single 
machine to come up and running).

[This is an interesting note about mwaa](https://docs.aws.amazon.com/mwaa/latest/userguide/mwaa-autoscaling.html) btw (offtop kind of):
![image.png](attachment:a2eb3b63-2e41-4c4f-a251-a538575d0816.png)

^ when there is an issue, then dont enable autoscaling and pay for the workers... :D

### 16 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_12-18",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t_30cron"],
        additional_metadata="coldstart",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-18_12-29",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t_30cron"],
        additional_metadata="1workermin_20workersmax_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, 
              not_task_ids=["run_before_loop"],
              height=700, 
              width=1500)

### 32 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_14-19",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-18_14-30",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t_30cron"],
        additional_metadata="1workermin_20workersmax_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, 
              not_task_ids=["run_before_loop"],
              not_dag_run_ids=["scheduled__2023-01-08T18:00:00+00:00"],
              height=700, 
              width=1500)

### 64 tasks

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_16-20",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-18_16-31",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t_30cron"],
        additional_metadata="1workermin_20workersmax_10task{for worker}",
    ),
]

In [None]:
box_plots_for(entries=entries, 
              not_task_ids=["run_before_loop"],
              not_dag_run_ids=["scheduled__2023-01-08T18:30:00+00:00"],
              height=700, 
              width=1500)

### 125

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_18-21",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t_30cron"],
        additional_metadata="db.small.coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-18_18-32",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t_30cron"],
        additional_metadata="20workers{min==max}_10task{for worker}",
    ),
]

In [None]:
print_tables(entries)

In [None]:
box_plots_for(entries=entries, 
              not_task_ids=["run_before_loop"],
              not_dag_run_ids=["scheduled__2023-01-18T17:00:00+00:00", "scheduled__2023-01-18T16:30:00+00:00", "scheduled__2023-01-18T17:30:00+00:00"],
              height=700, 
              width=1500)

At the last graph here it can be seen where the new workers were added to MWAA
at the end where more tasks were executed in parallel

### Trends

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_12-18",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-18_12-29",
        tasks_count=16,
        task_duration=10,
        dag_ids=["10s_16t_30cron"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_14-19",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-18_14-30",
        tasks_count=32,
        task_duration=10,
        dag_ids=["10s_32t_30cron"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_16-20",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-18_16-31",
        tasks_count=64,
        task_duration=10,
        dag_ids=["10s_64t_30cron"],
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-18_18-21",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t_30cron"],
        additional_metadata="coldstarts",
    ),
    BoxPlotsEntry(
        system=System.MWAA,
        export_date="2023-01-18_18-32",
        tasks_count=125,
        task_duration=10,
        dag_ids=["10s_125t_30cron"],
    ),

]

In [None]:
trends_for(entries, not_task_ids=[], not_dag_run_ids=["scheduled__2023-01-18T17:00:00+00:00", "scheduled__2023-01-18T16:30:00+00:00", "scheduled__2023-01-18T17:30:00+00:00"],)

In [None]:
Tests

In [None]:
entries = [
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-19_15-42",
        tasks_count=125,
        task_duration=30,
        dag_ids=["30s_125t"],
        additional_metadata="db.small",
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-21_16-27",
        tasks_count=125,
        task_duration=30,
        dag_ids=["30s_125t"],
        additional_metadata="db.m5.large",
    ),
    BoxPlotsEntry(
        system=System.BEEFLOW,
        export_date="2023-01-21_17-42",
        tasks_count=125,
        task_duration=30,
        dag_ids=["30s_125t"],
        additional_metadata="db.m4.xlarge",
    ),
]


In [None]:
box_plots_for(entries=entries, 
              not_task_ids=["run_before_loop"],
              height=700, 
              width=1500)