In [None]:
BUCKET_NAME="beeflow-dev-metadata-dumps"

In [None]:
import pandas as pd
import boto3

In [None]:
s3 = boto3.client('s3') 

In [None]:
def get_s3_csv(file: str):
    obj = s3.get_object(Bucket=BUCKET_NAME, Key=file)
    return pd.read_csv(obj['Body'])


In [None]:
from enum import Enum

class System(Enum):
    MWAA = "mwaa"
    BEEFLOW = "serverless"
    
class Stats(Enum):
    DAG_RUNS = "dagrun"
    TASKS = "taskinstance"

In [None]:
def get_stats(system: System, date: str, stats: Stats):
    return get_s3_csv(f"{system.value}/export/{stats.value}/dt={date}/{stats.value}.csv")

In [None]:
def filter_on_dag_ids(df, dag_ids):
    return df[df['dag_id'].isin(dag_ids)]

In [None]:
mwaa_shard_10_tasks = filter_on_dag_ids(get_stats(system=System.MWAA, date="2022-12-07_21-57", stats=Stats.TASKS), dag_ids=["beeflow_experiment2"])

In [None]:
beeflow_shard_10_tasks = filter_on_dag_ids(get_stats(system=System.BEEFLOW, date="2022-12-07_21-50", stats=Stats.TASKS), dag_ids=["beeflow_experiment2"])

In [None]:
import plotly.express as px

In [None]:
def gantt_task(df, range_x=None):
    fig = px.timeline(df, x_start="start_date", x_end="end_date", color="run_id", y="task_id", range_x=range_x)
    fig.update_yaxes(autorange="reversed")
    fig.show()

In [None]:
shard_10_tasks_x_range = ("2022-12-07 20:40:00.000000+00:00", "2022-12-07 21:55:00.000000+00:00")

In [None]:
gantt_task(mwaa_shard_10_tasks, range_x=shard_10_tasks_x_range)

In [None]:
gantt_task(beeflow_shard_10_tasks, range_x=shard_10_tasks_x_range)

In [None]:
mwaa_shard_10_dag_runs = filter_on_dag_ids(get_stats(system=System.MWAA, date="2022-12-07_21-57", stats=Stats.DAG_RUNS), dag_ids=["beeflow_experiment2"])

In [None]:
beeflow_shard_10_dag_runs = filter_on_dag_ids(get_stats(system=System.BEEFLOW, date="2022-12-07_21-50", stats=Stats.DAG_RUNS), dag_ids=["beeflow_experiment2"])

In [None]:
def gantt_dag_runs(df, **kwargs):
    fig = px.timeline(df, x_start="start_date", x_end="end_date", color="run_id", **kwargs)
    fig.update_yaxes(autorange="reversed")
    fig.show()

In [None]:
gantt_dag_runs(mwaa_shard_10_dag_runs, range_x=shard_10_tasks_x_range)

In [None]:
gantt_dag_runs(beeflow_shard_10_dag_runs, range_x=shard_10_tasks_x_range)

In [None]:
def average_dag_run_duration(df):
    start_ms_time = pd.to_datetime(df['start_date'])
    end_ms_time = pd.to_datetime(df['end_date'])
    df['duration'] = end_ms_time.sub(start_ms_time).dt.total_seconds()
    return df['duration'].mean()

In [None]:
average_dag_run_duration(mwaa_shard_10_dag_runs)

In [None]:
average_dag_run_duration(beeflow_shard_10_dag_runs)