In [None]:
import pandas as pd
import re

In [None]:
!mkdir -p data && cd data && wget -c --retry-connrefused --tries=0 --timeout=50 http://aliopentrace.oss-cn-beijing.aliyuncs.com/v2018Traces/batch_task.tar.gz

In [None]:
!cd data && tar -xvzf batch_task.tar.gz

In [None]:
# From https://github.com/alibaba/clusterdata/blob/master/cluster-trace-v2018/fetchData.sh
df = pd.read_csv('data/batch_task.csv', names=['task_name', 'instance_num', 'job_name', 'task_type', 'status', 'start_time', 'end_time', 'plan_cpu', 'plan_mem'])
df['duration'] = df['end_time'] - df['start_time']

In [None]:
# Task name is:
# a) containing dependencies (like 'J4_2_3' -> task 4 depends on 2 and 3)
TASK_NAME_RE = re.compile('^[^_]*[A-Z](?P<task_id>\d+)(_(?P<deps>[\d+_]+))?(_Stg\d+)?$') # Note: sometimes job ends with _Stg*
# b) independent task (like 'task_LTE4NjUxMjg5NDY5MDI4NjAzNzU=')
SINGLE_TASK_RE = re.compile('^task_[a-zA-Z0-9]+=*$')
# c) 'MergeTask'

# check that we cover all cases:
assert df.task_name.apply(lambda f: TASK_NAME_RE.match(f) is not None or SINGLE_TASK_RE.match(f) is not None or f == 'MergeTask').all()

In [None]:
# Check that 'MergeTask's are in fact independent tasks (i.e. the only task in a job)
def get_merge_task_stats(df):
    df = df[['job_name', 'task_name']].copy()
    df['is_merge_task'] = df['task_name'] == 'MergeTask'
    return df.groupby('job_name').agg(
        count=pd.NamedAgg('task_name', 'count'),
        mergeCount=pd.NamedAgg('is_merge_task', 'sum')
    )

assert len(get_merge_task_stats(df).query('mergeCount > 0 and count > 1')) == 0

In [None]:
# Extract dependency info
def get_task_index_and_deps(row):
    task_name = row.task_name
    if m := TASK_NAME_RE.match(task_name):
        if m.group('deps'):
            deps = [int(item) for item in m.group('deps').split('_') if item != '']
        else:
            deps = []
        return int(m.group('task_id')), deps
    else:
        return 1, []

df[['task_index', 'task_deps']] = df[['task_name']].apply(get_task_index_and_deps, result_type='expand', axis=1)

In [None]:
df[['job_name', 'task_index', 'task_deps', 'duration', 'instance_num']].head()

In [None]:
# Sample job
df[['job_name', 'task_index', 'task_deps', 'duration', 'instance_num']].query('job_name == "j_3"')

In [None]:
# Filter jobs with 4 or more tasks
jobs = df.groupby("job_name").filter(lambda x: len(x) > 3)