In [None]:
import pandas as pd
import plotly.express as px
from pathlib import Path

In [None]:
REPO_ROOT = Path("../../").resolve()
DATA_DIR = str(REPO_ROOT) + '/data/alibaba/cluster-trace-gpu-v2020/data/'

dfg = pd.read_csv(DATA_DIR + 'workers_with_workload.csv')
# dfg = pd.read_csv("tasks_with_workload.csv")

## Model as a proxy for privacy mechanism (various parameters of DPSGD, Gaussian, Laplace)

In [None]:
px.histogram(
    dfg,
    x="group",
    # y="inst_id",
    title="Number of tasks per group (group = recurring task)"
)

In [None]:
px.histogram(
    dfg,
    x="workload",
    # y="inst_id",
    title="Number of tasks for each model architecture (workload)"
)

Most jobs have only one task, but some have multiple tasks.
We might want to keep training tasks only

In [None]:
job_counts = dfg[["start_time", "job_name"]].groupby("job_name").count().reset_index()
job_counts["n_tasks"] = job_counts["start_time"]
job_counts=job_counts.drop("start_time", axis=1)

In [None]:
px.ecdf(
    job_counts,
    x="n_tasks",
    # color="task_name",
    log_x=True,
    title="# tasks per job",
)

In [None]:
px.histogram(
    dfg,
    x="task_name",
    # y="inst_id",
    title="Number of tasks per task type"
)

In [None]:
dfg.job_name.nunique()

In [None]:
len(dfg)

In [None]:
dfg.head()

In [None]:
dfg.user.nunique()

In [None]:
user_counts = dfg[["user", "job_name"]].groupby("user").count().reset_index()
user_counts["n_tasks"] = user_counts["job_name"]
user_counts=user_counts.drop("job_name", axis=1)

In [None]:
px.bar(
    user_counts.sort_values("n_tasks"),
    x="user",
    y="n_tasks",
    # y="inst_id",
    title="Number of tasks per user",
    log_y=True,
)

In [None]:
dfg.describe()

In [None]:
dfg.task_name

## Computational resources

inst_num: number of instances launched by the task.

plan_cpu: number of CPU cores requested in percentage (i.e., 600.0 is 6 vCPU cores) .

plan_mem: GB of main memory requested.

plan_gpu: number of GPUs requested in percentage (i.e., 50.0 is 50% GPU).

gpu_type: type of GPUs assigned to this task. MISC is short for "miscellaneous", indicating GPUs of older generations, e.g., NVIDIA Tesla K40m, K80, M60.


### Number of instances (Docker containers) per task -> Proxy for privacy?

- or cpu

- Rescale to fit epsilon between 0.1 and 10.
- how to set delta? *maybe according to the task type* 
- Given epsilon, delta and a mechanism type, compute the corresponding RDP curve for the actual task demand


In [None]:
px.ecdf(
    dfg,
    x="inst_num",
    color="workload",
    log_x=True,
    title="# containers per task",
)

### GPU type as a proxy for profit (or privacy)?

High-end GPU (V100) are reserved for expensive tasks.
P100 and T4 are intermediary GPUs.
MISC are GPUs from older generations, quite cheap.


- [ ] Check if priority

This could give us a discrete mapping:
- V100M32: profit = 100
- V100: profit = 10
- P100: profit = 5
- T4: profit = 1
- MISC: profit = 0.1


In [None]:
px.histogram(
    dfg,
    x="gpu_type",
    color="workload",
    title="Type of GPU for each type of model"
)

In [None]:
px.ecdf(
    dfg,
    x="plan_cpu",
    color="workload",
    log_x=True,
    title="# cpu per task"
)

In [None]:
px.ecdf(
    dfg,
    x="plan_gpu",
    color="workload",
    log_x=True,
    title="# GPU per task"
)

## Memory as a proxy for the number of blocks?

This metric might be close to the number of data points actually consumed.

Problem:
- There is not too much variability, most tasks ask between 1GB and 100GB of memory
- RAM is not necessarily related to the size of the dataset, since we usually don't load it all at once in memory
- Most instances of a given workload have the same memory request (e.g 80% of CTR instances ask for 2GB). Strong correlation between the curve type and the amplitude. This might actually be quite realistic.


In [None]:
px.ecdf(
    dfg,
    x="plan_mem",
    color="workload",
    log_x=True,
    title="RAM per task (in GB)"
)

## Runtime

Not sure it is reasonable to use it as a proxy for number of blocks or privacy budget. Too many orders of magnitude

In [None]:
px.ecdf(
    dfg,
    x="runtime",
    color="workload",
    log_x=True,
    title="Runtime per task (in seconds)"
)

## The missing piece: what proxy for the block range?

Even if we have a proxy for the number of blocks (e.g. memory or runtime), we still need to know *which* blocks are requested to run a simulation.

- The $N$ latest blocks?
- $N$ random blocks chosen among the $2N$ latest blocks?
- Some sort of tumbling window?
- Any random $N$ blocks since the beginning of time?

We will likely need some TFX statistics to decide this.

## Arrival time

In [None]:
px.histogram(
    dfg,
    x="start_date",
    nbins=100,
    title="arrival time, starting at timestamp 0",
)

In [None]:
px.histogram(
    dfg[dfg["start_date"] < '1970-01-15'],
    x="start_date",
    color="workload",
    title="arrival time for 1 day, per model type"
)

# Recurring pipelines

(Not a priority now, but might be useful later)

In [None]:
px.histogram(
    dfg,
    x="group"
)

In [None]:
groups = dfg.groupby(['group']).count()

In [None]:
px.ecdf(
    groups,
    x="job_name",
    log_x=True,
    title="CDF of the number of instances for each group. <br> We can confirm the paper's claim that 65% of the instances repeat more than 5 times" 
)

In [None]:
sg = groups.sort_values(["job_name"], ascending=False)

In [None]:
sg.head()

In [None]:
sg.iloc[0].name

In [None]:
groups = [sg.iloc[10 * i].name for i in range(10)]

In [None]:
px.histogram(
    dfg.query(f"start_date < '1970-01-17' and group in {groups}"),
#     dfg[(dfg["start_date"]<'1970-01-15') & (dfg["group"] == '02a6709662bff12fea88270e3eb1231d')],
    x="start_date",
    color="group",
    nbins=1000,
    title="Launch time for the top 3 recurring tasks during the first 3 days"
)