# Notebook to analyse the TC SA results

## Imports

In [78]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, DataFrame, Window
from pyspark.sql import functions as F

## create DataFrame from cleaned cluster trace Alibaba 2018

In [79]:
batch_tasks_clean_path = "/home/felix/TUB_Master_ISM/SoSe21/MA/acs-simulation/out/analysis/sa/eval/BinaryTaskfailClassifier-trace-Alibaba2018-size-0.1-mem-6000-testfrac-0.2-sampleseed-99-fittime-28800-cvK-5.csv.gz"
master = "local"  # use 4 cores
app_name = "analyse clean Alibaba 2018 cluster trace"
config = SparkConf().setAll([
    # ('spark.executor.memory', '6g'),
    # ('spark.executor.cores', '4'),
    ('spark.driver.cores', '4'),
    ('spark.driver.memory', '12g')
])
spark_session = SparkSession.builder \
    .master(master) \
    .appName(app_name) \
    .config(conf=config) \
    .getOrCreate()

batch_tasks_clean = spark_session.read.csv(path=batch_tasks_clean_path, header=True, inferSchema=True)

                                                                                

## Simple analysis

In [80]:
batch_tasks_clean.count()

52960

In [81]:
batch_tasks_clean.dtypes

[('task_name', 'string'),
 ('task_id_num', 'double'),
 ('instance_num', 'double'),
 ('job_name', 'string'),
 ('task_type', 'int'),
 ('status', 'string'),
 ('start_time', 'int'),
 ('end_time', 'int'),
 ('plan_cpu', 'double'),
 ('plan_mem', 'double'),
 ('map_reduce', 'string'),
 ('earliest', 'int'),
 ('sched_intv', 'double'),
 ('job_exec', 'double'),
 ('logical_job_name', 'string'),
 ('latest', 'int'),
 ('task_duration', 'double'),
 ('tts_task', 'double'),
 ('mtts_task', 'double'),
 ('ttf_task', 'double'),
 ('ttr_task', 'double'),
 ('reduce_checkpoint', 'double'),
 ('second_quant_checkpoint', 'double'),
 ('third_quant_checkpoint', 'double'),
 ('labels', 'int'),
 ('predictions', 'int'),
 ('correct_predictions', 'boolean'),
 ('tc', 'string'),
 ('td', 'double'),
 ('reduce_co', 'string'),
 ('second_quant_co', 'string'),
 ('third_quant_co', 'string'),
 ('adaptive_co', 'string'),
 ('zip_tc_adaptive', 'string'),
 ('zip_tc', 'double'),
 ('zip_adaptive_co', 'double')]

In [82]:
len(batch_tasks_clean.dtypes)

36

In [83]:
num_cols = [x[0] for x in batch_tasks_clean.dtypes if x[1] == 'int']
num_cols

['task_type',
 'start_time',
 'end_time',
 'earliest',
 'latest',
 'labels',
 'predictions']

## Analyse Adaptive CO results

In [92]:
batch_tasks_clean.filter(
        (F.col("zip_adaptive_co").isNotNull()) &
        (F.col("predictions")==1)
    ).select("task_name", "task_id_num", "map_reduce", "task_duration", "zip_tc", "td", "labels", "predictions", "correct_predictions", "zip_adaptive_co").show()

+--------------------+-----------+----------+-------------+------+----------------+------+-----------+-------------------+------------------+
|           task_name|task_id_num|map_reduce|task_duration|zip_tc|              td|labels|predictions|correct_predictions|   zip_adaptive_co|
+--------------------+-----------+----------+-------------+------+----------------+------+-----------+-------------------+------------------+
|task_LTg0MTUwNTA5...|        0.0|         m|          5.0|   0.0|260.435181565379|     0|          1|              false|260.47518156448496|
|task_LTg0MTUwNTA5...|        0.0|         m|          5.0|   5.3|260.435181565379|     0|          1|              false|265.77518156448497|
|task_LTg0MTUwNTA5...|        0.0|         m|          5.0|  11.0|260.435181565379|     0|          1|              false|271.47518156448496|
|task_LTg0MTUwNTA5...|        0.0|         m|          5.0|  16.0|260.435181565379|     0|          1|              false|276.47518156448496|
|task_

In [89]:
batch_tasks_clean.select(F.max(F.col("zip_adaptive_co")), F.min(F.col("zip_adaptive_co"))).show()

+--------------------+--------------------+
|max(zip_adaptive_co)|min(zip_adaptive_co)|
+--------------------+--------------------+
|  279.47518156448496|  -536.8350000008941|
+--------------------+--------------------+



In [91]:
batch_tasks_clean.groupBy("zip_tc").agg(
                    F.mean(F.col("zip_adaptive_co")).alias("avg_adaptive_co") /
                    F.mean(F.col("task_duration").alias("mean_task_duration_adaptive_co"))
            ).alias("avg_adaptive_co_perc").show()

                                                                                

+------+----------------------------------------------------------------------------------------------------+
|zip_tc|(avg(zip_adaptive_co) AS `avg_adaptive_co` / avg(task_duration AS `mean_task_duration_adaptive_co`))|
+------+----------------------------------------------------------------------------------------------------+
|   0.0|                                                                                   2.634155841902389|
|  11.0|                                                                                   2.762268381079897|
|   5.3|                                                                                  2.6958827925970006|
|  16.0|                                                                                   2.820501353433309|
+------+----------------------------------------------------------------------------------------------------+



In [94]:
batch_tasks_clean.filter(
    (F.col("zip_adaptive_co").isNotNull()) &
    ((F.col("predictions")==1))
).groupBy("correct_predictions").count().show()

                                                                                

+-------------------+-----+
|correct_predictions|count|
+-------------------+-----+
|              false| 1088|
+-------------------+-----+



## We have this positive overhead of the adaptive checkpoint model because the failure prediction model is not accurate