In [1]:
# Find out why this logical job is such an important feature

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, DataFrame, Window
from pyspark.sql import functions as F
import matplotlib as mpl
from matplotlib import pyplot as plt

## create DataFrame from cleaned cluster trace Alibaba 2018

In [3]:
test_path = "../../out/model/eval/final/GBT5CV3parall_priorFeat_tune_maxDepth_Iter_Bins_03inst777SeedVal/part-00000-fcf17f91-2e4e-4fb3-830d-921eb502fd93-c000.csv.gz"
master = "local[2]"  # use 2 cores
app_name = "analyse prediction of Alibaba 2018 cluster trace"
config = SparkConf().setAll([
    ('spark.driver.memory', '3g')
])
spark_session = SparkSession.builder \
    .master(master) \
    .appName(app_name) \
    .config(conf=config) \
    .getOrCreate()

test = spark_session.read.csv(path=test_path, header=True, inferSchema=True)

22/02/25 09:32:08 WARN Utils: Your hostname, felix-Surface-Book resolves to a loopback address: 127.0.1.1; using 192.168.0.4 instead (on interface wlp3s0)
22/02/25 09:32:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/25 09:32:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

### read the data and reproduce the datasets

In [4]:
# read dataset
data_name = "batch_jobs_clean_03inst_1task_00015S_1F"
data_path = f"../../out/clean/{data_name}/*.csv.gz"
data = spark_session.read.csv(data_path, header=True, inferSchema=True)
# reproduce training set
seed = 61
fractions = {0: 0.50, 1: 0.50}
training = data.sampleBy(col="labels", fractions=fractions, seed=seed)
# stratified (50/25/25) split to have 50% of the data for training, 25 for validation (hyper
rest = data.subtract(training)
fractions = {0: 0.25, 1: 0.25}
validation = rest.sampleBy(col = "labels", fractions = fractions, seed = seed)
test_raw = rest.subtract(validation)

                                                                                

In [23]:
data.dtypes

[('task_name', 'string'),
 ('job_name', 'string'),
 ('task_id_num', 'int'),
 ('instance_num', 'int'),
 ('task_type', 'int'),
 ('status', 'string'),
 ('start_time', 'int'),
 ('end_time', 'int'),
 ('plan_cpu', 'double'),
 ('plan_mem', 'double'),
 ('map_reduce', 'string'),
 ('earliest', 'int'),
 ('sched_intv', 'int'),
 ('job_exec', 'int'),
 ('logical_job_name', 'string'),
 ('latest', 'int'),
 ('task_duration', 'int'),
 ('tts_task', 'int'),
 ('mtts_task', 'double'),
 ('ttf_task', 'int'),
 ('ttr_task', 'int'),
 ('reduce_checkpoint', 'int'),
 ('second_quant_checkpoint', 'int'),
 ('third_quant_checkpoint', 'int'),
 ('instance_name', 'string'),
 ('instance_task_type', 'int'),
 ('instance_status', 'string'),
 ('instance_start_time', 'int'),
 ('instance_end_time', 'int'),
 ('machine_id', 'string'),
 ('seq_no', 'int'),
 ('total_seq_no', 'int'),
 ('cpu_avg', 'double'),
 ('cpu_max', 'double'),
 ('mem_avg', 'double'),
 ('mem_max', 'double'),
 ('labels', 'int')]

In [12]:
data.select("logical_job_name", "job_name").limit(5).show()

+----------------+---------+
|logical_job_name| job_name|
+----------------+---------+
|            null|j_1514651|
|            null| j_469896|
|            null|j_1995375|
|            null|j_1995375|
|            null|j_1995375|
+----------------+---------+



### filter for the respective logical job and summarize

In [49]:
lj_most_imp = data.filter(
    (F.col("logical_job_name").contains("2678067")) |  # most imp
    (F.col("logical_job_name").contains("3138253")) |  # second most imp
    (F.col("logical_job_name").contains("2474299"))    # third most imp
    
)

In [50]:
lj_most_imp.count()

                                                                                

218030

In [51]:
lj_most_imp.select("logical_job_name").distinct().show()

[Stage 73:>                                                         (0 + 2) / 2]

+----------------+
|logical_job_name|
+----------------+
|     L_j_3138253|
|     L_j_2474299|
|     L_j_2678067|
+----------------+





In [52]:
lj_most_imp = lj_most_imp.groupBy("logical_job_name").agg(
    F.avg("plan_cpu"),
    F.avg("plan_mem"),
    F.avg("task_type"),
    F.avg("labels")
).toPandas()

                                                                                

In [53]:
latex = lj_most_imp.to_latex()

In [54]:
print(latex)

\begin{tabular}{llrrrr}
\toprule
{} & logical\_job\_name &  avg(plan\_cpu) &  avg(plan\_mem) &  avg(task\_type) &  avg(labels) \\
\midrule
0 &      L\_j\_3138253 &      89.407105 &       0.348577 &             1.0 &     0.599760 \\
1 &      L\_j\_2474299 &      87.582682 &       0.375619 &             1.0 &     0.048997 \\
2 &      L\_j\_2678067 &      98.829916 &       0.303328 &             1.0 &     0.914553 \\
\bottomrule
\end{tabular}

