# Running queries using JSONS produced by Scala code

Install and import all needed packages

In [1]:
%%bash
pip install numpy
pip install pandas
pip install pyspark













In [2]:
import json
import time
import numpy as np
import csv
import multiprocessing
import signal
import pandas as pd
import os
import threading
import random
from pyspark.sql import SparkSession
from py4j.protocol import Py4JJavaError, Py4JError
import psutil
import string

Function for running one query. This means
*  run the original query 5 times (after one initial run, which we do not use)
*  run the rewritten queries 5 times (after one initial run, which we do not use) and drop the created tables each time
*  take the runtimes and calculate mean, median and standard deviation of time for either the original or rewritten query
*  compare the runtimes between the original query, the rewritten query and the rewritten query + the rewriting time (how long the Scala took)
*  save everything in a csv output file

In [3]:
def create_spark():
    spark = SparkSession.builder \
        .appName("app") \
        .master(f'local[{SPARK_CORES}]') \
        .config("spark.driver.memory", f'{SPARK_MEMORY}g') \
        .config("spark.executor.memory", f'{SPARK_MEMORY}g') \
        .config("spark.memory.offHeap.enabled",False) \
        .config("spark.jars", "postgresql-42.3.3.jar") \
        .getOrCreate()
    return spark

def import_db(spark, dbname):

    if dbname == "JOB":
        dbname = "imdb"
    else:
        dbname = dbname.lower()
    
    username = dbname
    password = dbname
    dbname = dbname

    df_tables = spark.read.format("jdbc") \
    .option("url", f'jdbc:postgresql://postgres:5432/{dbname}') \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "information_schema.tables") \
    .option("user", username) \
    .option("password", password) \
    .load()

    for idx, row in df_tables.toPandas().iterrows():
        if row.table_schema == 'public':
            table_name = row.table_name
            df = spark.read.format("jdbc") \
                .option("url", f'jdbc:postgresql://{DBHOST}:5432/{dbname}') \
                .option("driver", "org.postgresql.Driver") \
                .option("dbtable", table_name) \
                .option("user", username) \
                .option("password", password) \
                .load()
    
            print(table_name)
            #print(df.show())
            df.createOrReplaceTempView(table_name)

In [4]:
def measure_resource_usage(resource_usage):
    t = threading.current_thread()
    secs = 0
    while getattr(t, "do_run", True):
        resource_usage.append(get_resource_usage(secs))
        #print("resource usage: " + str(resource_usage))
        secs += 1
        time.sleep(1)

def get_resource_usage(t):
    return {
        'time': t,
        'memory': psutil.virtual_memory(),
        'cpu': psutil.cpu_percent(interval=None, percpu=True),
        'cpu_total': psutil.cpu_percent(interval=None, percpu=False)
    }
    
def cancel_query(spark, seconds, group_id):
    time.sleep(seconds)
    print("cancelling jobs with id " + group_id)
    print(spark.sparkContext.cancelJobGroup(group_id))
    print("cancelled job")

def cancel_query_after(spark, seconds):
    group_id = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(16)) #random id
    spark.sparkContext.setJobGroup(group_id, group_id)
    threading.Thread(target=cancel_query, args=(spark, seconds, group_id,)).start()
    return group_id

In [5]:
def run_query(benchmark, query, spark):
    print(benchmark, query)
    file_path = f'rewritten/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)
    
    original_query = json_data["original_query"]
    rewritten_query_list = json_data["rewritten_query"]
    rewriting_time = json_data["time"]

    rewritten_query_list_spark = [rewritten_query.replace(" UNLOGGED TABLE ", " VIEW ")
                                                 .replace("TIMESTAMP(0)", "TIMESTAMP")
                                                 .replace("$", "_")
                                                 .replace("CREATE VIEW", "CREATE TEMPORARY VIEW")
                                  for rewritten_query in rewritten_query_list]


    file_path_drop = f'rewritten/{benchmark}_{query}_drop.json'
    with open(file_path_drop, 'r') as file:
        json_drop = json.load(file)
    drop_query_list = json_drop["rewritten_query"]

    timeout_flag_orig = True
    timeout_flag_rewr = True
    
    try:
        spark.sparkContext._jvm.System.gc()

        start_time = time.time()
        resource_usage = []
        measure_thread = threading.Thread(target=measure_resource_usage, args=(resource_usage, ))
        measure_thread.start()
    
        group_id = cancel_query_after(spark, TIMEOUT)
        result = spark.sql(original_query)
        result.show()
        end_time = time.time()
        print(end_time-start_time)
    
        measure_thread.do_run = False
        timeout_flag_orig = False
    except Py4JError as e:
        print('timeout or error orig: ' + str(e))
        
    try:
        spark.sparkContext._jvm.System.gc()
        resource_usage = []
        measure_thread = threading.Thread(target=measure_resource_usage, args=(resource_usage, ))
        measure_thread.start()

        group_id = cancel_query_after(spark, TIMEOUT)
        for rewritten_query in rewritten_query_list_spark:
            if rewritten_query.startswith("SELECT"):
                result1 = spark.sql(rewritten_query)
                result1.show()
            else:
                result2 = spark.sql(rewritten_query)
                result2.show()

        measure_thread.do_run = False
        timeout_flag_rewr = False

        for drop_query in drop_query_list:
            drop = spark.sql(drop_query)
            drop.show()
        
    except Py4JError as e:
        print('timeout or error rewr: ' + str(e))

    print(timeout_flag_orig, timeout_flag_rewr)
    # original and rewritten query are TOs
    if timeout_flag_orig and timeout_flag_rewr:
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"
        list_original = ["-", "-", "-", "-", "-"]
        
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        rewr_mean_plus_rewr = "TO"
        rewr_med_plus_rewr = "TO"
        list_rewritten = ["-", "-", "-", "-", "-"]
        
        orig_or_rewr_mean = "-"
        orig_or_rewr_or_equal = "-"
        orig_or_rewr_plus_rewr_mean = "-"

    # original query is a TO and the rewritten not
    elif timeout_flag_orig:
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"
        list_original = ["-", "-", "-", "-", "-"]

        list_rewritten = []
        print("rewritten")
        for i in range(5):
            print(i)
            # execute the rewritten query
            start_time_rewritten = time.time()
            for rewritten_query in rewritten_query_list_spark:
                exec = spark.sql(rewritten_query)
                exec.show()
            end_time_rewritten = time.time()
            rewritten_time = end_time_rewritten - start_time_rewritten
            list_rewritten.append(rewritten_time)
            # drop all created tables
            for drop_query in drop_query_list:
                drop = spark.sql(drop_query)
                drop.show()
        rewr_mean = np.mean(list_rewritten)
        rewr_med = np.median(list_rewritten)
        rewr_std = np.std(list_rewritten)
        rewr_mean_plus_rewr = rewr_mean + rewriting_time
        rewr_med_plus_rewr = rewr_med + rewriting_time

        orig_or_rewr_mean = "rewr"
        orig_or_rewr_or_equal = "rewr"
        orig_or_rewr_plus_rewr_mean = "rewr"

    # rewritten query is a TO and the original not
    elif timeout_flag_rewr:
        list_original = []
        print("orig")
        for i in range(5):
            # execute the original query
            start_time_original = time.time()
            exec = spark.sql(original_query)
            exec.show()
            end_time_original = time.time()
            original_time = end_time_original - start_time_original
            list_original.append(original_time)
        orig_mean = np.mean(list_original)
        orig_med = np.median(list_original)
        orig_std = np.std(list_original)
        
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        rewr_mean_plus_rewr = "TO"
        rewr_med_plus_rewr = "TO"
        list_rewritten = ["-", "-", "-", "-", "-"]

        orig_or_rewr_mean = "orig"
        orig_or_rewr_or_equal = "orig"
        orig_or_rewr_plus_rewr_mean = "orig"

    # both queries are no TOs
    else:
        #print(result, result1)
        list_original = []
        list_rewritten = []
        # take times for 5 runs (run 2-6) for the original query and the rewritten query
        print("orig+rewr")
        for i in range(5):
            print(i)
            # execute the original query
            start_time_original = time.time()
            exec = spark.sql(original_query)
            exec.show()
            end_time_original = time.time()
            original_time = end_time_original - start_time_original
            list_original.append(original_time)
        
            # execute the rewritten query
            start_time_rewritten = time.time()
            for rewritten_query in rewritten_query_list_spark:
                exec = spark.sql(rewritten_query)
                exec.show()
            end_time_rewritten = time.time()
            rewritten_time = end_time_rewritten - start_time_rewritten
            list_rewritten.append(rewritten_time)
            
            # drop all created tables
            for drop_query in drop_query_list:
                drop = spark.sql(drop_query)
                drop.show()
            
        orig_mean = np.mean(list_original)
        orig_med = np.median(list_original)
        orig_std = np.std(list_original)
        rewr_mean = np.mean(list_rewritten)
        rewr_med = np.median(list_rewritten)
        rewr_std = np.std(list_rewritten)
        rewr_mean_plus_rewr = rewr_mean + rewriting_time
        rewr_med_plus_rewr = rewr_med + rewriting_time
        if orig_mean > rewr_mean:
            orig_or_rewr_mean = "rewr"
        else:
            orig_or_rewr_mean = "orig"
        if abs(rewr_mean-orig_mean) < 0.05:
            orig_or_rewr_or_equal = "equal 0.05"
        else:
            orig_or_rewr_or_equal = orig_or_rewr_mean
        if orig_mean > rewr_mean_plus_rewr:
            orig_or_rewr_plus_rewr_mean = "rewr"
        else:
            orig_or_rewr_plus_rewr_mean = "orig"
            
    list_output = [benchmark, query] + [orig_mean, rewr_mean, rewr_mean_plus_rewr, orig_or_rewr_mean, orig_or_rewr_or_equal, \
                                        orig_or_rewr_plus_rewr_mean, rewriting_time] + \
                    list_original + [orig_med, orig_std] + list_rewritten + [rewr_med, rewr_std, rewr_med_plus_rewr]
    #print(list_output)
    file_path = "results/SPA_Scala_comparison_TO_augment_server.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

In [6]:
# Global configuration
SPARK_MEMORY = 500
SPARK_CORES = 4
DBHOST = 'postgres'
TIMEOUT = 100

Create the output csv with the header. We add the running times for each query then.

In [7]:
file_path = "results/SPA_Scala_comparison_TO_augment_server.csv"

names = ["bench", "query", "orig mean", "rewr mean", "rewr mean+rewr", "orig/rewr(mean)", "orig/rewr/equal", "orig/rewr+rewr(mean)", "rewriting", 
         "orig 1", "orig 2", "orig 3", "orig 4", "orig 5", "orig med", "orig_std", "rewr 1", "rewr 2", "rewr 3", "rewr 4", "rewr 5", "rewr med", 
         "rewr_std", "rewr med+rewr", ]

with open(file_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(names)

## STATS

In [7]:
spark = create_spark()
import_db(spark, "STATS")

24/05/06 07:33:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


users
posts
postlinks
posthistory
comments
votes
badges
tags


In [8]:
sc = spark.sparkContext
conf = sc.getConf()
num_cores = conf.get("spark.executor.cores", "1")
executor_memory = conf.get("spark.executor.memory", "1g")
print("Number of cores:", num_cores)
print("Executor memory:", executor_memory)

Number of cores: 1
Executor memory: 500g


In [9]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('STATS')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)

STATS 001-014-augA1
+-------+
|min(id)|
+-------+
|  19294|
+-------+

2.0616631507873535
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 19294|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

False False
orig+rewr
0
+-------+
|min(id)|
+-------+
|  19294|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 19294|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

1
+-------+
|min(id)|
+-------+
|  19294|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 19294|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2
+-------+
|min(id)|
+-------+
|  19294|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 19294|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

3
+-------+
|min(id)|
+--

24/04/30 08:40:04 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


+------+
|EXPR_0|
+------+
|129735|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2
+-------+
|min(id)|
+-------+
| 129735|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|129735|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

3
+-------+
|min(id)|
+-------+
| 129735|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|129735|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

4
+-------+
|min(id)|
+-------+
| 129735|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|129735|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

STATS 001-014-augF1-augA1
+-------+
|min(id)|
+-------+
|  19294|
+-------+

0.29331040382385254
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 19294|
+---

[Stage 1311:>                                                       (0 + 1) / 1]

cancelling jobs with id 7L4HG3LUEP6Z0XFG
None
cancelled job
cancelling jobs with id YP0AB1CUB59UA9RQ
None
cancelled job
cancelling jobs with id SA9TEFS1HMRN9VYH
None
cancelled job
cancelling jobs with id QSJQ9TFMPY7TPVK5
None
cancelled job
cancelling jobs with id 0JLNF6Y1IO805FJL
None
cancelled job
cancelling jobs with id AGHM6K7VUCJDVI6O
None
cancelled job
cancelling jobs with id D3IR099VQ840KNGC
None
cancelled job
cancelling jobs with id 7EFFQO8WQXSNKV13
None
cancelled job
cancelling jobs with id EFQG1QXEKVQXSV6I
None
cancelled job
cancelling jobs with id U9D8512IE4KW9G7W
None
cancelled job
cancelling jobs with id HEACR23JCE0PF4AR
None
cancelled job
cancelling jobs with id 7PRV45XR90W5XRAF
None
cancelled job
cancelling jobs with id LAZOIY75RE5YTUXT
None
cancelled job
cancelling jobs with id C5JJT4WMLIT2ZA6B
None
cancelled job
cancelling jobs with id 0F7AX3G7IJ11KVBY
None
cancelled job
cancelling jobs with id POFAHTI8HZ6DZYCK
None
cancelled job
cancelling jobs with id WU30WGWN0H69Y1MW

24/04/30 08:42:13 WARN TaskSetManager: Lost task 0.0 in stage 1311.0 (TID 766) (a873fa1f8418 executor driver): TaskKilled (Stage cancelled: Job 766 cancelled part of cancelled job group MDS782UTYX3OD20E)


++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++



                                                                                

+------+
|EXPR_0|
+------+
| 99864|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

True False
rewritten
0
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 99864|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

1
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 99864|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 99864|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++


[Stage 1434:>                                                       (0 + 1) / 1]

cancelling jobs with id ZHQHN5ZU7T3U1K7H
None
cancelled job
cancelling jobs with id X1LYMWZHQZ64BV4H
None
cancelled job
timeout or error orig: An error occurred while calling o2422.showString.
: org.apache.spark.SparkException: Job 842 cancelled part of cancelled job group X1LYMWZHQZ64BV4H
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:2731)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleJobGroupCancelled$4(DAGScheduler.scala:1198)
	at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:1197)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3016)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scal

24/04/30 08:43:57 WARN TaskSetManager: Lost task 0.0 in stage 1434.0 (TID 842) (a873fa1f8418 executor driver): TaskKilled (Stage cancelled: Job 842 cancelled part of cancelled job group X1LYMWZHQZ64BV4H)


++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++



                                                                                

+------+
|EXPR_0|
+------+
| 22224|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

True False
rewritten
0
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 22224|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

1
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 22224|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 22224|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++


[Stage 1557:>                                                       (0 + 1) / 1]

cancelling jobs with id VLBN4PI21017SB9I
None
cancelled job
cancelling jobs with id TIVGTPHBU08WWWCM
None
cancelled job
timeout or error orig: An error occurred while calling o2770.showString.
: org.apache.spark.SparkException: Job 918 cancelled part of cancelled job group TIVGTPHBU08WWWCM
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:2731)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleJobGroupCancelled$4(DAGScheduler.scala:1198)
	at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:1197)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3016)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scal

24/04/30 08:45:41 WARN TaskSetManager: Lost task 0.0 in stage 1557.0 (TID 918) (a873fa1f8418 executor driver): TaskKilled (Stage cancelled: Job 918 cancelled part of cancelled job group TIVGTPHBU08WWWCM)


++
||
++
++

++
||
++
++

++
||
++
++



                                                                                

+------+
|EXPR_0|
+------+
|186119|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

True False
rewritten
0
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|186119|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

1
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|186119|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|186119|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++


[Stage 1680:>                                                       (0 + 1) / 1]

cancelling jobs with id SPCM4NQBO1ODDX8Y
None
cancelled job
cancelling jobs with id WYXYTYHL75QKJU2S
None
cancelled job
timeout or error orig: An error occurred while calling o3118.showString.
: org.apache.spark.SparkException: Job 994 cancelled part of cancelled job group WYXYTYHL75QKJU2S
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:2731)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleJobGroupCancelled$4(DAGScheduler.scala:1198)
	at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:1197)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3016)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scal

24/04/30 08:47:25 WARN TaskSetManager: Lost task 0.0 in stage 1680.0 (TID 994) (a873fa1f8418 executor driver): TaskKilled (Stage cancelled: Job 994 cancelled part of cancelled job group WYXYTYHL75QKJU2S)


++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++



                                                                                

+------+
|EXPR_0|
+------+
| 12233|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

True False
rewritten
0
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 12233|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

1
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 12233|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 12233|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++


[Stage 1803:>                                                       (0 + 1) / 1]

cancelling jobs with id W6MLALB64H5AOMT8
None
cancelled job
cancelling jobs with id 7BT3WIQU4DZNO1P7
None
cancelled job
timeout or error orig: An error occurred while calling o3466.showString.
: org.apache.spark.SparkException: Job 1070 cancelled part of cancelled job group 7BT3WIQU4DZNO1P7
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:2731)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleJobGroupCancelled$4(DAGScheduler.scala:1198)
	at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:1197)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3016)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.sca

24/04/30 08:49:08 WARN TaskSetManager: Lost task 0.0 in stage 1803.0 (TID 1070) (a873fa1f8418 executor driver): TaskKilled (Stage cancelled: Job 1070 cancelled part of cancelled job group 7BT3WIQU4DZNO1P7)


++
||
++
++

++
||
++
++



                                                                                

+------+
|EXPR_0|
+------+
|159588|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

True False
rewritten
0
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|159588|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

1
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|159588|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|159588|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++


[Stage 1926:>                                                       (0 + 1) / 1]

cancelling jobs with id 3W8GFW3PT0JD9LY7
None
cancelled job
cancelling jobs with id HXSKBWFX78WKPICR
None
cancelled job
timeout or error orig: An error occurred while calling o3814.showString.
: org.apache.spark.SparkException: Job 1146 cancelled part of cancelled job group HXSKBWFX78WKPICR
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:2731)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleJobGroupCancelled$4(DAGScheduler.scala:1198)
	at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:1197)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3016)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.sca

24/04/30 08:50:52 WARN TaskSetManager: Lost task 0.0 in stage 1926.0 (TID 1146) (a873fa1f8418 executor driver): TaskKilled (Stage cancelled: Job 1146 cancelled part of cancelled job group HXSKBWFX78WKPICR)
                                                                                

+------+
|EXPR_0|
+------+
| 24274|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

True False
rewritten
0
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 24274|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

1
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 24274|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 24274|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++


[Stage 2049:>                                                       (0 + 1) / 1]

cancelling jobs with id FXFPYLCCC8PIE81P
None
cancelled job
cancelling jobs with id 8KOVTQRQ4N900VCP
None
cancelled job
timeout or error orig: An error occurred while calling o4162.showString.
: org.apache.spark.SparkException: Job 1222 cancelled part of cancelled job group 8KOVTQRQ4N900VCP
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:2731)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleJobGroupCancelled$4(DAGScheduler.scala:1198)
	at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:1197)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3016)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.sca

24/04/30 08:52:35 WARN TaskSetManager: Lost task 0.0 in stage 2049.0 (TID 1222) (a873fa1f8418 executor driver): TaskKilled (Stage cancelled: Job 1222 cancelled part of cancelled job group 8KOVTQRQ4N900VCP)
                                                                                

+------+
|EXPR_0|
+------+
|146852|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

True False
rewritten
0
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|146852|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

1
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|146852|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|146852|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++


[Stage 2172:>                                                       (0 + 1) / 1]

cancelling jobs with id 5W5ZB6QUPG1BELGM
None
cancelled job
cancelling jobs with id PH0V685F1GW5PPL5
None
cancelled job
timeout or error orig: An error occurred while calling o4510.showString.
: org.apache.spark.SparkException: Job 1298 cancelled part of cancelled job group PH0V685F1GW5PPL5
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:2731)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleJobGroupCancelled$4(DAGScheduler.scala:1198)
	at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:1197)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3016)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.sca

24/04/30 08:54:18 WARN TaskSetManager: Lost task 0.0 in stage 2172.0 (TID 1298) (a873fa1f8418 executor driver): TaskKilled (Stage cancelled: Job 1298 cancelled part of cancelled job group PH0V685F1GW5PPL5)
                                                                                

+------+
|EXPR_0|
+------+
| 13203|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

True False
rewritten
0
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 13203|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

1
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 13203|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 13203|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++


[Stage 2295:>                                                       (0 + 1) / 1]

cancelling jobs with id RJJDP5FO36AX7J3N
None
cancelled job
cancelling jobs with id K40R37AIHAPAJ9Q7
None
cancelled job
timeout or error orig: An error occurred while calling o4858.showString.
: org.apache.spark.SparkException: Job 1374 cancelled part of cancelled job group K40R37AIHAPAJ9Q7
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:2731)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleJobGroupCancelled$4(DAGScheduler.scala:1198)
	at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:1197)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3016)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.sca

24/04/30 08:56:02 WARN TaskSetManager: Lost task 0.0 in stage 2295.0 (TID 1374) (a873fa1f8418 executor driver): TaskKilled (Stage cancelled: Job 1374 cancelled part of cancelled job group K40R37AIHAPAJ9Q7)
                                                                                

+------+
|EXPR_0|
+------+
| 72210|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

True False
rewritten
0
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 72210|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

1
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 72210|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
| 72210|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

18.800420999526978
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

False False
orig+rewr
0


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

1


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

3


[Stage 2562:>                                                       (0 + 1) / 1]

cancelling jobs with id 4CHSF805JBYV2SVQ
None
cancelled job


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

4


[Stage 2598:>                                                       (0 + 1) / 1]

cancelling jobs with id C72OYMN49XKSVXC4
None
cancelled job


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

STATS 002-048-augF2-augA2


[Stage 2634:>                                                       (0 + 1) / 1]

cancelling jobs with id JNVOEVHHI2LL4AR4
None
cancelled job


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

18.24640703201294
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

False False
orig+rewr
0


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

1


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

3


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

4


[Stage 2799:>                                                       (0 + 1) / 1]

cancelling jobs with id 9K345TCBECXE7O50
None
cancelled job


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

STATS 002-048-augF2-augA3


[Stage 2832:>                                                       (0 + 1) / 1]

cancelling jobs with id AL6QMSVL6GXH584U
None
cancelled job


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

28.95664405822754
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

False False
orig+rewr
0


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

1


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

3


[Stage 2976:>                                                       (0 + 1) / 1]

cancelling jobs with id 6FYSVL184JL75KWQ
None
cancelled job


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

4


[Stage 3012:>                                                       (0 + 1) / 1]

cancelling jobs with id ARPAVT3S0B6A5OD6
None
cancelled job


24/04/30 09:02:03 WARN TaskSetManager: Lost task 0.0 in stage 3012.0 (TID 1818) (a873fa1f8418 executor driver): TaskKilled (Stage cancelled: Job 1818 cancelled part of cancelled job group ARPAVT3S0B6A5OD6)


Py4JJavaError: An error occurred while calling o6232.showString.
: org.apache.spark.SparkException: Job 1818 cancelled part of cancelled job group ARPAVT3S0B6A5OD6
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:2731)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleJobGroupCancelled$4(DAGScheduler.scala:1198)
	at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:1197)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3016)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)


In [9]:
run_query("STATS", "002-048-augF2-augA3", spark)

STATS 002-048-augF2-augA3


24/05/06 07:33:31 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

15.813504219055176
++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

False False
orig+rewr
0


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

1


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

2


                                                                                

+-------+
|min(id)|
+-------+
|   NULL|
+-------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

+------+
|EXPR_0|
+------+
|  NULL|
+------+

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

++
||
++
++

3


[Stage 159:>                                                        (0 + 1) / 1]

cancelling jobs with id TZP173JMAMLZT3YB
None
cancelled job
cancelling jobs with id NIHCEJD4TDAQ2LDF
None
cancelled job


24/05/06 07:35:14 WARN TaskSetManager: Lost task 0.0 in stage 159.0 (TID 98) (a873fa1f8418 executor driver): TaskKilled (Stage cancelled: Job 98 cancelled part of cancelled job group NIHCEJD4TDAQ2LDF)


Py4JJavaError: An error occurred while calling o399.showString.
: org.apache.spark.SparkException: Job 98 cancelled part of cancelled job group NIHCEJD4TDAQ2LDF
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:2731)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleJobGroupCancelled$4(DAGScheduler.scala:1198)
	at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:1197)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3016)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)


In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('STATS')][21:]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)

### SNAP

In [None]:
spark = create_spark()
import_db(spark, "SNAP")

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)

### JOB

In [None]:
spark = create_spark()
import_db(spark, "JOB")

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('JOB')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)

### LSQB

In [None]:
spark = create_spark()
import_db(spark, "LSQB")

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('LSQB')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)

### HETIO

In [None]:
spark = create_spark()
import_db(spark, "HETIO")

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('HETIO')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)