# Running queries using JSONS produced by Scala code

Install and import all needed packages

In [None]:
%%bash
pip install numpy
pip install pandas
pip install pyspark

In [None]:
import json
import time
import numpy as np
import csv
import multiprocessing
import signal
import pandas as pd
import os
import threading
import random
from pyspark.sql import SparkSession
from py4j.protocol import Py4JJavaError, Py4JError
import psutil
import string

Function for running one query. This means
*  run the original query 5 times (after one initial run, which we do not use)
*  run the rewritten queries 5 times (after one initial run, which we do not use) and drop the created tables each time
*  take the runtimes and calculate mean, median and standard deviation of time for either the original or rewritten query
*  additionaly save the runtimes for each stage
*  save everything in a csv output file

In [None]:
# create a spark session
def create_spark():
    spark = SparkSession.builder \
        .appName("app") \
        .master(f'local[{SPARK_CORES}]') \
        .config("spark.driver.memory", f'{SPARK_MEMORY}g') \
        .config("spark.executor.memory", f'{SPARK_MEMORY}g') \
        .config("spark.memory.offHeap.enabled",False) \
        .config("spark.jars", "postgresql-42.3.3.jar") \
        .getOrCreate()
    return spark

# import the database into Spark from PostgreSQL
def import_db(spark, dbname):

    if dbname == "JOB":
        dbname = "imdb"
    else:
        dbname = dbname.lower()
    
    username = dbname
    password = dbname
    dbname = dbname

    df_tables = spark.read.format("jdbc") \
    .option("url", f'jdbc:postgresql://postgres:5432/{dbname}') \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "information_schema.tables") \
    .option("user", username) \
    .option("password", password) \
    .load()

    for idx, row in df_tables.toPandas().iterrows():
        if row.table_schema == 'public':
            table_name = row.table_name
            df = spark.read.format("jdbc") \
                .option("url", f'jdbc:postgresql://{DBHOST}:5432/{dbname}') \
                .option("driver", "org.postgresql.Driver") \
                .option("dbtable", table_name) \
                .option("user", username) \
                .option("password", password) \
                .load()
    
            print(table_name)
            #print(df.show())
            df.createOrReplaceTempView(table_name)

In [None]:
# functions for handling TO and cancelling those queries in case of a TO
def measure_resource_usage(resource_usage):
    t = threading.current_thread()
    secs = 0
    while getattr(t, "do_run", True):
        resource_usage.append(get_resource_usage(secs))
        #print("resource usage: " + str(resource_usage))
        secs += 1
        time.sleep(1)

def get_resource_usage(t):
    return {
        'time': t,
        'memory': psutil.virtual_memory(),
        'cpu': psutil.cpu_percent(interval=None, percpu=True),
        'cpu_total': psutil.cpu_percent(interval=None, percpu=False)
    }
    
def cancel_query(spark, seconds, group_id):
    time.sleep(seconds)
    print("cancelling jobs with id " + group_id)
    print(spark.sparkContext.cancelJobGroup(group_id))
    print("cancelled job")

def cancel_query_after(spark, seconds):
    group_id = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(16)) #random id
    spark.sparkContext.setJobGroup(group_id, group_id)
    threading.Thread(target=cancel_query, args=(spark, seconds, group_id,)).start()
    return group_id

In [None]:
# function to run the query 6 times checking for TO and calculate and saving all values
def run_query(benchmark, query, spark):
    print(benchmark, query)
    file_path = f'output/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)

    # get the original and rewritten query
    original_query = json_data["original_query"]
    rewritten_query_list = json_data["rewritten_query"]
    rewriting_time = json_data["time"]

    rewritten_query_list_stage1 = rewritten_query_list[:-2]
    rewritten_query_list_stage3 = rewritten_query_list[-2:]
    rewritten_query_list_stage2 = [r for r in rewritten_query_list_stage1 if "stage2" in r]
    rewritten_query_list_stage0 = [r for r in rewritten_query_list_stage1 if "VIEW" in r]
    rewritten_query_list_stage1 = [r for r in rewritten_query_list_stage1 if "stage2" not in r and "VIEW" not in r]

    # change the queries such that they can be executed in SparkSQL (without changing the output)
    rewritten_query_list_stage0 = [rewritten_query.replace(" UNLOGGED TABLE ", " VIEW ")
                                                 .replace("TIMESTAMP(0)", "TIMESTAMP")
                                                 .replace("$", "_")
                                                 .replace("CREATE VIEW", "CREATE TEMPORARY VIEW")
                                  for rewritten_query in rewritten_query_list_stage0]
    rewritten_query_list_stage1 = [rewritten_query.replace(" UNLOGGED TABLE ", " VIEW ")
                                                 .replace("TIMESTAMP(0)", "TIMESTAMP")
                                                 .replace("$", "_")
                                                 .replace("CREATE VIEW", "CREATE TEMPORARY VIEW")
                                  for rewritten_query in rewritten_query_list_stage1]
    rewritten_query_list_stage2 = [rewritten_query.replace(" UNLOGGED TABLE ", " VIEW ")
                                                 .replace("TIMESTAMP(0)", "TIMESTAMP")
                                                 .replace("$", "_")
                                                 .replace("CREATE VIEW", "CREATE TEMPORARY VIEW")
                                  for rewritten_query in rewritten_query_list_stage2]
    rewritten_query_list_stage3 = [rewritten_query.replace(" UNLOGGED TABLE ", " VIEW ")
                                                 .replace("TIMESTAMP(0)", "TIMESTAMP")
                                                 .replace("$", "_")
                                                 .replace("CREATE VIEW", "CREATE TEMPORARY VIEW")
                                  for rewritten_query in rewritten_query_list_stage3]
    rewritten_query_list = [rewritten_query.replace(" UNLOGGED TABLE ", " VIEW ")
                                                 .replace("TIMESTAMP(0)", "TIMESTAMP")
                                                 .replace("$", "_")
                                                 .replace("CREATE VIEW", "CREATE TEMPORARY VIEW")
                                  for rewritten_query in rewritten_query_list]


    # get the drop queries
    file_path_drop = f'output/{benchmark}_{query}_drop.json'
    with open(file_path_drop, 'r') as file:
        json_drop = json.load(file)
    drop_query_list = json_drop["rewritten_query"]

    drop_query_list = [drop_query.lower()
                                       .replace("drop view", "drop view if exists")
                                       .replace("drop table", "drop table if exists")
                             for drop_query in drop_query_list]
    drop_query_list[0] = drop_query_list[0].replace("table", "view")

    for drop_query in drop_query_list:
            drop = spark.sql(drop_query)
            drop.show()

    timeout_flag_orig = True
    timeout_flag_rewr = True

    # the first run is just a warm up run and to check for the time out
    try:
        spark.sparkContext._jvm.System.gc()

        start_time = time.time()
        resource_usage = []
        measure_thread = threading.Thread(target=measure_resource_usage, args=(resource_usage, ))
        measure_thread.start()
    
        group_id = cancel_query_after(spark, TIMEOUT)
        result = spark.sql(original_query)
        #result.show()
        rows_orig = result.count()
        end_time = time.time()
        print(end_time-start_time)
    
        measure_thread.do_run = False
        timeout_flag_orig = False
    except Py4JError as e:
        print('timeout or error orig: ' + str(e))
        
    try:
        spark.sparkContext._jvm.System.gc()
        resource_usage = []
        measure_thread = threading.Thread(target=measure_resource_usage, args=(resource_usage, ))
        measure_thread.start()

        group_id = cancel_query_after(spark, TIMEOUT)
        for rewritten_query in rewritten_query_list:
            if rewritten_query.startswith("SELECT"):
                result1 = spark.sql(rewritten_query)
                #result1.show()
                rows_rewr = result1.count()
            else:
                result2 = spark.sql(rewritten_query)
                result2.show()

        measure_thread.do_run = False
        timeout_flag_rewr = False

        for drop_query in drop_query_list:
            drop = spark.sql(drop_query)
            drop.show()
        
    except Py4JError as e:
        print('timeout or error rewr: ' + str(e))

    print(timeout_flag_orig, timeout_flag_rewr)
    # original and rewritten query are TOs
    if timeout_flag_orig and timeout_flag_rewr:
        list_original_time = ["-", "-", "-", "-", "-"]
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"

        list_rewritten_time_stage0 = ["-", "-", "-", "-", "-"]
        rewr_stage0_mean = "-"
        rewr_stage0_med = "-"
        rewr_stage0_std = "-"
        list_rewritten_time_stage1 = ["-", "-", "-", "-", "-"]
        rewr_stage1_mean = "-"
        rewr_stage1_med = "-"
        rewr_stage1_std = "-"
        list_rewritten_time_stage2 = ["-", "-", "-", "-", "-"]
        rewr_stage2_mean = "-"
        rewr_stage2_med = "-"
        rewr_stage2_std = "-"
        list_rewritten_time_stage3 = ["-", "-", "-", "-", "-"]
        rewr_stage3_mean = "-"
        rewr_stage3_med = "-"
        rewr_stage3_std = "-"
        list_rewritten_time = ["-", "-", "-", "-", "-"]
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"

        orig_rewr = "-"
        rows = "-"
        
    # original query is a TO and the rewritten not
    elif timeout_flag_orig:
        list_original_time = ["-", "-", "-", "-", "-"]
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"

        list_rewritten_time_stage0 = []
        list_rewritten_time_stage1 = []
        list_rewritten_time_stage2 = []
        list_rewritten_time_stage3 = []
        list_rewritten_time = []
        
        print("rewritten")
        for i in range(5):
            print(i)
            spark.sparkContext._jvm.System.gc()

            try:
                # execute the rewritten query
                start_time_rewritten_stage0 = time.time()
                for rewritten_query in rewritten_query_list_stage0:
                    exec = spark.sql(rewritten_query)
                    exec.show()
                end_time_rewritten_stage0 = time.time()
                rewritten_time_stage0 = end_time_rewritten_stage0 - start_time_rewritten_stage0
    
                # execute the rewritten query
                start_time_rewritten_stage1 = time.time()
                for rewritten_query in rewritten_query_list_stage1:
                    exec = spark.sql(rewritten_query)
                    exec.show()
                end_time_rewritten_stage1 = time.time()
                rewritten_time_stage1 = end_time_rewritten_stage1 - start_time_rewritten_stage1
    
                # execute the rewritten query
                start_time_rewritten_stage2 = time.time()
                for rewritten_query in rewritten_query_list_stage2:
                    exec = spark.sql(rewritten_query)
                    exec.show()
                end_time_rewritten_stage2 = time.time()
                rewritten_time_stage2 = end_time_rewritten_stage2 - start_time_rewritten_stage2
    
                # execute the rewritten query
                start_time_rewritten_stage3 = time.time()
                for rewritten_query in rewritten_query_list_stage3:
                    exec = spark.sql(rewritten_query)
                    exec.show()
                end_time_rewritten_stage3 = time.time()
                rewritten_time_stage3 = end_time_rewritten_stage3 - start_time_rewritten_stage3

                list_rewritten_time_stage0.append(rewritten_time_stage0)
                list_rewritten_time_stage1.append(rewritten_time_stage1)
                list_rewritten_time_stage2.append(rewritten_time_stage2)
                list_rewritten_time_stage3.append(rewritten_time_stage3)
                list_rewritten_time.append(rewritten_time_stage0 + rewritten_time_stage1 + rewritten_time_stage2 + rewritten_time_stage3) 

            except Exception as e:
                list_rewritten_time_stage0.append("-")
                list_rewritten_time_stage1.append("-")
                list_rewritten_time_stage2.append("-")
                list_rewritten_time_stage3.append("-")
                list_rewritten_time.append("-") 
                
            # drop all created tables
            for drop_query in drop_query_list:
                drop = spark.sql(drop_query)
                drop.show()

        list_rewritten_time_stage0_filtered = [x for x in list_rewritten_time_stage0 if x != "-"]
        list_rewritten_time_stage1_filtered = [x for x in list_rewritten_time_stage1 if x != "-"]
        list_rewritten_time_stage2_filtered = [x for x in list_rewritten_time_stage2 if x != "-"]
        list_rewritten_time_stage3_filtered = [x for x in list_rewritten_time_stage3 if x != "-"]
        list_rewritten_time_filtered = [x for x in list_rewritten_time if x != "-"]
        rewr_stage0_mean = np.mean(list_rewritten_time_stage0_filtered)
        rewr_stage0_med = np.median(list_rewritten_time_stage0_filtered)
        rewr_stage0_std = np.std(list_rewritten_time_stage0_filtered)
        rewr_stage1_mean = np.mean(list_rewritten_time_stage1_filtered)
        rewr_stage1_med = np.median(list_rewritten_time_stage1_filtered)
        rewr_stage1_std = np.std(list_rewritten_time_stage1_filtered)
        rewr_stage2_mean = np.mean(list_rewritten_time_stage2_filtered)
        rewr_stage2_med = np.median(list_rewritten_time_stage2_filtered)
        rewr_stage2_std = np.std(list_rewritten_time_stage2_filtered)
        rewr_stage3_mean = np.mean(list_rewritten_time_stage3_filtered)
        rewr_stage3_med = np.median(list_rewritten_time_stage3_filtered)
        rewr_stage3_std = np.std(list_rewritten_time_stage3_filtered)
        rewr_mean = np.mean(list_rewritten_time_filtered)
        rewr_med = np.median(list_rewritten_time_filtered)
        rewr_std = np.std(list_rewritten_time_filtered)
        
        orig_rewr = "rewr"
        rows = rows_rewr

    # rewritten query is a TO and the original not
    elif timeout_flag_rewr:
        list_original_time = []
        print("orig")
        for i in range(5):
            spark.sparkContext._jvm.System.gc()

            try:
                # execute the original query
                start_time_original = time.time()
                exec = spark.sql(original_query)
                exec.show()
                end_time_original = time.time()
                original_time = end_time_original - start_time_original
                list_original_time.append(original_time)
            except Exception as e:
                list_original_time.append("-")

        list_original_time_filtered = [x for x in list_original_time if x != "-"]
        orig_mean = np.mean(list_original_time_filtered)
        orig_med = np.median(list_original_time_filtered)
        orig_std = np.std(list_original_time_filtered)

        list_rewritten_time_stage0 = ["-", "-", "-", "-", "-"]
        rewr_stage0_mean = "-"
        rewr_stage0_med = "-"
        rewr_stage0_std = "-"
        list_rewritten_time_stage1 = ["-", "-", "-", "-", "-"]
        rewr_stage1_mean = "-"
        rewr_stage1_med = "-"
        rewr_stage1_std = "-"
        list_rewritten_time_stage2 = ["-", "-", "-", "-", "-"]
        rewr_stage2_mean = "-"
        rewr_stage2_med = "-"
        rewr_stage2_std = "-"
        list_rewritten_time_stage3 = ["-", "-", "-", "-", "-"]
        rewr_stage3_mean = "-"
        rewr_stage3_med = "-"
        rewr_stage3_std = "-"
        list_rewritten_time = ["-", "-", "-", "-", "-"]
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        
        orig_rewr = "orig"
        rows = rows_orig

    # both queries are no TOs
    else:
        #print(result, result1)
        list_original_time = []
        list_rewritten_time_stage0 = []
        list_rewritten_time_stage1 = []
        list_rewritten_time_stage2 = []
        list_rewritten_time_stage3 = []
        list_rewritten_time = []
        
        # take times for 5 runs (run 2-6) for the original query and the rewritten query
        print("orig+rewr")
        for i in range(5):
            print(i)
            spark.sparkContext._jvm.System.gc()
            
            try:
                # execute the original query
                start_time_original = time.time()
                exec = spark.sql(original_query)
                exec.show()
                end_time_original = time.time()
                original_time = end_time_original - start_time_original
                list_original_time.append(original_time)
            except Exception as e:
                list_original_time.append("-")

            spark.sparkContext._jvm.System.gc()

            try:
                # execute the rewritten query
                start_time_rewritten_stage0 = time.time()
                for rewritten_query in rewritten_query_list_stage0:
                    exec = spark.sql(rewritten_query)
                    exec.show()
                end_time_rewritten_stage0 = time.time()
                rewritten_time_stage0 = end_time_rewritten_stage0 - start_time_rewritten_stage0
    
                # execute the rewritten query
                start_time_rewritten_stage1 = time.time()
                for rewritten_query in rewritten_query_list_stage1:
                    exec = spark.sql(rewritten_query)
                    exec.show()
                end_time_rewritten_stage1 = time.time()
                rewritten_time_stage1 = end_time_rewritten_stage1 - start_time_rewritten_stage1
    
                # execute the rewritten query
                start_time_rewritten_stage2 = time.time()
                for rewritten_query in rewritten_query_list_stage2:
                    exec = spark.sql(rewritten_query)
                    exec.show()
                end_time_rewritten_stage2 = time.time()
                rewritten_time_stage2 = end_time_rewritten_stage2 - start_time_rewritten_stage2
    
                # execute the rewritten query
                start_time_rewritten_stage3 = time.time()
                for rewritten_query in rewritten_query_list_stage3:
                    exec = spark.sql(rewritten_query)
                    exec.show()
                end_time_rewritten_stage3 = time.time()
                rewritten_time_stage3 = end_time_rewritten_stage3 - start_time_rewritten_stage3

                list_rewritten_time_stage0.append(rewritten_time_stage0)
                list_rewritten_time_stage1.append(rewritten_time_stage1)
                list_rewritten_time_stage2.append(rewritten_time_stage2)
                list_rewritten_time_stage3.append(rewritten_time_stage3)
                list_rewritten_time.append(rewritten_time_stage0 + rewritten_time_stage1 + rewritten_time_stage2 + rewritten_time_stage3) 

            except Exception as e:
                list_rewritten_time_stage0.append("-")
                list_rewritten_time_stage1.append("-")
                list_rewritten_time_stage2.append("-")
                list_rewritten_time_stage3.append("-")
                list_rewritten_time.append("-") 
                
            # drop all created tables
            for drop_query in drop_query_list:
                drop = spark.sql(drop_query)
                drop.show()

        list_original_time_filtered = [x for x in list_original_time if x != "-"]
        orig_mean = np.mean(list_original_time_filtered)
        orig_med = np.median(list_original_time_filtered)
        orig_std = np.std(list_original_time_filtered)
        list_rewritten_time_stage0_filtered = [x for x in list_rewritten_time_stage0 if x != "-"]
        list_rewritten_time_stage1_filtered = [x for x in list_rewritten_time_stage1 if x != "-"]
        list_rewritten_time_stage2_filtered = [x for x in list_rewritten_time_stage2 if x != "-"]
        list_rewritten_time_stage3_filtered = [x for x in list_rewritten_time_stage3 if x != "-"]
        list_rewritten_time_filtered = [x for x in list_rewritten_time if x != "-"]
        rewr_stage0_mean = np.mean(list_rewritten_time_stage0_filtered)
        rewr_stage0_med = np.median(list_rewritten_time_stage0_filtered)
        rewr_stage0_std = np.std(list_rewritten_time_stage0_filtered)
        rewr_stage1_mean = np.mean(list_rewritten_time_stage1_filtered)
        rewr_stage1_med = np.median(list_rewritten_time_stage1_filtered)
        rewr_stage1_std = np.std(list_rewritten_time_stage1_filtered)
        rewr_stage2_mean = np.mean(list_rewritten_time_stage2_filtered)
        rewr_stage2_med = np.median(list_rewritten_time_stage2_filtered)
        rewr_stage2_std = np.std(list_rewritten_time_stage2_filtered)
        rewr_stage3_mean = np.mean(list_rewritten_time_stage3_filtered)
        rewr_stage3_med = np.median(list_rewritten_time_stage3_filtered)
        rewr_stage3_std = np.std(list_rewritten_time_stage3_filtered)
        rewr_mean = np.mean(list_rewritten_time_filtered)
        rewr_med = np.median(list_rewritten_time_filtered)
        rewr_std = np.std(list_rewritten_time_filtered)
            
        if orig_med > rewr_med:
            orig_rewr = "rewr"
        else:
            orig_rewr = "orig"
    
        if rows_orig == rows_rewr:
            rows = rows_orig
        else:
            rows = "not the same!"

    if benchmark == "IMDB":
        benchmark = "JOB"
    list_output = [benchmark, query, orig_rewr, orig_med, rewr_med, rewr_stage0_med, rewr_stage1_med, rewr_stage2_med, rewr_stage3_med, rows] + \
                    list_original_time + [orig_mean, orig_std] + list_rewritten_time + [rewr_mean, rewr_std] + list_rewritten_time_stage0 + \
                    [rewr_stage0_mean, rewr_stage0_std] + list_rewritten_time_stage1 + [rewr_stage1_mean, rewr_stage1_std] + \
                    list_rewritten_time_stage2 + [rewr_stage2_mean, rewr_stage2_std] + list_rewritten_time_stage3 + [rewr_stage3_mean, rewr_stage3_std]

    #print(list_output)
    file_path = "results/SPA_Scala_comparison_TO_augment_server_full_enum_infos.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

In [None]:
# Global configuration
SPARK_MEMORY = 120
SPARK_CORES = 4
DBHOST = 'postgres'
TIMEOUT = 100

Create the output csv with the header. We add the running times for each query then.

In [None]:
file_path = "results/SPA_Scala_comparison_TO_augment_server_full_enum_infos.csv"

names = ["bench", "query", "orig/rewr(med)", "orig(med)", "rewr(med)", "stage0(med)", "stage1(med)", "stage2(med)", "stage3(med)", "rows",
        "orig 1", "orig 2", "orig 3", "orig 4", "orig 5", "orig(mean)", "orig(std)", 
        "rewr 1", "rewr 2", "rewr 3", "rewr 4", "rewr 5", "rewr(mean)", "rewr(std)",
        "stage0 1", "stage0 2", "stage0 3", "stage0 4", "stage0 5", "stage0(mean)", "stage0(std)",
        "stage1 1", "stage1 2", "stage1 3", "stage1 4", "stage1 5", "stage1(mean)", "stage1(std)",
        "stage2 1", "stage2 2", "stage2 3", "stage2 4", "stage2 5", "stage2(mean)", "stage2(std)",
        "stage3 1", "stage3 2", "stage3 3", "stage3 4", "stage3 5", "stage3(mean)", "stage3(std)"]

with open(file_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(names)

Connect to Spark for each dataset and execute all queries:

## STATS

In [None]:
spark = create_spark()
import_db(spark, "STATS")

In [None]:
folder_path = 'output/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('STATS')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)

### SNAP

In [None]:
spark = create_spark()
import_db(spark, "SNAP")

In [None]:
folder_path = 'output/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)

### JOB

In [None]:
spark = create_spark()
import_db(spark, "JOB")

In [None]:
folder_path = 'output/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('IMDB')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)

### LSQB

In [None]:
spark = create_spark()
import_db(spark, "LSQB")

In [None]:
folder_path = 'output/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('LSQB')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)

### HETIO

In [None]:
spark = create_spark()
import_db(spark, "HETIO")

In [None]:
folder_path = 'output/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('HETIO')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)