# Running queries using JSONS produced by Scala code

Install and import all needed packages

In [None]:
%%bash
pip install psycopg2-binary
pip install numpy
pip install pandas

In [None]:
import json
import time
import psycopg2
import numpy as np
import csv
import multiprocessing
import signal
import pandas as pd
import os

Function for running one query. This means
*  run the original query 5 times (after one initial run, which we do not use)
*  run the rewritten queries 5 times (after one initial run, which we do not use) and drop the created tables each time
*  take the runtimes and calculate mean, median and standard deviation of time for either the original or rewritten query
*  additionaly save the runtimes for each stage
*  save everything in a csv output file

In [None]:
# functions to handle the timeouts
def handler_orig(signum, frame):
    global timeout_flag_orig
    timeout_flag_orig = True
    raise Exception("Query execution of the original query > 100s")

def handler_rewr(signum, frame):
    global timeout_flag_rewr
    timeout_flag_rewr = True
    raise Exception("Query execution of the rewritten query > 100s")

def run_query(benchmark, query):
    print(benchmark, query)
    file_path = f'output/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)

    # get the original and rewritten query
    original_query = json_data["original_query"]
    rewritten_query_list = json_data["rewritten_query"]
    rewriting_time = json_data["time"]

    rewritten_query_list[-2] = rewritten_query_list[-2].replace("UNLOGGED TABLE", "VIEW")
    rewritten_query_list_stage1 = rewritten_query_list[:-2]
    rewritten_query_list_stage3 = rewritten_query_list[-2:]
    rewritten_query_list_stage2 = [r for r in rewritten_query_list_stage1 if "stage2" in r]
    rewritten_query_list_stage0 = [r for r in rewritten_query_list_stage1 if "VIEW" in r]
    rewritten_query_list_stage1 = [r for r in rewritten_query_list_stage1 if "stage2" not in r and "VIEW" not in r]
    
    # get the drop queries
    file_path_drop = f'output/{benchmark}_{query}_drop.json'
    with open(file_path_drop, 'r') as file:
        json_drop = json.load(file)
    drop_query_list = json_drop["rewritten_query"]
    drop_query_list[0] = drop_query_list[0].replace("TABLE", "VIEW")

    # connect to PostgreSQL
    if benchmark == "JOB":
        database = "imdb"
    else:
        database = benchmark.lower()
    
    conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )
    
    # if the evaluation takes longer than 100sec then break it
    global timeout_flag_orig
    global timeout_flag_rewr
    timeout_flag_orig = False
    timeout_flag_rewr = False

    print("original1")
    # the first run is just a warm up run and to check for the time out
    signal.signal(signal.SIGALRM, handler_orig) 
    signal.alarm(100) #TO at 100 sec, can be changed
    try:
        cur = conn.cursor()
        cur.execute(original_query)
        result = cur.fetchall()
        rows_orig = len(result)
    except Exception as exc: 
        print(exc)
    signal.alarm(0) 

    for drop_query in drop_query_list:
        cur.execute(drop_query.replace("DROP VIEW", "DROP VIEW IF EXISTS").replace("DROP TABLE", "DROP TABLE IF EXISTS"))

    print("rewritten1")
    signal.signal(signal.SIGALRM, handler_rewr) 
    signal.alarm(100) 
    last_executed_query = None
    try:
        for rewritten_query in rewritten_query_list:
            last_executed_query = rewritten_query
            cur.execute(rewritten_query)
            if rewritten_query.startswith("SELECT"):
                result1 = cur.fetchall()
                rows_rewr = len(result1)
        for drop_query in drop_query_list:
            last_executed_query = drop_query
            cur.execute(drop_query)
    except Exception as exc: 
        print(exc)
        print(last_executed_query)
    signal.alarm(0)

    print(timeout_flag_orig, timeout_flag_rewr)
    # original and rewritten query are TOs
    if timeout_flag_orig and timeout_flag_rewr:
        list_original_time = ["-", "-", "-", "-", "-"]
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"

        list_rewritten_time_stage0 = ["-", "-", "-", "-", "-"]
        rewr_stage0_mean = "-"
        rewr_stage0_med = "-"
        rewr_stage0_std = "-"
        list_rewritten_time_stage1 = ["-", "-", "-", "-", "-"]
        rewr_stage1_mean = "-"
        rewr_stage1_med = "-"
        rewr_stage1_std = "-"
        list_rewritten_time_stage2 = ["-", "-", "-", "-", "-"]
        rewr_stage2_mean = "-"
        rewr_stage2_med = "-"
        rewr_stage2_std = "-"
        list_rewritten_time_stage3 = ["-", "-", "-", "-", "-"]
        rewr_stage3_mean = "-"
        rewr_stage3_med = "-"
        rewr_stage3_std = "-"
        list_rewritten_time = ["-", "-", "-", "-", "-"]
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"

        orig_rewr = "-"
        rows = "-"

    # original query is a TO and the rewritten not
    elif timeout_flag_orig:
        list_original_time = ["-", "-", "-", "-", "-"]
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"

        list_rewritten_time_stage0 = []
        list_rewritten_time_stage1 = []
        list_rewritten_time_stage2 = []
        list_rewritten_time_stage3 = []
        list_rewritten_time = []

        for i in range(5):
            print(i)
            # execute the rewritten query
            start_time_rewritten_stage0 = time.time()
            for rewritten_query in rewritten_query_list_stage0:
                cur.execute(rewritten_query)
            end_time_rewritten_stage0 = time.time()
            rewritten_time_stage0 = end_time_rewritten_stage0 - start_time_rewritten_stage0
            list_rewritten_time_stage0.append(rewritten_time_stage0)
            
            start_time_rewritten_stage1 = time.time()
            for rewritten_query in rewritten_query_list_stage1:
                cur.execute(rewritten_query)
            end_time_rewritten_stage1 = time.time()
            rewritten_time_stage1 = end_time_rewritten_stage1 - start_time_rewritten_stage1
            list_rewritten_time_stage1.append(rewritten_time_stage1)
            
            start_time_rewritten_stage2 = time.time()
            for rewritten_query in rewritten_query_list_stage2:
                cur.execute(rewritten_query)
            end_time_rewritten_stage2 = time.time()
            rewritten_time_stage2 = end_time_rewritten_stage2 - start_time_rewritten_stage2
            list_rewritten_time_stage2.append(rewritten_time_stage2)
        
            start_time_rewritten_stage3 = time.time()
            for rewritten_query in rewritten_query_list_stage3:
                cur.execute(rewritten_query)
            end_time_rewritten_stage3 = time.time()
            rewritten_time_stage3 = end_time_rewritten_stage3 - start_time_rewritten_stage3
            list_rewritten_time_stage3.append(rewritten_time_stage3)
            
            list_rewritten_time.append(rewritten_time_stage0 + rewritten_time_stage1 + rewritten_time_stage2 + rewritten_time_stage3) 
        
            # drop all created tables
            for drop_query in drop_query_list:
                cur.execute(drop_query)

        rewr_stage0_mean = np.mean(list_rewritten_time_stage0)
        rewr_stage0_med = np.median(list_rewritten_time_stage0)
        rewr_stage0_std = np.std(list_rewritten_time_stage0)
        rewr_stage1_mean = np.mean(list_rewritten_time_stage1)
        rewr_stage1_med = np.median(list_rewritten_time_stage1)
        rewr_stage1_std = np.std(list_rewritten_time_stage1)
        rewr_stage2_mean = np.mean(list_rewritten_time_stage2)
        rewr_stage2_med = np.median(list_rewritten_time_stage2)
        rewr_stage2_std = np.std(list_rewritten_time_stage2)
        rewr_stage3_mean = np.mean(list_rewritten_time_stage3)
        rewr_stage3_med = np.median(list_rewritten_time_stage3)
        rewr_stage3_std = np.std(list_rewritten_time_stage3)
        rewr_mean = np.mean(list_rewritten_time)
        rewr_med = np.median(list_rewritten_time)
        rewr_std = np.std(list_rewritten_time)
        
        orig_rewr = "rewr"
        rows = rows_rewr

    # rewritten query is a TO and the original not
    elif timeout_flag_rewr:
        list_original_time = []

        for i in range(5):
            print(i)
            # execute the original query
            start_time_original = time.time()
            cur.execute(original_query)
            end_time_original = time.time()
            list_original_time.append(end_time_original - start_time_original)

        orig_mean = np.mean(list_original_time)
        orig_med = np.median(list_original_time)
        orig_std = np.std(list_original_time)

        list_rewritten_time_stage0 = ["-", "-", "-", "-", "-"]
        rewr_stage0_mean = "-"
        rewr_stage0_med = "-"
        rewr_stage0_std = "-"
        list_rewritten_time_stage1 = ["-", "-", "-", "-", "-"]
        rewr_stage1_mean = "-"
        rewr_stage1_med = "-"
        rewr_stage1_std = "-"
        list_rewritten_time_stage2 = ["-", "-", "-", "-", "-"]
        rewr_stage2_mean = "-"
        rewr_stage2_med = "-"
        rewr_stage2_std = "-"
        list_rewritten_time_stage3 = ["-", "-", "-", "-", "-"]
        rewr_stage3_mean = "-"
        rewr_stage3_med = "-"
        rewr_stage3_std = "-"
        list_rewritten_time = ["-", "-", "-", "-", "-"]
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        
        orig_rewr = "orig"
        rows = rows_orig

    # both queries are no TOs
    else:
        list_original_time = []
        list_rewritten_time_stage0 = []
        list_rewritten_time_stage1 = []
        list_rewritten_time_stage2 = []
        list_rewritten_time_stage3 = []
        list_rewritten_time = []

        for i in range(5):
            print(i)
            # execute the original query
            start_time_original = time.time()
            cur.execute(original_query)
            end_time_original = time.time()
            list_original_time.append(end_time_original - start_time_original)
        
            # execute the rewritten query
            start_time_rewritten_stage0 = time.time()
            for rewritten_query in rewritten_query_list_stage0:
                cur.execute(rewritten_query)
            end_time_rewritten_stage0 = time.time()
            rewritten_time_stage0 = end_time_rewritten_stage0 - start_time_rewritten_stage0
            list_rewritten_time_stage0.append(rewritten_time_stage0)
            
            start_time_rewritten_stage1 = time.time()
            for rewritten_query in rewritten_query_list_stage1:
                cur.execute(rewritten_query)
            end_time_rewritten_stage1 = time.time()
            rewritten_time_stage1 = end_time_rewritten_stage1 - start_time_rewritten_stage1
            list_rewritten_time_stage1.append(rewritten_time_stage1)
            
            start_time_rewritten_stage2 = time.time()
            for rewritten_query in rewritten_query_list_stage2:
                cur.execute(rewritten_query)
            end_time_rewritten_stage2 = time.time()
            rewritten_time_stage2 = end_time_rewritten_stage2 - start_time_rewritten_stage2
            list_rewritten_time_stage2.append(rewritten_time_stage2)
        
            start_time_rewritten_stage3 = time.time()
            for rewritten_query in rewritten_query_list_stage3:
                cur.execute(rewritten_query)
            end_time_rewritten_stage3 = time.time()
            rewritten_time_stage3 = end_time_rewritten_stage3 - start_time_rewritten_stage3
            list_rewritten_time_stage3.append(rewritten_time_stage3)
            
            list_rewritten_time.append(rewritten_time_stage0 + rewritten_time_stage1 + rewritten_time_stage2 + rewritten_time_stage3) 
        
            # drop all created tables
            for drop_query in drop_query_list:
                cur.execute(drop_query)

        orig_mean = np.mean(list_original_time)
        orig_med = np.median(list_original_time)
        orig_std = np.std(list_original_time)
        rewr_stage0_mean = np.mean(list_rewritten_time_stage0)
        rewr_stage0_med = np.median(list_rewritten_time_stage0)
        rewr_stage0_std = np.std(list_rewritten_time_stage0)
        rewr_stage1_mean = np.mean(list_rewritten_time_stage1)
        rewr_stage1_med = np.median(list_rewritten_time_stage1)
        rewr_stage1_std = np.std(list_rewritten_time_stage1)
        rewr_stage2_mean = np.mean(list_rewritten_time_stage2)
        rewr_stage2_med = np.median(list_rewritten_time_stage2)
        rewr_stage2_std = np.std(list_rewritten_time_stage2)
        rewr_stage3_mean = np.mean(list_rewritten_time_stage3)
        rewr_stage3_med = np.median(list_rewritten_time_stage3)
        rewr_stage3_std = np.std(list_rewritten_time_stage3)
        rewr_mean = np.mean(list_rewritten_time)
        rewr_med = np.median(list_rewritten_time)
        rewr_std = np.std(list_rewritten_time)
            
        if orig_med > rewr_med:
            orig_rewr = "rewr"
        else:
            orig_rewr = "orig"
    
        if rows_orig == rows_rewr:
            rows = rows_orig
        else:
            rows = "not the same!"

    if benchmark == "IMDB":
        benchmark = "JOB"
    list_output = [benchmark, query, orig_rewr, orig_med, rewr_med, rewr_stage0_med, rewr_stage1_med, rewr_stage2_med, rewr_stage3_med, rows] + \
                    list_original_time + [orig_mean, orig_std] + list_rewritten_time + [rewr_mean, rewr_std] + list_rewritten_time_stage0 + \
                    [rewr_stage0_mean, rewr_stage0_std] + list_rewritten_time_stage1 + [rewr_stage1_mean, rewr_stage1_std] + \
                    list_rewritten_time_stage2 + [rewr_stage2_mean, rewr_stage2_std] + list_rewritten_time_stage3 + [rewr_stage3_mean, rewr_stage3_std]
        

    file_path = "results/POS_Scala_comparison_TO_augment_server_full_enum_infos.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

In [None]:
file_path = "results/POS_Scala_comparison_TO_augment_server_full_enum_infos.csv"

names = ["bench", "query", "orig/rewr(med)", "orig(med)", "rewr(med)", "stage0(med)", "stage1(med)", "stage2(med)", "stage3(med)", "rows",
        "orig 1", "orig 2", "orig 3", "orig 4", "orig 5", "orig(mean)", "orig(std)", 
        "rewr 1", "rewr 2", "rewr 3", "rewr 4", "rewr 5", "rewr(mean)", "rewr(std)",
        "stage0 1", "stage0 2", "stage0 3", "stage0 4", "stage0 5", "stage0(mean)", "stage0(std)",
        "stage1 1", "stage1 2", "stage1 3", "stage1 4", "stage1 5", "stage1(mean)", "stage1(std)",
        "stage2 1", "stage2 2", "stage2 3", "stage2 4", "stage2 5", "stage2(mean)", "stage2(std)",
        "stage3 1", "stage3 2", "stage3 3", "stage3 4", "stage3 5", "stage3(mean)", "stage3(std)"]

with open(file_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(names)

### STATS

In [None]:
folder_path = 'output/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('STATS')]

for file in output_files:
    file_split = file.split("_")
    number = int(file_split[1].split("-")[0])
    if number <= 50:
        run_query(file_split[0], file_split[1])

In [None]:
folder_path = 'output/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('STATS')]

for file in output_files:
    file_split = file.split("_")
    number = int(file_split[1].split("-")[0])
    if number > 50 and number <= 100:
        run_query(file_split[0], file_split[1])

In [None]:
folder_path = 'output/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('STATS')]

for file in output_files:
    file_split = file.split("_")
    number = int(file_split[1].split("-")[0])
    if number > 100:
        run_query(file_split[0], file_split[1])

### SNAP

In [None]:
folder_path = 'output/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

### JOB

In [None]:
folder_path = 'output/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('IMDB')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

### LSQB

In [None]:
folder_path = 'output/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('LSQB')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

### HETIO

In [None]:
folder_path = 'output/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('HETIO')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])