# Running queries using JSONS produced by Scala code

Install and import all needed packages

In [None]:
%%bash
pip install numpy
pip install pandas
pip install duckdb
pip install psycopg2-binary

In [None]:
import json
import time
import duckdb
import numpy as np
import csv
import multiprocessing
import signal
import pandas as pd
import os
import psycopg2

Function for running one query. This means
*  run the original query 5 times (after one initial run, which we do not use)
*  run the rewritten queries 5 times (after one initial run, which we do not use) and drop the created tables each time
*  take the runtimes and calculate mean, median and standard deviation of time for either the original or rewritten query
*  compare the runtimes between the original query, the rewritten query and the rewritten query + the rewriting time (how long the Scala took)
*  save everything in a csv output file

In [None]:
# functions to handle the timeouts
def handler_orig(signum, frame):
    global timeout_flag_orig
    timeout_flag_orig = True
    raise Exception("Query execution of the original query > 100s")

def handler_rewr(signum, frame):
    global timeout_flag_rewr
    timeout_flag_rewr = True
    raise Exception("Query execution of the rewritten query > 100s")

# function to run the query 6 times checking for TO and calculate and saving all values
def run_query(benchmark, query):
    print(benchmark, query)
    file_path = f'rewritten/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)

    # get the original and rewritten query
    original_query = json_data["original_query"]
    rewritten_query_list = json_data["rewritten_query"]
    rewriting_time = json_data["time"]

    # change the queries such that they can be executed in DuckDB (without changing the output)
    rewritten_query_list_ddb = [rewritten_query.replace("TIMESTAMP(0)", "TIMESTAMP").lower()
                                  for rewritten_query in rewritten_query_list]

    # get the drop queries
    file_path_drop = f'rewritten/{benchmark}_{query}_drop.json'
    with open(file_path_drop, 'r') as file:
        json_drop = json.load(file)
    drop_query_list = json_drop["rewritten_query"]

    drop_query_list_ddb = [drop_query.lower() for drop_query in drop_query_list]

    # connect to DuckDB
    database = benchmark.lower() + "/" + benchmark.lower() + ".duckdb"
    conn = duckdb.connect(database=database)

    # if the evaluation takes longer than 100sec then break it
    global timeout_flag_orig
    global timeout_flag_rewr
    timeout_flag_orig = False
    timeout_flag_rewr = False

    print("original1")
    # the first run is just a warm up run and to check for the time out
    signal.signal(signal.SIGALRM, handler_orig) 
    signal.alarm(100) #TO at 100 sec, can be changed
    try:
        cur = conn.cursor()
        cur.execute("USE " + benchmark.lower() + "_DDB")
        cur.execute(original_query)
        result = cur.fetchall()
    except Exception as exc: 
        print(exc)
    signal.alarm(0) 

    print("rewritten1")
    signal.signal(signal.SIGALRM, handler_rewr) 
    signal.alarm(100) 
    try:
        for rewritten_query in rewritten_query_list_ddb:
            cur.execute(rewritten_query)
            if rewritten_query.startswith("select"):
                result1 = cur.fetchall()
        for drop_query in drop_query_list_ddb:
            cur.execute(drop_query)
    except Exception as exc: 
        print(exc)
    signal.alarm(0)

    print(timeout_flag_orig, timeout_flag_rewr)
    # original and rewritten query are TOs
    if timeout_flag_orig and timeout_flag_rewr:
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"
        list_original = ["-", "-", "-", "-", "-"]
        
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        rewr_mean_plus_rewr = "TO"
        rewr_med_plus_rewr = "TO"
        list_rewritten = ["-", "-", "-", "-", "-"]
        
        orig_or_rewr_mean = "-"
        orig_or_rewr_or_equal = "-"
        orig_or_rewr_plus_rewr_mean = "-"

        for drop_query in drop_query_list_ddb:
            drop_query = drop_query.replace("drop view", "drop view if exists").replace("drop table", "drop table if exists")
            #print(drop_query)
            cur.execute(drop_query)

    # original query is a TO and the rewritten not
    elif timeout_flag_orig:
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"
        list_original = ["-", "-", "-", "-", "-"]

        list_rewritten = []
        print("rewritten")
        for i in range(5):
            print(i)
            # execute the rewritten query
            start_time_rewritten = time.time()
            for rewritten_query in rewritten_query_list_ddb:
                cur.execute(rewritten_query)
            end_time_rewritten = time.time()
            rewritten_time = end_time_rewritten - start_time_rewritten
            list_rewritten.append(rewritten_time)
            # drop all created tables
            for drop_query in drop_query_list_ddb:
                cur.execute(drop_query)
        rewr_mean = np.mean(list_rewritten)
        rewr_med = np.median(list_rewritten)
        rewr_std = np.std(list_rewritten)
        rewr_mean_plus_rewr = rewr_mean + rewriting_time
        rewr_med_plus_rewr = rewr_med + rewriting_time

        orig_or_rewr_mean = "rewr"
        orig_or_rewr_or_equal = "rewr"
        orig_or_rewr_plus_rewr_mean = "rewr"

    # rewritten query is a TO and the original not
    elif timeout_flag_rewr:
        list_original = []
        print("orig")
        for i in range(5):
            # execute the original query
            start_time_original = time.time()
            cur.execute(original_query)
            end_time_original = time.time()
            original_time = end_time_original - start_time_original
            list_original.append(original_time)
        orig_mean = np.mean(list_original)
        orig_med = np.median(list_original)
        orig_std = np.std(list_original)
        
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        rewr_mean_plus_rewr = "TO"
        rewr_med_plus_rewr = "TO"
        list_rewritten = ["-", "-", "-", "-", "-"]

        orig_or_rewr_mean = "orig"
        orig_or_rewr_or_equal = "orig"
        orig_or_rewr_plus_rewr_mean = "orig"

        for drop_query in drop_query_list_ddb:
            drop_query = drop_query.replace("drop view", "drop view if exists").replace("drop table", "drop table if exists")
            #print(drop_query)
            cur.execute(drop_query)

    # both queries are no TOs
    else:
        print(result, result1)
        list_original = []
        list_rewritten = []
        # take times for 5 runs (run 2-6) for the original query and the rewritten query
        print("orig+rewr")
        for i in range(5):
            print(i)
            # execute the original query
            start_time_original = time.time()
            cur.execute(original_query)
            end_time_original = time.time()
            original_time = end_time_original - start_time_original
            list_original.append(original_time)
        
            # execute the rewritten query
            start_time_rewritten = time.time()
            for rewritten_query in rewritten_query_list_ddb:
                cur.execute(rewritten_query)
            end_time_rewritten = time.time()
            rewritten_time = end_time_rewritten - start_time_rewritten
            list_rewritten.append(rewritten_time)
            
            # drop all created tables
            for drop_query in drop_query_list_ddb:
                cur.execute(drop_query)
            
        orig_mean = np.mean(list_original)
        orig_med = np.median(list_original)
        orig_std = np.std(list_original)
        rewr_mean = np.mean(list_rewritten)
        rewr_med = np.median(list_rewritten)
        rewr_std = np.std(list_rewritten)
        rewr_mean_plus_rewr = rewr_mean + rewriting_time
        rewr_med_plus_rewr = rewr_med + rewriting_time
        if orig_mean > rewr_mean:
            orig_or_rewr_mean = "rewr"
        else:
            orig_or_rewr_mean = "orig"
        if abs(rewr_mean-orig_mean) < 0.05:
            orig_or_rewr_or_equal = "equal 0.05"
        else:
            orig_or_rewr_or_equal = orig_or_rewr_mean
        if orig_mean > rewr_mean_plus_rewr:
            orig_or_rewr_plus_rewr_mean = "rewr"
        else:
            orig_or_rewr_plus_rewr_mean = "orig"
            
    list_output = [benchmark, query] + [orig_mean, rewr_mean, rewr_mean_plus_rewr, orig_or_rewr_mean, orig_or_rewr_or_equal, \
                                        orig_or_rewr_plus_rewr_mean, rewriting_time] + \
                    list_original + [orig_med, orig_std] + list_rewritten + [rewr_med, rewr_std, rewr_med_plus_rewr]
    #print(list_output)
    file_path = "results/DDB_Scala_comparison_TO_augment_server.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

In [None]:
# if the original query is a timeout for sure, then only run for the rewritten one
def run_query_rewritten(benchmark, query):
    print(benchmark, query)
    file_path = f'rewritten/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)
    
    original_query = json_data["original_query"]
    rewritten_query_list = json_data["rewritten_query"]
    rewriting_time = json_data["time"]

    rewritten_query_list_ddb = [rewritten_query.replace("TIMESTAMP(0)", "TIMESTAMP").lower()
                                  for rewritten_query in rewritten_query_list]

    file_path_drop = f'rewritten/{benchmark}_{query}_drop.json'
    with open(file_path_drop, 'r') as file:
        json_drop = json.load(file)
    drop_query_list = json_drop["rewritten_query"]

    drop_query_list_ddb = [drop_query.lower() for drop_query in drop_query_list]

    database = benchmark.lower() + "/" + benchmark.lower() + ".duckdb"
    conn = duckdb.connect(database=database)

    # if the evaluation takes longer than 30min then break it
    global timeout_flag_rewr
    timeout_flag_rewr = False

    signal.signal(signal.SIGALRM, handler_rewr) 
    signal.alarm(100) 
    try:
        cur = conn.cursor()
        cur.execute("USE " + benchmark.lower() + "_DDB")
        for rewritten_query in rewritten_query_list_ddb:
            cur.execute(rewritten_query)
            if rewritten_query.startswith("select"):
                result1 = cur.fetchall()
        for drop_query in drop_query_list_ddb:
            cur.execute(drop_query)
    except Exception as exc: 
        print(exc)
    signal.alarm(0)

    # rewritten query is a TO too
    if timeout_flag_rewr:
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"
        list_original = ["-", "-", "-", "-", "-"]
        
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        rewr_mean_plus_rewr = "TO"
        rewr_med_plus_rewr = "TO"
        list_rewritten = ["-", "-", "-", "-", "-"]
        
        orig_or_rewr_mean = "-"
        orig_or_rewr_or_equal = "-"
        orig_or_rewr_plus_rewr_mean = "-"

        for drop_query in drop_query_list_ddb:
            drop_query = drop_query.replace("drop view", "drop view if exists").replace("drop table", "drop table if exists")
            #print(drop_query)
            cur.execute(drop_query)
    

    # rewritten query is not a TO
    else:
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"
        list_original = ["-", "-", "-", "-", "-"]

        list_rewritten = []
        for i in range(5):
            print(i)
            # execute the rewritten query
            start_time_rewritten = time.time()
            for rewritten_query in rewritten_query_list_ddb:
                cur.execute(rewritten_query)
            end_time_rewritten = time.time()
            rewritten_time = end_time_rewritten - start_time_rewritten
            list_rewritten.append(rewritten_time)
            # drop all created tables
            for drop_query in drop_query_list_ddb:
                cur.execute(drop_query)
        rewr_mean = np.mean(list_rewritten)
        rewr_med = np.median(list_rewritten)
        rewr_std = np.std(list_rewritten)
        rewr_mean_plus_rewr = rewr_mean + rewriting_time
        rewr_med_plus_rewr = rewr_med + rewriting_time

        orig_or_rewr_mean = "rewr"
        orig_or_rewr_or_equal = "rewr"
        orig_or_rewr_plus_rewr_mean = "rewr"

    orig_mean = "TO"
    orig_med = "TO"
    orig_std = "-"
    list_original = ["-", "-", "-", "-", "-"]
            
    list_output = [benchmark, query] + [orig_mean, rewr_mean, rewr_mean_plus_rewr, orig_or_rewr_mean, orig_or_rewr_or_equal, \
                                        orig_or_rewr_plus_rewr_mean, rewriting_time] + \
                    list_original + [orig_med, orig_std] + list_rewritten + [rewr_med, rewr_std, rewr_med_plus_rewr]
    #print(list_output)
    file_path = "results/DDB_Scala_comparison_TO_augment_server.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

Create the output csv with the header. We add the running times for each query then.

In [None]:
file_path = "results/DDB_Scala_comparison_TO_augment_server.csv"

names = ["bench", "query", "orig mean", "rewr mean", "rewr mean+rewr", "orig/rewr(mean)", "orig/rewr/equal", "orig/rewr+rewr(mean)", "rewriting", 
         "orig 1", "orig 2", "orig 3", "orig 4", "orig 5", "orig med", "orig_std", "rewr 1", "rewr 2", "rewr 3", "rewr 4", "rewr 5", "rewr med", 
         "rewr_std", "rewr med+rewr", ]

with open(file_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(names)

Connect to DuckDB for each dataset and execute all queries:

### STATS

In [None]:
database = "stats"
conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )

cur = conn.cursor()
cur.execute("SELECT inet_server_addr(), inet_server_port()")
host, port = cur.fetchone()

print("Host:", host)
print("Port:", port)

cur.close()
conn.close()

In [None]:
con = duckdb.connect(database="stats/stats.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=" + host + " port=5432 user=postgres password=postgres dbname=stats' AS stats_DDB (TYPE postgres)")
con.execute("USE stats_DDB")

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('STATS')][0:1834]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][1834:1837]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][1837:1845]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][1845:1849]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][1849:1857]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][1857:]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

### SNAP

In [None]:
database = "snap"
conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )

cur = conn.cursor()
cur.execute("SELECT inet_server_addr(), inet_server_port()")
host, port = cur.fetchone()

print("Host:", host)
print("Port:", port)

cur.close()
conn.close()

In [None]:
con = duckdb.connect(database="snap/snap.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=" + host + " port=5432 user=postgres password=postgres dbname=snap' AS snap_DDB (TYPE postgres)")
con.execute("USE snap_DDB")

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][0:7]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][7:61]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][61:64]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][64:122]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][122:129]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][129:]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

### JOB

In [None]:
database = "imdb"
conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )

cur = conn.cursor()
cur.execute("SELECT inet_server_addr(), inet_server_port()")
host, port = cur.fetchone()

print("Host:", host)
print("Port:", port)

cur.close()
conn.close()

In [None]:
con = duckdb.connect(database="job/job.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=" + host + " port=5432 user=postgres password=postgres dbname=imdb' AS job_DDB (TYPE postgres)")
con.execute("USE job_DDB")

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('JOB')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

### LSQB

In [None]:
database = "lsqb"
conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )

cur = conn.cursor()
cur.execute("SELECT inet_server_addr(), inet_server_port()")
host, port = cur.fetchone()

print("Host:", host)
print("Port:", port)

cur.close()
conn.close()

In [None]:
con = duckdb.connect(database="lsqb/lsqb.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=" + host + " port=5432 user=postgres password=postgres dbname=lsqb' AS lsqb_DDB (TYPE postgres)")
con.execute("USE lsqb_DDB")

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('LSQB')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

### HETIO

In [None]:
database = "hetio"
conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )

cur = conn.cursor()
cur.execute("SELECT inet_server_addr(), inet_server_port()")
host, port = cur.fetchone()

print("Host:", host)
print("Port:", port)

cur.close()
conn.close()

In [None]:
con = duckdb.connect(database="hetio/hetio.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=" + host + " port=5432 user=postgres password=postgres dbname=hetio' AS hetio_DDB (TYPE postgres)")
con.execute("USE hetio_DDB")

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('HETIO')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])