# Running queries using JSONS produced by Scala code

Install and import all needed packages

In [1]:
%%bash
pip install psycopg2-binary
pip install numpy

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.0/3.0 MB 4.0 MB/s eta 0:00:00
Installing collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.9




Collecting numpy
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.2/18.2 MB 18.5 MB/s eta 0:00:00
Installing collected packages: numpy
Successfully installed numpy-1.26.4




In [2]:
import json
import time
import psycopg2
import numpy as np
import csv
import multiprocessing
import signal

Function for running one query. This means
*  run the original query 5 times (after one initial run, which we do not use)
*  run the rewritten queries 5 times (after one initial run, which we do not use) and drop the created tables each time
*  take the runtimes and calculate mean, median and standard deviation of time for either the original or rewritten query
*  compare the runtimes between the original query, the rewritten query and the rewritten query + the rewriting time (how long the Scala took)
*  save everything in a csv output file

In [3]:
def handler_orig(signum, frame):
    global timeout_flag_orig
    timeout_flag_orig = True
    raise Exception("Query execution of the original query > 1800s")

def handler_rewr(signum, frame):
    global timeout_flag_rewr
    timeout_flag_rewr = True
    raise Exception("Query execution of the rewritten query > 1800s")
    
def run_query(benchmark, query):
    file_path = f'rewritten/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)
    
    original_query = json_data["original_query"]
    rewritten_query_list = json_data["rewritten_query"]
    rewriting_time = json_data["time"]

    file_path_drop = f'rewritten/{benchmark}_{query}_drop.json'
    with open(file_path_drop, 'r') as file:
        json_drop = json.load(file)
    drop_query_list = json_drop["rewritten_query"]

    if benchmark == "JOB":
        database = "imdb"
    else:
        database = benchmark.lower()
    
    conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )

    # if the evaluation takes longer than 30min then break it
    global timeout_flag_orig
    global timeout_flag_rewr
    timeout_flag_orig = False
    timeout_flag_rewr = False

    print("original1")
    # the first run is just a warm up run and to check for the time out
    signal.signal(signal.SIGALRM, handler_orig) 
    signal.alarm(1800) 
    try:
        print("o1")
        cur = conn.cursor()
        print("o2")
        cur.execute(original_query)
        print("o3")
        result = cur.fetchall()
    except Exception as exc: 
        print(exc)
    signal.alarm(0) 

    print("rewritten1")
    signal.signal(signal.SIGALRM, handler_rewr) 
    signal.alarm(1800) 
    try:
        for rewritten_query in rewritten_query_list:
            cur.execute(rewritten_query)
            if rewritten_query.startswith("SELECT"):
                result1 = cur.fetchall()
        for drop_query in drop_query_list:
            cur.execute(drop_query)
    except Exception as exc: 
        print(exc)
    signal.alarm(0)

    print(timeout_flag_orig, timeout_flag_rewr)
    # original and rewritten query are TOs
    if timeout_flag_orig and timeout_flag_rewr:
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"
        list_original = ["-", "-", "-", "-", "-"]
        
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        rewr_mean_plus_rewr = "TO"
        rewr_med_plus_rewr = "TO"
        list_rewritten = ["-", "-", "-", "-", "-"]
        
        orig_or_rewr_mean = "-"
        orig_or_rewr_plus_rewr_mean = "-"

    # original query is a TO and the rewritten not
    elif timeout_flag_orig:
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"
        list_original = ["-", "-", "-", "-", "-"]

        list_rewritten = []
        print("rewritten")
        for i in range(5):
            print(i)
            # execute the rewritten query
            start_time_rewritten = time.time()
            for rewritten_query in rewritten_query_list:
                cur.execute(rewritten_query)
            end_time_rewritten = time.time()
            rewritten_time = end_time_rewritten - start_time_rewritten
            list_rewritten.append(rewritten_time)
            # drop all created tables
            for drop_query in drop_query_list:
                cur.execute(drop_query)
        rewr_mean = np.mean(list_rewritten)
        rewr_med = np.median(list_rewritten)
        rewr_std = np.std(list_rewritten)
        rewr_mean_plus_rewr = rewr_mean + rewriting_time
        rewr_med_plus_rewr = rewr_med + rewriting_time

        orig_or_rewr_mean = "rewr"
        orig_or_rewr_plus_rewr_mean = "rewr"

    # rewritten query is a TO and the original not
    elif timeout_flag_rewr:
        list_original = []
        print("orig")
        for i in range(5):
            # execute the original query
            start_time_original = time.time()
            cur.execute(original_query)
            end_time_original = time.time()
            original_time = end_time_original - start_time_original
            list_original.append(original_time)
        orig_mean = np.mean(list_original)
        orig_med = np.median(list_original)
        orig_std = np.std(list_original)
        
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        rewr_mean_plus_rewr = "TO"
        rewr_med_plus_rewr = "TO"
        list_rewritten = ["-", "-", "-", "-", "-"]

        orig_or_rewr_mean = "orig"
        orig_or_rewr_plus_rewr_mean = "orig"

    # both queries are no TOs
    else:
        print(result, result1)
        list_original = []
        list_rewritten = []
        # take times for 5 runs (run 2-6) for the original query and the rewritten query
        print("orig+rewr")
        for i in range(5):
            print(i)
            # execute the original query
            start_time_original = time.time()
            cur.execute(original_query)
            end_time_original = time.time()
            original_time = end_time_original - start_time_original
            list_original.append(original_time)
        
            # execute the rewritten query
            start_time_rewritten = time.time()
            for rewritten_query in rewritten_query_list:
                cur.execute(rewritten_query)
            end_time_rewritten = time.time()
            rewritten_time = end_time_rewritten - start_time_rewritten
            list_rewritten.append(rewritten_time)
            
            # drop all created tables
            for drop_query in drop_query_list:
                cur.execute(drop_query)
            
        orig_mean = np.mean(list_original)
        orig_med = np.median(list_original)
        orig_std = np.std(list_original)
        rewr_mean = np.mean(list_rewritten)
        rewr_med = np.median(list_rewritten)
        rewr_std = np.std(list_rewritten)
        rewr_mean_plus_rewr = rewr_mean + rewriting_time
        rewr_med_plus_rewr = rewr_med + rewriting_time
        if orig_mean > rewr_mean:
            orig_or_rewr_mean = "rewr"
        else:
            orig_or_rewr_mean = "orig"
        if orig_mean > rewr_mean_plus_rewr:
            orig_or_rewr_plus_rewr_mean = "rewr"
        else:
            orig_or_rewr_plus_rewr_mean = "orig"
            
    list_output = [benchmark, query] + [orig_mean, rewr_mean, rewr_mean_plus_rewr, orig_or_rewr_mean, orig_or_rewr_plus_rewr_mean, rewriting_time] + \
                    list_original + [orig_med, orig_std] + list_rewritten + [rewr_med, rewr_std, rewr_med_plus_rewr]
    #print(list_output)
    file_path = "results/POS_Scala_comparison_TO.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

In [4]:
def run_query_rewritten(benchmark, query):
    file_path = f'rewritten/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)

    print(query)

    rewritten_query_list = json_data["rewritten_query"]
    rewriting_time = json_data["time"]

    file_path_drop = f'rewritten/{benchmark}_{query}_drop.json'
    with open(file_path_drop, 'r') as file:
        json_drop = json.load(file)
    drop_query_list = json_drop["rewritten_query"]

    if benchmark == "JOB":
        database = "imdb"
    else:
        database = benchmark.lower()
    
    conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )

    # if the evaluation takes longer than 30min then break it
    global timeout_flag_rewr
    timeout_flag_rewr = False

    signal.signal(signal.SIGALRM, handler_rewr) 
    signal.alarm(1800) 
    try:
        cur = conn.cursor()
        for rewritten_query in rewritten_query_list:
            cur.execute(rewritten_query)
            if rewritten_query.startswith("SELECT"):
                result1 = cur.fetchall()
        for drop_query in drop_query_list:
            cur.execute(drop_query)
    except Exception as exc: 
        print(exc)
    signal.alarm(0)

    # rewritten query is a TO and the original not
    if timeout_flag_rewr:
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        rewr_mean_plus_rewr = "TO"
        rewr_med_plus_rewr = "TO"
        list_rewritten = ["-", "-", "-", "-", "-"]

        orig_or_rewr_mean = "-"
        orig_or_rewr_plus_rewr_mean = "-"

    # rewritten query is not a TO
    else:
        list_rewritten = []
        # take times for 5 runs (run 2-6) for the original query and the rewritten query
        for i in range(5):
            print(i)
            # execute the rewritten query
            start_time_rewritten = time.time()
            for rewritten_query in rewritten_query_list:
                cur.execute(rewritten_query)
            end_time_rewritten = time.time()
            rewritten_time = end_time_rewritten - start_time_rewritten
            list_rewritten.append(rewritten_time)
            
            # drop all created tables
            for drop_query in drop_query_list:
                cur.execute(drop_query)
            
        rewr_mean = np.mean(list_rewritten)
        rewr_med = np.median(list_rewritten)
        rewr_std = np.std(list_rewritten)
        rewr_mean_plus_rewr = rewr_mean + rewriting_time
        rewr_med_plus_rewr = rewr_med + rewriting_time
        orig_or_rewr_mean = "rewr"
        orig_or_rewr_plus_rewr_mean = "rewr"

    orig_mean = "TO"
    orig_med = "TO"
    orig_std = "-"
    list_original = ["-", "-", "-", "-", "-"]
        
    list_output = [benchmark, query] + [orig_mean, rewr_mean, rewr_mean_plus_rewr, orig_or_rewr_mean, orig_or_rewr_plus_rewr_mean, rewriting_time] + \
                    list_original + [orig_med, orig_std] + list_rewritten + [rewr_med, rewr_std, rewr_med_plus_rewr]

    file_path = "results/POS_Scala_comparison_TO.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

Create the output csv with the header. We add the running times for each query then.

In [None]:
file_path = "results/POS_Scala_comparison_TO.csv"

names = ["bench", "query", "orig mean", "rewr mean", "rewr mean+rewr", "orig/rewr(mean)", "orig/rewr+rewr(mean)", "rewriting", "orig 1", "orig 2", 
         "orig 3", "orig 4", "orig 5", "orig med", "orig_std", "rewr 1", "rewr 2", "rewr 3", "rewr 4", "rewr 5", "rewr med", "rewr_std", "rewr med+rewr", ]

with open(file_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(names)

### STATS

In [None]:
run_query("STATS", "001-014")
run_query("STATS", "002-048")
run_query("STATS", "003-067")
run_query("STATS", "004-041")
run_query("STATS", "005-024")
run_query("STATS", "006-054")
run_query("STATS", "007-057")
run_query("STATS", "008-045")
run_query("STATS", "009-033")
run_query("STATS", "010-063")

In [None]:
run_query("STATS", "011-050")
run_query("STATS", "012-019")
run_query("STATS", "013-055")
run_query("STATS", "014-134")
run_query("STATS", "015-026")
run_query("STATS", "016-021")
run_query("STATS", "017-040")
run_query("STATS", "018-065")
run_query("STATS", "019-130")
run_query("STATS", "020-027")

In [None]:
run_query("STATS", "021-036")
run_query("STATS", "022-125")
run_query("STATS", "023-018")
run_query("STATS", "024-017")
run_query("STATS", "025-001")
run_query("STATS", "026-066")
run_query("STATS", "027-132")
run_query("STATS", "028-062")
run_query("STATS", "029-023")
run_query("STATS", "030-092")

In [None]:
run_query("STATS", "031-025")
run_query("STATS", "032-086")
run_query("STATS", "033-137")
run_query("STATS", "034-016")
run_query("STATS", "035-020")
run_query("STATS", "036-100")
run_query("STATS", "037-013")
run_query("STATS", "038-011")
run_query("STATS", "039-117")
run_query("STATS", "040-101")

In [None]:
run_query("STATS", "041-078")
run_query("STATS", "042-006")
run_query("STATS", "043-076")
run_query("STATS", "044-145")
run_query("STATS", "045-046")
run_query("STATS", "046-128")
run_query("STATS", "047-008")
run_query("STATS", "048-127")
run_query("STATS", "049-118")
run_query("STATS", "050-106")

In [None]:
run_query("STATS", "051-090")
run_query("STATS", "052-029")
run_query("STATS", "053-081")
run_query("STATS", "054-064")
run_query("STATS", "055-009")
run_query("STATS", "056-007")
run_query("STATS", "057-087")
run_query("STATS", "058-010")
run_query("STATS", "059-061")
run_query("STATS", "060-042")

In [None]:
run_query("STATS", "061-053")
run_query("STATS", "062-129")
run_query("STATS", "063-109")
run_query("STATS", "064-116")
run_query("STATS", "065-012")
run_query("STATS", "066-094")
run_query("STATS", "067-110")
run_query("STATS", "068-121")
run_query("STATS", "069-089")
run_query("STATS", "070-003")

In [None]:
run_query("STATS", "071-080")
run_query("STATS", "072-099")
run_query("STATS", "073-146")
run_query("STATS", "074-072")
run_query("STATS", "075-037")
run_query("STATS", "076-073")
run_query("STATS", "077-004")
run_query("STATS", "078-082")
run_query("STATS", "079-112")
run_query("STATS", "080-119")

In [None]:
run_query("STATS", "081-111")
run_query("STATS", "082-096")
run_query("STATS", "083-039")
run_query("STATS", "084-044")
run_query("STATS", "085-093")
run_query("STATS", "086-083")
run_query("STATS", "087-022")
run_query("STATS", "088-102")
run_query("STATS", "089-131")
run_query("STATS", "090-079")

In [None]:
run_query("STATS", "091-035")
run_query("STATS", "092-103")
run_query("STATS", "093-075")
run_query("STATS", "094-091")
run_query("STATS", "095-139")
run_query("STATS", "096-095")
run_query("STATS", "097-077")
run_query("STATS", "098-124")
run_query("STATS", "099-031")
run_query("STATS", "100-005")

In [None]:
run_query("STATS", "101-043")
run_query("STATS", "102-002")
run_query("STATS", "103-015")
run_query("STATS", "104-088")
run_query("STATS", "105-085")
run_query("STATS", "106-084")
run_query("STATS", "107-104")
run_query("STATS", "108-060")
run_query("STATS", "109-133")
run_query("STATS", "110-138")

In [None]:
run_query("STATS", "111-056")
run_query("STATS", "112-028")
run_query("STATS", "113-113")
run_query("STATS", "114-049")
run_query("STATS", "115-144")
run_query("STATS", "116-032")
run_query("STATS", "117-114")
run_query("STATS", "118-074")
run_query("STATS", "119-098")
run_query("STATS", "120-115")

In [None]:
run_query("STATS", "121-097")
run_query("STATS", "122-071")
run_query("STATS", "123-047")
run_query("STATS", "124-070")
run_query("STATS", "125-051")
run_query("STATS", "126-059")
run_query("STATS", "127-038")
run_query("STATS", "128-069")
run_query("STATS", "129-140")
run_query("STATS", "130-123")

In [None]:
run_query("STATS", "131-143")
run_query("STATS", "132-105")
run_query("STATS", "133-052")
run_query("STATS", "134-030")
run_query("STATS", "135-136")
run_query("STATS", "136-142")
run_query("STATS", "137-141")
run_query("STATS", "138-107")
run_query("STATS", "139-034")
run_query("STATS", "140-108")

In [None]:
run_query("STATS", "141-068")
run_query("STATS", "142-135")
run_query("STATS", "143-126")
run_query("STATS", "144-122")
run_query("STATS", "145-120")
run_query("STATS", "146-058")

### SNAP

In [None]:
#run_query("SNAP", "dblp-path02")
#run_query("SNAP", "dblp-path03")
#run_query("SNAP", "dblp-path04")
#run_query("SNAP", "dblp-path05")
#run_query_rewritten("SNAP", "dblp-path06")
#run_query_rewritten("SNAP", "dblp-path07")
#run_query_rewritten("SNAP", "dblp-path08")
run_query("SNAP", "dblp-tree01")
run_query("SNAP", "dblp-tree02")
run_query_rewritten("SNAP", "dblp-tree03")

original1
o1
o2
o3
rewritten1
False False
[(1,)] [(1,)]
orig+rewr
0
1
2


In [None]:
run_query("SNAP", "google-path02")
run_query("SNAP", "google-path03")
run_query("SNAP", "google-path04")
run_query_rewritten("SNAP", "google-path05")
run_query_rewritten("SNAP", "google-path06")
run_query_rewritten("SNAP", "google-path07")
run_query_rewritten("SNAP", "google-path08")
run_query("SNAP", "google-tree01")
run_query_rewritten("SNAP", "google-tree02")
run_query_rewritten("SNAP", "google-tree03")

In [None]:
run_query("SNAP", "patents-path02")
run_query("SNAP", "patents-path03")
run_query("SNAP", "patents-path04")
run_query("SNAP", "patents-path05")
run_query_rewritten("SNAP", "patents-path06")
run_query_rewritten("SNAP", "patents-path07")
run_query_rewritten("SNAP", "patents-path08")
run_query("SNAP", "patents-tree01")
run_query("SNAP", "patents-tree02")
run_query_rewritten("SNAP", "patents-tree03")

In [None]:
run_query_rewritten("SNAP", "wiki-path02")
run_query_rewritten("SNAP", "wiki-path03")
run_query_rewritten("SNAP", "wiki-path04")
run_query_rewritten("SNAP", "wiki-path05")
run_query_rewritten("SNAP", "wiki-path06")
run_query_rewritten("SNAP", "wiki-path07")
run_query_rewritten("SNAP", "wiki-path08")
run_query_rewritten("SNAP", "wiki-tree01")
run_query_rewritten("SNAP", "wiki-tree02")
run_query_rewritten("SNAP", "wiki-tree03")

### LSQB

In [4]:
run_query("LSQB", "q1")
run_query("LSQB", "q4")

[(0,)] [(0,)]
could not resize shared memory segment "/PostgreSQL.1660782094" to 226938880 bytes: No space left on device

current transaction is aborted, commands ignored until end of transaction block



UnboundLocalError: local variable 'result' referenced before assignment

### JOB

In [4]:
run_query("JOB", "2a")
run_query("JOB", "2b")
run_query("JOB", "2c")
run_query("JOB", "2d")
run_query("JOB", "3a")
run_query("JOB", "3b")
run_query("JOB", "3c")
run_query("JOB", "5a")
run_query("JOB", "5b")
run_query("JOB", "5c")
run_query("JOB", "17d")
run_query("JOB", "17e")
run_query("JOB", "17f")
run_query("JOB", "20a")
run_query("JOB", "20b")

[('002 agenti segretissimi',)] [('002 agenti segretissimi',)]
[('007 in Rio',)] [('007 in Rio',)]
[(None,)] [(None,)]
[('002 agenti segretissimi',)] [('002 agenti segretissimi',)]
[('2 Days in New York',)] [('2 Days in New York',)]
[('300: Rise of an Empire',)] [('300: Rise of an Empire',)]
[('00 Schneider - Jagd auf Nihil Baxter',)] [('00 Schneider - Jagd auf Nihil Baxter',)]
[(None,)] [(None,)]
[(None,)] [(None,)]
[('114 Days: The Race to Save a Dream',)] [('114 Days: The Race to Save a Dream',)]
[('Akaffou, Bertin',)] [('Akaffou, Bertin',)]
[('100 Proof, The',)] [('100 Proof, The',)]
[('2X, Benjamin',)] [('2X, Benjamin',)]
[(None,)] [(None,)]
[(None,)] [(None,)]


ToDo!
Compare CREATE TABLE/CREATE VIEW/CREATE UNLOGGED TABLE

In [None]:
# just running, without timeout
def run_query(benchmark, query):
    file_path = f'rewritten/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)
    
    original_query = json_data["original_query"]
    rewritten_query_list = json_data["rewritten_query"]
    rewriting_time = json_data["time"]

    file_path_drop = f'rewritten/{benchmark}_{query}_drop.json'
    with open(file_path_drop, 'r') as file:
        json_drop = json.load(file)
    drop_query_list = json_drop["rewritten_query"]

    if benchmark == "JOB":
        database = "imdb"
    else:
        database = benchmark.lower()
    conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )
    
    #check if results are equal and dont take the time for the first run
    cur = conn.cursor()
    cur.execute(original_query)
    result = cur.fetchall()
    for rewritten_query in rewritten_query_list:
        cur.execute(rewritten_query)
        if rewritten_query.startswith("SELECT"):
            result1 = cur.fetchall()
    for drop_query in drop_query_list:
        cur.execute(drop_query)
    print(result, result1)
    if result != result1:
        print("Outputs not equal!!")

    list_original = []
    list_rewritten = []
    # take times for 5 runs (run 2-6) for the original query and the rewritten query
    for i in range(5):
        # print(i)
        # execute the original query
        start_time_original = time.time()
        cur.execute(original_query)
        end_time_original = time.time()
        original_time = end_time_original - start_time_original
        list_original.append(original_time)
    
        # execute the rewritten query
        start_time_rewritten = time.time()
        for rewritten_query in rewritten_query_list:
            cur.execute(rewritten_query)
        end_time_rewritten = time.time()
        rewritten_time = end_time_rewritten - start_time_rewritten
        list_rewritten.append(rewritten_time)
        
        # drop all created tables
        for drop_query in drop_query_list:
            cur.execute(drop_query)

    orig_mean = np.mean(list_original)
    orig_med = np.median(list_original)
    orig_std = np.std(list_original)
    rewr_mean = np.mean(list_rewritten)
    rewr_med = np.median(list_rewritten)
    rewr_std = np.std(list_rewritten)
    rewr_mean_plus_rewr = rewr_mean + rewriting_time
    rewr_med_plus_rewr = rewr_med + rewriting_time
    if orig_mean > rewr_mean:
        orig_or_rewr_mean = "rewr"
    else:
        orig_or_rewr_mean = "orig"
    if orig_mean > rewr_mean_plus_rewr:
        orig_or_rewr_plus_rewr_mean = "rewr"
    else:
        orig_or_rewr_plus_rewr_mean = "orig"
    list_output = [benchmark, query] + [orig_mean, rewr_mean, rewr_mean_plus_rewr, orig_or_rewr_mean, orig_or_rewr_plus_rewr_mean, rewriting_time] + \
                    list_original + [orig_med, orig_std] + list_rewritten + [rewr_med, rewr_std, rewr_med_plus_rewr]

    file_path = "results/POS_Scala_comparison.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)