# Running queries using JSONS produced by Scala code

Install and import all needed packages

In [1]:
%%bash
pip install numpy
pip install pandas
pip install duckdb
pip install psycopg2-binary

















In [2]:
import json
import time
import duckdb
import numpy as np
import csv
import multiprocessing
import signal
import pandas as pd
import os
import psycopg2

Function for running one query. This means
*  run the original query 5 times (after one initial run, which we do not use)
*  run the rewritten queries 5 times (after one initial run, which we do not use) and drop the created tables each time
*  take the runtimes and calculate mean, median and standard deviation of time for either the original or rewritten query
*  additionaly save the runtimes for each stage
*  save everything in a csv output file

In [3]:
# functions to handle the timeouts
def handler_orig(signum, frame):
    global timeout_flag_orig
    timeout_flag_orig = True
    raise Exception("Query execution of the original query > 100s")

def handler_rewr(signum, frame):
    global timeout_flag_rewr
    timeout_flag_rewr = True
    raise Exception("Query execution of the rewritten query > 100s")

def run_query(benchmark, query):
    print(benchmark, query)
    file_path = f'rewritten/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)

    # get the original and rewritten query
    original_query = json_data["original_query"]
    rewritten_query_list = json_data["rewritten_query"]
    rewriting_time = json_data["time"]

    # change the queries such that they can be executed in DuckDB (without changing the output)
    rewritten_query_list = [rewritten_query.replace("TIMESTAMP(0)", "TIMESTAMP").lower()
                                  for rewritten_query in rewritten_query_list]

    rewritten_query_list_stage0 = [r for r in rewritten_query_list if "view" in r]
    rewritten_query_list_stage1 = [r for r in rewritten_query_list if "view" not in r]
    rewritten_query_list_stage1[-2] = rewritten_query_list_stage1[-2].replace("unlogged table", "view")
    rewritten_query_list[-2] = rewritten_query_list[-2].replace("unlogged table", "view")
    
    # get the drop queries
    file_path_drop = f'rewritten/{benchmark}_{query}_drop.json'
    with open(file_path_drop, 'r') as file:
        json_drop = json.load(file)
    drop_query_list = json_drop["rewritten_query"]
    drop_query_list[0] = drop_query_list[0].replace("TABLE", "VIEW")

    drop_query_list = [drop_query.lower() for drop_query in drop_query_list]

    # connect to DuckDB
    database = benchmark.lower() + "/" + benchmark.lower() + ".duckdb"
    conn = duckdb.connect(database=database)

    # if the evaluation takes longer than 100sec then break it
    global timeout_flag_orig
    global timeout_flag_rewr
    timeout_flag_orig = False
    timeout_flag_rewr = False

    print("original1")
    # the first run is just a warm up run and to check for the time out
    signal.signal(signal.SIGALRM, handler_orig) 
    signal.alarm(100) #TO at 100 sec, can be changed
    try:
        cur = conn.cursor()
        cur.execute("USE " + benchmark.lower() + "_DDB")
        cur.execute(original_query)
        result = cur.fetchall()
        rows_orig = len(result)
    except Exception as exc: 
        print(exc)
    signal.alarm(0) 

    for drop_query in drop_query_list:
        cur.execute(drop_query.replace("drop view", "drop view if exists").replace("drop table", "drop table if exists"))

    print("rewritten1")
    signal.signal(signal.SIGALRM, handler_rewr) 
    signal.alarm(100) 
    try:
        for rewritten_query in rewritten_query_list:
            cur.execute(rewritten_query)
            if rewritten_query.startswith("select"):
                result1 = cur.fetchall()
                rows_rewr = len(result1)
        for drop_query in drop_query_list:
            cur.execute(drop_query)
    except Exception as exc: 
        print(exc)
    signal.alarm(0)

    print(timeout_flag_orig, timeout_flag_rewr)
    # original and rewritten query are TOs
    if timeout_flag_orig and timeout_flag_rewr:
        list_original_time = ["-", "-", "-", "-", "-"]
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"

        list_rewritten_time_stage0 = ["-", "-", "-", "-", "-"]
        rewr_stage0_mean = "-"
        rewr_stage0_med = "-"
        rewr_stage0_std = "-"
        list_rewritten_time_stage1 = ["-", "-", "-", "-", "-"]
        rewr_stage1_mean = "-"
        rewr_stage1_med = "-"
        rewr_stage1_std = "-"
        list_rewritten_time = ["-", "-", "-", "-", "-"]
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"

        orig_rewr = "-"
        rows = "-"

    # original query is a TO and the rewritten not
    elif timeout_flag_orig:
        list_original_time = ["-", "-", "-", "-", "-"]
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"

        list_rewritten_time_stage0 = []
        list_rewritten_time_stage1 = []
        list_rewritten_time = []

        for i in range(5):
            print(i)
            # execute the rewritten query
            start_time_rewritten_stage0 = time.time()
            for rewritten_query in rewritten_query_list_stage0:
                cur.execute(rewritten_query)
            end_time_rewritten_stage0 = time.time()
            rewritten_time_stage0 = end_time_rewritten_stage0 - start_time_rewritten_stage0
            list_rewritten_time_stage0.append(rewritten_time_stage0)
            
            start_time_rewritten_stage1 = time.time()
            for rewritten_query in rewritten_query_list_stage1:
                cur.execute(rewritten_query)
            end_time_rewritten_stage1 = time.time()
            rewritten_time_stage1 = end_time_rewritten_stage1 - start_time_rewritten_stage1
            list_rewritten_time_stage1.append(rewritten_time_stage1)
            
            list_rewritten_time.append(rewritten_time_stage0 + rewritten_time_stage1) 
        
            # drop all created tables
            for drop_query in drop_query_list:
                cur.execute(drop_query)

        rewr_stage0_mean = np.mean(list_rewritten_time_stage0)
        rewr_stage0_med = np.median(list_rewritten_time_stage0)
        rewr_stage0_std = np.std(list_rewritten_time_stage0)
        rewr_stage1_mean = np.mean(list_rewritten_time_stage1)
        rewr_stage1_med = np.median(list_rewritten_time_stage1)
        rewr_stage1_std = np.std(list_rewritten_time_stage1)
        rewr_mean = np.mean(list_rewritten_time)
        rewr_med = np.median(list_rewritten_time)
        rewr_std = np.std(list_rewritten_time)
        
        orig_rewr = "rewr"
        rows = rows_rewr

    # rewritten query is a TO and the original not
    elif timeout_flag_rewr:
        list_original_time = []

        for i in range(5):
            print(i)
            # execute the original query
            start_time_original = time.time()
            cur.execute(original_query)
            end_time_original = time.time()
            list_original_time.append(end_time_original - start_time_original)

        orig_mean = np.mean(list_original_time)
        orig_med = np.median(list_original_time)
        orig_std = np.std(list_original_time)

        list_rewritten_time_stage0 = ["-", "-", "-", "-", "-"]
        rewr_stage0_mean = "-"
        rewr_stage0_med = "-"
        rewr_stage0_std = "-"
        list_rewritten_time_stage1 = ["-", "-", "-", "-", "-"]
        rewr_stage1_mean = "-"
        rewr_stage1_med = "-"
        rewr_stage1_std = "-"
        list_rewritten_time = ["-", "-", "-", "-", "-"]
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        
        orig_rewr = "orig"
        rows = rows_orig

    # both queries are no TOs
    else:
        list_original_time = []
        list_rewritten_time_stage0 = []
        list_rewritten_time_stage1 = []
        list_rewritten_time = []

        for i in range(5):
            print(i)
            # execute the original query
            start_time_original = time.time()
            cur.execute(original_query)
            end_time_original = time.time()
            list_original_time.append(end_time_original - start_time_original)
        
            # execute the rewritten query
            start_time_rewritten_stage0 = time.time()
            for rewritten_query in rewritten_query_list_stage0:
                cur.execute(rewritten_query)
            end_time_rewritten_stage0 = time.time()
            rewritten_time_stage0 = end_time_rewritten_stage0 - start_time_rewritten_stage0
            list_rewritten_time_stage0.append(rewritten_time_stage0)
            
            start_time_rewritten_stage1 = time.time()
            for rewritten_query in rewritten_query_list_stage1:
                cur.execute(rewritten_query)
            end_time_rewritten_stage1 = time.time()
            rewritten_time_stage1 = end_time_rewritten_stage1 - start_time_rewritten_stage1
            list_rewritten_time_stage1.append(rewritten_time_stage1)
            
            list_rewritten_time.append(rewritten_time_stage0 + rewritten_time_stage1) 
        
            # drop all created tables
            for drop_query in drop_query_list:
                cur.execute(drop_query)

        orig_mean = np.mean(list_original_time)
        orig_med = np.median(list_original_time)
        orig_std = np.std(list_original_time)
        rewr_stage0_mean = np.mean(list_rewritten_time_stage0)
        rewr_stage0_med = np.median(list_rewritten_time_stage0)
        rewr_stage0_std = np.std(list_rewritten_time_stage0)
        rewr_stage1_mean = np.mean(list_rewritten_time_stage1)
        rewr_stage1_med = np.median(list_rewritten_time_stage1)
        rewr_stage1_std = np.std(list_rewritten_time_stage1)
        rewr_mean = np.mean(list_rewritten_time)
        rewr_med = np.median(list_rewritten_time)
        rewr_std = np.std(list_rewritten_time)
            
        if orig_med > rewr_med:
            orig_rewr = "rewr"
        else:
            orig_rewr = "orig"
    
        if rows_orig == rows_rewr:
            rows = rows_orig
        else:
            rows = "not the same!"

    if benchmark == "IMDB":
        benchmark = "JOB"
    list_output = [benchmark, query, orig_rewr, orig_med, rewr_med, rewr_stage0_med, rewr_stage1_med, rows] + \
                    list_original_time + [orig_mean, orig_std] + list_rewritten_time + [rewr_mean, rewr_std] + list_rewritten_time_stage0 + \
                    [rewr_stage0_mean, rewr_stage0_std] + list_rewritten_time_stage1 + [rewr_stage1_mean, rewr_stage1_std]
                    

    file_path = "results/DDB_Scala_comparison_TO_augment_server_infos.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

Same function as run_query, only that first the rewritten query is executed and than the original in the beginning to check for TO
(Since we guess that if there is a TO it would be for the rewriting and then we do not need to wait so long to know if we should break the query.)

In [4]:
def run_query2(benchmark, query):
    print(benchmark, query)
    file_path = f'rewritten/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)

    # get the original and rewritten query
    original_query = json_data["original_query"]
    rewritten_query_list = json_data["rewritten_query"]
    rewriting_time = json_data["time"]

    # change the queries such that they can be executed in DuckDB (without changing the output)
    rewritten_query_list = [rewritten_query.replace("TIMESTAMP(0)", "TIMESTAMP").lower()
                                  for rewritten_query in rewritten_query_list]

    rewritten_query_list_stage0 = [r for r in rewritten_query_list if "view" in r]
    rewritten_query_list_stage1 = [r for r in rewritten_query_list if "view" not in r]
    rewritten_query_list_stage1[-2] = rewritten_query_list_stage1[-2].replace("unlogged table", "view")
    rewritten_query_list[-2] = rewritten_query_list[-2].replace("unlogged table", "view")
    
    # get the drop queries
    file_path_drop = f'rewritten/{benchmark}_{query}_drop.json'
    with open(file_path_drop, 'r') as file:
        json_drop = json.load(file)
    drop_query_list = json_drop["rewritten_query"]
    drop_query_list[0] = drop_query_list[0].replace("TABLE", "VIEW")

    drop_query_list = [drop_query.lower() for drop_query in drop_query_list]

    # connect to DuckDB
    database = benchmark.lower() + "/" + benchmark.lower() + ".duckdb"
    conn = duckdb.connect(database=database)
    cur = conn.cursor()
    cur.execute("USE " + benchmark.lower() + "_DDB")

    # if the evaluation takes longer than 100sec then break it
    global timeout_flag_orig
    global timeout_flag_rewr
    timeout_flag_orig = False
    timeout_flag_rewr = False

    for drop_query in drop_query_list:
        cur.execute(drop_query.replace("drop view", "drop view if exists").replace("drop table", "drop table if exists"))

    print("rewritten1")
    signal.signal(signal.SIGALRM, handler_rewr) 
    signal.alarm(100) 
    try:
        for rewritten_query in rewritten_query_list:
            cur.execute(rewritten_query)
            if rewritten_query.startswith("select"):
                result1 = cur.fetchall()
                rows_rewr = len(result1)
        for drop_query in drop_query_list:
            cur.execute(drop_query)
    except Exception as exc: 
        print(exc)
    signal.alarm(0)

    print("original1")
    # the first run is just a warm up run and to check for the time out
    signal.signal(signal.SIGALRM, handler_orig) 
    signal.alarm(100) #TO at 100 sec, can be changed
    try:
        cur.execute(original_query)
        result = cur.fetchall()
        rows_orig = len(result)
    except Exception as exc: 
        print(exc)
    signal.alarm(0) 

    print(timeout_flag_orig, timeout_flag_rewr)
    # original and rewritten query are TOs
    if timeout_flag_orig and timeout_flag_rewr:
        list_original_time = ["-", "-", "-", "-", "-"]
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"

        list_rewritten_time_stage0 = ["-", "-", "-", "-", "-"]
        rewr_stage0_mean = "-"
        rewr_stage0_med = "-"
        rewr_stage0_std = "-"
        list_rewritten_time_stage1 = ["-", "-", "-", "-", "-"]
        rewr_stage1_mean = "-"
        rewr_stage1_med = "-"
        rewr_stage1_std = "-"
        list_rewritten_time = ["-", "-", "-", "-", "-"]
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"

        orig_rewr = "-"
        rows = "-"

    # original query is a TO and the rewritten not
    elif timeout_flag_orig:
        list_original_time = ["-", "-", "-", "-", "-"]
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"

        list_rewritten_time_stage0 = []
        list_rewritten_time_stage1 = []
        list_rewritten_time = []

        for i in range(5):
            print(i)
            # execute the rewritten query
            start_time_rewritten_stage0 = time.time()
            for rewritten_query in rewritten_query_list_stage0:
                cur.execute(rewritten_query)
            end_time_rewritten_stage0 = time.time()
            rewritten_time_stage0 = end_time_rewritten_stage0 - start_time_rewritten_stage0
            list_rewritten_time_stage0.append(rewritten_time_stage0)
            
            start_time_rewritten_stage1 = time.time()
            for rewritten_query in rewritten_query_list_stage1:
                cur.execute(rewritten_query)
            end_time_rewritten_stage1 = time.time()
            rewritten_time_stage1 = end_time_rewritten_stage1 - start_time_rewritten_stage1
            list_rewritten_time_stage1.append(rewritten_time_stage1)
            
            list_rewritten_time.append(rewritten_time_stage0 + rewritten_time_stage1) 
        
            # drop all created tables
            for drop_query in drop_query_list:
                cur.execute(drop_query)

        rewr_stage0_mean = np.mean(list_rewritten_time_stage0)
        rewr_stage0_med = np.median(list_rewritten_time_stage0)
        rewr_stage0_std = np.std(list_rewritten_time_stage0)
        rewr_stage1_mean = np.mean(list_rewritten_time_stage1)
        rewr_stage1_med = np.median(list_rewritten_time_stage1)
        rewr_stage1_std = np.std(list_rewritten_time_stage1)
        rewr_mean = np.mean(list_rewritten_time)
        rewr_med = np.median(list_rewritten_time)
        rewr_std = np.std(list_rewritten_time)
        
        orig_rewr = "rewr"
        rows = rows_rewr

    # rewritten query is a TO and the original not
    elif timeout_flag_rewr:
        list_original_time = []

        for i in range(5):
            print(i)
            # execute the original query
            start_time_original = time.time()
            cur.execute(original_query)
            end_time_original = time.time()
            list_original_time.append(end_time_original - start_time_original)

        orig_mean = np.mean(list_original_time)
        orig_med = np.median(list_original_time)
        orig_std = np.std(list_original_time)

        list_rewritten_time_stage0 = ["-", "-", "-", "-", "-"]
        rewr_stage0_mean = "-"
        rewr_stage0_med = "-"
        rewr_stage0_std = "-"
        list_rewritten_time_stage1 = ["-", "-", "-", "-", "-"]
        rewr_stage1_mean = "-"
        rewr_stage1_med = "-"
        rewr_stage1_std = "-"
        list_rewritten_time = ["-", "-", "-", "-", "-"]
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        
        orig_rewr = "orig"
        rows = rows_orig

    # both queries are no TOs
    else:
        list_original_time = []
        list_rewritten_time_stage0 = []
        list_rewritten_time_stage1 = []
        list_rewritten_time = []

        for i in range(5):
            print(i)
            # execute the original query
            start_time_original = time.time()
            cur.execute(original_query)
            end_time_original = time.time()
            list_original_time.append(end_time_original - start_time_original)
        
            # execute the rewritten query
            start_time_rewritten_stage0 = time.time()
            for rewritten_query in rewritten_query_list_stage0:
                cur.execute(rewritten_query)
            end_time_rewritten_stage0 = time.time()
            rewritten_time_stage0 = end_time_rewritten_stage0 - start_time_rewritten_stage0
            list_rewritten_time_stage0.append(rewritten_time_stage0)
            
            start_time_rewritten_stage1 = time.time()
            for rewritten_query in rewritten_query_list_stage1:
                cur.execute(rewritten_query)
            end_time_rewritten_stage1 = time.time()
            rewritten_time_stage1 = end_time_rewritten_stage1 - start_time_rewritten_stage1
            list_rewritten_time_stage1.append(rewritten_time_stage1)
            
            list_rewritten_time.append(rewritten_time_stage0 + rewritten_time_stage1) 
        
            # drop all created tables
            for drop_query in drop_query_list:
                cur.execute(drop_query)

        orig_mean = np.mean(list_original_time)
        orig_med = np.median(list_original_time)
        orig_std = np.std(list_original_time)
        rewr_stage0_mean = np.mean(list_rewritten_time_stage0)
        rewr_stage0_med = np.median(list_rewritten_time_stage0)
        rewr_stage0_std = np.std(list_rewritten_time_stage0)
        rewr_stage1_mean = np.mean(list_rewritten_time_stage1)
        rewr_stage1_med = np.median(list_rewritten_time_stage1)
        rewr_stage1_std = np.std(list_rewritten_time_stage1)
        rewr_mean = np.mean(list_rewritten_time)
        rewr_med = np.median(list_rewritten_time)
        rewr_std = np.std(list_rewritten_time)
            
        if orig_med > rewr_med:
            orig_rewr = "rewr"
        else:
            orig_rewr = "orig"
    
        if rows_orig == rows_rewr:
            rows = rows_orig
        else:
            rows = "not the same!"

    if benchmark == "IMDB":
        benchmark = "JOB"
    list_output = [benchmark, query, orig_rewr, orig_med, rewr_med, rewr_stage0_med, rewr_stage1_med, rows] + \
                    list_original_time + [orig_mean, orig_std] + list_rewritten_time + [rewr_mean, rewr_std] + list_rewritten_time_stage0 + \
                    [rewr_stage0_mean, rewr_stage0_std] + list_rewritten_time_stage1 + [rewr_stage1_mean, rewr_stage1_std]
                    

    file_path = "results/DDB_Scala_comparison_TO_augment_server_infos.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

Function if we know that both versions are giving a timeout (since we broke them by hand instead of waiting for the timeout handler, because it is faster)

In [5]:
def run_query_TO(benchmark, query):
    print(benchmark, query)

    list_original_time = ["-", "-", "-", "-", "-"]
    orig_mean = "TO"
    orig_med = "TO"
    orig_std = "-"

    list_rewritten_time_stage0 = ["-", "-", "-", "-", "-"]
    rewr_stage0_mean = "-"
    rewr_stage0_med = "-"
    rewr_stage0_std = "-"
    list_rewritten_time_stage1 = ["-", "-", "-", "-", "-"]
    rewr_stage1_mean = "-"
    rewr_stage1_med = "-"
    rewr_stage1_std = "-"
    list_rewritten_time = ["-", "-", "-", "-", "-"]
    rewr_mean = "TO"
    rewr_med = "TO"
    rewr_std = "-"

    orig_rewr = "-"
    rows = "-"

    if benchmark == "IMDB":
        benchmark = "JOB"
    list_output = [benchmark, query, orig_rewr, orig_med, rewr_med, rewr_stage0_med, rewr_stage1_med,rows] + \
                    list_original_time + [orig_mean, orig_std] + list_rewritten_time + [rewr_mean, rewr_std] + list_rewritten_time_stage0 + \
                    [rewr_stage0_mean, rewr_stage0_std] + list_rewritten_time_stage1 + [rewr_stage1_mean, rewr_stage1_std] 

    file_path = "results/DDB_Scala_comparison_TO_augment_server_infos.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

Function if we know that original version is giving a timeout, so we only need to check for the rewritten version and run that one.

In [6]:
def run_query_rewr(benchmark, query):
    print(benchmark, query)
    file_path = f'rewritten/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)

    # get the original and rewritten query
    rewritten_query_list = json_data["rewritten_query"]
    rewriting_time = json_data["time"]

    # change the queries such that they can be executed in DuckDB (without changing the output)
    rewritten_query_list = [rewritten_query.replace("TIMESTAMP(0)", "TIMESTAMP").lower()
                                  for rewritten_query in rewritten_query_list]
    
    rewritten_query_list_stage0 = [r for r in rewritten_query_list if "view" in r]
    rewritten_query_list_stage1 = [r for r in rewritten_query_list if "view" not in r]
    rewritten_query_list_stage1[-2] = rewritten_query_list_stage1[-2].replace("unlogged table", "view")
    rewritten_query_list[-2] = rewritten_query_list[-2].replace("unlogged table", "view")
    
    # get the drop queries
    file_path_drop = f'rewritten/{benchmark}_{query}_drop.json'
    with open(file_path_drop, 'r') as file:
        json_drop = json.load(file)
    drop_query_list = json_drop["rewritten_query"]
    drop_query_list[0] = drop_query_list[0].replace("TABLE", "VIEW")

    drop_query_list = [drop_query.lower() for drop_query in drop_query_list]

    # connect to DuckDB
    database = benchmark.lower() + "/" + benchmark.lower() + ".duckdb"
    conn = duckdb.connect(database=database)
    cur = conn.cursor()
    cur.execute("USE " + benchmark.lower() + "_DDB")

    # if the evaluation takes longer than 100sec then break it
    global timeout_flag_orig
    global timeout_flag_rewr
    timeout_flag_orig = True
    timeout_flag_rewr = False

    for drop_query in drop_query_list:
        cur.execute(drop_query.replace("drop view", "drop view if exists").replace("drop table", "drop table if exists"))

    print("rewritten1")
    signal.signal(signal.SIGALRM, handler_rewr) 
    signal.alarm(100) 
    try:
        for rewritten_query in rewritten_query_list:
            cur.execute(rewritten_query)
            if rewritten_query.startswith("select"):
                result1 = cur.fetchall()
                rows_rewr = len(result1)
        for drop_query in drop_query_list:
            cur.execute(drop_query)
    except Exception as exc: 
        print(exc)
    signal.alarm(0)

    print(timeout_flag_orig, timeout_flag_rewr)
    # original and rewritten query are TOs
    if timeout_flag_orig and timeout_flag_rewr:
        list_original_time = ["-", "-", "-", "-", "-"]
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"

        list_rewritten_time_stage0 = ["-", "-", "-", "-", "-"]
        rewr_stage0_mean = "-"
        rewr_stage0_med = "-"
        rewr_stage0_std = "-"
        list_rewritten_time_stage1 = ["-", "-", "-", "-", "-"]
        rewr_stage1_mean = "-"
        rewr_stage1_med = "-"
        rewr_stage1_std = "-"
        list_rewritten_time = ["-", "-", "-", "-", "-"]
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"

        orig_rewr = "-"
        rows = "-"

    # original query is a TO and the rewritten not
    elif timeout_flag_orig:
        list_original_time = ["-", "-", "-", "-", "-"]
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"

        list_rewritten_time_stage0 = []
        list_rewritten_time_stage1 = []
        list_rewritten_time = []

        for i in range(5):
            print(i)
            # execute the rewritten query
            start_time_rewritten_stage0 = time.time()
            for rewritten_query in rewritten_query_list_stage0:
                cur.execute(rewritten_query)
            end_time_rewritten_stage0 = time.time()
            rewritten_time_stage0 = end_time_rewritten_stage0 - start_time_rewritten_stage0
            list_rewritten_time_stage0.append(rewritten_time_stage0)
            
            start_time_rewritten_stage1 = time.time()
            for rewritten_query in rewritten_query_list_stage1:
                cur.execute(rewritten_query)
            end_time_rewritten_stage1 = time.time()
            rewritten_time_stage1 = end_time_rewritten_stage1 - start_time_rewritten_stage1
            list_rewritten_time_stage1.append(rewritten_time_stage1)
            
            list_rewritten_time.append(rewritten_time_stage0 + rewritten_time_stage1) 
        
            # drop all created tables
            for drop_query in drop_query_list:
                cur.execute(drop_query)

        rewr_stage0_mean = np.mean(list_rewritten_time_stage0)
        rewr_stage0_med = np.median(list_rewritten_time_stage0)
        rewr_stage0_std = np.std(list_rewritten_time_stage0)
        rewr_stage1_mean = np.mean(list_rewritten_time_stage1)
        rewr_stage1_med = np.median(list_rewritten_time_stage1)
        rewr_stage1_std = np.std(list_rewritten_time_stage1)
        rewr_mean = np.mean(list_rewritten_time)
        rewr_med = np.median(list_rewritten_time)
        rewr_std = np.std(list_rewritten_time)
        
        orig_rewr = "rewr"
        rows = rows_rewr

    if benchmark == "IMDB":
        benchmark = "JOB"
    list_output = [benchmark, query, orig_rewr, orig_med, rewr_med, rewr_stage0_med, rewr_stage1_med, rows] + \
                    list_original_time + [orig_mean, orig_std] + list_rewritten_time + [rewr_mean, rewr_std] + list_rewritten_time_stage0 + \
                    [rewr_stage0_mean, rewr_stage0_std] + list_rewritten_time_stage1 + [rewr_stage1_mean, rewr_stage1_std]
        

    file_path = "results/DDB_Scala_comparison_TO_augment_server_infos.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

Function if we know that rewritten version is giving a timeout, so we only need to check for the rewritten version and run that one.

In [7]:
def run_query_orig(benchmark, query):
    print(benchmark, query)
    file_path = f'rewritten/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)

    # get the original and rewritten query
    original_query = json_data["original_query"]

    # connect to DuckDB
    database = benchmark.lower() + "/" + benchmark.lower() + ".duckdb"
    conn = duckdb.connect(database=database)

    # if the evaluation takes longer than 100sec then break it
    global timeout_flag_orig
    global timeout_flag_rewr
    timeout_flag_orig = False
    timeout_flag_rewr = True

    print("original1")
    # the first run is just a warm up run and to check for the time out
    signal.signal(signal.SIGALRM, handler_orig) 
    signal.alarm(100) #TO at 100 sec, can be changed
    try:
        cur = conn.cursor()
        cur.execute("USE " + benchmark.lower() + "_DDB")
        cur.execute(original_query)
        result = cur.fetchall()
        rows_orig = len(result)
    except Exception as exc: 
        print(exc)
    signal.alarm(0) 

    print(timeout_flag_orig, timeout_flag_rewr)
    # original and rewritten query are TOs
    if timeout_flag_orig and timeout_flag_rewr:
        list_original_time = ["-", "-", "-", "-", "-"]
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"

        list_rewritten_time_stage0 = ["-", "-", "-", "-", "-"]
        rewr_stage0_mean = "-"
        rewr_stage0_med = "-"
        rewr_stage0_std = "-"
        list_rewritten_time_stage1 = ["-", "-", "-", "-", "-"]
        rewr_stage1_mean = "-"
        rewr_stage1_med = "-"
        rewr_stage1_std = "-"
        list_rewritten_time = ["-", "-", "-", "-", "-"]
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"

        orig_rewr = "-"
        rows = "-"


    # rewritten query is a TO and the original not
    elif timeout_flag_rewr:
        list_original_time = []

        for i in range(5):
            print(i)
            # execute the original query
            start_time_original = time.time()
            cur.execute(original_query)
            end_time_original = time.time()
            list_original_time.append(end_time_original - start_time_original)

        orig_mean = np.mean(list_original_time)
        orig_med = np.median(list_original_time)
        orig_std = np.std(list_original_time)

        list_rewritten_time_stage0 = ["-", "-", "-", "-", "-"]
        rewr_stage0_mean = "-"
        rewr_stage0_med = "-"
        rewr_stage0_std = "-"
        list_rewritten_time_stage1 = ["-", "-", "-", "-", "-"]
        rewr_stage1_mean = "-"
        rewr_stage1_med = "-"
        rewr_stage1_std = "-"
        list_rewritten_time = ["-", "-", "-", "-", "-"]
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        
        orig_rewr = "orig"
        rows = rows_orig

    if benchmark == "IMDB":
        benchmark = "JOB"
    list_output = [benchmark, query, orig_rewr, orig_med, rewr_med, rewr_stage0_med, rewr_stage1_med, rows] + \
                    list_original_time + [orig_mean, orig_std] + list_rewritten_time + [rewr_mean, rewr_std] + list_rewritten_time_stage0 + \
                    [rewr_stage0_mean, rewr_stage0_std] + list_rewritten_time_stage1 + [rewr_stage1_mean, rewr_stage1_std]
                    

    file_path = "results/DDB_Scala_comparison_TO_augment_server_infos.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

In [10]:
file_path = "results/DDB_Scala_comparison_TO_augment_server_infos.csv"

names = ["bench", "query", "orig/rewr(med)", "orig(med)", "rewr(med)", "stage0(med)", "stage1(med)", "rows",
        "orig 1", "orig 2", "orig 3", "orig 4", "orig 5", "orig(mean)", "orig(std)", 
        "rewr 1", "rewr 2", "rewr 3", "rewr 4", "rewr 5", "rewr(mean)", "rewr(std)",
        "stage0 1", "stage0 2", "stage0 3", "stage0 4", "stage0 5", "stage0(mean)", "stage0(std)",
        "stage1 1", "stage1 2", "stage1 3", "stage1 4", "stage1 5", "stage1(mean)", "stage1(std)"]

with open(file_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(names)

### STATS

In [8]:
database = "stats"
conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )

cur = conn.cursor()
cur.execute("SELECT inet_server_addr(), inet_server_port()")
host, port = cur.fetchone()

print("Host:", host)
print("Port:", port)

cur.close()
conn.close()

Host: 192.168.64.2
Port: 5432


In [9]:
con = duckdb.connect(database="stats/stats.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=" + host + " port=5432 user=postgres password=postgres dbname=stats' AS stats_DDB (TYPE postgres)")
con.execute("USE stats_DDB")

<duckdb.duckdb.DuckDBPyConnection at 0x7f003d74d970>

In [14]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('STATS')]

for file in output_files:
    file_split = file.split("_")
    number = int(file_split[1].split("-")[0])
    if number <= 50:
        run_query(file_split[0], file_split[1])

STATS 001-014-augF1-full1
original1
rewritten1
False False
0
1
2
3
4
STATS 001-014-augF1-full2
original1
rewritten1
False False
0
1
2
3
4
STATS 001-014-augF1-full3
original1
rewritten1
False False
0
1
2
3
4
STATS 001-014-augF2-full1
original1
rewritten1
False False
0
1
2
3
4
STATS 001-014-augF2-full2
original1
rewritten1
False False
0
1
2
3
4
STATS 001-014-augF2-full3
original1
rewritten1
False False
0
1
2
3
4
STATS 001-014-full1
original1
rewritten1
False False
0
1
2
3
4
STATS 001-014-full2
original1
rewritten1
False False
0
1
2
3
4
STATS 001-014-full3
original1
rewritten1
False False
0
1
2
3
4
STATS 002-048-augF1-full1
original1
rewritten1
False False
0
1
2
3
4
STATS 002-048-augF1-full2
original1
rewritten1
False False
0
1
2
3
4
STATS 002-048-augF1-full3
original1
rewritten1
False False
0
1
2
3
4
STATS 002-048-augF2-full1
original1
rewritten1
False False
0
1
2
3
4
STATS 002-048-augF2-full2
original1
rewritten1
False False
0
1
2
3
4
STATS 002-048-augF2-full3
original1
rewritten1
False

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('STATS')]

for file in output_files:
    file_split = file.split("_")
    number = int(file_split[1].split("-")[0])
    if number > 50 and number <= 100:
        run_query(file_split[0], file_split[1])

STATS 051-090-augF1-full1
original1
rewritten1
False False
0
1
2
3
4
STATS 051-090-augF1-full2
original1
rewritten1
False False
0
1
2
3
4
STATS 051-090-augF1-full3
original1
rewritten1
False False
0
1
2
3
4
STATS 051-090-augF2-full1
original1
rewritten1
False False
0
1
2
3
4
STATS 051-090-augF2-full2
original1
rewritten1
False False
0
1
2
3
4
STATS 051-090-augF2-full3
original1
rewritten1
False False
0
1
2
3
4
STATS 051-090-full1
original1
rewritten1
False False
0
1
2
3
4
STATS 051-090-full2
original1
rewritten1
False False
0
1
2
3
4
STATS 051-090-full3
original1
rewritten1
False False
0
1
2
3
4
STATS 052-029-augF1-full1
original1
rewritten1
False False
0
1
2
3
4
STATS 052-029-augF1-full2
original1
rewritten1
False False
0
1
2
3
4
STATS 052-029-augF1-full3
original1
rewritten1
False False
0
1
2
3
4
STATS 052-029-augF2-full1
original1
rewritten1
False False
0
1
2
3
4
STATS 052-029-augF2-full2
original1
rewritten1
False False
0
1
2
3
4
STATS 052-029-augF2-full3
original1
rewritten1
False

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('STATS')]

for file in output_files:
    file_split = file.split("_")
    number = int(file_split[1].split("-")[0])
    if number > 100:
        run_query(file_split[0], file_split[1])

### SNAP

In [8]:
database = "snap"
conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )

cur = conn.cursor()
cur.execute("SELECT inet_server_addr(), inet_server_port()")
host, port = cur.fetchone()

print("Host:", host)
print("Port:", port)

cur.close()
conn.close()

Host: 192.168.64.2
Port: 5432


In [9]:
con = duckdb.connect(database="snap/snap.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=" + host + " port=5432 user=postgres password=postgres dbname=snap' AS snap_DDB (TYPE postgres)")
con.execute("USE snap_DDB")

<duckdb.duckdb.DuckDBPyConnection at 0x7f52fd347a30>

In [14]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][148:164]
print(output_files)

['SNAP_patents-path07-augA2_output.json', 'SNAP_patents-path07-augA3_output.json', 'SNAP_patents-path07-augA4_output.json', 'SNAP_patents-path07-augA5_output.json', 'SNAP_patents-path07-augA6_output.json', 'SNAP_patents-path07-augA7_output.json', 'SNAP_patents-path07_output.json', 'SNAP_patents-path08-augA1_output.json', 'SNAP_patents-path08-augA2_output.json', 'SNAP_patents-path08-augA3_output.json', 'SNAP_patents-path08-augA4_output.json', 'SNAP_patents-path08-augA5_output.json', 'SNAP_patents-path08-augA6_output.json', 'SNAP_patents-path08-augA7_output.json', 'SNAP_patents-path08-augA8_output.json', 'SNAP_patents-path08_output.json']


In [16]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][148:164]

for file in output_files:
    file_split = file.split("_")
    run_query_TO(file_split[0], file_split[1])

SNAP patents-path07-augA2
SNAP patents-path07-augA3
SNAP patents-path07-augA4
SNAP patents-path07-augA5
SNAP patents-path07-augA6
SNAP patents-path07-augA7
SNAP patents-path07
SNAP patents-path08-augA1
SNAP patents-path08-augA2
SNAP patents-path08-augA3
SNAP patents-path08-augA4
SNAP patents-path08-augA5
SNAP patents-path08-augA6
SNAP patents-path08-augA7
SNAP patents-path08-augA8
SNAP patents-path08


In [10]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][164:175]

for file in output_files:
    file_split = file.split("_")
    run_query_rewr(file_split[0], file_split[1])

SNAP patents-tree01-augA1
rewritten1
True False
0
1
2
3
4
SNAP patents-tree01-augA2
rewritten1
True False
0
1
2
3
4
SNAP patents-tree01-augA3
rewritten1
True False
0
1
2
3
4
SNAP patents-tree01-augA4
rewritten1
True False
0
1
2
3
4
SNAP patents-tree01
rewritten1
True False
0
1
2
3
4
SNAP patents-tree02-augA1
rewritten1
True False
0
1
2
3
4
SNAP patents-tree02-augA2
rewritten1
True False
0
1
2
3
4
SNAP patents-tree02-augA3
rewritten1
True False
0
1
2
3
4
SNAP patents-tree02-augA4
rewritten1
True False
0
1
2
3
4
SNAP patents-tree02-augA5
rewritten1
True False
0
1
2
3
4
SNAP patents-tree02
rewritten1
True False
0
1
2
3
4


In [11]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][175:]

for file in output_files:
    file_split = file.split("_")
    run_query_TO(file_split[0], file_split[1])

SNAP patents-tree03-augA1
SNAP patents-tree03-augA2
SNAP patents-tree03-augA3
SNAP patents-tree03-augA4
SNAP patents-tree03-augA5
SNAP patents-tree03-augA6
SNAP patents-tree03-augA7
SNAP patents-tree03
SNAP wiki-path02-augA1
SNAP wiki-path02-augA2
SNAP wiki-path02
SNAP wiki-path03-augA1
SNAP wiki-path03-augA2
SNAP wiki-path03-augA3
SNAP wiki-path03
SNAP wiki-path04-augA1
SNAP wiki-path04-augA2
SNAP wiki-path04-augA3
SNAP wiki-path04-augA4
SNAP wiki-path04
SNAP wiki-path05-augA1
SNAP wiki-path05-augA2
SNAP wiki-path05-augA3
SNAP wiki-path05-augA4
SNAP wiki-path05-augA5
SNAP wiki-path05
SNAP wiki-path06-augA1
SNAP wiki-path06-augA2
SNAP wiki-path06-augA3
SNAP wiki-path06-augA4
SNAP wiki-path06-augA5
SNAP wiki-path06-augA6
SNAP wiki-path06
SNAP wiki-path07-augA1
SNAP wiki-path07-augA2
SNAP wiki-path07-augA3
SNAP wiki-path07-augA4
SNAP wiki-path07-augA5
SNAP wiki-path07-augA6
SNAP wiki-path07-augA7
SNAP wiki-path07
SNAP wiki-path08-augA1
SNAP wiki-path08-augA2
SNAP wiki-path08-augA3
SNAP w

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

### JOB

In [8]:
database = "imdb"
conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )

cur = conn.cursor()
cur.execute("SELECT inet_server_addr(), inet_server_port()")
host, port = cur.fetchone()

print("Host:", host)
print("Port:", port)

cur.close()
conn.close()

Host: 192.168.176.2
Port: 5432


In [9]:
con = duckdb.connect(database="job/job.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=" + host + " port=5432 user=postgres password=postgres dbname=imdb' AS job_DDB (TYPE postgres)")
con.execute("USE job_DDB")

<duckdb.duckdb.DuckDBPyConnection at 0x7f551c215f70>

In [11]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('JOB')][55:93]
print(output_files)

['JOB_17f-augF2-augA1_output.json', 'JOB_17f-augF2-augA2_output.json', 'JOB_17f-augF2-augA3_output.json', 'JOB_17f-augF2-augA4_output.json', 'JOB_17f-augF2-augA5_output.json', 'JOB_17f-augF2-augA6_output.json', 'JOB_17f-augF2_output.json', 'JOB_17f_output.json', 'JOB_20a-augA1_output.json', 'JOB_20a-augA2_output.json', 'JOB_20a-augA3_output.json', 'JOB_20a-augA4_output.json', 'JOB_20a-augA5_output.json', 'JOB_20a-augA6_output.json', 'JOB_20a-augA7_output.json', 'JOB_20a-augA8_output.json', 'JOB_20a-augA9_output.json', 'JOB_20a-augF1-augA1_output.json', 'JOB_20a-augF1-augA2_output.json', 'JOB_20a-augF1-augA3_output.json', 'JOB_20a-augF1-augA4_output.json', 'JOB_20a-augF1-augA5_output.json', 'JOB_20a-augF1-augA6_output.json', 'JOB_20a-augF1-augA7_output.json', 'JOB_20a-augF1-augA8_output.json', 'JOB_20a-augF1-augA9_output.json', 'JOB_20a-augF1_output.json', 'JOB_20a-augF2-augA1_output.json', 'JOB_20a-augF2-augA2_output.json', 'JOB_20a-augF2-augA3_output.json', 'JOB_20a-augF2-augA4_output

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('JOB')][55:93]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

JOB 17f-augF2-augA1
original1
rewritten1
False False
0
1
2
3
4
JOB 17f-augF2-augA2
original1
rewritten1
Query interrupted
False True
0
1
2
3
4
JOB 17f-augF2-augA3
original1
rewritten1
False False
0
1
2
3
4
JOB 17f-augF2-augA4
original1
rewritten1
Query interrupted
False True
0
1
2
3
4
JOB 17f-augF2-augA5
original1
rewritten1
False False
0
1
2
3
4
JOB 17f-augF2-augA6
original1
rewritten1
False False
0
1
2
3
4
JOB 17f-augF2
original1
rewritten1
False False
0
1
2
3
4
JOB 17f
original1
rewritten1
False False
0
1
2
3
4
JOB 20a-augA1
original1
rewritten1
False False
0
1
2
3
4
JOB 20a-augA2
original1
rewritten1
Query interrupted
False True
0
1
2
3
4
JOB 20a-augA3
original1
rewritten1
Query interrupted
False True
0
1
2
3
4
JOB 20a-augA4
original1
rewritten1
Query interrupted
False True
0
1
2
3
4
JOB 20a-augA5
original1
rewritten1
Query interrupted
False True
0
1
2
3
4
JOB 20a-augA6
original1
rewritten1
False False
0
1
2
3
4
JOB 20a-augA7
original1
rewritten1
Query interrupted
False True
0
1
2


In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('JOB')][55:93]

for file in output_files:
    file_split = file.split("_")
    run_query2(file_split[0], file_split[1])

JOB 17d-augF2-augA2
rewritten1
original1
False False
0
1
2
3
4
JOB 17d-augF2-augA3
rewritten1
original1
False False
0
1
2
3
4
JOB 17d-augF2-augA4
rewritten1
original1
False False
0
1
2
3
4
JOB 17d-augF2-augA5
rewritten1
original1
False False
0
1


In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('JOB')]

for file in output_files:
    file_split = file.split("_")
    run_query_orig(file_split[0], file_split[1])

In [14]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('JOB')][93:]
print(output_files)

['JOB_20b-augA1_output.json', 'JOB_20b-augA2_output.json', 'JOB_20b-augA3_output.json', 'JOB_20b-augA4_output.json', 'JOB_20b-augA5_output.json', 'JOB_20b-augA6_output.json', 'JOB_20b-augA7_output.json', 'JOB_20b-augA8_output.json', 'JOB_20b-augA9_output.json', 'JOB_20b-augF1-augA1_output.json', 'JOB_20b-augF1-augA2_output.json', 'JOB_20b-augF1-augA3_output.json', 'JOB_20b-augF1-augA4_output.json', 'JOB_20b-augF1-augA5_output.json', 'JOB_20b-augF1-augA6_output.json', 'JOB_20b-augF1-augA7_output.json', 'JOB_20b-augF1-augA8_output.json', 'JOB_20b-augF1-augA9_output.json', 'JOB_20b-augF1_output.json', 'JOB_20b-augF2-augA1_output.json', 'JOB_20b-augF2-augA2_output.json', 'JOB_20b-augF2-augA3_output.json', 'JOB_20b-augF2-augA4_output.json', 'JOB_20b-augF2-augA5_output.json', 'JOB_20b-augF2-augA6_output.json', 'JOB_20b-augF2-augA7_output.json', 'JOB_20b-augF2-augA8_output.json', 'JOB_20b-augF2-augA9_output.json', 'JOB_20b-augF2_output.json', 'JOB_20b_output.json', 'JOB_2a-augA1_output.json',

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('JOB')][93:]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

JOB 20b-augA1
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augA2
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augA3
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augA4
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augA5
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augA6
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augA7
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augA8
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augA9
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augF1-augA1
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augF1-augA2
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augF1-augA3
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augF1-augA4
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augF1-augA5
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augF1-augA6
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augF1-augA7
original1
rewritten1
False False
0
1
2
3
4
JOB 20b-augF1-augA8
original1
rewritten1
False

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('IMDB')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

### LSQB

In [None]:
database = "lsqb"
conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )

cur = conn.cursor()
cur.execute("SELECT inet_server_addr(), inet_server_port()")
host, port = cur.fetchone()

print("Host:", host)
print("Port:", port)

cur.close()
conn.close()

In [None]:
con = duckdb.connect(database="lsqb/lsqb.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=" + host + " port=5432 user=postgres password=postgres dbname=lsqb' AS lsqb_DDB (TYPE postgres)")
con.execute("USE lsqb_DDB")

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('LSQB')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

### HETIO

In [None]:
database = "hetio"
conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )

cur = conn.cursor()
cur.execute("SELECT inet_server_addr(), inet_server_port()")
host, port = cur.fetchone()

print("Host:", host)
print("Port:", port)

cur.close()
conn.close()

In [None]:
con = duckdb.connect(database="hetio/hetio.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=" + host + " port=5432 user=postgres password=postgres dbname=hetio' AS hetio_DDB (TYPE postgres)")
con.execute("USE hetio_DDB")

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('HETIO')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])