# Running queries using JSONS produced by Scala code

Install and import all needed packages

In [1]:
%%bash
pip install psycopg2-binary
pip install numpy
pip install pandas













In [2]:
import json
import time
import psycopg2
import numpy as np
import csv
import multiprocessing
import signal
import pandas as pd
import os

Function for running one query. This means
*  run the original query 5 times (after one initial run, which we do not use)
*  run the rewritten queries 5 times (after one initial run, which we do not use) and drop the created tables each time
*  take the runtimes and calculate mean, median and standard deviation of time for either the original or rewritten query
*  compare the runtimes between the original query, the rewritten query and the rewritten query + the rewriting time (how long the Scala took)
*  save everything in a csv output file

In [3]:
def handler_orig(signum, frame):
    global timeout_flag_orig
    timeout_flag_orig = True
    raise Exception("Query execution of the original query > 100s")

def handler_rewr(signum, frame):
    global timeout_flag_rewr
    timeout_flag_rewr = True
    raise Exception("Query execution of the rewritten query > 100s")
    
def run_query(benchmark, query):
    print(benchmark, query)
    file_path = f'rewritten/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)
    
    original_query = json_data["original_query"]
    rewritten_query_list = json_data["rewritten_query"]
    rewriting_time = json_data["time"]

    file_path_drop = f'rewritten/{benchmark}_{query}_drop.json'
    with open(file_path_drop, 'r') as file:
        json_drop = json.load(file)
    drop_query_list = json_drop["rewritten_query"]

    if benchmark == "JOB":
        database = "imdb"
    else:
        database = benchmark.lower()
    
    conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )

    # if the evaluation takes longer than 30min then break it
    global timeout_flag_orig
    global timeout_flag_rewr
    timeout_flag_orig = False
    timeout_flag_rewr = False

    print("original1")
    # the first run is just a warm up run and to check for the time out
    signal.signal(signal.SIGALRM, handler_orig) 
    signal.alarm(100) 
    try:
        cur = conn.cursor()
        cur.execute(original_query)
        result = cur.fetchall()
    except Exception as exc: 
        print(exc)
    signal.alarm(0) 

    print("rewritten1")
    signal.signal(signal.SIGALRM, handler_rewr) 
    signal.alarm(100) 
    try:
        for rewritten_query in rewritten_query_list:
            cur.execute(rewritten_query)
            if rewritten_query.startswith("SELECT"):
                result1 = cur.fetchall()
        for drop_query in drop_query_list:
            cur.execute(drop_query)
    except Exception as exc: 
        print(exc)
    signal.alarm(0)

    print(timeout_flag_orig, timeout_flag_rewr)
    # original and rewritten query are TOs
    if timeout_flag_orig and timeout_flag_rewr:
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"
        list_original = ["-", "-", "-", "-", "-"]
        
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        rewr_mean_plus_rewr = "TO"
        rewr_med_plus_rewr = "TO"
        list_rewritten = ["-", "-", "-", "-", "-"]
        
        orig_or_rewr_mean = "-"
        orig_or_rewr_or_equal = "-"
        orig_or_rewr_plus_rewr_mean = "-"

    # original query is a TO and the rewritten not
    elif timeout_flag_orig:
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"
        list_original = ["-", "-", "-", "-", "-"]

        list_rewritten = []
        print("rewritten")
        for i in range(5):
            print(i)
            # execute the rewritten query
            start_time_rewritten = time.time()
            for rewritten_query in rewritten_query_list:
                cur.execute(rewritten_query)
            end_time_rewritten = time.time()
            rewritten_time = end_time_rewritten - start_time_rewritten
            list_rewritten.append(rewritten_time)
            # drop all created tables
            for drop_query in drop_query_list:
                cur.execute(drop_query)
        rewr_mean = np.mean(list_rewritten)
        rewr_med = np.median(list_rewritten)
        rewr_std = np.std(list_rewritten)
        rewr_mean_plus_rewr = rewr_mean + rewriting_time
        rewr_med_plus_rewr = rewr_med + rewriting_time

        orig_or_rewr_mean = "rewr"
        orig_or_rewr_or_equal = "rewr"
        orig_or_rewr_plus_rewr_mean = "rewr"

    # rewritten query is a TO and the original not
    elif timeout_flag_rewr:
        list_original = []
        print("orig")
        for i in range(5):
            # execute the original query
            start_time_original = time.time()
            cur.execute(original_query)
            end_time_original = time.time()
            original_time = end_time_original - start_time_original
            list_original.append(original_time)
        orig_mean = np.mean(list_original)
        orig_med = np.median(list_original)
        orig_std = np.std(list_original)
        
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        rewr_mean_plus_rewr = "TO"
        rewr_med_plus_rewr = "TO"
        list_rewritten = ["-", "-", "-", "-", "-"]

        orig_or_rewr_mean = "orig"
        orig_or_rewr_or_equal = "orig"
        orig_or_rewr_plus_rewr_mean = "orig"

    # both queries are no TOs
    else:
        print(result, result1)
        list_original = []
        list_rewritten = []
        # take times for 5 runs (run 2-6) for the original query and the rewritten query
        print("orig+rewr")
        for i in range(5):
            print(i)
            # execute the original query
            start_time_original = time.time()
            cur.execute(original_query)
            end_time_original = time.time()
            original_time = end_time_original - start_time_original
            list_original.append(original_time)
        
            # execute the rewritten query
            start_time_rewritten = time.time()
            for rewritten_query in rewritten_query_list:
                cur.execute(rewritten_query)
            end_time_rewritten = time.time()
            rewritten_time = end_time_rewritten - start_time_rewritten
            list_rewritten.append(rewritten_time)
            
            # drop all created tables
            for drop_query in drop_query_list:
                cur.execute(drop_query)
            
        orig_mean = np.mean(list_original)
        orig_med = np.median(list_original)
        orig_std = np.std(list_original)
        rewr_mean = np.mean(list_rewritten)
        rewr_med = np.median(list_rewritten)
        rewr_std = np.std(list_rewritten)
        rewr_mean_plus_rewr = rewr_mean + rewriting_time
        rewr_med_plus_rewr = rewr_med + rewriting_time
        if orig_mean > rewr_mean:
            orig_or_rewr_mean = "rewr"
        else:
            orig_or_rewr_mean = "orig"
        if abs(rewr_mean-orig_mean) < 0.05:
            orig_or_rewr_or_equal = "equal 0.05"
        else:
            orig_or_rewr_or_equal = orig_or_rewr_mean
        if orig_mean > rewr_mean_plus_rewr:
            orig_or_rewr_plus_rewr_mean = "rewr"
        else:
            orig_or_rewr_plus_rewr_mean = "orig"
            
    list_output = [benchmark, query] + [orig_mean, rewr_mean, rewr_mean_plus_rewr, orig_or_rewr_mean, orig_or_rewr_or_equal, \
                                        orig_or_rewr_plus_rewr_mean, rewriting_time] + \
                    list_original + [orig_med, orig_std] + list_rewritten + [rewr_med, rewr_std, rewr_med_plus_rewr]
    #print(list_output)
    file_path = "results/POS_Scala_comparison_TO_augment.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

In [4]:
def run_query_rewritten(benchmark, query):
    file_path = f'rewritten/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)

    print(query)

    rewritten_query_list = json_data["rewritten_query"]
    rewriting_time = json_data["time"]

    file_path_drop = f'rewritten/{benchmark}_{query}_drop.json'
    with open(file_path_drop, 'r') as file:
        json_drop = json.load(file)
    drop_query_list = json_drop["rewritten_query"]

    if benchmark == "JOB":
        database = "imdb"
    else:
        database = benchmark.lower()
    
    conn = psycopg2.connect(
        host="postgres",
        database=database,
        user=database,
        password=database
    )

    # if the evaluation takes longer than 30min then break it
    global timeout_flag_rewr
    timeout_flag_rewr = False

    signal.signal(signal.SIGALRM, handler_rewr) 
    signal.alarm(100) 
    try:
        cur = conn.cursor()
        for rewritten_query in rewritten_query_list:
            cur.execute(rewritten_query)
            if rewritten_query.startswith("SELECT"):
                result1 = cur.fetchall()
        for drop_query in drop_query_list:
            cur.execute(drop_query)
    except Exception as exc: 
        print(exc)
    signal.alarm(0)

    # rewritten query is a TO and the original not
    if timeout_flag_rewr:
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        rewr_mean_plus_rewr = "TO"
        rewr_med_plus_rewr = "TO"
        list_rewritten = ["-", "-", "-", "-", "-"]

        orig_or_rewr_mean = "-"
        orig_or_rewr_or_equal = "-"
        orig_or_rewr_plus_rewr_mean = "-"

    # rewritten query is not a TO
    else:
        list_rewritten = []
        # take times for 5 runs (run 2-6) for the original query and the rewritten query
        for i in range(5):
            print(i)
            # execute the rewritten query
            start_time_rewritten = time.time()
            for rewritten_query in rewritten_query_list:
                cur.execute(rewritten_query)
            end_time_rewritten = time.time()
            rewritten_time = end_time_rewritten - start_time_rewritten
            list_rewritten.append(rewritten_time)
            
            # drop all created tables
            for drop_query in drop_query_list:
                cur.execute(drop_query)
            
        rewr_mean = np.mean(list_rewritten)
        rewr_med = np.median(list_rewritten)
        rewr_std = np.std(list_rewritten)
        rewr_mean_plus_rewr = rewr_mean + rewriting_time
        rewr_med_plus_rewr = rewr_med + rewriting_time
        orig_or_rewr_mean = "rewr"
        orig_or_rewr_or_equal = "rewr"
        orig_or_rewr_plus_rewr_mean = "rewr"

    orig_mean = "TO"
    orig_med = "TO"
    orig_std = "-"
    list_original = ["-", "-", "-", "-", "-"]
        
    list_output = [benchmark, query] + [orig_mean, rewr_mean, rewr_mean_plus_rewr, orig_or_rewr_mean, orig_or_rewr_or_equal, \
                                        orig_or_rewr_plus_rewr_mean, rewriting_time] + \
                    list_original + [orig_med, orig_std] + list_rewritten + [rewr_med, rewr_std, rewr_med_plus_rewr]
    #print(list_output)
    file_path = "results/POS_Scala_comparison_TO_augment.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

Create the output csv with the header. We add the running times for each query then.

In [5]:
file_path = "results/POS_Scala_comparison_TO_augment.csv"

names = ["bench", "query", "orig mean", "rewr mean", "rewr mean+rewr", "orig/rewr(mean)", "orig/rewr/equal", "orig/rewr+rewr(mean)", "rewriting", 
         "orig 1", "orig 2", "orig 3", "orig 4", "orig 5", "orig med", "orig_std", "rewr 1", "rewr 2", "rewr 3", "rewr 4", "rewr 5", "rewr med", 
         "rewr_std", "rewr med+rewr", ]

with open(file_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(names)

### STATS

In [6]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('STATS')]

for file in output_files:
    file_split = file.split("_")
    number = int(file_split[1].split("-")[0])
    if number <= 50:
        run_query(file_split[0], file_split[1])

STATS 001-014-augA1
original1
rewritten1
False False
[(19294,)] [(19294,)]
orig+rewr
0
1
2
3
4
STATS 001-014-augA2
original1
rewritten1
False False
[(129735,)] [(129735,)]
orig+rewr
0
1
2
3
4
STATS 001-014-augF1-augA1
original1
rewritten1
False False
[(19294,)] [(19294,)]
orig+rewr
0
1
2
3
4
STATS 001-014-augF1-augA2
original1
rewritten1
False False
[(129735,)] [(129735,)]
orig+rewr
0
1
2
3
4
STATS 001-014-augF1
original1
rewritten1
False False
[(3934,)] [(3934,)]
orig+rewr
0
1
2
3
4
STATS 001-014-augF2-augA1
original1
rewritten1
False False
[(19294,)] [(19294,)]
orig+rewr
0
1
2
3
4
STATS 001-014-augF2-augA2
original1
rewritten1
False False
[(129735,)] [(129735,)]
orig+rewr
0
1
2
3
4
STATS 001-014-augF2
original1
rewritten1
False False
[(1841,)] [(1841,)]
orig+rewr
0
1
2
3
4
STATS 001-014
original1
rewritten1
False False
[(1841,)] [(1841,)]
orig+rewr
0
1
2
3
4
STATS 002-048-augA1
original1
rewritten1
False False
[(99864,)] [(99864,)]
orig+rewr
0
1
2
3
4
STATS 002-048-augA2
original1
re

In [7]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('STATS')]

for file in output_files:
    file_split = file.split("_")
    number = int(file_split[1].split("-")[0])
    if number > 50 and number <= 100:
        run_query(file_split[0], file_split[1])

STATS 051-090-augA1
original1
rewritten1
False False
[(5011,)] [(5011,)]
orig+rewr
0
1
2
3
4
STATS 051-090-augA2
original1
rewritten1
False False
[(2170,)] [(2170,)]
orig+rewr
0
1
2
3
4
STATS 051-090-augF1-augA1
original1
rewritten1
False False
[(5011,)] [(5011,)]
orig+rewr
0
1
2
3
4
STATS 051-090-augF1-augA2
original1
rewritten1
False False
[(2170,)] [(2170,)]
orig+rewr
0
1
2
3
4
STATS 051-090-augF1
original1
rewritten1
False False
[(30877,)] [(30877,)]
orig+rewr
0
1
2
3
4
STATS 051-090-augF2-augA1
original1
rewritten1
False False
[(5011,)] [(5011,)]
orig+rewr
0
1
2
3
4
STATS 051-090-augF2-augA2
original1
rewritten1
False False
[(2170,)] [(2170,)]
orig+rewr
0
1
2
3
4
STATS 051-090-augF2
original1
rewritten1
False False
[(30877,)] [(30877,)]
orig+rewr
0
1
2
3
4
STATS 051-090
original1
rewritten1
False False
[(30877,)] [(30877,)]
orig+rewr
0
1
2
3
4
STATS 052-029-augA1
original1
rewritten1
False False
[(4972,)] [(4972,)]
orig+rewr
0
1
2
3
4
STATS 052-029-augA2
original1
rewritten1
False

In [8]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('STATS')]

for file in output_files:
    file_split = file.split("_")
    number = int(file_split[1].split("-")[0])
    if number > 100:
        run_query(file_split[0], file_split[1])

STATS 101-043-augA1
original1
rewritten1
False False
[(2,)] [(2,)]
orig+rewr
0
1
2
3
4
STATS 101-043-augA2
original1
rewritten1
False False
[(6,)] [(6,)]
orig+rewr
0
1
2
3
4
STATS 101-043-augA3
original1
rewritten1
False False
[(2,)] [(2,)]
orig+rewr
0
1
2
3
4
STATS 101-043-augF1-augA1
original1
rewritten1
False False
[(2,)] [(2,)]
orig+rewr
0
1
2
3
4
STATS 101-043-augF1-augA2
original1
rewritten1
False False
[(6,)] [(6,)]
orig+rewr
0
1
2
3
4
STATS 101-043-augF1-augA3
original1
rewritten1
False False
[(2,)] [(2,)]
orig+rewr
0
1
2
3
4
STATS 101-043-augF1
original1
rewritten1
False False
[(6,)] [(6,)]
orig+rewr
0
1
2
3
4
STATS 101-043-augF2-augA1
original1
rewritten1
False False
[(2,)] [(2,)]
orig+rewr
0
1
2
3
4
STATS 101-043-augF2-augA2
original1
rewritten1
False False
[(6,)] [(6,)]
orig+rewr
0
1
2
3
4
STATS 101-043-augF2-augA3
original1
rewritten1
False False
[(2,)] [(2,)]
orig+rewr
0
1
2
3
4
STATS 101-043-augF2
original1
rewritten1
False False
[(6,)] [(6,)]
orig+rewr
0
1
2
3
4
STATS 1

### SNAP

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][0:16]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

SNAP dblp-path02-augA1
original1
rewritten1
False False
[(1,)] [(1,)]
orig+rewr
0
1
2
3
4
SNAP dblp-path02-augA2
original1
rewritten1
False False
[(2,)] [(2,)]
orig+rewr
0
1
2
3
4
SNAP dblp-path02
original1
rewritten1
False False
[(1,)] [(1,)]
orig+rewr
0
1
2
3
4
SNAP dblp-path03-augA1
original1
rewritten1
False False
[(1,)] [(1,)]
orig+rewr
0
1
2
3
4
SNAP dblp-path03-augA2
original1
rewritten1
False False
[(2,)] [(2,)]
orig+rewr
0
1
2
3
4
SNAP dblp-path03-augA3
original1
rewritten1
False False
[(320,)] [(320,)]
orig+rewr
0
1
2
3
4
SNAP dblp-path03
original1
rewritten1
False False
[(1,)] [(1,)]
orig+rewr
0
1
2
3
4
SNAP dblp-path04-augA1
original1
Query execution of the original query > 100s
rewritten1
True False
rewritten
0
1
2
3
4
SNAP dblp-path04-augA2
original1
Query execution of the original query > 100s
rewritten1
True False
rewritten
0
1
2
3
4
SNAP dblp-path04-augA3
original1
Query execution of the original query > 100s
rewritten1
True False
rewritten
0
1
2
3
4
SNAP dblp-path04-a

In [5]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][17:42]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

dblp-path05
0
1
2
3
4
dblp-path06-augA1
0
1
2
3
4
dblp-path06-augA2
0
1
2
3
4
dblp-path06-augA3
0
1
2
3
4
dblp-path06-augA4
0
1
2
3
4
dblp-path06-augA5
0
1
2
3
4
dblp-path06-augA6
0
1
2
3
4
dblp-path06
0
1
2
3
4
dblp-path07-augA1
0
1
2
3
4
dblp-path07-augA2
0
1
2
3
4
dblp-path07-augA3
0
1
2
3
4
dblp-path07-augA4
0
1
2
3
4
dblp-path07-augA5
0
1
2
3
4
dblp-path07-augA6
0
1
2
3
4
dblp-path07-augA7
0
1
2
3
4
dblp-path07
0
1
2
3
4
dblp-path08-augA1
0
1
2
3
4
dblp-path08-augA2
0
1
2
3
4
dblp-path08-augA3
0
1
2
3
4
dblp-path08-augA4
0
1
2
3
4
dblp-path08-augA5
0
1
2
3
4
dblp-path08-augA6
0
1
2
3
4
dblp-path08-augA7
0
1
2
3
4
dblp-path08-augA8
0
1
2
3
4
dblp-path08
0
1
2
3
4


In [5]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][42:61]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

dblp-tree01-augA1
0
1
2
3
4
dblp-tree01-augA2
0
1
2
3
4
dblp-tree01-augA3
0
1
2
3
4
dblp-tree01-augA4
0
1
2
3
4
dblp-tree01
0
1
2
3
4
dblp-tree02-augA1
0
1
2
3
4
dblp-tree02-augA2
0
1
2
3
4
dblp-tree02-augA3
0
1
2
3
4
dblp-tree02-augA4
0
1
2
3
4
dblp-tree02-augA5
0
1
2
3
4
dblp-tree02
0
1
2
3
4
dblp-tree03-augA1
0
1
2
3
4
dblp-tree03-augA2
0
1
2
3
4
dblp-tree03-augA3
0
1
2
3
4
dblp-tree03-augA4
0
1
2
3
4
dblp-tree03-augA5
0
1
2
3
4
dblp-tree03-augA6
0
1
2
3
4
dblp-tree03-augA7
0
1
2
3
4
dblp-tree03
0
1
2
3
4


In [5]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][61:64]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

SNAP google-path02-augA1
original1
rewritten1
False False
[(0,)] [(0,)]
orig+rewr
0
1
2
3
4
SNAP google-path02-augA2
original1
rewritten1
False False
[(0,)] [(0,)]
orig+rewr
0
1
2
3
4
SNAP google-path02
original1
rewritten1
False False
[(0,)] [(0,)]
orig+rewr
0
1
2
3
4


In [6]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][64:122]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

google-path03-augA1
0
1
2
3
4
google-path03-augA2
0
1
2
3
4
google-path03-augA3
0
1
2
3
4
google-path03
0
1
2
3
4
google-path04-augA1
0
1
2
3
4
google-path04-augA2
0
1
2
3
4
google-path04-augA3
0
1
2
3
4
google-path04-augA4
0
1
2
3
4
google-path04
0
1
2
3
4
google-path05-augA1
0
1
2
3
4
google-path05-augA2
0
1
2
3
4
google-path05-augA3
0
1
2
3
4
google-path05-augA4
0
1
2
3
4
google-path05-augA5
0
1
2
3
4
google-path05
0
1
2
3
4
google-path06-augA1
0
1
2
3
4
google-path06-augA2
0
1
2
3
4
google-path06-augA3
0
1
2
3
4
google-path06-augA4
0
1
2
3
4
google-path06-augA5
0
1
2
3
4
google-path06-augA6
0
1
2
3
4
google-path06
0
1
2
3
4
google-path07-augA1
0
1
2
3
4
google-path07-augA2
0
1
2
3
4
google-path07-augA3
0
1
2
3
4
google-path07-augA4
0
1
2
3
4
google-path07-augA5
0
1
2
3
4
google-path07-augA6
0
1
2
3
4
google-path07-augA7
0
1
2
3
4
google-path07
0
1
2
3
4
google-path08-augA1
0
1
2
3
4
google-path08-augA2
0
1
2
3
4
google-path08-augA3
0
1
2
3
4
google-path08-augA4
0
1
2
3
4
google-pat

In [7]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][122:129]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

SNAP patents-path02-augA1
original1
rewritten1
False False
[(3859263,)] [(3859263,)]
orig+rewr
0
1
2
3
4
SNAP patents-path02-augA2
original1
rewritten1
False False
[(3858242,)] [(3858242,)]
orig+rewr
0
1
2
3
4
SNAP patents-path02
original1
rewritten1
False False
[(3859263,)] [(3859263,)]
orig+rewr
0
1
2
3
4
SNAP patents-path03-augA1
original1
rewritten1
False False
[(3866689,)] [(3866689,)]
orig+rewr
0
1
2
3
4
SNAP patents-path03-augA2
original1
rewritten1
False False
[(3859263,)] [(3859263,)]
orig+rewr
0
1
2
3
4
SNAP patents-path03-augA3
original1
rewritten1
False False
[(3858242,)] [(3858242,)]
orig+rewr
0
1
2
3
4
SNAP patents-path03
original1
rewritten1
False False
[(3866689,)] [(3866689,)]
orig+rewr
0
1
2
3
4


In [8]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][129:]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

patents-path04-augA1
0
1
2
3
4
patents-path04-augA2
0
1
2
3
4
patents-path04-augA3
0
1
2
3
4
patents-path04-augA4
0
1
2
3
4
patents-path04
0
1
2
3
4
patents-path05-augA1
0
1
2
3
4
patents-path05-augA2
0
1
2
3
4
patents-path05-augA3
0
1
2
3
4
patents-path05-augA4
0
1
2
3
4
patents-path05-augA5
0
1
2
3
4
patents-path05
0
1
2
3
4
patents-path06-augA1
0
1
2
3
4
patents-path06-augA2
0
1
2
3
4
patents-path06-augA3
0
1
2
3
4
patents-path06-augA4
0
1
2
3
4
patents-path06-augA5
0
1
2
3
4
patents-path06-augA6
0
1
2
3
4
patents-path06
0
1
2
3
4
patents-path07-augA1
0
1
2
3
4
patents-path07-augA2
0
1
2
3
4
patents-path07-augA3
0
1
2
3
4
patents-path07-augA4
0
1
2
3
4
patents-path07-augA5
0
1
2
3
4
patents-path07-augA6
0
1
2
3
4
patents-path07-augA7
0
1
2
3
4
patents-path07
0
1
2
3
4
patents-path08-augA1
0
1
2
3
4
patents-path08-augA2
0
1
2
3
4
patents-path08-augA3
0
1
2
3
4
patents-path08-augA4
0
1
2
3
4
patents-path08-augA5
0
1
2
3
4
patents-path08-augA6
0
1
2
3
4
patents-path08-augA7
0
1
2
3
4
p

### JOB

In [9]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('JOB')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

JOB 17d-augA1
original1
rewritten1
False False
[(2,)] [(2,)]
orig+rewr
0
1
2
3
4
JOB 17d-augA2
original1
rewritten1
False False
[(None,)] [(None,)]
orig+rewr
0
1
2
3
4
JOB 17d-augA3
original1
rewritten1
False False
[(299,)] [(299,)]
orig+rewr
0
1
2
3
4
JOB 17d-augA4
original1
rewritten1
False False
[(32605,)] [(32605,)]
orig+rewr
0
1
2
3
4
JOB 17d-augA5
original1
rewritten1
False False
[(87352,)] [(87352,)]
orig+rewr
0
1
2
3
4
JOB 17d-augA6
original1
rewritten1
False False
[(143842,)] [(143842,)]
orig+rewr
0
1
2
3
4
JOB 17d-augF1-augA1
original1
rewritten1
False False
[(2,)] [(2,)]
orig+rewr
0
1
2
3
4
JOB 17d-augF1-augA2
original1
rewritten1
False False
[(None,)] [(None,)]
orig+rewr
0
1
2
3
4
JOB 17d-augF1-augA3
original1
rewritten1
False False
[(299,)] [(299,)]
orig+rewr
0
1
2
3
4
JOB 17d-augF1-augA4
original1
rewritten1
False False
[(15010,)] [(15010,)]
orig+rewr
0
1
2
3
4
JOB 17d-augF1-augA5
original1
rewritten1
False False
[(41901,)] [(41901,)]
orig+rewr
0
1
2
3
4
JOB 17d-augF1-aug

### LSQB

In [10]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('LSQB')][:10]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

LSQB q1-augA1
original1
rewritten1
False False
[(111,)] [(111,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA2
original1
rewritten1
False False
[(14,)] [(14,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA3
original1
rewritten1
False False
[(36,)] [(36,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA4
original1
rewritten1
False False
[(36,)] [(36,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA5
original1
rewritten1
False False
[(30428,)] [(30428,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA6
original1
rewritten1
False False
[(250019,)] [(250019,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA7
original1
rewritten1
False False
[(250019,)] [(250019,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA8
original1
rewritten1
False False
[(0,)] [(0,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA9
original1
rewritten1
False False
[(0,)] [(0,)]
orig+rewr
0
1
2
3
4
LSQB q1
original1
rewritten1
False False
[(0,)] [(0,)]
orig+rewr
0
1
2
3
4


In [11]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('LSQB')][10:]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

q4-augA1
0
1
2
3
4
q4-augA2
0
1
2
3
4
q4-augA3
0
1
2
3
4
q4
0
1
2
3
4


### HETIO

In [12]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('HETIO')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

HETIO 10-01-SpDdGpPW-augA1
original1
rewritten1
False False
[('DOID:0050156',)] [('DOID:0050156',)]
orig+rewr
0
1
2
3
4
HETIO 10-01-SpDdGpPW-augA2
original1
rewritten1
False False
[('DOID:0050156',)] [('DOID:0050156',)]
orig+rewr
0
1
2
3
4
HETIO 10-01-SpDdGpPW-augA3
original1
rewritten1
False False
[('DOID:0050156',)] [('DOID:0050156',)]
orig+rewr
0
1
2
3
4
HETIO 10-01-SpDdGpPW-augA4
original1
rewritten1
False False
[('10000',)] [('10000',)]
orig+rewr
0
1
2
3
4
HETIO 10-01-SpDdGpPW-augA5
original1
rewritten1
False False
[('10000',)] [('10000',)]
orig+rewr
0
1
2
3
4
HETIO 10-01-SpDdGpPW-augA6
original1
rewritten1
False False
[('PC7_10399',)] [('PC7_10399',)]
orig+rewr
0
1
2
3
4
HETIO 10-01-SpDdGpPW
original1
rewritten1
False False
[('D000006',)] [('D000006',)]
orig+rewr
0
1
2
3
4
HETIO 10-02-SpDuGpPW-augA1
original1
rewritten1
False False
[('DOID:0050156',)] [('DOID:0050156',)]
orig+rewr
0
1
2
3
4
HETIO 10-02-SpDuGpPW-augA2
original1
rewritten1
False False
[('DOID:0050156',)] [('DOID:00