# Running queries using JSONS produced by Scala code

Install and import all needed packages

In [1]:
%%bash
pip install numpy
pip install pandas
pip install pyspark









Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 317.0/317.0 MB 10.8 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting py4j==0.10.9.7
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 200.5/200.5 KB 20.4 MB/s eta 0:00:00
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py): started
  Building wheel for pyspark (setup.py): finished with status 'done'
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488513 sha256=08d81bfe9bdfe616efa83642abc135664b3d9fc1203b029fc7ae8b94e43f90bb
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.5.1




In [2]:
import json
import time
import numpy as np
import csv
import multiprocessing
import signal
import pandas as pd
import os
import threading
import random
from pyspark.sql import SparkSession
from py4j.protocol import Py4JJavaError, Py4JError
import psutil
import string

Function for running one query. This means
*  run the original query 5 times (after one initial run, which we do not use)
*  run the rewritten queries 5 times (after one initial run, which we do not use) and drop the created tables each time
*  take the runtimes and calculate mean, median and standard deviation of time for either the original or rewritten query
*  compare the runtimes between the original query, the rewritten query and the rewritten query + the rewriting time (how long the Scala took)
*  save everything in a csv output file

In [3]:
def create_spark():
    spark = SparkSession.builder \
        .appName("app") \
        .master(f'local[{SPARK_CORES}]') \
        .config("spark.driver.memory", f'{SPARK_MEMORY}g') \
        .config("spark.executor.memory", f'{SPARK_MEMORY}g') \
        .config("spark.memory.offHeap.enabled",False) \
        .config("spark.jars", "postgresql-42.3.3.jar") \
        .getOrCreate()
    return spark

def import_db(spark, dbname):

    if dbname == "JOB":
        dbname = "imdb"
    else:
        dbname = dbname.lower()
    
    username = dbname
    password = dbname
    dbname = dbname

    df_tables = spark.read.format("jdbc") \
    .option("url", f'jdbc:postgresql://postgres:5432/{dbname}') \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "information_schema.tables") \
    .option("user", username) \
    .option("password", password) \
    .load()

    for idx, row in df_tables.toPandas().iterrows():
        if row.table_schema == 'public':
            table_name = row.table_name
            df = spark.read.format("jdbc") \
                .option("url", f'jdbc:postgresql://{DBHOST}:5432/{dbname}') \
                .option("driver", "org.postgresql.Driver") \
                .option("dbtable", table_name) \
                .option("user", username) \
                .option("password", password) \
                .load()
    
            print(table_name)
            print(df.show())
            df.createOrReplaceTempView(table_name)

In [4]:
def measure_resource_usage(resource_usage):
    t = threading.current_thread()
    secs = 0
    while getattr(t, "do_run", True):
        resource_usage.append(get_resource_usage(secs))
        #print("resource usage: " + str(resource_usage))
        secs += 1
        time.sleep(1)

def get_resource_usage(t):
    return {
        'time': t,
        'memory': psutil.virtual_memory(),
        'cpu': psutil.cpu_percent(interval=None, percpu=True),
        'cpu_total': psutil.cpu_percent(interval=None, percpu=False)
    }
    
def cancel_query(spark, seconds, group_id):
    time.sleep(seconds)
    #print("cancelling jobs with id " + group_id)
    #print(spark.sparkContext.cancelJobGroup(group_id))
    #print("cancelled job")

def cancel_query_after(spark, seconds):
    group_id = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(16)) #random id
    spark.sparkContext.setJobGroup(group_id, group_id)
    threading.Thread(target=cancel_query, args=(spark, seconds, group_id,)).start()
    return group_id

In [5]:
def run_query(benchmark, query, spark):
    print(benchmark, query)
    file_path = f'rewritten/{benchmark}_{query}_output.json'
    with open(file_path, 'r') as file:
        json_data = json.load(file)
    
    original_query = json_data["original_query"]
    rewritten_query_list = json_data["rewritten_query"]
    rewriting_time = json_data["time"]

    rewritten_query_list_spark = [rewritten_query.replace(" UNLOGGED TABLE ", " VIEW ")
                                                 .replace("TIMESTAMP(0)", "TIMESTAMP")
                                                 .replace("$", "_")
                                                 .replace("CREATE VIEW", "CREATE TEMPORARY VIEW")
                                  for rewritten_query in rewritten_query_list]


    file_path_drop = f'rewritten/{benchmark}_{query}_drop.json'
    with open(file_path_drop, 'r') as file:
        json_drop = json.load(file)
    drop_query_list = json_drop["rewritten_query"]

    timeout_flag_orig = True
    timeout_flag_rewr = True
    
    try:
        spark.sparkContext._jvm.System.gc()
        resource_usage = []
        measure_thread = threading.Thread(target=measure_resource_usage, args=(resource_usage, ))
        measure_thread.start()
    
        group_id = cancel_query_after(spark, TIMEOUT)
        result = spark.sql(original_query)
        #result.show()
    
        measure_thread.do_run = False
        timeout_flag_orig = False
    except Py4JError as e:
        print('timeout or error orig: ' + str(e))
        
    try:
        spark.sparkContext._jvm.System.gc()
        resource_usage = []
        measure_thread = threading.Thread(target=measure_resource_usage, args=(resource_usage, ))
        measure_thread.start()

        group_id = cancel_query_after(spark, TIMEOUT)
        for rewritten_query in rewritten_query_list_spark:
            if rewritten_query.startswith("SELECT"):
                result1 = spark.sql(rewritten_query)
                #result1.show()
            else:
                spark.sql(rewritten_query)

        measure_thread.do_run = False
        timeout_flag_rewr = False

        for drop_query in drop_query_list:
            spark.sql(drop_query)
        
    except Py4JError as e:
        print('timeout or error rewr: ' + str(e))

    print(timeout_flag_orig, timeout_flag_rewr)
    # original and rewritten query are TOs
    if timeout_flag_orig and timeout_flag_rewr:
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"
        list_original = ["-", "-", "-", "-", "-"]
        
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        rewr_mean_plus_rewr = "TO"
        rewr_med_plus_rewr = "TO"
        list_rewritten = ["-", "-", "-", "-", "-"]
        
        orig_or_rewr_mean = "-"
        orig_or_rewr_or_equal = "-"
        orig_or_rewr_plus_rewr_mean = "-"

    # original query is a TO and the rewritten not
    elif timeout_flag_orig:
        orig_mean = "TO"
        orig_med = "TO"
        orig_std = "-"
        list_original = ["-", "-", "-", "-", "-"]

        list_rewritten = []
        print("rewritten")
        for i in range(5):
            print(i)
            # execute the rewritten query
            start_time_rewritten = time.time()
            for rewritten_query in rewritten_query_list_spark:
                spark.sql(rewritten_query)
            end_time_rewritten = time.time()
            rewritten_time = end_time_rewritten - start_time_rewritten
            list_rewritten.append(rewritten_time)
            # drop all created tables
            for drop_query in drop_query_list:
                spark.sql(drop_query)
        rewr_mean = np.mean(list_rewritten)
        rewr_med = np.median(list_rewritten)
        rewr_std = np.std(list_rewritten)
        rewr_mean_plus_rewr = rewr_mean + rewriting_time
        rewr_med_plus_rewr = rewr_med + rewriting_time

        orig_or_rewr_mean = "rewr"
        orig_or_rewr_or_equal = "rewr"
        orig_or_rewr_plus_rewr_mean = "rewr"

    # rewritten query is a TO and the original not
    elif timeout_flag_rewr:
        list_original = []
        print("orig")
        for i in range(5):
            # execute the original query
            start_time_original = time.time()
            spark.sql(original_query)
            end_time_original = time.time()
            original_time = end_time_original - start_time_original
            list_original.append(original_time)
        orig_mean = np.mean(list_original)
        orig_med = np.median(list_original)
        orig_std = np.std(list_original)
        
        rewr_mean = "TO"
        rewr_med = "TO"
        rewr_std = "-"
        rewr_mean_plus_rewr = "TO"
        rewr_med_plus_rewr = "TO"
        list_rewritten = ["-", "-", "-", "-", "-"]

        orig_or_rewr_mean = "orig"
        orig_or_rewr_or_equal = "orig"
        orig_or_rewr_plus_rewr_mean = "orig"

    # both queries are no TOs
    else:
        #print(result, result1)
        list_original = []
        list_rewritten = []
        # take times for 5 runs (run 2-6) for the original query and the rewritten query
        print("orig+rewr")
        for i in range(5):
            print(i)
            # execute the original query
            start_time_original = time.time()
            spark.sql(original_query)
            end_time_original = time.time()
            original_time = end_time_original - start_time_original
            list_original.append(original_time)
        
            # execute the rewritten query
            start_time_rewritten = time.time()
            for rewritten_query in rewritten_query_list_spark:
                spark.sql(rewritten_query)
            end_time_rewritten = time.time()
            rewritten_time = end_time_rewritten - start_time_rewritten
            list_rewritten.append(rewritten_time)
            
            # drop all created tables
            for drop_query in drop_query_list:
                spark.sql(drop_query)
            
        orig_mean = np.mean(list_original)
        orig_med = np.median(list_original)
        orig_std = np.std(list_original)
        rewr_mean = np.mean(list_rewritten)
        rewr_med = np.median(list_rewritten)
        rewr_std = np.std(list_rewritten)
        rewr_mean_plus_rewr = rewr_mean + rewriting_time
        rewr_med_plus_rewr = rewr_med + rewriting_time
        if orig_mean > rewr_mean:
            orig_or_rewr_mean = "rewr"
        else:
            orig_or_rewr_mean = "orig"
        if abs(rewr_mean-orig_mean) < 0.05:
            orig_or_rewr_or_equal = "equal 0.05"
        else:
            orig_or_rewr_or_equal = orig_or_rewr_mean
        if orig_mean > rewr_mean_plus_rewr:
            orig_or_rewr_plus_rewr_mean = "rewr"
        else:
            orig_or_rewr_plus_rewr_mean = "orig"
            
    list_output = [benchmark, query] + [orig_mean, rewr_mean, rewr_mean_plus_rewr, orig_or_rewr_mean, orig_or_rewr_or_equal, \
                                        orig_or_rewr_plus_rewr_mean, rewriting_time] + \
                    list_original + [orig_med, orig_std] + list_rewritten + [rewr_med, rewr_std, rewr_med_plus_rewr]
    #print(list_output)
    file_path = "results/SPA_Scala_comparison_TO_augment_server.csv"
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(list_output)

In [6]:
# Global configuration
SPARK_MEMORY = 8
SPARK_CORES = 4
DBHOST = 'postgres'
TIMEOUT = 100

Create the output csv with the header. We add the running times for each query then.

In [7]:
file_path = "results/SPA_Scala_comparison_TO_augment_server.csv"

names = ["bench", "query", "orig mean", "rewr mean", "rewr mean+rewr", "orig/rewr(mean)", "orig/rewr/equal", "orig/rewr+rewr(mean)", "rewriting", 
         "orig 1", "orig 2", "orig 3", "orig 4", "orig 5", "orig med", "orig_std", "rewr 1", "rewr 2", "rewr 3", "rewr 4", "rewr 5", "rewr med", 
         "rewr_std", "rewr med+rewr", ]

with open(file_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(names)

## STATS

In [8]:
spark = create_spark()
import_db(spark, "STATS")

24/04/25 14:17:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


users
+---+----------+-------------------+-----+-------+---------+
| id|reputation|       creationdate|views|upvotes|downvotes|
+---+----------+-------------------+-----+-------+---------+
| -1|         1|2010-07-19 06:55:26|    0|   5007|     1920|
|  2|       101|2010-07-19 14:01:36|   25|      3|        0|
|  3|       101|2010-07-19 15:34:50|   22|     19|        0|
|  4|       101|2010-07-19 19:03:27|   11|      0|        0|
|  5|      6792|2010-07-19 19:03:57| 1145|    662|        5|
|  6|       457|2010-07-19 19:04:07|  114|     47|        0|
|  7|       429|2010-07-19 19:04:37|   56|     20|        0|
|  8|      6764|2010-07-19 19:04:52| 1089|    604|       25|
| 10|       121|2010-07-19 19:05:40|   20|      2|        0|
| 11|       136|2010-07-19 19:06:02|   10|     10|        0|
| 12|       101|2010-07-19 19:06:34|   10|      5|        0|
| 13|       817|2010-07-19 19:06:49|  178|     44|        1|
| 15|        11|2010-07-19 19:07:32|    8|      0|        0|
| 16|       101|20

24/04/25 14:17:25 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('STATS')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)

STATS 001-014-augA1
False False
orig+rewr
0
1
2
3
4
STATS 001-014-augA2
False False
orig+rewr
0
1
2
3
4
STATS 001-014-augF1-augA1
False False
orig+rewr
0
1
2
3
4
STATS 001-014-augF1-augA2
False False
orig+rewr
0
1
2
3
4
STATS 001-014-augF1
False False
orig+rewr
0
1
2
3
4
STATS 001-014-augF2-augA1
False False
orig+rewr
0
1
2
3
4
STATS 001-014-augF2-augA2
False False
orig+rewr
0
1
2
3
4
STATS 001-014-augF2
False False
orig+rewr
0
1
2
3
4
STATS 001-014
False False
orig+rewr
0
1
2
3
4
STATS 002-048-augA1
False False
orig+rewr
0
1
2
3
4
STATS 002-048-augA2
False False
orig+rewr
0
1
2
3
4
STATS 002-048-augA3
False False
orig+rewr
0
1
2
3
4
STATS 002-048-augA4
False False
orig+rewr
0
1
2
3
4
STATS 002-048-augF1-augA1
False False
orig+rewr
0
1
2
3
4
STATS 002-048-augF1-augA2
False False
orig+rewr
0
1
2
3
4
STATS 002-048-augF1-augA3
False False
orig+rewr
0
1
2
3
4
STATS 002-048-augF1-augA4
False False
orig+rewr
0
1
2
3
4
STATS 002-048-augF1
False False
orig+rewr
0
1
2
3
4
STATS 002-048-augF2-au

### SNAP

In [None]:
spark = create_spark()
import_db(spark, "SNAP")

In [None]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)

SNAP dblp-path02-augA1
original1
rewritten1
False False
[(1,)] [(1,)]
orig+rewr
0
1
2
3
4
SNAP dblp-path02-augA2
original1
rewritten1
False False
[(2,)] [(2,)]
orig+rewr
0
1
2
3
4
SNAP dblp-path02
original1
rewritten1
False False
[(1,)] [(1,)]
orig+rewr
0
1
2
3
4
SNAP dblp-path03-augA1
original1
rewritten1
False False
[(1,)] [(1,)]
orig+rewr
0
1
2
3
4
SNAP dblp-path03-augA2
original1
rewritten1
False False
[(2,)] [(2,)]
orig+rewr
0
1
2
3
4
SNAP dblp-path03-augA3
original1
rewritten1
False False
[(320,)] [(320,)]
orig+rewr
0
1
2
3
4
SNAP dblp-path03
original1
rewritten1
False False
[(1,)] [(1,)]
orig+rewr
0
1
2
3
4
SNAP dblp-path04-augA1
original1
Query execution of the original query > 100s
rewritten1
True False
rewritten
0
1
2
3
4
SNAP dblp-path04-augA2
original1
Query execution of the original query > 100s
rewritten1
True False
rewritten
0
1
2
3
4
SNAP dblp-path04-augA3
original1
Query execution of the original query > 100s
rewritten1
True False
rewritten
0
1
2
3
4
SNAP dblp-path04-a

In [5]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][17:42]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

dblp-path05
0
1
2
3
4
dblp-path06-augA1
0
1
2
3
4
dblp-path06-augA2
0
1
2
3
4
dblp-path06-augA3
0
1
2
3
4
dblp-path06-augA4
0
1
2
3
4
dblp-path06-augA5
0
1
2
3
4
dblp-path06-augA6
0
1
2
3
4
dblp-path06
0
1
2
3
4
dblp-path07-augA1
0
1
2
3
4
dblp-path07-augA2
0
1
2
3
4
dblp-path07-augA3
0
1
2
3
4
dblp-path07-augA4
0
1
2
3
4
dblp-path07-augA5
0
1
2
3
4
dblp-path07-augA6
0
1
2
3
4
dblp-path07-augA7
0
1
2
3
4
dblp-path07
0
1
2
3
4
dblp-path08-augA1
0
1
2
3
4
dblp-path08-augA2
0
1
2
3
4
dblp-path08-augA3
0
1
2
3
4
dblp-path08-augA4
0
1
2
3
4
dblp-path08-augA5
0
1
2
3
4
dblp-path08-augA6
0
1
2
3
4
dblp-path08-augA7
0
1
2
3
4
dblp-path08-augA8
0
1
2
3
4
dblp-path08
0
1
2
3
4


In [5]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][42:61]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

dblp-tree01-augA1
0
1
2
3
4
dblp-tree01-augA2
0
1
2
3
4
dblp-tree01-augA3
0
1
2
3
4
dblp-tree01-augA4
0
1
2
3
4
dblp-tree01
0
1
2
3
4
dblp-tree02-augA1
0
1
2
3
4
dblp-tree02-augA2
0
1
2
3
4
dblp-tree02-augA3
0
1
2
3
4
dblp-tree02-augA4
0
1
2
3
4
dblp-tree02-augA5
0
1
2
3
4
dblp-tree02
0
1
2
3
4
dblp-tree03-augA1
0
1
2
3
4
dblp-tree03-augA2
0
1
2
3
4
dblp-tree03-augA3
0
1
2
3
4
dblp-tree03-augA4
0
1
2
3
4
dblp-tree03-augA5
0
1
2
3
4
dblp-tree03-augA6
0
1
2
3
4
dblp-tree03-augA7
0
1
2
3
4
dblp-tree03
0
1
2
3
4


In [5]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][61:64]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

SNAP google-path02-augA1
original1
rewritten1
False False
[(0,)] [(0,)]
orig+rewr
0
1
2
3
4
SNAP google-path02-augA2
original1
rewritten1
False False
[(0,)] [(0,)]
orig+rewr
0
1
2
3
4
SNAP google-path02
original1
rewritten1
False False
[(0,)] [(0,)]
orig+rewr
0
1
2
3
4


In [6]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][64:122]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

google-path03-augA1
0
1
2
3
4
google-path03-augA2
0
1
2
3
4
google-path03-augA3
0
1
2
3
4
google-path03
0
1
2
3
4
google-path04-augA1
0
1
2
3
4
google-path04-augA2
0
1
2
3
4
google-path04-augA3
0
1
2
3
4
google-path04-augA4
0
1
2
3
4
google-path04
0
1
2
3
4
google-path05-augA1
0
1
2
3
4
google-path05-augA2
0
1
2
3
4
google-path05-augA3
0
1
2
3
4
google-path05-augA4
0
1
2
3
4
google-path05-augA5
0
1
2
3
4
google-path05
0
1
2
3
4
google-path06-augA1
0
1
2
3
4
google-path06-augA2
0
1
2
3
4
google-path06-augA3
0
1
2
3
4
google-path06-augA4
0
1
2
3
4
google-path06-augA5
0
1
2
3
4
google-path06-augA6
0
1
2
3
4
google-path06
0
1
2
3
4
google-path07-augA1
0
1
2
3
4
google-path07-augA2
0
1
2
3
4
google-path07-augA3
0
1
2
3
4
google-path07-augA4
0
1
2
3
4
google-path07-augA5
0
1
2
3
4
google-path07-augA6
0
1
2
3
4
google-path07-augA7
0
1
2
3
4
google-path07
0
1
2
3
4
google-path08-augA1
0
1
2
3
4
google-path08-augA2
0
1
2
3
4
google-path08-augA3
0
1
2
3
4
google-path08-augA4
0
1
2
3
4
google-pat

In [7]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][122:129]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1])

SNAP patents-path02-augA1
original1
rewritten1
False False
[(3859263,)] [(3859263,)]
orig+rewr
0
1
2
3
4
SNAP patents-path02-augA2
original1
rewritten1
False False
[(3858242,)] [(3858242,)]
orig+rewr
0
1
2
3
4
SNAP patents-path02
original1
rewritten1
False False
[(3859263,)] [(3859263,)]
orig+rewr
0
1
2
3
4
SNAP patents-path03-augA1
original1
rewritten1
False False
[(3866689,)] [(3866689,)]
orig+rewr
0
1
2
3
4
SNAP patents-path03-augA2
original1
rewritten1
False False
[(3859263,)] [(3859263,)]
orig+rewr
0
1
2
3
4
SNAP patents-path03-augA3
original1
rewritten1
False False
[(3858242,)] [(3858242,)]
orig+rewr
0
1
2
3
4
SNAP patents-path03
original1
rewritten1
False False
[(3866689,)] [(3866689,)]
orig+rewr
0
1
2
3
4


In [8]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('SNAP')][129:]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

patents-path04-augA1
0
1
2
3
4
patents-path04-augA2
0
1
2
3
4
patents-path04-augA3
0
1
2
3
4
patents-path04-augA4
0
1
2
3
4
patents-path04
0
1
2
3
4
patents-path05-augA1
0
1
2
3
4
patents-path05-augA2
0
1
2
3
4
patents-path05-augA3
0
1
2
3
4
patents-path05-augA4
0
1
2
3
4
patents-path05-augA5
0
1
2
3
4
patents-path05
0
1
2
3
4
patents-path06-augA1
0
1
2
3
4
patents-path06-augA2
0
1
2
3
4
patents-path06-augA3
0
1
2
3
4
patents-path06-augA4
0
1
2
3
4
patents-path06-augA5
0
1
2
3
4
patents-path06-augA6
0
1
2
3
4
patents-path06
0
1
2
3
4
patents-path07-augA1
0
1
2
3
4
patents-path07-augA2
0
1
2
3
4
patents-path07-augA3
0
1
2
3
4
patents-path07-augA4
0
1
2
3
4
patents-path07-augA5
0
1
2
3
4
patents-path07-augA6
0
1
2
3
4
patents-path07-augA7
0
1
2
3
4
patents-path07
0
1
2
3
4
patents-path08-augA1
0
1
2
3
4
patents-path08-augA2
0
1
2
3
4
patents-path08-augA3
0
1
2
3
4
patents-path08-augA4
0
1
2
3
4
patents-path08-augA5
0
1
2
3
4
patents-path08-augA6
0
1
2
3
4
patents-path08-augA7
0
1
2
3
4
p

### JOB

In [None]:
spark = create_spark()
import_db(spark, "JOB")

In [9]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('JOB')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)

JOB 17d-augA1
original1
rewritten1
False False
[(2,)] [(2,)]
orig+rewr
0
1
2
3
4
JOB 17d-augA2
original1
rewritten1
False False
[(None,)] [(None,)]
orig+rewr
0
1
2
3
4
JOB 17d-augA3
original1
rewritten1
False False
[(299,)] [(299,)]
orig+rewr
0
1
2
3
4
JOB 17d-augA4
original1
rewritten1
False False
[(32605,)] [(32605,)]
orig+rewr
0
1
2
3
4
JOB 17d-augA5
original1
rewritten1
False False
[(87352,)] [(87352,)]
orig+rewr
0
1
2
3
4
JOB 17d-augA6
original1
rewritten1
False False
[(143842,)] [(143842,)]
orig+rewr
0
1
2
3
4
JOB 17d-augF1-augA1
original1
rewritten1
False False
[(2,)] [(2,)]
orig+rewr
0
1
2
3
4
JOB 17d-augF1-augA2
original1
rewritten1
False False
[(None,)] [(None,)]
orig+rewr
0
1
2
3
4
JOB 17d-augF1-augA3
original1
rewritten1
False False
[(299,)] [(299,)]
orig+rewr
0
1
2
3
4
JOB 17d-augF1-augA4
original1
rewritten1
False False
[(15010,)] [(15010,)]
orig+rewr
0
1
2
3
4
JOB 17d-augF1-augA5
original1
rewritten1
False False
[(41901,)] [(41901,)]
orig+rewr
0
1
2
3
4
JOB 17d-augF1-aug

### LSQB

In [None]:
spark = create_spark()
import_db(spark, "LSQB")

In [10]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('LSQB')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)

LSQB q1-augA1
original1
rewritten1
False False
[(111,)] [(111,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA2
original1
rewritten1
False False
[(14,)] [(14,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA3
original1
rewritten1
False False
[(36,)] [(36,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA4
original1
rewritten1
False False
[(36,)] [(36,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA5
original1
rewritten1
False False
[(30428,)] [(30428,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA6
original1
rewritten1
False False
[(250019,)] [(250019,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA7
original1
rewritten1
False False
[(250019,)] [(250019,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA8
original1
rewritten1
False False
[(0,)] [(0,)]
orig+rewr
0
1
2
3
4
LSQB q1-augA9
original1
rewritten1
False False
[(0,)] [(0,)]
orig+rewr
0
1
2
3
4
LSQB q1
original1
rewritten1
False False
[(0,)] [(0,)]
orig+rewr
0
1
2
3
4


In [11]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('LSQB')][10:]

for file in output_files:
    file_split = file.split("_")
    run_query_rewritten(file_split[0], file_split[1])

q4-augA1
0
1
2
3
4
q4-augA2
0
1
2
3
4
q4-augA3
0
1
2
3
4
q4
0
1
2
3
4


### HETIO

In [None]:
spark = create_spark()
import_db(spark, "HETIO")

In [12]:
folder_path = 'rewritten/'
files = sorted(os.listdir(folder_path))
output_files = [file for file in files if file.endswith('_output.json') and file.startswith('HETIO')]

for file in output_files:
    file_split = file.split("_")
    run_query(file_split[0], file_split[1], spark)

HETIO 10-01-SpDdGpPW-augA1
original1
rewritten1
False False
[('DOID:0050156',)] [('DOID:0050156',)]
orig+rewr
0
1
2
3
4
HETIO 10-01-SpDdGpPW-augA2
original1
rewritten1
False False
[('DOID:0050156',)] [('DOID:0050156',)]
orig+rewr
0
1
2
3
4
HETIO 10-01-SpDdGpPW-augA3
original1
rewritten1
False False
[('DOID:0050156',)] [('DOID:0050156',)]
orig+rewr
0
1
2
3
4
HETIO 10-01-SpDdGpPW-augA4
original1
rewritten1
False False
[('10000',)] [('10000',)]
orig+rewr
0
1
2
3
4
HETIO 10-01-SpDdGpPW-augA5
original1
rewritten1
False False
[('10000',)] [('10000',)]
orig+rewr
0
1
2
3
4
HETIO 10-01-SpDdGpPW-augA6
original1
rewritten1
False False
[('PC7_10399',)] [('PC7_10399',)]
orig+rewr
0
1
2
3
4
HETIO 10-01-SpDdGpPW
original1
rewritten1
False False
[('D000006',)] [('D000006',)]
orig+rewr
0
1
2
3
4
HETIO 10-02-SpDuGpPW-augA1
original1
rewritten1
False False
[('DOID:0050156',)] [('DOID:0050156',)]
orig+rewr
0
1
2
3
4
HETIO 10-02-SpDuGpPW-augA2
original1
rewritten1
False False
[('DOID:0050156',)] [('DOID:00