# Run the benchmark queries
for the DBMS:
* DuckDB
* Postgres

for the benchmark datasets:
* LSQB
* SNAP
* JOB
* STATS

the queries are executed as:
* original query
* rewritten with Yannakakis

# Use Apache Calcite

In [1]:
import subprocess

### Run a jar file

In [2]:
!java -version

java version "21.0.2" 2024-01-16 LTS
Java(TM) SE Runtime Environment (build 21.0.2+13-LTS-58)
Java HotSpot(TM) 64-Bit Server VM (build 21.0.2+13-LTS-58, mixed mode, sharing)


In [3]:
!java -jar JavaTestWithVar.jar 5 4

9


In [4]:
result = subprocess.check_output(['java', '-jar', 'JavaTestWithVar.jar', '5', '4'], text=True)
result = int(result)
print(result)

9


In [5]:
result = subprocess.check_output(['java', '-jar', 'QueryPlan.jar', 'select * from trades.trade'], text=True)
result = int(result)
print(result)

Error: Could not find or load main class org.calcite.QueryPlan
Caused by: java.lang.ClassNotFoundException: org.calcite.QueryPlan


CalledProcessError: Command '['java', '-jar', 'QueryPlan.jar', 'select * from trades.trade']' returned non-zero exit status 1.

### Run the java file

#### Testing with artifical trade dataset

In [6]:
command = "cd ../../../calcite_java/QueryPlan && ./run org.calcite.QueryPlan 'select * from trades.trade natural join trades.trade2 where \"tradeid\" > 232312123'"
result = subprocess.check_output(command, shell=True, text=True)
print(result)

16:34:40.998 [main] DEBUG org.apache.calcite.sql.parser - Reduced `tradeid` > 232312123
16:34:41.301 [main] DEBUG org.apache.calcite.sql2rel - Plan after converting SqlNode to RelNode
LogicalProject(qty=[COALESCE($2, $5)], tradeid=[$0], productid=[$1], tradeid2=[$3], productid2=[$4])
  LogicalFilter(condition=[>($0, 232312123)])
    LogicalJoin(condition=[=($2, $5)], joinType=[inner])
      LogicalTableScan(table=[[TRADES, TRADE]])
      LogicalTableScan(table=[[TRADES, TRADE2]])

LogicalProject(qty=[COALESCE($2, $5)], tradeid=[$0], productid=[$1], tradeid2=[$3], productid2=[$4])
  LogicalFilter(condition=[>($0, 232312123)])
    LogicalJoin(condition=[=($2, $5)], joinType=[inner])
      LogicalTableScan(table=[[TRADES, TRADE]])
      LogicalTableScan(table=[[TRADES, TRADE2]])




In [7]:
print(result.strip().split('\n\n')[-1])

LogicalProject(qty=[COALESCE($2, $5)], tradeid=[$0], productid=[$1], tradeid2=[$3], productid2=[$4])
  LogicalFilter(condition=[>($0, 232312123)])
    LogicalJoin(condition=[=($2, $5)], joinType=[inner])
      LogicalTableScan(table=[[TRADES, TRADE]])
      LogicalTableScan(table=[[TRADES, TRADE2]])


#### STATS

In [8]:
command = "cd ../../../calcite_java/QueryPlan && ./run org.calcite.QueryPlan 'select * from stats.\"comments\"'"
result = subprocess.check_output(command, shell=True, text=True)
result = result.strip().split('\n\n')[-1]
print(result)

LogicalProject(Id=[$0], PostId=[$1], Score=[$2], CreationDate=[$3], UserId=[$4])
  LogicalTableScan(table=[[STATS, comments]])


In [9]:
query = 'select * from stats.\\"comments\\"'
command = f'cd ../../../calcite_java/QueryPlan && ./run org.calcite.QueryPlan "{query}"'
result = subprocess.check_output(command, shell=True, text=True)
result = result.strip().split('\n\n')[-1]
print(result)

LogicalProject(Id=[$0], PostId=[$1], Score=[$2], CreationDate=[$3], UserId=[$4])
  LogicalTableScan(table=[[STATS, comments]])


In [10]:
file_path = '../stats-queries/001-014.sql'
with open(file_path, 'r') as file:
    query = file.read()
print(query)

SELECT COUNT(*) FROM comments as c, votes as v, users as u WHERE u.Id = c.UserId AND u.Id = v.UserId AND c.CreationDate>=CAST('2010-10-01 20:45:26' AS TIMESTAMP) AND c.CreationDate<=CAST('2014-09-05 12:51:17' AS TIMESTAMP) AND v.BountyAmount<=100 AND u.UpVotes=0 AND u.CreationDate<=CAST('2014-09-12 03:25:34' AS TIMESTAMP);


In [11]:
query = '''SELECT COUNT(*) FROM stats.\\"comments\\" as c, stats.\\"votes\\" as v, stats.\\"users\\" as u 
            WHERE u.\\"Id\\" = c.\\"UserId\\" AND u.\\"Id\\" = v.\\"UserId\\" 
                AND c.\\"CreationDate\\">=CAST('2010-10-01 20:45:26' AS TIMESTAMP)
                AND c.\\"CreationDate\\"<=CAST('2014-09-05 12:51:17' AS TIMESTAMP) 
                AND v.\\"BountyAmount\\"<=100 AND u.\\"UpVotes\\"=0 
                AND u.\\"CreationDate\\"<=CAST('2014-09-12 03:25:34' AS TIMESTAMP)'''
command = f'cd ../../../calcite_java/QueryPlan && ./run org.calcite.QueryPlan "{query}"'
result = subprocess.check_output(command, shell=True, text=True)
result = result.strip().split('\n\n')[-1]
print(result)

LogicalAggregate(group=[{}], EXPR$0=[COUNT()])
  LogicalFilter(condition=[AND(=($11, $4), =($11, $9), >=(CAST($3):TIMESTAMP(0) NOT NULL, CAST('2010-10-01 20:45:26'):TIMESTAMP(0) NOT NULL), <=(CAST($3):TIMESTAMP(0) NOT NULL, CAST('2014-09-05 12:51:17'):TIMESTAMP(0) NOT NULL), <=(CAST($10):INTEGER NOT NULL, 100), =(CAST($15):INTEGER NOT NULL, 0), <=(CAST($13):TIMESTAMP(0) NOT NULL, CAST('2014-09-12 03:25:34'):TIMESTAMP(0) NOT NULL))])
    LogicalJoin(condition=[true], joinType=[inner])
      LogicalJoin(condition=[true], joinType=[inner])
        LogicalTableScan(table=[[STATS, comments]])
        LogicalTableScan(table=[[STATS, votes]])
      LogicalTableScan(table=[[STATS, users]])


In [12]:
query = '''SELECT COUNT(*) FROM stats.\\"comments\\" as c JOIN stats.\\"users\\" as u ON u.\\"Id\\" = c.\\"UserId\\" '''
command = f'cd ../../../calcite_java/QueryPlan && ./run org.calcite.QueryPlan "{query}"'
result = subprocess.check_output(command, shell=True, text=True)
result = result.strip().split('\n\n')[-1]
print(result)

LogicalAggregate(group=[{}], EXPR$0=[COUNT()])
  LogicalJoin(condition=[=($5, $4)], joinType=[inner])
    LogicalTableScan(table=[[STATS, comments]])
    LogicalTableScan(table=[[STATS, users]])


In [23]:
query2 = '''SELECT count(*)
FROM stats.\\"users\\"
JOIN stats.\\"votes\\"
  ON \\"users\\".\\"Id\\" = \\"votes\\".\\"UserId\\"
JOIN stats.\\"comments\\"
  ON \\"users\\".\\"Id\\" = \\"comments\\".\\"UserId\\"'''
command = f'cd ../../../calcite_java/QueryPlan && ./run org.calcite.QueryPlan "{query2}"'
result = subprocess.check_output(command, shell=True, text=True)
result = result.strip().split('\n\n')[-1]
print(result)

LogicalAggregate(group=[{}], EXPR$0=[COUNT()])
  LogicalJoin(condition=[=($0, $16)], joinType=[inner])
    LogicalJoin(condition=[=($0, $10)], joinType=[inner])
      LogicalTableScan(table=[[STATS, users]])
      LogicalTableScan(table=[[STATS, votes]])
    LogicalTableScan(table=[[STATS, comments]])


In [25]:
query2 = '''SELECT count(*)
FROM stats.\\"users\\" as u
JOIN stats.\\"votes\\" as v
  ON u.\\"Id\\" = v.\\"UserId\\"
JOIN stats.\\"comments\\" as c
  ON u.\\"Id\\" = c.\\"UserId\\"'''
command = f'cd ../../../calcite_java/QueryPlan && ./run org.calcite.QueryPlan "{query2}"'
result = subprocess.check_output(command, shell=True, text=True)
result = result.strip().split('\n\n')[-1]
print(result)

LogicalAggregate(group=[{}], EXPR$0=[COUNT()])
  LogicalJoin(condition=[=($0, $16)], joinType=[inner])
    LogicalJoin(condition=[=($0, $10)], joinType=[inner])
      LogicalTableScan(table=[[STATS, users]])
      LogicalTableScan(table=[[STATS, votes]])
    LogicalTableScan(table=[[STATS, comments]])


In [26]:
query3 = '''select count(*) from stats.\\"comments\\" c1, stats.\\"comments\\" c2, stats.\\"comments\\" c3 
where c1.\\"Id\\" = c2.\\"PostId\\" AND c2.\\"Id\\" = c3.\\"UserId\\"'''
command = f'cd ../../../calcite_java/QueryPlan && ./run org.calcite.QueryPlan "{query3}"'
result = subprocess.check_output(command, shell=True, text=True)
result = result.strip().split('\n\n')[-1]
print(result)

LogicalAggregate(group=[{}], EXPR$0=[COUNT()])
  LogicalFilter(condition=[AND(=($0, $6), =($5, $14))])
    LogicalJoin(condition=[true], joinType=[inner])
      LogicalJoin(condition=[true], joinType=[inner])
        LogicalTableScan(table=[[STATS, comments]])
        LogicalTableScan(table=[[STATS, comments]])
      LogicalTableScan(table=[[STATS, comments]])


#### LSQB

In [19]:
%%bash
cd ../lsqb/data/social-network-sf1-merged-fk/
ls

City.csv
City_isPartOf_Country.csv
Comment.csv
Comment_hasTag_Tag.csv
Company.csv
Company_isLocatedIn_Country.csv
Continent.csv
Country.csv
Country_isPartOf_Continent.csv
Forum.csv
Forum_hasMember_Person.csv
Forum_hasTag_Tag.csv
Person.csv
Person_hasInterest_Tag.csv
Person_knows_Person.csv
Person_likes_Comment.csv
Person_likes_Post.csv
Person_studyAt_University.csv
Person_workAt_Company.csv
Post.csv
Post_hasTag_Tag.csv
Tag.csv
TagClass.csv
University.csv
University_isLocatedIn_City.csv


In [20]:
import csv

# Replace 'your_file.csv' with the actual path to your CSV file
csv_file_path = '../lsqb/data/social-network-sf1-merged-fk/City.csv'

# Open the CSV file in read mode
with open(csv_file_path, 'r') as csv_file:
    # Create a CSV reader
    csv_reader = csv.reader(csv_file)

    # Read the header row
    header_row = next(csv_reader)

# Print the column names
print('Column Names:', header_row)


Column Names: ['id|ispartof_country']


In [21]:
import csv

# Replace 'your_file.csv' with the actual path to your CSV file
csv_file_path = '../stats/datasets/comments.csv'

# Open the CSV file in read mode
with open(csv_file_path, 'r') as csv_file:
    # Create a CSV reader
    csv_reader = csv.reader(csv_file)

    # Read the header row
    header_row = next(csv_reader)

# Print the column names
print('Column Names:', header_row)


Column Names: ['Id', 'PostId', 'Score', 'CreationDate', 'UserId']


In [None]:
query = '''SELECT count(*)
FROM lsqb.\\"Country\\"
JOIN lsqb.\\"City\\"
  ON \\"City\\".\\"ispartof_country\\" = \\"Country\\".\\"id\\"
JOIN lsqb.\\"Person\\"
  ON \\"Person\\".\\"islocatedin_city\\" = \\"City\\".\\"id\\"
JOIN lsqb.\\"Forum_hasMember_Person\\"
  ON \\"Forum_hasMember_Person\\".\\"id\\" = \\"Person\\".\\"id\\"'''
command = f'cd ../../../calcite_java/QueryPlan && ./run org.calcite.QueryPlan "{query}"'
result = subprocess.check_output(command, shell=True, text=True)
result = result.strip().split('\n\n')[-1]
print(result)

#### String manipulation

In [57]:
import re

def transform_columns(input_string):
    # Define a regular expression pattern to match the column specifications
    pattern = re.compile(r'\b(\w+)\s*(?:as\s*(\w+))?,?\s*')

    # Find all matches in the input string
    matches = pattern.findall(input_string)

    # Replace the matched patterns with the desired strings
    transformed_string = input_string
    for match in matches:
        original_column, alias = match
        alias = alias or original_column  # If no alias is provided, use the original column name
        replacement = f'status.{original_column} AS {alias}'
        transformed_string = re.sub(re.escape(match[0]), replacement, transformed_string)

    return transformed_string

# Example input strings
input_strings = [
    "comments as c, tag as t",
    "comments AS c, tag AS t",
    "comments c, tag t",
    "comments, tag"
]

# Apply the transformation to each input string
for input_str in input_strings:
    result = transform_columns(input_str)
    print(f'Original: {input_str}\nTransformed: {result}\n')


Original: comments as c, tag as t
Transformed: status.comments AS c as c, status.tag AS t as t

Original: comments AS c, tag AS t
Transformed: sstatus.t AS tastatus.t AS tus.sstatus.t AS tastatus.t AS tus.c sstatus.t AS tastatus.t AS tus.AS AS AS commenstatus.t AS ts sstatus.t AS tastatus.t AS tus.sstatus.t AS tastatus.t AS tus.AS AS AS sstatus.t AS tastatus.t AS tus.AS AS AS sstatus.t AS tastatus.t AS tus.AS AS AS sstatus.t AS tastatus.t AS tus.c sstatus.t AS tastatus.t AS tus.AS AS AS commenstatus.t AS ts sstatus.t AS tastatus.t AS tus.sstatus.t AS tastatus.t AS tus.AS AS AS sstatus.t AS tastatus.t AS tus.AS AS AS sstatus.t AS tastatus.t AS tus.AS AS AS sstatus.t AS tastatus.t AS tus.c sstatus.t AS tastatus.t AS tus.AS AS AS c, sstatus.t AS tastatus.t AS tus.status.t AS tag sstatus.t AS tastatus.t AS tus.AS AS AS status.t AS tag sstatus.t AS tastatus.t AS tus.sstatus.t AS tastatus.t AS tus.AS AS AS sstatus.t AS tastatus.t AS tus.AS AS AS sstatus.t AS tastatus.t AS tus.AS AS AS status

In [58]:
import re

def transform_columns(input_string):
    # Define a regular expression pattern to match the column specifications
    pattern = re.compile(r'\b(\w+)\s*(?:as\s*(\w+))?,?\s*')

    # Find all matches in the input string
    matches = pattern.findall(input_string)
    print(matches)

# Example input strings
input_strings = [
    "comments as c, tag as t",
    "comments AS c, tag AS t",
    "comments c, tag t",
    "comments, tag"
]

# Apply the transformation to each input string
for input_str in input_strings:
    result = transform_columns(input_str)
    print(f'Original: {input_str}\nTransformed: {result}\n')


[('comments', 'c'), ('tag', 't')]
Original: comments as c, tag as t
Transformed: None

[('comments', ''), ('AS', ''), ('c', ''), ('tag', ''), ('AS', ''), ('t', '')]
Original: comments AS c, tag AS t
Transformed: None

[('comments', ''), ('c', ''), ('tag', ''), ('t', '')]
Original: comments c, tag t
Transformed: None

[('comments', ''), ('tag', '')]
Original: comments, tag
Transformed: None



In [107]:
re.split(r'\bwhere\b', "a bb", flags=re.IGNORECASE)

['a bb']

In [35]:
import re

In [125]:
def query_string_manipulation(query, dataset):
    query = query.replace('\n', ' ')
    q = re.split(r'\bfrom\b', query, flags=re.IGNORECASE)
    result = q[0]

    tables = []
    if re.search(r'\bwhere\b', q[1], flags=re.IGNORECASE):
        q1 = re.split(r'\bwhere\b', q[1], flags=re.IGNORECASE)
        q10 = re.split(",", q1[0])
        for table in q10:
            table_split = table.strip().split(maxsplit = 1)
            result += dataset + '.\\\\"' + table_split[0] + '\\\\" ' + table_split[1] + ', '
            tables.append(table_split[0])
        conditions = re.split(r'\band\b', q1[1], flags=re.IGNORECASE)   
        for con in conditions:
            con_split = re.split(r'\s*([><]?=)\s*', con.strip())
            print(con_split)
            for i in con_split:
                x = i.split(".")
                print(x)
                if x[0] in tables:
                    print("yo")
                else:
                    print("ho")
    print(result)

In [126]:
query1 = '''SELECT COUNT(*) FROM comments as c, votes as v, users as u 
            WHERE u.Id = c.UserId AND u.Id = v.UserId
                AND comments.CreationDate>=CAST('2010-10-01 20:45:26' AS TIMESTAMP)
                AND c.CreationDate<=CAST('2014-09-05 12:51:17' AS TIMESTAMP) 
                AND v.BountyAmount<=100 AND u.UpVotes=0 
                AND u.CreationDate<=CAST('2014-09-12 03:25:34' AS TIMESTAMP)'''
query_string_manipulation(query1, "stats")

['u.Id', '=', 'c.UserId']
['u', 'Id']
ho
['=']
ho
['c', 'UserId']
ho
['u.Id', '=', 'v.UserId']
['u', 'Id']
ho
['=']
ho
['v', 'UserId']
ho
['comments.CreationDate', '>=', "CAST('2010-10-01 20:45:26' AS TIMESTAMP)"]
['comments', 'CreationDate']
yo
['>=']
ho
["CAST('2010-10-01 20:45:26' AS TIMESTAMP)"]
ho
['c.CreationDate', '<=', "CAST('2014-09-05 12:51:17' AS TIMESTAMP)"]
['c', 'CreationDate']
ho
['<=']
ho
["CAST('2014-09-05 12:51:17' AS TIMESTAMP)"]
ho
['v.BountyAmount', '<=', '100']
['v', 'BountyAmount']
ho
['<=']
ho
['100']
ho
['u.UpVotes', '=', '0']
['u', 'UpVotes']
ho
['=']
ho
['0']
ho
['u.CreationDate', '<=', "CAST('2014-09-12 03:25:34' AS TIMESTAMP)"]
['u', 'CreationDate']
ho
['<=']
ho
["CAST('2014-09-12 03:25:34' AS TIMESTAMP)"]
ho
SELECT COUNT(*) stats.\\"comments\\" as c, stats.\\"votes\\" as v, stats.\\"users\\" as u, 


In [95]:
query2 = '''SELECT count(*)
FROM users
JOIN votes
  ON users.Id = votes.UserId
JOIN comments
  ON users.Id = comments.UserId'''
query_string_manipulation(query2, "stats")

SELECT count(*) 


In [110]:
query3 = '''select count(*) from comments c1, comments c2, comments c3 where c1.Id = c2.PostId AND c2.Id = c3.UserId'''
query_string_manipulation(query3, "stats")

['c1.Id', '=', 'c2.PostId']
['c2.Id', '=', 'c3.UserId']
select count(*) stats.\\"comments\\" c1, stats.\\"comments\\" c2, stats.\\"comments\\" c3, 
['comments', 'comments', 'comments']


In [41]:
query1 = '''SELECT COUNT(*) FROM stats.\\"comments\\" as c, stats.\\"votes\\" as v, stats.\\"users\\" as u 
            WHERE u.\\"Id\\" = c.\\"UserId\\" AND u.\\"Id\\" = v.\\"UserId\\" 
                AND c.\\"CreationDate\\">=CAST('2010-10-01 20:45:26' AS TIMESTAMP)
                AND c.\\"CreationDate\\"<=CAST('2014-09-05 12:51:17' AS TIMESTAMP) 
                AND v.\\"BountyAmount\\"<=100 AND u.\\"UpVotes\\"=0 
                AND u.\\"CreationDate\\"<=CAST('2014-09-12 03:25:34' AS TIMESTAMP)'''

In [42]:
query2 = '''SELECT count(*)
FROM stats.\\"users\\"
JOIN stats.\\"votes\\"
  ON \\"users\\".\\"Id\\" = \\"votes\\".\\"UserId\\"
JOIN stats.\\"comments\\"
  ON \\"users\\".\\"Id\\" = \\"comments\\".\\"UserId\\"'''

In [43]:
query3 = '''select count(*) from stats.\\"comments\\" c1, stats.\\"comments\\" c2, stats.\\"comments\\" c3 
where c1.\\"Id\\" = c2.\\"PostId\\" AND c2.\\"Id\\" = c3.\\"UserId\\"'''

In [None]:
query2 = '''SELECT count(*)
FROM lsqb.\\"Country\\"
JOIN lsqb.\\"City\\"
  ON City.\\"isPartOf_CountryId\\" = Country.\\"CountryId\\"
JOIN lsqb.\\"Person\\"
  ON Person.\\"isLocatedIn_CityId\\" = City.\\"CityId\\"
JOIN lsqb.\\"Forum_hasMember_Person\\"
  ON Forum_hasMember_Person.\\"PersonId\\" = Person.\\"PersonId\\"'''
query_string_manipulation(query2)

In [33]:
import psycopg2

# Replace these with your PostgreSQL connection details
dbname = "lsqb"
user = "lsqb"
password = "lsqb"
host = "postgres"

# SQL statement representing your logical query plan
logical_query = result

try:
    # Connect to the PostgreSQL database
    connection = psycopg2.connect(dbname=dbname, user=user, password=password, host=host)

    # Create a cursor to interact with the database
    cursor = connection.cursor()

    # Execute the logical query plan
    cursor.execute(logical_query)

    # Fetch the results if needed
    results = cursor.fetchall()
    print(results)

    # Commit the transaction
    connection.commit()

except Exception as e:
    print("Error:", e)

finally:
    # Close the cursor and connection
    cursor.close()
    connection.close()


Error: could not translate host name "postgres" to address: Name or service not known



NameError: name 'cursor' is not defined

In [8]:
!pip install docopt
!pip install networkx
!pip install colorama
!pip install termcolor
!pip install duckdb
!pip install psycopg

[0m

In [2]:
!chmod +x sql2hg.jar

The queries here have to have the form SELECT * FROM ... WHERE ..., without additional count/min in SELECT or >=/<= in the WHERE or joins. Also aliases (badges AS b) does not work!   
The queries in the benchmarks do not have this form, but we simplify them (e.g. COUNT(*) -> *).  
The queries simplified to this pattern are saved in the "simple" folders.

### Create the execution plan for one query

In [3]:
%%bash
python3 main.py "lsqb/sql/simple/q1.sql"

/* Arguments 
 {'--agg': False,
 '--pg': False,
 '<query>': 'lsqb/sql/simple/q1.sql'}  */
TreeNode-Post
-- Time to compute JT: 0.2214040000581008 ms

--- stage1
create view Post_s1_14c8510065a68094 as select Post.PostId as v9,Post.Forum_containerOfId as v6 from Post;
create view Forum_hasMember_Person_s1_14c8510065a68094 as select Forum_hasMember_Person.PersonId as v4,Forum_hasMember_Person.ForumId as v6 from Forum_hasMember_Person;
create view Person_s1_14c8510065a68094 as select Person.isLocatedIn_CityId as v2,Person.PersonId as v4 from Person;
create view City_s1_14c8510065a68094 as select City.CityId as v2,City.isPartOf_CountryId as v0 from City;
create view Country_s2_14c8510065a68094 as select Country.CountryId as v0 from Country;
create view Forum_s2_14c8510065a68094 as select Forum.ForumId as v6 from Forum;
create view Comment_s1_14c8510065a68094 as select Comment.CommentId as v11,Comment.replyOf_PostId as v9 from Comment;
create view Comment_hasTag_Tag_s1_14c8510065a68094 as s

### Create the execution plan and execute it and stop the time

#### with DuckDB and LSQB

erstellen der der DuckDB database for LSQB

In [5]:
# use the scale factor, which you also used for downloading the data
SF = 1

In [6]:
import duckdb
import os

con = duckdb.connect(database=f"lsqb/lsqb{SF}.duckdb")

with open("lsqb/sql/schema.sql", "r") as f:
    schema_init_query = f.read()
    con.execute(schema_init_query)

data_dir = f"lsqb/data/social-network-sf{SF}-merged-fk"
with open("lsqb/sql/snb-load.sql", "r") as f:
    load_query = f.read().replace("PATHVAR", data_dir)
    con.execute(load_query)

with open("lsqb/sql/views.sql", "r") as f:
    view_init_query = f.read()
    con.execute(view_init_query)

con.close()

In [7]:
!python3 duckrunner.py "lsqb/sql/simple/test1.sql" "results/DDB_LSQB.csv" "lsqb/lsqb${SF}.duckdb"

{'--agg': False,
 '<csvout>': 'results/DDB_LSQB.csv',
 '<dbfile>': 'lsqb/lsqb.duckdb',
 '<glob>': 'lsqb/sql/simple/test1.sql'}
lsqb/sql/simple/test1.sql
TreeNode-City
Qsetup create temp table _star_select as SELECT Country.CountryId,City.CityId
FROM Country, City, Person
WHERE City.isPartOf_CountryId = Country.CountryId
AND Person.isLocatedIn_CityId = City.CityId; TreeNode-City [31m[22mCity[0m([33m[22mv2[0m,[33m[22mv0[0m)
[31m[22mCountry[0m([33m[22mv0[0m)
[31m[22mPerson[0m([33m[22mv2[0m)
 {'v0': ['Country.CountryId', 'City.isPartOf_CountryId'], 'v2': ['City.CityId', 'Person.isLocatedIn_CityId']}
here
  width 1
  running classic
  -- debug:  create temp table _star_select as SELECT Country.CountryId,City.CityId
FROM Country, City, Person
WHERE City.isPartOf_CountryId = Country.CountryId
AND Person.isLocatedIn_CityId = City.CityId;
Exitcode None
['test1.sql', 'baseline', '', 1, 0.01664342399999441, 'ok', '(184000)']
[0m------------
QUERIES stage1
	 create view City_

In [80]:
!python3 duckrunner.py "lsqb/sql/simple/test*.sql" "results/DDB_LSQB.csv" "lsqb/lsqb${SF}.duckdb"

{'--agg': False,
 '<csvout>': 'DDB_LSQB.csv',
 '<dbfile>': '/benchmark/lsqb/lsqb1.duckdb',
 '<glob>': '../lsqb/sql/simple/test*.sql'}
../lsqb/sql/simple/test1.sql
Query SELECT *
FROM Country, City, Person
WHERE City.isPartOf_CountryId = Country.CountryId
AND Person.isLocatedIn_CityId = City.CityId;
TreeNode-City
Qsetup create temp table _star_select as SELECT Country.CountryId,City.CityId
FROM Country, City, Person
WHERE City.isPartOf_CountryId = Country.CountryId
AND Person.isLocatedIn_CityId = City.CityId; TreeNode-City [31m[22mCity[0m([33m[22mv2[0m,[33m[22mv0[0m)
[31m[22mCountry[0m([33m[22mv0[0m)
[31m[22mPerson[0m([33m[22mv2[0m)
 {'v0': ['Country.CountryId', 'City.isPartOf_CountryId'], 'v2': ['City.CityId', 'Person.isLocatedIn_CityId']}
here
  width 1
  running classic
  -- debug:  create temp table _star_select as SELECT Country.CountryId,City.CityId
FROM Country, City, Person
WHERE City.isPartOf_CountryId = Country.CountryId
AND Person.isLocatedIn_CityId = Cit

In [81]:
!python3 duckrunner.py "lsqb/sql/simple/test*.sql" "results/DDB_LSQB_AGG.csv" "lsqb/lsqb1.duckdb" --agg

{'--agg': True,
 '<csvout>': 'DDB_LSQB_AGG.csv',
 '<dbfile>': '/benchmark/lsqb/lsqb1.duckdb',
 '<glob>': '../lsqb/sql/simple/test*.sql'}
../lsqb/sql/simple/test1.sql
Query SELECT *
FROM Country, City, Person
WHERE City.isPartOf_CountryId = Country.CountryId
AND Person.isLocatedIn_CityId = City.CityId;
TreeNode-City
Qsetup SELECT *
FROM Country, City, Person
WHERE City.isPartOf_CountryId = Country.CountryId
AND Person.isLocatedIn_CityId = City.CityId; TreeNode-City [31m[22mCity[0m([33m[22mv0[0m,[33m[22mv2[0m)
[31m[22mCountry[0m([33m[22mv0[0m)
[31m[22mPerson[0m([33m[22mv2[0m)
 {'v0': ['Country.CountryId', 'City.isPartOf_CountryId'], 'v2': ['City.CityId', 'Person.isLocatedIn_CityId']}
here
  width 1
  running classic
  -- debug:  SELECT *
FROM Country, City, Person
WHERE City.isPartOf_CountryId = Country.CountryId
AND Person.isLocatedIn_CityId = City.CityId;
Exitcode None
[0m['test1.sql', 'baseline', 'min(City.isPartOf_CountryId)', 1, 0.011251795000134734, 'ok', '(57

#### with PostgresSQL and LSQB

In [9]:
!chmod +x import-lsqb.sh

In [12]:
%%bash
export SF=1
./import-lsqb.sh

DROP VIEW
DROP VIEW
DROP VIEW
DROP VIEW
DROP VIEW
DROP VIEW
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE


psql:lsqb/sql/schema.sql:1: NOTICE:  table "company" does not exist, skipping


DROP TABLE
DROP TABLE


psql:lsqb/sql/schema.sql:2: NOTICE:  table "university" does not exist, skipping
psql:lsqb/sql/schema.sql:3: NOTICE:  table "continent" does not exist, skipping
psql:lsqb/sql/schema.sql:4: NOTICE:  table "country" does not exist, skipping
psql:lsqb/sql/schema.sql:5: NOTICE:  table "city" does not exist, skipping
psql:lsqb/sql/schema.sql:6: NOTICE:  table "tag" does not exist, skipping
psql:lsqb/sql/schema.sql:7: NOTICE:  table "tagclass" does not exist, skipping
psql:lsqb/sql/schema.sql:8: NOTICE:  table "forum" does not exist, skipping
psql:lsqb/sql/schema.sql:9: NOTICE:  table "comment" does not exist, skipping
psql:lsqb/sql/schema.sql:10: NOTICE:  table "post" does not exist, skipping
psql:lsqb/sql/schema.sql:11: NOTICE:  table "person" does not exist, skipping
psql:lsqb/sql/schema.sql:12: NOTICE:  table "comment_hastag_tag" does not exist, skipping
psql:lsqb/sql/schema.sql:13: NOTICE:  table "post_hastag_tag" does not exist, skipping
psql:lsqb/sql/schema.sql:14: NOTICE:  table "for

DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
COPY 1575
COPY 6380
COPY 6
COPY 111
COPY 1343
COPY 109617
COPY 2580332
COPY 1229275
COPY 11000
COPY 16080
COPY 71
COPY 3148317
COPY 815205
COPY 3268415
COPY 354213
COPY 255596
COPY 1668015
COPY 853145
COPY 8880
COPY 23600
COPY 226293
COPY 226293
CREATE VIEW
CREATE VIEW
CREATE VIEW
CREATE VIEW
CREATE VIEW
CREATE VIEW


In [18]:
!python3 pgrunner.py "lsqb/sql/simple/test1.sql" "results/POS_LSQB.csv" "host=postgres dbname=lsqb user=lsqb password=lsqb"

{'--agg': False,
 '--skip-classic': False,
 '<csvout>': 'results/POS_LSQB.csv',
 '<glob>': 'lsqb/sql/simple/test1.sql',
 '<pgconnectstr>': 'host=postgres dbname=lsqb user=lsqb password=lsqb'}
lsqb/sql/simple/test1.sql
TreeNode-City
  width 1
  running classic
  -- debug:  create temp table _star_select as SELECT Country.CountryId as Country___CountryId,City.CityId as City___CityId
FROM Country, City, Person
WHERE City.isPartOf_CountryId = Country.CountryId
AND Person.isLocatedIn_CityId = City.CityId;
['test1.sql', 'baseline', '', 1, 0.03149839300021995, 'ok', '-']
[0m------------
QUERIES stage1
	 create view City_s1_16e7e30065a681bd as select City.isPartOf_CountryId as v0,City.CityId as v2 from City;
	 create view Person_s2_16e7e30065a681bd as select Person.isLocatedIn_CityId as v2 from Person;
	 create view Country_s2_16e7e30065a681bd as select Country.CountryId as v0 from Country;
QUERIES stage2
	 create temp table City_s2_16e7e30065a681bd as select * from City_s1_16e7e30065a681bd w

In [19]:
!python3 pgrunner.py "lsqb/sql/simple/test1.sql" "results/POS_LSQB_AGG.csv" "host=postgres dbname=lsqb user=lsqb password=lsqb" --agg

{'--agg': True,
 '--skip-classic': False,
 '<csvout>': 'results/POS_LSQB_AGG.csv',
 '<glob>': 'lsqb/sql/simple/test1.sql',
 '<pgconnectstr>': 'host=postgres dbname=lsqb user=lsqb password=lsqb'}
lsqb/sql/simple/test1.sql
TreeNode-City
  width 1
  running classic
  -- debug:  SELECT *
FROM Country, City, Person
WHERE City.isPartOf_CountryId = Country.CountryId
AND Person.isLocatedIn_CityId = City.CityId;
['test1.sql', 'baseline', 'min(City.isPartOf_CountryId)', 1, 0.15473245200064412, 'ok', '(57, 1458, 874, 57, 26388279068012, 874)']
------------
QUERIES stage1
	 create view City_s1_1731508065a681c2 as select City.isPartOf_CountryId as v0,City.CityId as v2 from City;
	 create view Person_s2_1731508065a681c2 as select Person.isLocatedIn_CityId as v2 from Person;
	 create view Country_s2_1731508065a681c2 as select Country.CountryId as v0 from Country;
QUERIES stage2
	 create temp table City_s2_1731508065a681c2 as select * from City_s1_1731508065a681c2 where (v0) in (select v0 from Country

#### with DuckDB and SNAP

will not work because we have aliases

#### with PostgreSQL and SNAP

In [None]:
!chmod +x import-stats.sh

In [None]:
%%bash
./import-stats.sh

#### with DuckDB and STATS

In [21]:
import duckdb
import os

con = duckdb.connect(database="stats/stats.duckdb")

with open("stats/sql/schema.sql", "r") as f:
    schema_init_query = f.read()
    con.execute(schema_init_query)

data_dir = f"stats/datasets"
with open("stats/sql/snb-load.sql", "r") as f:
    load_query = f.read().replace("PATHVAR", data_dir)
    con.execute(load_query)

con.close()

In [24]:
!python3 duckrunner.py "stats-queries/simple/test1.sql" "results/DDB_STATS.csv" "stats/stats.duckdb" --agg

{'--agg': True,
 '<csvout>': 'results/DDB_STATS.csv',
 '<dbfile>': 'stats/stats.duckdb',
 '<glob>': 'stats-queries/simple/test1.sql'}
stats-queries/simple/test1.sql
TreeNode-users
Qsetup SELECT min(comments.UserId) FROM comments, votes, users WHERE users.Id = comments.UserId AND users.Id = votes.UserId; TreeNode-comments [31m[22mcomments[0m([33m[22mv0[0m)
[31m[22musers[0m([33m[22mv0[0m)
[31m[22mvotes[0m([33m[22mv0[0m)
 {'v0': ['votes.UserId', 'comments.UserId', 'users.Id']}
here
  width 1
  running classic
  -- debug:  SELECT min(comments.UserId) FROM comments, votes, users WHERE users.Id = comments.UserId AND users.Id = votes.UserId;
Exitcode None
[0m['test1.sql', 'baseline', 'min(comments.UserId)', 1, 0.4080308620004871, 'ok', '(5)']
------------
QUERIES stage1
	 create view comments_s1_f35500065a68203 as select comments.UserId as v0 from comments;
	 create view users_s1_f35500065a68203 as select users.Id as v0 from users;
	 create view votes_s2_f35500065a68203 as 

In [23]:
!python3 duckrunner.py "stats-queries/simple/test1.sql" "results/DDB_STATS_AGG.csv" "stats/stats.duckdb" --agg

{'--agg': True,
 '<csvout>': 'results/DDB_STATS_AGG.csv',
 '<dbfile>': 'stats/stats.duckdb',
 '<glob>': 'stats-queries/simple/test1.sql'}
stats-queries/simple/test1.sql
TreeNode-users
Qsetup SELECT min(comments.UserId) FROM comments, votes, users WHERE users.Id = comments.UserId AND users.Id = votes.UserId; TreeNode-comments [31m[22mcomments[0m([33m[22mv0[0m)
[31m[22musers[0m([33m[22mv0[0m)
[31m[22mvotes[0m([33m[22mv0[0m)
 {'v0': ['votes.UserId', 'comments.UserId', 'users.Id']}
here
  width 1
  running classic
  -- debug:  SELECT min(comments.UserId) FROM comments, votes, users WHERE users.Id = comments.UserId AND users.Id = votes.UserId;
Exitcode None
[0m['test1.sql', 'baseline', 'min(comments.UserId)', 1, 0.49434783600008814, 'ok', '(5)']
------------
QUERIES stage1
	 create view comments_s1_167fc00065a681f8 as select comments.UserId as v0 from comments;
	 create view users_s1_167fc00065a681f8 as select users.Id as v0 from users;
	 create view votes_s2_167fc00065a6

#### with PostgreSQL and STATS

In [25]:
!chmod +x import-stats.sh

In [26]:
%%bash
./import-stats.sh

DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
COPY 79851
COPY 174305
COPY 303187
COPY 11102
COPY 91976
COPY 1032
COPY 40325
COPY 328064


In [27]:
!python3 pgrunner.py "stats-queries/simple/test1.sql" "results/POS_STATS.csv" "host=postgres dbname=stats user=stats password=stats"

{'--agg': False,
 '--skip-classic': False,
 '<csvout>': 'results/POS_STATS.csv',
 '<glob>': 'stats-queries/simple/test1.sql',
 '<pgconnectstr>': 'host=postgres dbname=stats user=stats password=stats'}
stats-queries/simple/test1.sql
TreeNode-users
  width 1
  running classic
  -- debug:  create temp table _star_select as SELECT votes.UserId as votes___UserId FROM comments, votes, users WHERE users.Id = comments.UserId AND users.Id = votes.UserId;
['test1.sql', 'baseline', '', 1, 25.609564517999388, 'ok', '-']
------------
QUERIES stage1
	 create view users_s1_f48a0065a6838c as select users.Id as v0 from users;
	 create view votes_s2_f48a0065a6838c as select votes.UserId as v0 from votes;
	 create view comments_s2_f48a0065a6838c as select comments.UserId as v0 from comments;
QUERIES stage2
	 create temp table users_s2_f48a0065a6838c as select * from users_s1_f48a0065a6838c where (v0) in (select v0 from comments_s2_f48a0065a6838c) AND (v0) in (select v0 from votes_s2_f48a0065a6838c);
QUER

In [28]:
!python3 pgrunner.py "stats-queries/simple/test1.sql" "results/POS_STATS_AGG.csv" "host=postgres dbname=stats user=stats password=stats" --agg

{'--agg': True,
 '--skip-classic': False,
 '<csvout>': 'results/POS_STATS_AGG.csv',
 '<glob>': 'stats-queries/simple/test1.sql',
 '<pgconnectstr>': 'host=postgres dbname=stats user=stats password=stats'}
stats-queries/simple/test1.sql
TreeNode-users
  width 1
  running classic
  -- debug:  SELECT min(comments.UserId) FROM comments, votes, users WHERE users.Id = comments.UserId AND users.Id = votes.UserId;
[0m['test1.sql', 'baseline', 'min(comments.UserId)', 1, 0.6294853949993922, 'ok', '(5)']
------------
QUERIES stage1
	 create view comments_s1_1541b58065a683ec as select comments.UserId as v0 from comments;
	 create view users_s1_1541b58065a683ec as select users.Id as v0 from users;
	 create view votes_s2_1541b58065a683ec as select votes.UserId as v0 from votes;
QUERIES stage2
	 create temp table users_s2_1541b58065a683ec as select * from users_s1_1541b58065a683ec where (v0) in (select v0 from votes_s2_1541b58065a683ec);
	 create temp table comments_s2_1541b58065a683ec as select * fr

#### with DuckDB and JOB

#### with PostgreSQL and JOB