# Loading the benchmark data and queries

* LSQB
* SNAP
* JOB
* STATS
* HETIO

# PostgreSQL

## Create the DB users

In [25]:
%%bash

export PGPASSWORD=postgres

create_db() {
cat <<EOF
        create user $1 with password '$1';
        create database $1;
        grant all privileges on database $1 to $1;
        \c $1 postgres
        grant all on schema public to $1;
EOF
}

psql --host=postgres --username postgres --dbname postgres <<-EOSQL
        $(create_db lsqb)
        $(create_db snap)
        $(create_db imdb)
        $(create_db stats)
        $(create_db hetio)
EOSQL

ERROR:  role "lsqb" already exists
ERROR:  database "lsqb" already exists


GRANT
You are now connected to database "lsqb" as user "postgres".
GRANT


ERROR:  role "snap" already exists
ERROR:  database "snap" already exists


GRANT
You are now connected to database "snap" as user "postgres".
GRANT


ERROR:  role "imdb" already exists
ERROR:  database "imdb" already exists


GRANT
You are now connected to database "imdb" as user "postgres".
GRANT


ERROR:  role "stats" already exists
ERROR:  database "stats" already exists


GRANT
You are now connected to database "stats" as user "postgres".
GRANT


ERROR:  role "hetio" already exists
ERROR:  database "hetio" already exists


GRANT
You are now connected to database "hetio" as user "postgres".
GRANT


# LSQB

## Fetch LSQB data

You either can download one LSQB dataset, deciding which scale factor you want or a lot of different ones. Both methods are provided here.  

Only load one dataset with scale factor SF (here = 1)

In [2]:
!chmod +x lsqb/scripts/download-merged-fk-single-data-set.sh

In [3]:
%%bash
cd lsqb
export SF=1
./scripts/download-merged-fk-single-data-set.sh
cd ..

Downloading scale factor 1


Downloading the LSQB data for different scale factors: 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000

In [4]:
#!chmod +x lsqb/scripts/download-merged-fk-data-sets.sh

In [5]:
#%%bash
#cd lsqb
#export MAX_SF=300
#./scripts/download-merged-fk-data-sets.sh
#cd ..

## Import the LSQB benchmark data

Import the benchmark data for scale factor 1

In [6]:
!chmod +x import-lsqb.sh

In [7]:
%%bash
export SF=1
./import-lsqb.sh

psql:lsqb/sql/drop.sql:1: NOTICE:  view "message" does not exist, skipping


DROP VIEW


psql:lsqb/sql/drop.sql:2: NOTICE:  view "comment_replyof_message" does not exist, skipping
psql:lsqb/sql/drop.sql:3: NOTICE:  view "message_hascreator_person" does not exist, skipping
psql:lsqb/sql/drop.sql:4: NOTICE:  view "message_hastag_tag" does not exist, skipping
psql:lsqb/sql/drop.sql:5: NOTICE:  view "message_islocatedin_country" does not exist, skipping
psql:lsqb/sql/drop.sql:6: NOTICE:  view "person_likes_message" does not exist, skipping
psql:lsqb/sql/drop.sql:8: NOTICE:  table "company" does not exist, skipping
psql:lsqb/sql/drop.sql:9: NOTICE:  table "university" does not exist, skipping
psql:lsqb/sql/drop.sql:10: NOTICE:  table "continent" does not exist, skipping
psql:lsqb/sql/drop.sql:11: NOTICE:  table "country" does not exist, skipping
psql:lsqb/sql/drop.sql:12: NOTICE:  table "city" does not exist, skipping
psql:lsqb/sql/drop.sql:13: NOTICE:  table "tag" does not exist, skipping
psql:lsqb/sql/drop.sql:14: NOTICE:  table "tagclass" does not exist, skipping
psql:lsqb/s

DROP VIEW
DROP VIEW
DROP VIEW
DROP VIEW
DROP VIEW
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE


psql:lsqb/sql/drop.sql:23: NOTICE:  table "person_hasinterest_tag" does not exist, skipping
psql:lsqb/sql/drop.sql:24: NOTICE:  table "person_likes_comment" does not exist, skipping
psql:lsqb/sql/drop.sql:25: NOTICE:  table "person_likes_post" does not exist, skipping
psql:lsqb/sql/drop.sql:26: NOTICE:  table "person_studyat_university" does not exist, skipping
psql:lsqb/sql/drop.sql:27: NOTICE:  table "person_workat_company" does not exist, skipping
psql:lsqb/sql/drop.sql:28: NOTICE:  table "person_knows_person" does not exist, skipping


DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
DROP TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
COPY 1575
COPY 6380
COPY 6
COPY 111
COPY 1343
COPY 109617
COPY 2580332
COPY 1229275
COPY 11000
COPY 16080
COPY 71
COPY 3148317
COPY 815205
COPY 3268415
COPY 354213
COPY 255596
COPY 1668015
COPY 853145
COPY 8880
COPY 23600
COPY 226293
COPY 226293
CREATE VIEW
CREATE VIEW
CREATE VIEW
CREATE VIEW
CREATE VIEW
CREATE VIEW


# SNAP

## Fetch the SNAP data

In [8]:
!chmod +x snap/remove-header.sh

In [9]:
%%bash

datasets=("http://snap.stanford.edu/data/cit-Patents.txt.gz"
          "http://snap.stanford.edu/data/wiki-topcats.txt.gz"
          "http://snap.stanford.edu/data/web-Google.txt.gz"
          "http://snap.stanford.edu/data/bigdata/communities/com-dblp.ungraph.txt.gz")

cd snap

for d in ${datasets[*]}; do
    if [ ! -f $(basename $d) ]; then
        curl -O $d;
        gunzip $(basename $d)
    else
        echo "$(basename $d) exists"
    fi
done

./remove-header.sh

cd ..

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 81.1M  100 81.1M    0     0  5532k      0  0:00:15  0:00:15 --:--:-- 13.3M
gzip: cit-Patents.txt already exists;	not overwritten
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 95.1M  100 95.1M    0     0  5812k      0  0:00:16  0:00:16 --:--:-- 13.3M
gzip: wiki-topcats.txt already exists;	not overwritten
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 20.1M  100 20.1M    0     0  2116k      0  0:00:09  0:00:09 --:--:-- 4087k
gzip: web-Google.txt already exists;	not overwritten
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload 

## Import the SNAP data

In [10]:
%%writefile import-snap.sql

DROP TABLE IF EXISTS patents;
CREATE TABLE patents (fromNode integer, toNode integer);
\copy patents FROM 'snap/noheader/cit-Patents.txt' with (header false);

DROP TABLE IF EXISTS wiki;
CREATE TABLE wiki (fromNode integer, toNode integer);
\copy wiki FROM 'snap/noheader/wiki-topcats.txt' with (header false, delimiter ' ');

DROP TABLE IF EXISTS google;
CREATE TABLE google (fromNode integer, toNode integer);
\copy google FROM 'snap/noheader/web-Google.txt' with (header false);

DROP TABLE IF EXISTS dblp;
CREATE TABLE dblp (fromNode integer, toNode integer);
\copy dblp FROM 'snap/noheader/com-dblp.ungraph.txt' with (header false);

Overwriting import-snap.sql


In [11]:
%%bash

PGPASSWORD=snap psql -h postgres -U snap -d snap -f import-snap.sql

psql:import-snap.sql:2: NOTICE:  table "patents" does not exist, skipping


DROP TABLE
CREATE TABLE
COPY 16518948


psql:import-snap.sql:6: NOTICE:  table "wiki" does not exist, skipping


DROP TABLE
CREATE TABLE
COPY 28511807


psql:import-snap.sql:10: NOTICE:  table "google" does not exist, skipping


DROP TABLE
CREATE TABLE
COPY 5105039


psql:import-snap.sql:14: NOTICE:  table "dblp" does not exist, skipping


DROP TABLE
CREATE TABLE
COPY 1049866


## Generate SNAP queries

In [12]:
import glob
from pathlib import Path

snap_tables = ['patents', 'wiki', 'google', 'dblp']

Path(f'snap-queries/all').mkdir(parents=True, exist_ok=True)
for tablename in snap_tables:
    Path(f'snap-queries/{tablename}').mkdir(parents=True, exist_ok=True)
    # We use the patent queries as the base and replace the references to the patent relation
    base_query_files = glob.glob('snap-queries/patents/*.sql')
    for file in base_query_files:
        query = Path(file).read_text()
        basename = Path(file).name
        new_query = query.replace('patents', tablename)
        with open(f'snap-queries/{tablename}/{basename}', 'w') as new_file:
            new_file.write(new_query)
        with open(f'snap-queries/all/{tablename}-{basename}', 'w') as new_file:
            new_file.write(new_query)

# JOB

## Import the JOB/IMDB data

In [13]:
%%bash
cd job
tar -xzf export.sql.tar.gz

In [14]:
%%bash
tail -n 50 job/export.sql


CREATE INDEX title_idx_kindid ON public.title USING btree (kind_id);


--
-- Name: title_idx_md5; Type: INDEX; Schema: public; Owner: imdb
--

CREATE INDEX title_idx_md5 ON public.title USING btree (md5sum);


--
-- Name: title_idx_pcode; Type: INDEX; Schema: public; Owner: imdb
--

CREATE INDEX title_idx_pcode ON public.title USING btree (phonetic_code);


--
-- Name: title_idx_season_nr; Type: INDEX; Schema: public; Owner: imdb
--

CREATE INDEX title_idx_season_nr ON public.title USING btree (season_nr);


--
-- Name: title_idx_title; Type: INDEX; Schema: public; Owner: imdb
--

CREATE INDEX title_idx_title ON public.title USING btree (title);


--
-- Name: title_idx_year; Type: INDEX; Schema: public; Owner: imdb
--

CREATE INDEX title_idx_year ON public.title USING btree (production_year);


--
-- Name: SCHEMA public; Type: ACL; Schema: -; Owner: pg_database_owner
--

GRANT ALL ON SCHEMA public TO imdb;


--
-- PostgreSQL database dump complete
--



In [15]:
%%bash

cd job
PGPASSWORD=imdb psql -h postgres -U imdb -d imdb -f export.sql

SET
SET
SET
SET
SET
 set_config 
------------
 
(1 row)

SET
SET
SET
SET
SET
SET
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER SEQUENCE
ALTER SEQUENCE
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER SEQUENCE
ALTER SEQUENCE
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER SEQUENCE
ALTER SEQUENCE
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER SEQUENCE
ALTER SEQUENCE
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER SEQUENCE
ALTER SEQUENCE
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER SEQUENCE
ALTER SEQUENCE
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER SEQUENCE
ALTER SEQUENCE
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER SEQUENCE
ALTER SEQUENCE
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER SEQUENCE
ALTER SEQUENCE
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER SEQUENCE
ALTER SEQUENCE
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER SEQUENCE
ALTER SEQUENCE
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER SEQUENCE
ALTER SEQUENCE
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER SEQUENCE
ALTER SEQUE



GRANT


In [16]:
%%bash
PGPASSWORD=imdb psql -h postgres -U imdb -d imdb
\dt

            List of relations
 Schema |      Name       | Type  | Owner 
--------+-----------------+-------+-------
 public | aka_name        | table | imdb
 public | aka_title       | table | imdb
 public | cast_info       | table | imdb
 public | char_name       | table | imdb
 public | comp_cast_type  | table | imdb
 public | company_name    | table | imdb
 public | company_type    | table | imdb
 public | complete_cast   | table | imdb
 public | info_type       | table | imdb
 public | keyword         | table | imdb
 public | kind_type       | table | imdb
 public | link_type       | table | imdb
 public | movie_companies | table | imdb
 public | movie_info      | table | imdb
 public | movie_keyword   | table | imdb
 public | movie_link      | table | imdb
 public | name            | table | imdb
 public | person_info     | table | imdb
 public | role_type       | table | imdb
 public | title           | table | imdb
(20 rows)



# STATS

## Import the STATS dataset

In [17]:
%%bash

cd stats
PGPASSWORD=stats psql -h postgres -U stats -d stats -f datasets/stats.sql
PGPASSWORD=stats psql -h postgres -U stats -d stats -f scripts/stats_load.sql

CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
COPY 79851
COPY 174305
COPY 40325
COPY 1032
COPY 91976
COPY 328064
COPY 303187
COPY 11102


## Generate the STATS queries

Some small errors in the names of the foreign keys are fixed (from https://arxiv.org/pdf/2109.05877.pdf)

In [18]:
import re

query_file = Path('stats/workload/stats_CEB.sql').read_text()
queries = []
for i, row in enumerate(query_file.split('\n')):
    count = int(row.split('||')[0])
    query = row.split('||')[1]

    p = re.compile("('\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}')::timestamp")
    query_replaced = p.sub(lambda match: f'CAST({match.group(1)} AS TIMESTAMP)', query)
    queries.append((i, count, query_replaced))

queries.sort(key=lambda t: t[1])

hints = [('FK(t.ExcerptPostId, p.Id)', ['t', 'p']),
        ('FK(pl.PostId, p.Id)', ['pl', 'p']),
        ('FK(pl.RelatedPostId, p.Id)', ['pl', 'p']),
        ('FK(c.PostId, p.Id)', ['c', 'p']),
        ('FK(ph.PostId, p.Id)', ['ph', 'p']),
        ('FK(v.PostId, p.Id)', ['v', 'p']),
        ('FK(p.OwnerUserId, u.Id)', ['p', 'u']),
        ('FK(c.UserId, u.Id)', ['c', 'u']),
        ('FK(ph.UserId, u.Id)', ['ph', 'u']),
        ('FK(b.UserId, u.Id)', ['b', 'u']),
        ('FK(v.UserId, u.Id)', ['v', 'u'])]

for i, (idx, count, query_replaced) in enumerate(queries):
    with open(f'stats-queries/{i+1:03}-{idx+1:03}.sql', 'w') as f:
        f.write(query_replaced)
    with open(f'stats-queries/hints/{i+1:03}-{idx+1:03}-hint.sql', 'w') as f:
        applicable_hints = list(map(lambda h: h[0],
                               list(filter(lambda h: re.search('as {rname}( |,)'.format(rname=h[1][0]), query_replaced) is not None
                                           and re.search('as {rname}( |,)'.format(rname=h[1][1]), query_replaced) is not None, hints))))
        if (len(applicable_hints) > 0):
            query_hints = query_replaced.replace('SELECT', f'SELECT /*+ {",".join(applicable_hints)} */ ')
            f.write(query_hints)
        else:
            f.write(query_replaced)

# HETIONET

In [19]:
%%bash
rm hetionet-v1.0.json.bz2
wget https://github.com/hetio/hetionet/raw/main/hetnet/json/hetionet-v1.0.json.bz2

--2024-04-24 08:56:17--  https://github.com/hetio/hetionet/raw/main/hetnet/json/hetionet-v1.0.json.bz2
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://media.githubusercontent.com/media/hetio/hetionet/main/hetnet/json/hetionet-v1.0.json.bz2 [following]
--2024-04-24 08:56:18--  https://media.githubusercontent.com/media/hetio/hetionet/main/hetnet/json/hetionet-v1.0.json.bz2
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16112094 (15M) [application/octet-stream]
Saving to: ‘hetionet-v1.0.json.bz2’

     0K .......... .......... .......... .......... ..........  0% 6.77M 2s
    50K .......... .......... .......... ..

In [20]:
import json
import bz2

with bz2.open("hetionet-v1.0.json.bz2", 'r') as f:
    hetiojson = json.load(f)

list(hetiojson.keys())

nodes = hetiojson['nodes']
edges = hetiojson['edges']

node_rels = {}

for n in nodes:
    rel = n['kind'].replace(' ', '_')
    id = n['identifier']
    name = n['name']

    if rel not in node_rels:
        node_rels[rel] = []
    
    node_rels[rel].append((id, name))

node_rels.keys()

edge_rels = {}

for e in edges:
    erel = e['kind']
    source = e['source_id'][1]
    target = e['target_id'][1]
    direction = e['direction']

    if erel not in edge_rels:
        edge_rels[erel] = []

    if direction == 'both':
        edge_rels[erel].append((source, target))
        edge_rels[erel].append((target, source))
    elif direction == 'forward':
        edge_rels[erel].append((source, target))
    else:
        print(f'unknown direction {direction}, canceling!')
        break
        
for k in edge_rels.keys():
    print(k, len(edge_rels[k]))

upregulates 248670
expresses 1052814
interacts 294328
participates 1629328
downregulates 261930
causes 277888
binds 23142
regulates 265672
associates 25246
covaries 123380
localizes 7204
resembles 14058
treats 1510
includes 2058
presents 6714
palliates 780


In [21]:
%%bash
pip3 install psycopg2-binary





In [22]:
import psycopg2 as pg
from psycopg2 import sql

conn = pg.connect("host=postgres dbname=hetio user=hetio password=hetio")
cur = conn.cursor()

In [23]:
try:
    for k in node_rels.keys():
        s = sql.SQL('CREATE TABLE IF NOT EXISTS {} (nid varchar(64) primary key, name text)').format(sql.Identifier(k.lower()))
        cur.execute(s)
        for v in node_rels[k]:
            cur.execute(sql.SQL('INSERT INTO {} VALUES (%s,%s);').format(sql.Identifier(k.lower())), v)
    conn.commit()
except Exception as e:
    print(e)
    print('Failed')
    conn.rollback()

In [24]:
try:
    for k in edge_rels.keys():
        print(k)
        s = sql.SQL('CREATE TABLE IF NOT EXISTS {} (sid varchar(64), tid varchar(64))').format(sql.Identifier(k))
        print(s)
        cur.execute(s)
        for e in edge_rels[k]:
            cur.execute(sql.SQL('INSERT INTO {} VALUES (%s,%s);').format(sql.Identifier(k)), e)
        conn.commit()
except Exception as e:
    print(e)
    print('Failed')
    conn.rollback()

upregulates
Composed([SQL('CREATE TABLE IF NOT EXISTS '), Identifier('upregulates'), SQL(' (sid varchar(64), tid varchar(64))')])
expresses
Composed([SQL('CREATE TABLE IF NOT EXISTS '), Identifier('expresses'), SQL(' (sid varchar(64), tid varchar(64))')])
interacts
Composed([SQL('CREATE TABLE IF NOT EXISTS '), Identifier('interacts'), SQL(' (sid varchar(64), tid varchar(64))')])
participates
Composed([SQL('CREATE TABLE IF NOT EXISTS '), Identifier('participates'), SQL(' (sid varchar(64), tid varchar(64))')])
downregulates
Composed([SQL('CREATE TABLE IF NOT EXISTS '), Identifier('downregulates'), SQL(' (sid varchar(64), tid varchar(64))')])
causes
Composed([SQL('CREATE TABLE IF NOT EXISTS '), Identifier('causes'), SQL(' (sid varchar(64), tid varchar(64))')])
binds
Composed([SQL('CREATE TABLE IF NOT EXISTS '), Identifier('binds'), SQL(' (sid varchar(64), tid varchar(64))')])
regulates
Composed([SQL('CREATE TABLE IF NOT EXISTS '), Identifier('regulates'), SQL(' (sid varchar(64), tid varch

# DuckDB

In [3]:
%%bash
pip install duckdb

Collecting duckdb
  Downloading duckdb-0.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.2/18.2 MB 93.4 MB/s eta 0:00:00
Installing collected packages: duckdb
Successfully installed duckdb-0.10.2




## STATS

In [103]:
import duckdb
import os

con = duckdb.connect(database="stats/stats.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=172.20.0.2 port=5432 user=postgres password=postgres dbname=stats' AS stats_DDB (TYPE postgres)")
con.execute("USE stats_DDB")
con.close()

In [78]:
import duckdb

conn = duckdb.connect(database='stats/stats.duckdb')

cursor = conn.cursor()

# Example query
cursor.execute("USE stats_DDB")
cursor.execute("""SELECT *
FROM badges LIMIT 5

""")

# Fetch results
results = cursor.fetchall()
print(results)



[(1, 5, datetime.datetime(2010, 7, 19, 19, 39, 7)), (2, 6, datetime.datetime(2010, 7, 19, 19, 39, 7)), (3, 8, datetime.datetime(2010, 7, 19, 19, 39, 7)), (4, 23, datetime.datetime(2010, 7, 19, 19, 39, 7)), (5, 36, datetime.datetime(2010, 7, 19, 19, 39, 7))]


## SNAP

In [83]:
import duckdb
import os

con = duckdb.connect(database="snap/snap.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=172.20.0.2 port=5432 user=postgres password=postgres dbname=snap' AS snap_DDB (TYPE postgres)")
con.execute("USE snap_DDB")
con.close()

In [84]:
import duckdb

conn = duckdb.connect(database='snap/snap.duckdb')

cursor = conn.cursor()

# Example query
cursor.execute("USE snap_DDB")
cursor.execute("""SELECT *
FROM dblp LIMIT 5

""")

# Fetch results
results = cursor.fetchall()
print(results)

conn.close()


[(0, 1), (0, 2), (0, 4519), (0, 23073), (0, 33043)]


## JOB

In [90]:
import duckdb
import os

con = duckdb.connect(database="job/job.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=172.20.0.2 port=5432 user=postgres password=postgres dbname=imdb' AS job_DDB (TYPE postgres)")
con.execute("USE job_DDB")
con.close()

In [92]:
import duckdb

conn = duckdb.connect(database='job/job.duckdb')

cursor = conn.cursor()

# Example query
cursor.execute("USE job_DDB")
cursor.execute("""SELECT *
FROM keyword LIMIT 5

""")

# Fetch results
results = cursor.fetchall()
print(results)



[(1, 'number-in-title', 'N5165'), (2, 'web-series', 'W1262'), (3, 'friend', 'F653'), (4, 'heroin', 'H65'), (5, 'vlog', 'V42')]


## LSQB

In [97]:
import duckdb
import os

con = duckdb.connect(database="lsqb/lsqb.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=172.20.0.2 port=5432 user=postgres password=postgres dbname=lsqb' AS lsqb_DDB (TYPE postgres)")
con.execute("USE lsqb_DDB")
con.close()

In [98]:
import duckdb

conn = duckdb.connect(database='lsqb/lsqb.duckdb')

cursor = conn.cursor()

# Example query
cursor.execute("USE lsqb_DDB")
cursor.execute("""SELECT *
FROM Comment LIMIT 5

""")

# Fetch results
results = cursor.fetchall()
print(results)


[(1649267441665, 19791209306158, 55, 1236950581248, None), (1924145348610, 10349, 51, 1236950581248, None), (1649267441668, 19791209306158, 55, 3, None), (1649267441669, 19791209306158, 55, 3, None), (1786706395489, 21990232566466, 58, 962072674656, None)]


## HETIO

In [101]:
import duckdb
import os

con = duckdb.connect(database="hetio/hetio.duckdb")
con.execute("INSTALL postgres")
con.execute("LOAD postgres")
con.execute("ATTACH 'host=172.20.0.2 port=5432 user=postgres password=postgres dbname=hetio' AS hetio_DDB (TYPE postgres)")
con.execute("USE hetio_DDB")
con.close()

In [102]:
import duckdb

conn = duckdb.connect(database='hetio/hetio.duckdb')

cursor = conn.cursor()

# Example query
cursor.execute("USE hetio_DDB")
cursor.execute("""SELECT *
FROM gene LIMIT 5

""")

# Fetch results
results = cursor.fetchall()
print(results)



[('5345', 'SERPINF2'), ('9409', 'PEX16'), ('10848', 'PPP1R13L'), ('121129', 'OR2AP1'), ('432', 'ASGR1')]
