# Load TPC-H dataset into DuckDB

The goal of this notebook is to load mockup data from the TPC-H dataset

## Setup environment

In [1]:
from IPython.display import HTML, display

import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
import duckdb

from libs.helpers.utils_db import query_duckdb as query_duckdb_direct

def query_duckdb(sql: str) -> pd.DataFrame:
    return query_duckdb_direct(sql, storage_path="/files/local1.db")

In [3]:
# Useful to kill a duckdb process in case of lock
# import os
# import signal
# os.kill(12032, signal.SIGTERM)

## Use-case with TPC-H

### Ingestion of data (equivalent of the Bronze layer)

In [4]:
sql = """
    SELECT * FROM duckdb_databases()
"""
query_duckdb(sql)

Unnamed: 0,database_name,database_oid,path,comment,tags,internal,type,readonly,encrypted,cipher
0,local1,649,/files/local1.db,,{'storage_version': 'v1.0.0+'},False,duckdb,False,False,
1,system,0,,,{},True,duckdb,False,False,
2,temp,2050,,,{},True,duckdb,False,False,


In [5]:
sql = "CREATE SCHEMA IF NOT EXISTS tpch"
query_duckdb(sql)

In [6]:
sql = "SELECT * FROM information_schema.schemata"
query_duckdb(sql)

Unnamed: 0,catalog_name,schema_name,schema_owner,default_character_set_catalog,default_character_set_schema,default_character_set_name,sql_path
0,local1,main,duckdb,,,,
1,local1,tpch,duckdb,,,,
2,system,information_schema,duckdb,,,,
3,system,main,duckdb,,,,
4,system,pg_catalog,duckdb,,,,
5,temp,main,duckdb,,,,


In [7]:
sql = """
    USE tpch;
    DROP TABLE IF EXISTS customer;
    DROP TABLE IF EXISTS lineitem;
    DROP TABLE IF EXISTS nation;
    DROP TABLE IF EXISTS orders;
    DROP TABLE IF EXISTS part;
    DROP TABLE IF EXISTS partsupp;
    DROP TABLE IF EXISTS region;
    DROP TABLE IF EXISTS supplier;
"""
query_duckdb(sql)

In [8]:
sql = """
INSTALL tpch;
LOAD tpch;
CALL dbgen(sf=0.1, schema="tpch");
"""
query_duckdb(sql)

Unnamed: 0,Success


In [9]:
sql = """
    SET search_path = 'tpch';

    PRAGMA tpch(7);

    -- FROM tpch_queries();
    -- FROM tpch_answers();
"""
query_duckdb(sql)

Unnamed: 0,supp_nation,cust_nation,l_year,revenue
0,FRANCE,GERMANY,1995,4637235.0
1,FRANCE,GERMANY,1996,5224780.0
2,GERMANY,FRANCE,1995,6232819.0
3,GERMANY,FRANCE,1996,5557312.0


### Preparation of data (equivalent of the Silver layer)

Following the Entity-Centered Modelling principles with OBT (One Big Table)

In [10]:
sql = """
    DROP TABLE IF EXISTS tpch.customer_history;
    
    CREATE TABLE IF NOT EXISTS tpch.customer_history AS
    SELECT 
        o_custkey as h_custkey,
        count(*) as h_numorders,
        ARRAY_AGG(ROW(o_orderkey, o_orderdate, o_totalprice))::ROW(o_orderkey BIGINT, o_orderdate DATE, o_totalprice DOUBLE)[] as h_listorders,
        MIN(o_orderdate) as h_datefirstorder,
        MAX(o_orderdate) as h_datelastorder,
        SUM(o_totalprice) as h_totalprice
    FROM tpch.orders 
    GROUP BY o_custkey 
    ORDER BY o_custkey;
    
    SELECT COUNT(1) FROM tpch.customer_history;
"""
query_duckdb(sql)

Unnamed: 0,count(1)
0,10000


### Visualisation of data

In [11]:
query_duckdb("DROP VIEW IF EXISTS main.revenue_by_cohort_data")

In [12]:
query_duckdb("SELECT table_schema, table_name FROM information_schema.tables WHERE table_schema = 'tpch'")

Unnamed: 0,table_schema,table_name
0,tpch,customer
1,tpch,customer_history
2,tpch,lineitem
3,tpch,nation
4,tpch,orders
5,tpch,part
6,tpch,partsupp
7,tpch,region
8,tpch,supplier


In [13]:
query_duckdb("SELECT * FROM tpch.customer LIMIT 10")

Unnamed: 0,c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
0,1,Customer#000000001,j5JsirBM9PsCy0O1m,15,25-989-741-2988,711.56,BUILDING,y final requests wake slyly quickly special accounts. blithely
1,2,Customer#000000002,487LW1dovn6Q4dMVymKwwLE9OKf3QG,13,23-768-687-3665,121.65,AUTOMOBILE,y carefully regular foxes. slyly regular requests about the bli
2,3,Customer#000000003,fkRGN8nY4pkE,1,11-719-748-3364,7498.12,AUTOMOBILE,fully. carefully silent instructions sleep alongside of the slyly regular asymptotes. quickly regular
3,4,Customer#000000004,4u58h fqkyE,4,14-128-190-5944,2866.83,MACHINERY,sublate. fluffily even instructions are about th
4,5,Customer#000000005,hwBtxkoBF qSW4KrIk5U 2B1AU7H,3,13-750-942-6364,794.47,HOUSEHOLD,equests haggle furiously against the pending packa
5,6,Customer#000000006,"g1s,pzDenUEBW3O,2 pxu0f9n2g64rJrt5E",20,30-114-968-4951,7638.57,AUTOMOBILE,quickly silent asymptotes are slyly regular excuses. instructions wake furiously? quickly bold courts p
6,7,Customer#000000007,8OkMVLQ1dK6Mbu6WG9 w4pLGQ n7MQ,18,28-190-982-9759,9561.95,AUTOMOBILE,"ounts. ironic, regular accounts sleep. final requests haggle quickly after the"
7,8,Customer#000000008,"j,pZ,Qp,qtFEo0r0c 92qobZtlhSuOqbE4JGV",17,27-147-574-9335,6819.74,BUILDING,riously final excuses sublate quickly among the fluffily even foxes. quickly final packages haggle furiously furi
8,9,Customer#000000009,vgIql8H6zoyuLMFNdAMLyE7 H9,8,18-338-906-3675,8324.07,FURNITURE,ss pinto beans believe slyly quiet deposits-- doggedly bold packages boost. quickly ironic de
9,10,Customer#000000010,"Vf mQ6Ug9Ucf5OKGYq fsaX AtfsO7,rwY",5,15-741-346-9870,2753.54,HOUSEHOLD,g quickly after the evenly bold


In [14]:
query_duckdb("SELECT * FROM tpch.orders LIMIT 10")

Unnamed: 0,o_orderkey,o_custkey,o_orderstatus,o_totalprice,o_orderdate,o_orderpriority,o_clerk,o_shippriority,o_comment
0,1,3691,O,194029.55,1996-01-02,5-LOW,Clerk#000000951,0,ly express platelets. deposits acc
1,2,7801,O,60951.63,1996-12-01,1-URGENT,Clerk#000000880,0,ve the furiously fluffy dependencies. carefully regular
2,3,12332,F,247296.05,1993-10-14,5-LOW,Clerk#000000955,0,after the asymptotes. instructions cajole after the foxes. carefully unu
3,4,13678,O,53829.87,1995-10-11,5-LOW,Clerk#000000124,0,st the furiously bold pinto beans. furiously pending theodolites cajol
4,5,4450,F,139660.54,1994-07-30,5-LOW,Clerk#000000925,0,onic requests. carefully daring foxes among the carefu
5,6,5563,F,65843.52,1992-02-21,4-NOT SPECIFIED,Clerk#000000058,0,furiously ironic accounts haggle blithely carefully regular de
6,7,3914,O,231037.28,1996-01-10,2-HIGH,Clerk#000000470,0,", ironic packages wa"
7,32,13006,O,166802.63,1995-07-16,2-HIGH,Clerk#000000616,0,"ly about the carefully express theodolites. ironic, iron"
8,33,6697,F,118518.56,1993-10-27,3-MEDIUM,Clerk#000000409,0,"careful, regular courts. unusual"
9,34,6101,O,75662.77,1998-07-21,3-MEDIUM,Clerk#000000223,0,osits according to the ideas are furiously final requests? slyly pe


In [15]:
query_duckdb("SELECT * FROM tpch.customer_history ORDER BY h_custkey LIMIT 3")

Unnamed: 0,h_custkey,h_numorders,h_listorders,h_datefirstorder,h_datelastorder,h_totalprice
0,1,9,"[{'o_orderkey': 579908, 'o_orderdate': 1996-12-09, 'o_totalprice': 45744.09}, {'o_orderkey': 36422, 'o_orderdate': 1997-03-04, 'o_totalprice': 268835.44}, {'o_orderkey': 135943, 'o_orderdate': 1993-06-22, 'o_totalprice': 263247.54}, {'o_orderkey': 164711, 'o_orderdate': 1992-04-26, 'o_totalprice': 283261.47}, {'o_orderkey': 224167, 'o_orderdate': 1996-05-08, 'o_totalprice': 81485.84}, {'o_orderkey': 287619, 'o_orderdate': 1996-12-26, 'o_totalprice': 11925.85}, {'o_orderkey': 385825, 'o_orderdate': 1995-11-01, 'o_totalprice': 235155.22}, {'o_orderkey': 430243, 'o_orderdate': 1994-12-24, 'o_totalprice': 35523.05}, {'o_orderkey': 454791, 'o_orderdate': 1992-04-19, 'o_totalprice': 83779.26}]",1992-04-19,1997-03-04,1308957.76
1,2,11,"[{'o_orderkey': 491620, 'o_orderdate': 1998-05-22, 'o_totalprice': 122500.55}, {'o_orderkey': 9154, 'o_orderdate': 1997-06-23, 'o_totalprice': 299326.4}, {'o_orderkey': 52263, 'o_orderdate': 1994-05-08, 'o_totalprice': 36433.77}, {'o_orderkey': 90019, 'o_orderdate': 1993-10-28, 'o_totalprice': 96852.91}, {'o_orderkey': 100064, 'o_orderdate': 1996-04-10, 'o_totalprice': 51599.57}, {'o_orderkey': 120160, 'o_orderdate': 1995-04-09, 'o_totalprice': 209272.43}, {'o_orderkey': 212870, 'o_orderdate': 1996-10-30, 'o_totalprice': 168931.8}, {'o_orderkey': 269922, 'o_orderdate': 1996-03-19, 'o_totalprice': 108967.23}, {'o_orderkey': 306439, 'o_orderdate': 1997-05-17, 'o_totalprice': 222236.47}, {'o_orderkey': 360067, 'o_orderdate': 1992-12-07, 'o_totalprice': 195693.26}, {'o_orderkey': 374723, 'o_orderdate': 1996-11-20, 'o_totalprice': 233181.71}]",1992-12-07,1998-05-22,1744996.1
2,4,20,"[{'o_orderkey': 512195, 'o_orderdate': 1996-08-13, 'o_totalprice': 44791.42}, {'o_orderkey': 529350, 'o_orderdate': 1996-03-03, 'o_totalprice': 33563.9}, {'o_orderkey': 545218, 'o_orderdate': 1992-07-16, 'o_totalprice': 206615.3}, {'o_orderkey': 554115, 'o_orderdate': 1992-10-11, 'o_totalprice': 222977.75}, {'o_orderkey': 576263, 'o_orderdate': 1994-03-02, 'o_totalprice': 274992.65}, {'o_orderkey': 24322, 'o_orderdate': 1997-01-29, 'o_totalprice': 268534.86}, {'o_orderkey': 43879, 'o_orderdate': 1993-08-13, 'o_totalprice': 80130.69}, {'o_orderkey': 53283, 'o_orderdate': 1995-10-29, 'o_totalprice': 162955.31}, {'o_orderkey': 70819, 'o_orderdate': 1996-11-20, 'o_totalprice': 240814.11}, {'o_orderkey': 83684, 'o_orderdate': 1998-03-19, 'o_totalprice': 71483.64}, {'o_orderkey': 160516, 'o_orderdate': 1995-09-18, 'o_totalprice': 181789.2}, {'o_orderkey': 193030, 'o_orderdate': 1992-06-09, 'o_totalprice': 261208.46}, {'o_orderkey': 226818, 'o_orderdate': 1995-05-13, 'o_totalprice': 107127.51}, {'o_orderkey': 235779, 'o_orderdate': 1994-04-29, 'o_totalprice': 199636.4}, {'o_orderkey': 301350, 'o_orderdate': 1996-08-25, 'o_totalprice': 281282.72}, {'o_orderkey': 330404, 'o_orderdate': 1996-09-22, 'o_totalprice': 280809.61}, {'o_orderkey': 345858, 'o_orderdate': 1998-06-15, 'o_totalprice': 21012.53}, {'o_orderkey': 346693, 'o_orderdate': 1993-11-13, 'o_totalprice': 66417.98}, {'o_orderkey': 358886, 'o_orderdate': 1995-08-28, 'o_totalprice': 282207.37}, {'o_orderkey': 446499, 'o_orderdate': 1997-03-09, 'o_totalprice': 15671.83}]",1992-06-09,1998-06-15,3304023.24


In [16]:
# Date of today
query_duckdb("SELECT MAX(o_orderdate) as h_datelastorder FROM tpch.orders")

Unnamed: 0,h_datelastorder
0,1998-08-02
