# Demonstrating true strength of FireDucks in Query planning and optimization

The [TPC-H](https://www.tpc.org/tpch/) is a decision support benchmark that consists of a suite of business-oriented ad-hoc queries and concurrent data modifications. We will use [Query-3](https://www.tpc.org/TPC_Documents_Current_Versions/pdf/TPC-H_v3.0.1.pdf#page=33) in this demonstration that deals with three large tables, namely `lineitem`, `customer`, and `orders` with complex join, gorupby, sort etc.

##  Preparing Data (Scale Factor: 10) 

In [1]:
!pip install -q -U fireducks pydantic pydantic_settings linetimer

In [2]:
!git clone https://github.com/fireducks-dev/polars-tpch.git

fatal: destination path 'polars-tpch' already exists and is not an empty directory.


In [None]:
!mkdir -p polars-tpch/data/tables/scale-10.0
!cd polars-tpch/tpch-dbgen; make; ./dbgen -s 10; mv *.tbl ../data/tables/scale-10.0

make: Nothing to be done for 'all'.
TPC-H Population Generator (Version 2.17.2)
Copyright Transaction Processing Performance Council 1994 - 2010


In [None]:
!ls polars-tpch/data/tables/*

In [None]:
# generating parquet files with header from the above .tbl files
!cd polars-tpch; PATH_TABLES=data/tables SCALE_FACTOR=10.0 python -m scripts.prepare_data_pyarrow

In [None]:
!ls polars-tpch/data/tables/*

## Checking Evaluation Environments

In [None]:
import platform, psutil
print("="*30, "Evaluation Environment Information", "="*30)
print(f'platform: {platform.system()}')
print(f'architecture: {platform.machine()}')
print(f'processor: {platform.processor()}')
print(f'cpu: {psutil.cpu_count()}')

In [None]:
!nvidia-smi

##  Defining Query

In [None]:
import os
import datetime
from linetimer import CodeTimer

# REF: https://www.tpc.org/TPC_Documents_Current_Versions/pdf/TPC-H_v3.0.1.pdf#page=33
def q3(datapath):
    m_name = getattr(pd.__spec__.loader, "fast_lib", pd.__name__).split(".")[0]
    
    # to avoid some incompatibility issues in cuDF pandas
    dt_cons = datetime.datetime if m_name == "cudf" else datetime.date

    with CodeTimer(name=f"Overall execution of q3 using {m_name}", unit="s"):    
        (
            pd.read_parquet(os.path.join(datapath, "customer.parquet"))
              .merge(pd.read_parquet(os.path.join(datapath, "orders.parquet")), 
                     left_on="c_custkey", right_on="o_custkey")
              .merge(pd.read_parquet(os.path.join(datapath, "lineitem.parquet")), 
                     left_on="o_orderkey", right_on="l_orderkey")
              .pipe(lambda df: df[df["c_mktsegment"] == "BUILDING"])
              .pipe(lambda df: df[df["o_orderdate"] < dt_cons(1995, 3, 15)])
              .pipe(lambda df: df[df["l_shipdate"] > dt_cons(1995, 3, 15)])
              .assign(revenue=lambda df: df["l_extendedprice"] * (1 - df["l_discount"]))
              .groupby(["l_orderkey", "o_orderdate", "o_shippriority"], as_index=False)
              .agg({"revenue": "sum"})[["l_orderkey", "revenue", "o_orderdate", "o_shippriority"]]
              .sort_values(["revenue", "o_orderdate"], ascending=[False, True])
              .reset_index(drop=True)            
              .head(10)
              .to_parquet(os.path.join(datapath, f"{m_name}_q3_result.parquet"))      
        )

In [None]:
import os
import datetime
from linetimer import CodeTimer

def optimized_q3(datapath):
    m_name = getattr(pd.__spec__.loader, "fast_lib", pd.__name__).split(".")[0]
    
    # to avoid some incompatibility issues in cuDF pandas    
    dt_cons = datetime.datetime if m_name == "cudf" else datetime.date

    with CodeTimer(name=f"Overall execution of optimized_q3 using {m_name}", unit="s"):
        # load only required columns from respective tables
        req_customer_cols = ["c_custkey", "c_mktsegment"] # (2/8)
        req_lineitem_cols = ["l_orderkey", "l_shipdate", "l_extendedprice", "l_discount"] #(4/16)
        req_orders_cols = ["o_custkey", "o_orderkey", "o_orderdate", "o_shippriority"] #(4/9)
        customer = pd.read_parquet(os.path.join(datapath, "customer.parquet"), columns = req_customer_cols)
        lineitem =  pd.read_parquet(os.path.join(datapath, "lineitem.parquet"), columns = req_lineitem_cols)
        orders =  pd.read_parquet(os.path.join(datapath, "orders.parquet"), columns = req_orders_cols)
    
        # advanced-filter: to reduce scope of “customer” table to be processed
        f_cust = customer[customer["c_mktsegment"] == "BUILDING"]

        # advanced-filter: to reduce scope of “orders” table to be processed
        f_ord = orders[orders["o_orderdate"] < dt_cons(1995, 3, 15)]

        # advanced-filter: to reduce scope of “lineitem” table to be processed
        f_litem = lineitem[lineitem["l_shipdate"] > dt_cons(1995, 3, 15)]

        (
            f_cust.merge(f_ord, left_on="c_custkey", right_on="o_custkey")
                  .merge(f_litem, left_on="o_orderkey", right_on="l_orderkey")
                  .assign(revenue=lambda df: df["l_extendedprice"] * (1 - df["l_discount"]))
                  .groupby(["l_orderkey", "o_orderdate", "o_shippriority"], as_index=False)
                  .agg({"revenue": "sum"})[["l_orderkey", "revenue", "o_orderdate", "o_shippriority"]]
                  .sort_values(["revenue", "o_orderdate"], ascending=[False, True])
                  .reset_index(drop=True)
                  .head(10)
                  .to_parquet(os.path.join(datapath, f"{m_name}_opt_q3_result.parquet"))
        )

In [None]:
datapath = "polars-tpch/data/tables/scale-10.0"

##  Native-pandas

In [None]:
import pandas as pd # native pandas

In [None]:
q3(datapath)

In [None]:
optimized_q3(datapath)

##  FireDucks-pandas

In [None]:
%load_ext fireducks.pandas  
import pandas as pd  # fireducks.pandas

In [None]:
q3(datapath)

In [None]:
optimized_q3(datapath)

In [None]:
%unload_ext fireducks.pandas

##  cuDF-pandas

In [None]:
%load_ext cudf.pandas
import pandas as pd # cudf pandas

In [None]:
q3(datapath)

In [None]:
optimized_q3(datapath)

##  Result-verification

In [None]:
r1 = pd.read_parquet(os.path.join(datapath, "pandas_q3_result.parquet"))
r2 = pd.read_parquet(os.path.join(datapath, "pandas_opt_q3_result.parquet"))
r3 = pd.read_parquet(os.path.join(datapath, "fireducks_q3_result.parquet"))
r4 = pd.read_parquet(os.path.join(datapath, "fireducks_opt_q3_result.parquet"))
r5 = pd.read_parquet(os.path.join(datapath, "cudf_q3_result.parquet"))
r6 = pd.read_parquet(os.path.join(datapath, "cudf_opt_q3_result.parquet"))

In [None]:
r1

In [None]:
r2

In [None]:
r3

In [None]:
r4

In [None]:
r5

In [None]:
r6

##  Cleanup

In [None]:
# !rm -rf polars-tpch/data/tables