In [1]:
import pandas as pd
import numpy as np
from datetime import date
from numpy import char
import time
pd.set_option('display.float_format', str)

## Data

In [2]:
l_columnnames = ["L_ORDERKEY", "L_PARTKEY", "L_SUPPKEY", "L_LINENUMBER", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX", "L_RETURNFLAG", "L_LINESTATUS", "L_SHIPDATE", "L_COMMITDATE", "L_RECEIPTDATE", "L_SHIPINSTRUCT","L_SHIPMODE", "L_COMMENT"]

for i in range(len(l_columnnames)):
    l_columnnames[i] = l_columnnames[i].lower()
    
l_data_types = {
    'l_orderkey': int,
    'l_partkey': int,
    'l_suppkey': int,
    'l_linenumber': int,
    'l_quantity': float,
    'l_extendedprice': float,
    'l_discount': float,
    'l_tax': float,
    'l_returnflag': str,
    'l_linestatus': str,
    'l_shipinstruct': str,
    'l_shipmode': str,
    'l_comment': str
}

l_parse_dates = ['l_shipdate', 'l_commitdate', 'l_receiptdate']

In [3]:
p_columnnames = ["P_PARTKEY", "P_NAME", "P_MFGR", "P_BRAND", "P_TYPE", "P_SIZE", "P_CONTAINER", "P_RETAILPRICE", "P_COMMENT"]

for i in range(len(p_columnnames)):
    p_columnnames[i] = p_columnnames[i].lower()
    
p_data_types = {
    'p_partkey': int, 
    'p_name': str,
    'p_mfgr': str,
    'p_brand': str,
    'p_type': str,
    'p_size': int,
    'p_container': str,
    'p_retailprice': float,
    'p_comment': str
}

p_parse_dates = []

### Lineitem DataFrame

In [4]:
# Don't set indexes, as we can't access them with Pandas selection!
lineitem = pd.read_table("../../tpch-pgsql-master/data/load/lineitem.tbl.csv", sep="|", names=l_columnnames, dtype=l_data_types, parse_dates=l_parse_dates)

### Part DataFrame

In [5]:
# Don't set indexes, as we can't access them with Pandas selection!
part = pd.read_table("../../tpch-pgsql-master/data/load/part.tbl.csv", sep="|", names=p_columnnames, dtype=p_data_types, parse_dates=p_parse_dates)

## Q19

In [14]:
start_time = time.time()

df_filter_1 = part[(part.p_size >= 1) & (((part.p_brand == 'Brand#12') & (part.p_container.isin(["SM CASE","SM BOX","SM PACK","SM PKG"])) & (part.p_size <= 5)) | ((part.p_brand == 'Brand#23') & (part.p_container.isin(["MED BAG","MED BOX","MED PKG","MED PACK"])) & (part.p_size <= 10)) | ((part.p_brand == 'Brand#34') & (part.p_container.isin(["LG CASE","LG BOX","LG PACK","LG PKG"])) & (part.p_size <= 15)))]
df_filter_1 = df_filter_1[['p_partkey', 'p_name', 'p_mfgr', 'p_brand', 'p_type', 'p_size', 'p_container', 'p_retailprice', 'p_comment']]
df_filter_2 = lineitem[(lineitem.l_shipmode.isin(["AIR","AIR REG"])) & (lineitem.l_shipinstruct == 'DELIVER IN PERSON') & (((lineitem.l_quantity >= 1) & (lineitem.l_quantity <= 11)) | ((lineitem.l_quantity >= 10) & (lineitem.l_quantity <= 20)) | ((lineitem.l_quantity >= 20) & (lineitem.l_quantity <= 30)))]
df_filter_2 = df_filter_2[['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment']]
df_merge_1 = df_filter_1.merge(df_filter_2, left_on="p_partkey", right_on="l_partkey")
df_merge_1 = df_merge_1[((df_merge_1.p_brand == 'Brand#12') & (df_merge_1.p_container.isin(["SM CASE","SM BOX","SM PACK","SM PKG"])) & (df_merge_1.l_quantity >= 1) & (df_merge_1.l_quantity <= 11) & (df_merge_1.p_size <= 5)) | ((df_merge_1.p_brand == 'Brand#23') & (df_merge_1.p_container.isin(["MED BAG","MED BOX","MED PKG","MED PACK"])) & (df_merge_1.l_quantity >= 10) & (df_merge_1.l_quantity <= 20) & (df_merge_1.p_size <= 10)) | ((df_merge_1.p_brand == 'Brand#34') & (df_merge_1.p_container.isin(["LG CASE","LG BOX","LG PACK","LG PKG"])) & (df_merge_1.l_quantity >= 20) & (df_merge_1.l_quantity <= 30) & (df_merge_1.p_size <= 15))]
df_merge_1 = df_merge_1[['l_extendedprice', 'l_discount']]
df_aggr_1 = pd.DataFrame()
df_aggr_1['revenue'] = [(df_merge_1.l_extendedprice * ( 1 - df_merge_1.l_discount )).sum()]
df_aggr_1 = df_aggr_1[['revenue']]
df_limit_1 = df_aggr_1[['revenue']]
end_time = time.time()
print(df_limit_1.head(1))

print("--- %s seconds ---" % (end_time - start_time))

       revenue
0 3083843.0578
--- 0.7763514518737793 seconds ---


In [12]:
# Hesam Pandas Query
start_time = time.time()

pa_filt = part[
                ((part.p_brand == "Brand#12") & (part.p_container.isin(["SM CASE", "SM BOX", "SM PACK", "SM PKG"])) & (part.p_size >= 1) & (part.p_size <= 5)) |
                ((part.p_brand == "Brand#23") & (part.p_container.isin(["MED BAG", "MED BOX", "MED PKG", "MED PACK"])) & (part.p_size >= 1) & (part.p_size <= 10)) |
                ((part.p_brand == "Brand#34") & (part.p_container.isin(["LG CASE", "LG BOX", "LG PACK", "LG PKG"])) & (part.p_size >= 1) & (part.p_size <= 15))
            ]
pa_proj = pa_filt[["p_partkey", "p_brand", "p_size", "p_container"]]

li_filt = lineitem[(((lineitem.l_shipmode == "AIR") | (lineitem.l_shipmode == "AIR REG")) & (lineitem.l_shipinstruct == "DELIVER IN PERSON"))]
li_pa_join = pd.merge(pa_proj, li_filt, left_on = "p_partkey", right_on = "l_partkey", how = "inner")
li_pa_join_filt = li_pa_join[
                        (
                            ((li_pa_join.p_brand == "Brand#12") & ((li_pa_join.l_quantity >= 1) & (li_pa_join.l_quantity <= 11))) | 
                            ((li_pa_join.p_brand == "Brand#23") & ((li_pa_join.l_quantity >= 10) & (li_pa_join.l_quantity <= 20))) |
                            ((li_pa_join.p_brand == "Brand#34") & ((li_pa_join.l_quantity >= 20) & (li_pa_join.l_quantity <= 30)))
                        )
                    ]
li_pa_join_filt["revenue"] = li_pa_join_filt.l_extendedprice * (1 - li_pa_join_filt.l_discount)

result = li_pa_join_filt.revenue.sum()
end_time = time.time()
print(result)

print("--- %s seconds ---" % (end_time - start_time))

3083843.0578
--- 1.328519582748413 seconds ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  li_pa_join_filt["revenue"] = li_pa_join_filt.l_extendedprice * (1 - li_pa_join_filt.l_discount)


### Success!

Running the command in PSQL gives the output:

| revenue |
|--------|
| 3083843.0578 |

This is similar to Pandas, so all good.

Time information (all times in seconds, to 3 s.f.):
| | Run 1 | Run 2 | Run 3 | Average |
| --- | --- | --- | --- | --- |
| Pandas |  |  |  |  |
| PostgreSQL | 0.202 | 0.0728 | 0.0972 | 0.124 |
| Hesam Pandas | 1.50 | 1.27 | 1.33 | 1.37 |