In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.float_format', str)

In [3]:
""" 
from tpch-pgsql-master/query_root/prep_query:
    create_tbl.sql

CREATE TABLE LINEITEM (
    L_ORDERKEY        INTEGER NOT NULL, -- references O_ORDERKEY
    L_PARTKEY        INTEGER NOT NULL, -- references P_PARTKEY (compound fk to PARTSUPP)
    L_SUPPKEY        INTEGER NOT NULL, -- references S_SUPPKEY (compound fk to PARTSUPP)
    L_LINENUMBER    INTEGER,
    L_QUANTITY        DECIMAL,
    L_EXTENDEDPRICE    DECIMAL,
    L_DISCOUNT        DECIMAL,
    L_TAX            DECIMAL,
    L_RETURNFLAG    CHAR(1),
    L_LINESTATUS    CHAR(1),
    L_SHIPDATE        DATE,
    L_COMMITDATE    DATE,
    L_RECEIPTDATE    DATE,
    L_SHIPINSTRUCT    CHAR(25),
    L_SHIPMODE        CHAR(10),
    L_COMMENT        VARCHAR(44)
);

and
    create_idx.sql
    
ALTER TABLE LINEITEM ADD PRIMARY KEY (L_ORDERKEY, L_LINENUMBER);
"""

from datetime import date
from numpy import char


columnnames = ["L_ORDERKEY", "L_PARTKEY", "L_SUPPKEY", "L_LINENUMBER", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX", "L_RETURNFLAG", "L_LINESTATUS", "L_SHIPDATE", "L_COMMITDATE", "L_RECEIPTDATE", "L_SHIPINSTRUCT","L_SHIPMODE", "L_COMMENT"]

for i in range(len(columnnames)):
    columnnames[i] = columnnames[i].lower()
    
data_types = {
    'l_orderkey': int,
    'l_partkey': int,
    'l_suppkey': int,
    'l_linenumber': int,
    'l_quantity': float,
    'l_extendedprice': float,
    'l_discount': float,
    'l_tax': float,
    'l_returnflag': str,
    'l_linestatus': str,
    'l_shipinstruct': str,
    'l_shipmode': str,
    'l_comment': str
}

In [4]:
# Don't set indexes, as we can't access them with Pandas selection!
df = pd.read_table("../tpch-pgsql-master/data/load/lineitem.tbl.csv", sep="|", names=columnnames, dtype=data_types, parse_dates=['l_shipdate', 'l_commitdate', 'l_receiptdate'])

In [5]:
import time
start_time = time.time()

df_intermediate = df[(df.l_shipdate >= pd.Timestamp('1993-01-01 00:00:00')) & (df.l_shipdate < pd.Timestamp('1994-01-01 00:00:00')) & (df.l_discount >= 0.07) & (df.l_discount <= 0.09) & (df.l_quantity < 25) ]
df_filter = df_intermediate[['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment']]
df_aggr = pd.DataFrame()
df_aggr['revenue'] = [(df_filter.l_extendedprice * df_filter.l_discount).sum()]
print(df_aggr.head(1))

print("--- %s seconds ---" % (time.time() - start_time))

         revenue
0 179018379.9113
--- 0.2662086486816406 seconds ---


# Success!

Running the command in PSQL gives the output:
|     revenue      |
| ---------------- |
|  179018379.9113  |

This is the same number as Pandas, so good.

Time information (all times in seconds, to 3 s.f.):
| | Run 1 | Run 2 | Run 3 | Average |
| --- | --- | --- | --- | --- |
| Pandas | 0.136 | 0.143 | 0.155 | 0.145 |
| PostgreSQL | 0.926 | 0.442 | 0.364 | 0.577 |

## Q1

In [10]:
df_intermediate = df[df.l_shipdate <= pd.Timestamp('1998-08-11 00:00:00')]
df_filter = df_intermediate[['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment']]
df_intermediate = df_filter.groupby(['l_returnflag', 'l_linestatus'])
df_intermediate = df_intermediate.apply(lambda s: pd.Series({
    "sum_qty": (s["l_quantity"]).sum(),
    "sum_base_price": (s["l_extendedprice"]).sum(),
    "sum_disc_price": (s["l_extendedprice"] * ( 1 - s["l_discount"] )).sum(),
    "sum_charge": (( s["l_extendedprice"] * ( 1 - s["l_discount"] )) * ( 1 + s["l_tax"] )).sum(),
    "avg_qty": (s["l_quantity"]).mean(),
    "avg_price": (s["l_extendedprice"]).mean(),
    "avg_disc": (s["l_discount"]).mean(),
    "count_order": len(s.index),
}))
df_group = df_intermediate[['sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']]
df_intermediate = df_group.sort_values(by=['l_returnflag', 'l_linestatus'], ascending=[True, True])
df_sort = df_intermediate[['sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']]
df_limit = df_sort[['sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']]
print(df_limit.head(1))

                             sum_qty    sum_base_price    sum_disc_price  \
l_returnflag l_linestatus                                                  
A            F            37734107.0 56586554400.72996 53758257134.87001   

                                 sum_charge            avg_qty  \
l_returnflag l_linestatus                                        
A            F            55909065222.82771 25.522005853257337   

                                   avg_price             avg_disc  count_order  
l_returnflag l_linestatus                                                       
A            F            38273.129734621645 0.049985295838397614    1478493.0  


### Potential Issue

Is is going to be a problem for us.
The fact that the SQL plan has in the output section, all columns - including index ones
But in pandas we can't filter to return index ones as it gives an error

*TODO:* Work on Pandas output so to limit selecting by columns

### Further Issue

The rounding in Pandas and PostgreSQL might (and on initial inspection looks to be)

*TODO:* Investigate Pandas and PostgreSQL datatypes, potentially look at schema