In [6]:
import pandas as pd
import numpy as np
pd.set_option('display.float_format', str)
from datetime import date
from numpy import char

In [7]:
""" 
from tpch-pgsql-master/query_root/prep_query:
    create_tbl.sql

CREATE TABLE LINEITEM (
    L_ORDERKEY        INTEGER NOT NULL, -- references O_ORDERKEY
    L_PARTKEY        INTEGER NOT NULL, -- references P_PARTKEY (compound fk to PARTSUPP)
    L_SUPPKEY        INTEGER NOT NULL, -- references S_SUPPKEY (compound fk to PARTSUPP)
    L_LINENUMBER    INTEGER,
    L_QUANTITY        DECIMAL,
    L_EXTENDEDPRICE    DECIMAL,
    L_DISCOUNT        DECIMAL,
    L_TAX            DECIMAL,
    L_RETURNFLAG    CHAR(1),
    L_LINESTATUS    CHAR(1),
    L_SHIPDATE        DATE,
    L_COMMITDATE    DATE,
    L_RECEIPTDATE    DATE,
    L_SHIPINSTRUCT    CHAR(25),
    L_SHIPMODE        CHAR(10),
    L_COMMENT        VARCHAR(44)
);

and
    create_idx.sql
    
ALTER TABLE LINEITEM ADD PRIMARY KEY (L_ORDERKEY, L_LINENUMBER);
"""

columnnames = ["L_ORDERKEY", "L_PARTKEY", "L_SUPPKEY", "L_LINENUMBER", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX", "L_RETURNFLAG", "L_LINESTATUS", "L_SHIPDATE", "L_COMMITDATE", "L_RECEIPTDATE", "L_SHIPINSTRUCT","L_SHIPMODE", "L_COMMENT"]

for i in range(len(columnnames)):
    columnnames[i] = columnnames[i].lower()
    
data_types = {
    'l_orderkey': int,
    'l_partkey': int,
    'l_suppkey': int,
    'l_linenumber': int,
    'l_quantity': float,
    'l_extendedprice': float,
    'l_discount': float,
    'l_tax': float,
    'l_returnflag': str,
    'l_linestatus': str,
    'l_shipinstruct': str,
    'l_shipmode': str,
    'l_comment': str
}

In [8]:
# Don't set indexes, as we can't access them with Pandas selection!
lineitem = pd.read_table("../../tpch-pgsql-master/data/load/lineitem.tbl.csv", sep="|", names=columnnames, dtype=data_types, parse_dates=['l_shipdate', 'l_commitdate', 'l_receiptdate'])

## Q6

In [9]:
import time
start_time = time.time()

df_filter_1 = lineitem[(lineitem.l_shipdate >= pd.Timestamp('1994-01-01 00:00:00')) & (lineitem.l_shipdate < pd.Timestamp('1995-01-01 00:00:00')) & (lineitem.l_discount >= 0.05) & (lineitem.l_discount <= 0.07) & (lineitem.l_quantity < 24)]
df_filter_1 = df_filter_1[['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment']]
df_aggr_1 = pd.DataFrame()
df_aggr_1['revenue'] = [(df_filter_1.l_extendedprice * df_filter_1.l_discount).sum()]
df_aggr_1 = df_aggr_1[['revenue']]
df_limit_1 = df_aggr_1[['revenue']]
end_time = time.time()
print(df_limit_1.head(1))

print("--- %s seconds ---" % (end_time - start_time))

             revenue
0 123141078.22829999
--- 0.19051766395568848 seconds ---


In [21]:
# Hesam / SDQL Pandas
start_time = time.time()

li_filt = lineitem[
    (lineitem.l_shipdate >= "1994-01-01") &
    (lineitem.l_shipdate < "1995-01-01") &
    (lineitem.l_discount >= 0.05) &
    (lineitem.l_discount <= 0.07) &
    (lineitem.l_quantity < 24)
]
li_filt.head()
result = pd.DataFrame()
result['revenue'] = [(li_filt.l_extendedprice * li_filt.l_discount).sum()]
end_time = time.time()
print(result.head(1))

print("--- %s seconds ---" % (end_time - start_time))

             revenue
0 123141078.22829999
--- 0.14022350311279297 seconds ---


### Success!

Running the command in PSQL gives the output:
|     revenue      |
| ---------------- |
|  123141078.2283  |

This is the same number as Pandas, so good.

Time information (all times in seconds, to 3 s.f.):
| | Run 1 | Run 2 | Run 3 | Average |
| --- | --- | --- | --- | --- |
| Pandas | 0.263 | 0.172 | 0.170 | 0.202 |
| PostgreSQL | 1.28 | 1.01 | 1.07 | 1.12 |
| Hesam Pandas | 0.148 | 0.157 | 0.152 | 0.152 |

## Q1

In [10]:
import time
start_time = time.time()

df_filter_1 = lineitem[lineitem.l_shipdate <= pd.Timestamp('1998-09-02 00:00:00')]
df_filter_1 = df_filter_1[['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment']]
df_group_1 = df_filter_1.groupby(['l_returnflag', 'l_linestatus'])
df_group_1 = df_group_1.apply(lambda s: pd.Series({
    "sum_qty": (s["l_quantity"]).sum(),
    "sum_base_price": (s["l_extendedprice"]).sum(),
    "sum_disc_price": (s["l_extendedprice"] * ( 1 - s["l_discount"] )).sum(),
    "sum_charge": (( s["l_extendedprice"] * ( 1 - s["l_discount"] )) * ( 1 + s["l_tax"] )).sum(),
    "avg_qty": (s["l_quantity"]).mean(),
    "avg_price": (s["l_extendedprice"]).mean(),
    "avg_disc": (s["l_discount"]).mean(),
    "count_order": len(s.index),
}))
df_group_1 = df_group_1[['sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']]
df_sort_1 = df_group_1.sort_values(by=['l_returnflag', 'l_linestatus'], ascending=[True, True])
df_sort_1 = df_sort_1[['sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']]
df_limit_1 = df_sort_1.rename_axis(['l_returnflag', 'l_linestatus']).reset_index()
df_limit_1 = df_limit_1[['l_returnflag', 'l_linestatus', 'sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']]
end_time = time.time()
print(df_limit_1.head(1))

print("--- %s seconds ---" % (end_time - start_time))

  l_returnflag l_linestatus    sum_qty    sum_base_price    sum_disc_price  \
0            A            F 37734107.0 56586554400.72996 53758257134.87001   

         sum_charge            avg_qty          avg_price  \
0 55909065222.82771 25.522005853257337 38273.129734621645   

              avg_disc  count_order  
0 0.049985295838397614    1478493.0  
--- 3.430532693862915 seconds ---


In [13]:
start_time = time.time()

# Query from SDQL.py
li_filt = lineitem[(lineitem.l_shipdate <= "1998-09-02")]
li_filt["disc_price"] = li_filt.l_extendedprice * (1 - li_filt.l_discount)
li_filt["charge"] = li_filt.l_extendedprice * (1 - li_filt.l_discount) * (1 + li_filt.l_tax)

result = li_filt \
    .groupby(["l_returnflag", "l_linestatus"]) \
    .agg(
        sum_qty=("l_quantity", "sum"),
        sum_base_price=("l_extendedprice", "sum"),
        sum_disc_price=("disc_price", "sum"),
        sum_charge=("charge", "sum"),
        count_order=("l_quantity", "count")
    )
    
df_sort = result.sort_values(by=['l_returnflag', 'l_linestatus'], ascending=[True, True])
end_time = time.time()
print(df_sort.head(1))

print("--- %s seconds ---" % (end_time - start_time))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  li_filt["disc_price"] = li_filt.l_extendedprice * (1 - li_filt.l_discount)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  li_filt["charge"] = li_filt.l_extendedprice * (1 - li_filt.l_discount) * (1 + li_filt.l_tax)


                             sum_qty  sum_base_price  sum_disc_price  \
l_returnflag l_linestatus                                              
A            F            37734107.0  56586554400.73  53758257134.87   

                                 sum_charge  count_order  
l_returnflag l_linestatus                                 
A            F            55909065222.82769      1478493  
--- 2.1302566528320312 seconds ---


### Success!

Running the command in PSQL gives the output:

| l_returnflag | l_linestatus | sum_qty | sum_base_price | sum_disc_price | sum_charge | avg_qty | avg_price | avg_disc | count_order |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| A | F | 37734107 | 56586554400.73 | 53758257134.8700 | 55909065222.827692 | 25.5220058532573370 | 38273.129734621672 | 0.04998529583839761162 | 1478493 |


This is **ROUGHLY** (See below) the same numbers as Pandas, so good.

Time information (all times in seconds, to 3 s.f.):
| | Run 1 | Run 2 | Run 3 | Average |
| --- | --- | --- | --- | --- |
| Pandas | 3.53 | 4.70 | 3.81 | 4.01 |
| PostgreSQL | 5.99 | 6.22 | 6.30 | 6.17 |
| Hesam Pandas | 2.06 | 2.67 | 2.13 | 2.29 |

### Further Issue

The rounding in Pandas and PostgreSQL might (and on initial inspection looks to be)

*TODO:* Investigate Pandas and PostgreSQL datatypes, potentially look at schema



Column Datatypes Comparison:

| Postgres Type | Specifications | Pandas Type | Specifications |
| --- | --- | --- | --- |
| INTEGER | -2147483648 to +2147483647 | int64 | -9223372036854775808 to 9223372036854775807 |
| DECIMAL | 131072 digits before the decimal point; 16383 after  | float64 | Double precision float: sign bit, 11 bits exponent, 52 bits mantissa |
| CHAR(1) | fixed-length, blank padded | str | arrays of bytes representing Unicode characters |
| DATE | 4 bytes, date (no time of day) | datetime64[ns] | it represents an offset from 1970-01-01T00:00:00 |
| VARCHAR | variable-length with limit | str | arrays of bytes representing Unicode characters |

Let's look at "sum_base_price", in Postgres this is: _56586554400.73_ but in Pandas this is: _56586554400.72996_

"sum_base_price" is created from summing *l_extendedprice* (DECIMAL) in Postgres, through: *sum(l_extendedprice)*
And in Pandas, it's created by again summing *l_extendedprice* (float), through: *(s["l_extendedprice"]).sum()*

Is this a difference in how they are displayed?
In pandas, we have already set the display format for floats to be a string, for readability

**CONCLUSION**
This is fine, we if they're at least up to 5 (or n) significant figures of accuracy then that's completely okay.