In [18]:
import pandas as pd
import numpy as np
from datetime import date
from numpy import char
pd.set_option('display.float_format', str)

## Data

In [19]:
l_columnnames = ["L_ORDERKEY", "L_PARTKEY", "L_SUPPKEY", "L_LINENUMBER", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX", "L_RETURNFLAG", "L_LINESTATUS", "L_SHIPDATE", "L_COMMITDATE", "L_RECEIPTDATE", "L_SHIPINSTRUCT","L_SHIPMODE", "L_COMMENT"]

for i in range(len(l_columnnames)):
    l_columnnames[i] = l_columnnames[i].lower()
    
l_data_types = {
    'l_orderkey': int,
    'l_partkey': int,
    'l_suppkey': int,
    'l_linenumber': int,
    'l_quantity': float,
    'l_extendedprice': float,
    'l_discount': float,
    'l_tax': float,
    'l_returnflag': str,
    'l_linestatus': str,
    'l_shipinstruct': str,
    'l_shipmode': str,
    'l_comment': str
}

l_parse_dates = ['l_shipdate', 'l_commitdate', 'l_receiptdate']

In [20]:
o_columnnames = ["O_ORDERKEY", "O_CUSTKEY", "O_ORDERSTATUS", "O_TOTALPRICE", "O_ORDERDATE", "O_ORDERPRIORITY", "O_CLERK", "O_SHIPPRIORITY", "O_COMMENT"]

for i in range(len(o_columnnames)):
    o_columnnames[i] = o_columnnames[i].lower()
    
o_data_types = {
    'o_orderkey': int,
    'o_custkey': int,
    'o_orderstatus': str,
    'o_totalprice': float,
    'o_orderpriority': str,
    'o_clerk': str,
    'o_shippriority': int,
    'o_comment': str
}

o_parse_dates = ['o_orderdate']

In [21]:
c_columnnames = ["C_CUSTKEY", "C_NAME", "C_ADDRESS", "C_NATIONKEY", "C_PHONE", "C_ACCTBAL", "C_MKTSEGMENT", "C_COMMENT"]

for i in range(len(c_columnnames)):
    c_columnnames[i] = c_columnnames[i].lower()
    
c_data_types = {
    'c_custkey': int,
    'c_name': str,
    'c_address': str,
    'c_nationkey': int,
    'c_phone': str,
    'c_acctbal': float,
    'c_mktsegment': str,
    'c_comment': str
}

c_parse_dates = []

### Lineitem DataFrame

In [22]:
# Don't set indexes, as we can't access them with Pandas selection!
lineitem = pd.read_table("../../tpch-pgsql-master/data/load/lineitem.tbl.csv", sep="|", names=l_columnnames, dtype=l_data_types, parse_dates=l_parse_dates)

### Orders DataFrame

In [23]:
# Don't set indexes, as we can't access them with Pandas selection!
orders = pd.read_table("../../tpch-pgsql-master/data/load/orders.tbl.csv", sep="|", names=o_columnnames, dtype=o_data_types, parse_dates=o_parse_dates)

### Customer DataFrame

In [24]:
# Don't set indexes, as we can't access them with Pandas selection!
customer = pd.read_table("../../tpch-pgsql-master/data/load/customer.tbl.csv", sep="|", names=c_columnnames, dtype=c_data_types, parse_dates=c_parse_dates)

## Q3

In [30]:
import time
start_time = time.time()

df_filter_1 = orders[orders.o_orderdate < pd.Timestamp('1995-03-15 00:00:00')]
df_filter_1 = df_filter_1[['o_orderkey', 'o_custkey', 'o_orderstatus', 'o_totalprice', 'o_orderdate', 'o_orderpriority', 'o_clerk', 'o_shippriority', 'o_comment']]
df_filter_2 = customer[customer.c_mktsegment == 'BUILDING']
df_filter_2 = df_filter_2[['c_custkey']]
df_merge_1 = df_filter_1.merge(df_filter_2, left_on="o_custkey", right_on="c_custkey")
df_merge_1 = df_merge_1[['o_orderdate', 'o_shippriority', 'o_orderkey']]
df_filter_3 = lineitem[lineitem.l_shipdate > pd.Timestamp('1995-03-15 00:00:00')]
df_filter_3 = df_filter_3[['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment']]
df_merge_2 = df_merge_1.merge(df_filter_3, left_on="o_orderkey", right_on="l_orderkey")
df_merge_2 = df_merge_2[['l_orderkey', 'o_orderdate', 'o_shippriority', 'l_extendedprice', 'l_discount']]
df_group_1 = df_merge_2.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
df_group_1 = df_group_1.apply(lambda s: pd.Series({
    "revenue": (s["l_extendedprice"] * ( 1 - s["l_discount"] )).sum(),
}))
df_group_1 = df_group_1[['revenue']]
df_sort_1 = df_group_1.sort_values(by=['revenue', 'o_orderdate'], ascending=[False, True])
df_sort_1 = df_sort_1[['revenue']]
df_limit_1 = df_sort_1.rename_axis(['l_orderkey', 'o_orderdate', 'o_shippriority']).reset_index()
df_limit_1 = df_limit_1[['l_orderkey', 'revenue', 'o_orderdate', 'o_shippriority']]
end_time = time.time()
print(df_limit_1.head(10))

print("--- %s seconds ---" % (end_time - start_time))

   l_orderkey            revenue o_orderdate  o_shippriority
0     2456423        406181.0111  1995-03-05               0
1     3459808 405838.69889999996  1995-03-04               0
2      492164         390324.061  1995-02-19               0
3     1188320        384537.9359  1995-03-09               0
4     2435712 378673.05580000003  1995-02-26               0
5     4878020        378376.7952  1995-03-12               0
6     5521732        375153.9215  1995-03-13               0
7     2628192 373133.30939999997  1995-02-22               0
8      993600        371407.4595  1995-03-05               0
9     2300070  367371.1452000001  1995-03-13               0
--- 9.829975605010986 seconds ---


In [33]:
# Hesam / SDQL Pandas
start_time = time.time()

cu_filt = customer[customer.c_mktsegment == "BUILDING"]
cu_filt = cu_filt[["c_custkey"]]

ord_filt = orders[orders.o_orderdate < "1995-03-15"]
ord_cu_join = pd.merge(cu_filt, ord_filt, left_on = "c_custkey", right_on = "o_custkey", how = "inner")
ord_cu_join = ord_cu_join[["o_orderkey", "o_orderdate", "o_shippriority"]]

li_filt = lineitem[lineitem.l_shipdate > "1995-03-15"]
li_order_join = pd.merge(ord_cu_join, li_filt, left_on = "o_orderkey", right_on = "l_orderkey", how = "inner")
li_order_join["revenue"] = li_order_join.l_extendedprice * (1 - li_order_join.l_discount)

result = li_order_join \
    .groupby(["l_orderkey", "o_orderdate", "o_shippriority"]) \
    .agg(revenue=("revenue", "sum"))  

df_sort_1 = result.sort_values(by=['revenue', 'o_orderdate'], ascending=[False, True])
end_time = time.time()
print(df_sort_1.head(10))

print("--- %s seconds ---" % (end_time - start_time))

                                                 revenue
l_orderkey o_orderdate o_shippriority                   
2456423    1995-03-05  0                     406181.0111
3459808    1995-03-04  0              405838.69889999996
492164     1995-02-19  0                      390324.061
1188320    1995-03-09  0                     384537.9359
2435712    1995-02-26  0                     378673.0558
4878020    1995-03-12  0                     378376.7952
5521732    1995-03-13  0                     375153.9215
2628192    1995-02-22  0                     373133.3094
993600     1995-03-05  0                     371407.4595
2300070    1995-03-13  0                     367371.1452
--- 1.6345527172088623 seconds ---


In [36]:
# New Callum Query
start_time = time.time()

df_filter_1 = orders[orders.o_orderdate < pd.Timestamp('1995-03-15 00:00:00')]
df_filter_1 = df_filter_1[['o_orderkey', 'o_custkey', 'o_orderstatus', 'o_totalprice', 'o_orderdate', 'o_orderpriority', 'o_clerk', 'o_shippriority', 'o_comment']]
df_filter_2 = customer[customer.c_mktsegment == 'BUILDING']
df_filter_2 = df_filter_2[['c_custkey']]
df_merge_1 = df_filter_1.merge(df_filter_2, left_on="o_custkey", right_on="c_custkey")
df_merge_1 = df_merge_1[['o_orderdate', 'o_shippriority', 'o_orderkey']]
df_filter_3 = lineitem[lineitem.l_shipdate > pd.Timestamp('1995-03-15 00:00:00')]
df_filter_3 = df_filter_3[['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment']]
df_merge_2 = df_merge_1.merge(df_filter_3, left_on="o_orderkey", right_on="l_orderkey")
df_merge_2 = df_merge_2[['l_orderkey', 'o_orderdate', 'o_shippriority', 'l_extendedprice', 'l_discount']]
df_merge_2['revenue'] = df_merge_2.l_extendedprice * ( 1 - df_merge_2.l_discount )
df_group_1 = df_merge_2 \
    .groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) \
    .agg(
        revenue=("revenue", "sum"),
    )
df_group_1 = df_group_1[['revenue']]
df_sort_1 = df_group_1.sort_values(by=['revenue', 'o_orderdate'], ascending=[False, True])
df_sort_1 = df_sort_1[['revenue']]
df_limit_1 = df_sort_1.rename_axis(['l_orderkey', 'o_orderdate', 'o_shippriority']).reset_index()
df_limit_1 = df_limit_1[['l_orderkey', 'revenue', 'o_orderdate', 'o_shippriority']]
end_time = time.time()
print(df_limit_1.head(10))

print("--- %s seconds ---" % (end_time - start_time))

   l_orderkey            revenue o_orderdate  o_shippriority
0     2456423        406181.0111  1995-03-05               0
1     3459808 405838.69889999996  1995-03-04               0
2      492164         390324.061  1995-02-19               0
3     1188320        384537.9359  1995-03-09               0
4     2435712        378673.0558  1995-02-26               0
5     4878020        378376.7952  1995-03-12               0
6     5521732        375153.9215  1995-03-13               0
7     2628192        373133.3094  1995-02-22               0
8      993600        371407.4595  1995-03-05               0
9     2300070        367371.1452  1995-03-13               0
--- 1.544177770614624 seconds ---


### Success!

Running the command in PSQL gives the output:

| l_orderkey | revenue | o_orderdate | o_shippriority |
|--------|-------------|-------------|----------------|
| 2456423 | 406181.0111 | 1995-03-05  |              0 |
| 3459808 | 405838.6989 | 1995-03-04  |              0 |
| 492164 | 390324.0610 | 1995-02-19  |              0 |
| 1188320 | 384537.9359 | 1995-03-09  |              0 |
| 2435712 | 378673.0558 | 1995-02-26  |              0 |
| 4878020 | 378376.7952 | 1995-03-12  |              0 |
| 5521732 | 375153.9215 | 1995-03-13  |              0 |
| 2628192 | 373133.3094 | 1995-02-22  |              0 |
| 993600 | 371407.4595 | 1995-03-05  |              0 |
| 2300070 | 367371.1452 | 1995-03-13  |              0 |

This is **ROUGHLY** the same numbers as Pandas, so good.

Time information (all times in seconds, to 3 s.f.):
| | Run 1 | Run 2 | Run 3 | Average |
| --- | --- | --- | --- | --- |
| Pandas | 8.92 | 8.83 | 9.83 | 9.19 |
| New Pandas | 1.58 | 1.49 | 1.54 | 1.54|
| PostgreSQL | 2.38 | 1.25 | 1.13 | 1.59 |
| Hesam Pandas | 1.91 | 1.21 | 1.63 | 1.58 |

### Future Change Needed

Not super essential, but at the moment we have no way to make the output occur in the correct **order**. This is because we have some columns that are actually indexes and there's no ways to change the order between indexes and columns. 

What we could do, is in the final node (the "presentation node"/LIMIT), we convert all the indexes to columns (using rename_axis and reset_index) and then we change change the order based on output there

**DONE: 25.10.2022**