In [37]:
import pandas as pd
import numpy as np
from datetime import date
from numpy import char
pd.set_option('display.float_format', str)

## Data

In [38]:
l_columnnames = ["L_ORDERKEY", "L_PARTKEY", "L_SUPPKEY", "L_LINENUMBER", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX", "L_RETURNFLAG", "L_LINESTATUS", "L_SHIPDATE", "L_COMMITDATE", "L_RECEIPTDATE", "L_SHIPINSTRUCT","L_SHIPMODE", "L_COMMENT"]

for i in range(len(l_columnnames)):
    l_columnnames[i] = l_columnnames[i].lower()
    
l_data_types = {
    'l_orderkey': int,
    'l_partkey': int,
    'l_suppkey': int,
    'l_linenumber': int,
    'l_quantity': float,
    'l_extendedprice': float,
    'l_discount': float,
    'l_tax': float,
    'l_returnflag': str,
    'l_linestatus': str,
    'l_shipinstruct': str,
    'l_shipmode': str,
    'l_comment': str
}

l_parse_dates = ['l_shipdate', 'l_commitdate', 'l_receiptdate']

In [39]:
o_columnnames = ["O_ORDERKEY", "O_CUSTKEY", "O_ORDERSTATUS", "O_TOTALPRICE", "O_ORDERDATE", "O_ORDERPRIORITY", "O_CLERK", "O_SHIPPRIORITY", "O_COMMENT"]

for i in range(len(o_columnnames)):
    o_columnnames[i] = o_columnnames[i].lower()
    
o_data_types = {
    'o_orderkey': int,
    'o_custkey': int,
    'o_orderstatus': str,
    'o_totalprice': float,
    'o_orderpriority': str,
    'o_clerk': str,
    'o_shippriority': int,
    'o_comment': str
}

o_parse_dates = ['o_orderdate']

In [40]:
c_columnnames = ["C_CUSTKEY", "C_NAME", "C_ADDRESS", "C_NATIONKEY", "C_PHONE", "C_ACCTBAL", "C_MKTSEGMENT", "C_COMMENT"]

for i in range(len(c_columnnames)):
    c_columnnames[i] = c_columnnames[i].lower()
    
c_data_types = {
    'c_custkey': int,
    'c_name': str,
    'c_address': str,
    'c_nationkey': int,
    'c_phone': str,
    'c_acctbal': float,
    'c_mktsegment': str,
    'c_comment': str
}

c_parse_dates = []

### Lineitem DataFrame

In [41]:
# Don't set indexes, as we can't access them with Pandas selection!
lineitem = pd.read_table("../tpch-pgsql-master/data/load/lineitem.tbl.csv", sep="|", names=l_columnnames, dtype=l_data_types, parse_dates=l_parse_dates)

### Orders DataFrame

In [42]:
# Don't set indexes, as we can't access them with Pandas selection!
orders = pd.read_table("../tpch-pgsql-master/data/load/orders.tbl.csv", sep="|", names=o_columnnames, dtype=o_data_types, parse_dates=o_parse_dates)

### Customer DataFrame

In [43]:
# Don't set indexes, as we can't access them with Pandas selection!
customer = pd.read_table("../tpch-pgsql-master/data/load/customer.tbl.csv", sep="|", names=c_columnnames, dtype=c_data_types, parse_dates=c_parse_dates)

## Q3

In [50]:
import time
start_time = time.time()

df_intermediate = orders[orders.o_orderdate < pd.Timestamp('1995-03-27 00:00:00')]
df_filter_1 = df_intermediate[['o_orderkey', 'o_custkey', 'o_orderstatus', 'o_totalprice', 'o_orderdate', 'o_orderpriority', 'o_clerk', 'o_shippriority', 'o_comment']]
df_intermediate = customer[customer.c_mktsegment == 'MACHINERY']
df_filter_2 = df_intermediate[['c_custkey']]
df_intermediate = pd.DataFrame()
df_intermediate = df_filter_1.merge(df_filter_2, left_on="o_custkey", right_on="c_custkey")
df_merge_1 = df_intermediate[['o_orderdate', 'o_shippriority', 'o_orderkey']]
df_intermediate = lineitem[lineitem.l_shipdate > pd.Timestamp('1995-03-27 00:00:00')]
df_filter_3 = df_intermediate[['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment']]
df_intermediate = pd.DataFrame()
df_intermediate = df_merge_1.merge(df_filter_3, left_on="o_orderkey", right_on="l_orderkey")
df_merge_2 = df_intermediate[['l_orderkey', 'o_orderdate', 'o_shippriority', 'l_extendedprice', 'l_discount']]
df_intermediate = df_merge_2.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
df_intermediate = df_intermediate.apply(lambda s: pd.Series({
    "revenue": (s["l_extendedprice"] * ( 1 - s["l_discount"] )).sum(),
}))
df_group_1 = df_intermediate[['revenue']]
df_intermediate = df_group_1.sort_values(by=['revenue', 'o_orderdate'], ascending=[False, True])
df_sort_1 = df_intermediate[['revenue']]
df_limit_1 = df_sort_1[['revenue']]
print(df_limit_1.head(10))

print("--- %s seconds ---" % (time.time() - start_time))

                                                 revenue
l_orderkey o_orderdate o_shippriority                   
885895     1995-03-26  0                     428442.5359
4207744    1995-03-26  0              392122.02509999997
5660420    1995-03-08  0                     389343.1796
3837441    1995-02-14  0                     382568.0383
5006400    1995-03-18  0                     377083.4322
1983815    1995-03-25  0                     373983.2402
4837957    1995-02-21  0                     369585.3452
2759685    1995-02-27  0                     368851.5057
2500838    1995-03-18  0                      357224.291
2315457    1995-02-25  0                     354719.2327
--- 12.143398761749268 seconds ---


### Success!

Running the command in PSQL gives the output:

| l_orderkey | revenue | o_orderdate | o_shippriority |
|--------|-------------|-------------|----------------|
| 885895 | 428442.5359 | 1995-03-26 | 0 |
| 4207744 | 392122.0251 | 1995-03-26 | 0 |
| 5660420 | 389343.1796 | 1995-03-08 | 0 |
| 3837441 | 382568.0383 | 1995-02-14 | 0 |
| 5006400 | 377083.4322 | 1995-03-18 | 0 |
| 1983815 | 373983.2402 | 1995-03-25 | 0 |
| 4837957 | 369585.3452 | 1995-02-21 | 0 |
|2759685 | 368851.5057 | 1995-02-27 | 0 |
| 2500838 | 357224.2910 | 1995-03-18 | 0 |
| 2315457 | 354719.2327 | 1995-02-25 | 0 |

This is **ROUGHLY** the same numbers as Pandas, so good.

Time information (all times in seconds, to 3 s.f.):
| | Run 1 | Run 2 | Run 3 | Average |
| --- | --- | --- | --- | --- |
| Pandas | 10.0 | 11.8 | 12.1 | 11.3 |
| PostgreSQL | 1.81 | 1.03 | 1.06 | 1.3 |

### Future Change Needed

Not super essential, but at the moment we have no way to make the output occur in the correct **order**. This is because we have some columns that are actually indexes and there's no ways to change the order between indexes and columns. 

What we could do, is in the final node (the "presentation node"/LIMIT), we convert all the indexes to columns (using rename_axis and reset_index) and then we change change the order based on output there