In [24]:
import pandas as pd
import numpy as np

In [28]:
"""
CREATE TABLE LINEITEM (
    L_ORDERKEY        INTEGER NOT NULL, -- references O_ORDERKEY
    L_PARTKEY        INTEGER NOT NULL, -- references P_PARTKEY (compound fk to PARTSUPP)
    L_SUPPKEY        INTEGER NOT NULL, -- references S_SUPPKEY (compound fk to PARTSUPP)
    L_LINENUMBER    INTEGER,
    L_QUANTITY        DECIMAL,
    L_EXTENDEDPRICE    DECIMAL,
    L_DISCOUNT        DECIMAL,
    L_TAX            DECIMAL,
    L_RETURNFLAG    CHAR(1),
    L_LINESTATUS    CHAR(1),
    L_SHIPDATE        DATE,
    L_COMMITDATE    DATE,
    L_RECEIPTDATE    DATE,
    L_SHIPINSTRUCT    CHAR(25),
    L_SHIPMODE        CHAR(10),
    L_COMMENT        VARCHAR(44)
);
"""

from datetime import date
from numpy import char


columnnames = ["L_ORDERKEY", "L_PARTKEY", "L_SUPPKEY", "L_LINENUMBER", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX", "L_RETURNFLAG", "L_LINESTATUS", "L_SHIPDATE", "L_COMMITDATE", "L_RECEIPTDATE", "L_SHIPINSTRUCT","L_SHIPMODE", "L_COMMENT"]

for i in range(len(columnnames)):
    columnnames[i] = columnnames[i].lower()
    
data_types = {
    'l_orderkey': int,
    'l_partkey': int,
    'l_suppkey': int,
    'l_linenumber': int,
    'l_quantity': float,
    'l_extendedprice': float,
    'l_discount': float,
    'l_tax': float,
    'l_returnflag': str,
    'l_linestatus': str,
    'l_shipinstruct': str,
    'l_shipmode': str,
    'l_comment': str
}

In [32]:
df = pd.read_table("../tpch-pgsql-master/data/load/lineitem.tbl.csv", sep="|", names=columnnames, index_col=("l_orderkey", "l_linenumber"), dtype=data_types, parse_dates=['l_shipdate', 'l_commitdate', 'l_receiptdate'])

In [40]:
import time
start_time = time.time()

df_filter = df[(df.l_shipdate >= pd.Timestamp('1993-01-01 00:00:00')) & (df.l_shipdate < pd.Timestamp('1994-01-01 00:00:00')) & (df.l_discount >= 0.07) & (df.l_discount <= 0.09) & (df.l_quantity < 25) ]
revenue = (df_filter.l_extendedprice * df_filter.l_discount).sum()
print(revenue)

print("--- %s seconds ---" % (time.time() - start_time))

179018379.9113
--- 0.15473103523254395 seconds ---


# Success!

Running the command in PSQL gives the output:
|     revenue      |
| ---------------- |
|  179018379.9113  |

This is the same number as Pandas, so good.

Time information:
| Technology | Run 1 (3 s.f.) | Run 2 | Run 3 | Average (s) |
| --- | --- | --- | --- | --- |
| Pandas | 0.136 | 0.143 | 0.155 | 0.145 |
| PostgreSQL | 0.926 | 0.442 | 0.364 | 0.577 |