In [43]:
from numpy import char
from datetime import date
import pandas as pd
import numpy as np
import time
pd.set_option('display.float_format', str)

### Supplier

In [44]:
s_columnnames = ["S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_NATIONKEY", "S_PHONE", "S_ACCTBAL", "S_COMMENT"]

for i in range(len(s_columnnames)):
    s_columnnames[i] = s_columnnames[i].lower()

s_data_types = {
    's_suppkey': int,
    's_name': str,
    's_address': str,
    's_nationkey': int,
    's_phone': str,
    's_acctbal': float,
    's_comment': str
}

s_parse_dates = []

In [45]:
# Don't set indexes, as we can't access them with Pandas selection!
supplier = pd.read_table("../tpch-pgsql-master/data/load/supplier.tbl.csv", sep="|", names=s_columnnames, dtype=s_data_types, parse_dates=s_parse_dates)

### Partsupp

In [46]:
ps_columnnames = ["PS_PARTKEY", "PS_SUPPKEY", "PS_AVAILQTY", "PS_SUPPLYCOST", "PS_COMMENT"]

for i in range(len(ps_columnnames)):
    ps_columnnames[i] = ps_columnnames[i].lower()

ps_data_types = {
    'ps_partkey': int,
    'ps_suppkey': int,
    'ps_availqty': int,
    'ps_supplycost': float,
    'ps_comment': str
}

ps_parse_dates = []

In [47]:
# Don't set indexes, as we can't access them with Pandas selection!
partsupp = pd.read_table("../tpch-pgsql-master/data/load/partsupp.tbl.csv", sep="|", names=ps_columnnames, dtype=ps_data_types, parse_dates=ps_parse_dates)

### Part

In [48]:
p_columnnames = ["P_PARTKEY", "P_NAME", "P_MFGR", "P_BRAND", "P_TYPE", "P_SIZE", "P_CONTAINER", "P_RETAILPRICE", "P_COMMENT"]

for i in range(len(p_columnnames)):
    p_columnnames[i] = p_columnnames[i].lower()
    
p_data_types = {
    'p_partkey': int, 
    'p_name': str,
    'p_mfgr': str,
    'p_brand': str,
    'p_type': str,
    'p_size': int,
    'p_container': str,
    'p_retailprice': float,
    'p_comment': str
}

p_parse_dates = []

In [49]:
# Don't set indexes, as we can't access them with Pandas selection!
part = pd.read_table("../tpch-pgsql-master/data/load/part.tbl.csv", sep="|", names=p_columnnames, dtype=p_data_types, parse_dates=p_parse_dates)

## Query

#### Hesam

In [50]:
start_t = time.time()

pa_filt = part[
            (part.p_brand != "Brand#45") &
            (part.p_type.str.startswith("MEDIUM POLISHED") == False) &
            (
                    (part.p_size == 49) |
                    (part.p_size == 14) |
                    (part.p_size == 23) |
                    (part.p_size == 45) |
                    (part.p_size == 19) |
                    (part.p_size == 3)  |
                    (part.p_size == 36) |
                    (part.p_size == 9)
            )
        ]
pa_proj = pa_filt[["p_partkey", "p_brand", "p_type", "p_size"]]

su_filt = supplier[supplier.s_comment.str.contains("Customer") & (supplier.s_comment.str.find("Customer") + 7) < supplier.s_comment.str.find("Complaints")]
su_proj = su_filt[["s_suppkey"]]

ps_filt = partsupp[~partsupp.ps_suppkey.isin(su_proj["s_suppkey"])]

ps_pa_join = pd.merge(pa_proj, ps_filt, left_on = "p_partkey", right_on = "ps_partkey", how = "inner")

f1 = lambda x: x.nunique()
result = ps_pa_join \
    .groupby(["p_brand", "p_type", "p_size"]) \
    .agg(supplier_cnt=("ps_suppkey", f1))

print(len(result))
print(result)

print("---" * 5)
print(time.time() - start_t)
print("---" * 5)

18314
                                         supplier_cnt
p_brand  p_type                  p_size              
Brand#11 ECONOMY ANODIZED BRASS  3                  4
                                 14                16
                                 19                 8
                                 23                16
                                 45                 4
...                                               ...
Brand#55 STANDARD POLISHED STEEL 45                 8
                                 49                 8
         STANDARD POLISHED TIN   9                  4
                                 19                 4
                                 36                 4

[18314 rows x 1 columns]
---------------
1.3878214359283447
---------------


In [51]:
start_t = time.time()

df_filter_1 = supplier[(supplier.s_comment.str.contains("Customer")) & (supplier.s_comment.str.contains("Complaints")) & (supplier.s_comment.str.find("Customer") < supplier.s_comment.str.find("Complaints"))]
df_filter_1 = df_filter_1[['s_suppkey']]
df_filter_2 = partsupp[~partsupp.ps_suppkey.isin(df_filter_1["s_suppkey"])]
df_filter_2 = df_filter_2[['ps_partkey', 'ps_suppkey']]
df_filter_3 = part[(part.p_brand != 'Brand#45') & (part.p_type.str.startswith("MEDIUM POLISHED") == False) & (part.p_size.isin([49,14,23,45,19,3,36,9]))]
df_filter_3 = df_filter_3[['p_brand', 'p_type', 'p_size', 'p_partkey']]
df_merge_1 = df_filter_2.merge(df_filter_3, left_on="ps_partkey", right_on="p_partkey")
df_merge_1 = df_merge_1[['p_brand', 'p_type', 'p_size', 'ps_suppkey']]
df_sort_1 = df_merge_1.sort_values(by=['p_brand', 'p_type', 'p_size'], ascending=[True, True, True])
df_sort_1 = df_sort_1[['p_brand', 'p_type', 'p_size', 'ps_suppkey']]
df_group_1 = df_sort_1 \
    .groupby(['p_brand', 'p_type', 'p_size']) \
    .agg(
        supplier_cnt=("ps_suppkey", lambda x: x.nunique()),
    )
df_group_1 = df_group_1[['supplier_cnt']]
df_sort_2 = df_group_1.sort_values(by=['supplier_cnt', 'p_brand', 'p_type', 'p_size'], ascending=[False, True, True, True])
df_sort_2 = df_sort_2[['supplier_cnt']]
df_limit_1 = df_sort_2[['supplier_cnt']]
result = df_limit_1.head(1)

print(len(result))
print(result)

print("---" * 5)
print(time.time() - start_t)
print("---" * 5)

1
                                    supplier_cnt
p_brand  p_type             p_size              
Brand#41 MEDIUM BRUSHED TIN 3                 28
---------------
1.340609073638916
---------------


In [52]:
import pandas as pd
def query(partsupp, supplier, part):
    df_filter_1 = supplier[(supplier.s_comment.str.contains("Customer")) & (supplier.s_comment.str.contains("Complaints")) & (supplier.s_comment.str.find("Customer") < supplier.s_comment.str.find("Complaints"))]
    df_filter_1 = df_filter_1[['s_suppkey']]
    df_filter_2 = partsupp[~partsupp.ps_suppkey.isin(df_filter_1["s_suppkey"])]
    df_filter_2 = df_filter_2[['ps_partkey', 'ps_suppkey']]
    df_filter_3 = part[(part.p_brand != 'Brand#45') & (part.p_type.str.startswith("MEDIUM POLISHED") == False) & (part.p_size.isin([49,14,23,45,19,3,36,9]))]
    df_filter_3 = df_filter_3[['p_brand', 'p_type', 'p_size', 'p_partkey']]
    df_merge_1 = df_filter_2.merge(df_filter_3, left_on="ps_partkey", right_on="p_partkey")
    df_merge_1 = df_merge_1[['p_brand', 'p_type', 'p_size', 'ps_suppkey']]
    df_sort_1 = df_merge_1.sort_values(by=['p_brand', 'p_type', 'p_size'], ascending=[True, True, True])
    df_sort_1 = df_sort_1[['p_brand', 'p_type', 'p_size', 'ps_suppkey']]
    df_group_1 = df_sort_1 \
        .groupby(['p_brand', 'p_type', 'p_size']) \
        .agg(
            supplier_cnt=("ps_suppkey", lambda x: x.nunique()),
        )
    df_group_1 = df_group_1[['supplier_cnt']]
    df_sort_2 = df_group_1.sort_values(by=['supplier_cnt', 'p_brand', 'p_type', 'p_size'], ascending=[False, True, True, True])
    df_sort_2 = df_sort_2[['supplier_cnt']]
    df_limit_1 = df_sort_2[['supplier_cnt']]
    result = df_limit_1.head(1)
    return result