# TPC-DS Pipeline  

In [86]:
import tpcds_setup, tpcds_bq, gcp_storage, config

In [83]:
from importlib import reload

#### I. Make Project Files

In [2]:
tpcds_setup.make_directories()

#### II. Download TPC-DS Binaries from GCS  

In [3]:
tpcds_setup.download_zip()

Client created using default project: sada-colin-dietrich


'/home/colin/code/bq_snowflake_benchmark/download/tpc-ds_v2.11.0rc2.zip'

#### III. Unzip and Rename Source Files

In [4]:
#fp_zip = config.fp_download + config.sep + "tpc-ds_v2.11.0rc2.zip"
tpcds_setup.extract_tpcds_zip(zip_filepath=config.fp_ds_zip,
                              version=config.fp_ds_src_version)

#### IV. Make tpcds

In [5]:
tpcds_setup.make_tpcds()

#### V. Generate the table schema for BigQuery

In [6]:
tpcds_bq.schema(filepath_in=config.tpcds_schema_ansi_sql_filepath, 
                filepath_out=config.tpcds_schema_bq_filepath, 
                dataset_name=config.gcp_dataset)

In [7]:
tpcds_setup.dsdgen_bash_scripts()

In [8]:
tpcds_setup.dsdgen_move_bash_scripts()

### VI. Generate Data  

From the command line, execute appropriate script for CPU size and desired data size.

### VII. Create BQ Table Schema  

In [9]:
tpcds_bq.create_dataset()

Dataset(DatasetReference('sada-colin-dietrich', 'gcprabbit'))

In [10]:
from google.cloud import bigquery

In [11]:
client = bigquery.Client.from_service_account_json(config.gcp_cred_file)

In [12]:
with open(config.tpcds_schema_bq_filepath, 'r') as f:
    query_txt = f.read()

In [15]:
query_job = client.query(query_txt)  # API request
rows = query_job.result()  # Waits for query to finish
for r in rows:
    print(r.name)

In [70]:
def extract_table_name(f_name):
    """Extract the table name target for a TPC-DS data file
    
    Parameters
    ----------
    fname : str, name of file as generated with dsdgen
    
    Returns
    -------
    table_name : str, name of table the file's data should be
        loaded in
    """
    f_name = f_name.split(config.sep)[-1]
    f_name = f_name.split(".")[0]
    f_list = f_name.split("_")
    f_list_new = []
    for x in f_list:
        try:
            int(x)
        except:
            f_list_new.append(x)
    return "_".join(f_list_new)

In [79]:
files[0]

'/home/colin/code/bq_snowflake_benchmark/ds/output/1GB/customer_address_1_8.dat'

TODO: load data into GCS, include csv file with:
blob name, target table

In [80]:
import os

In [88]:
from google.cloud import storage

In [40]:
folder = config.fp_ds_output + config.sep + "1GB" + config.sep + "**.dat"
folder

'/home/colin/code/bq_snowflake_benchmark/ds/output/1GB/**.dat'

In [31]:
import glob

In [60]:
files = glob.glob(folder)

In [89]:
os.path.basename(files[0])

'customer_address_1_8.dat'

In [90]:
reload(config)

<module 'config' from '/home/colin/code/bq_snowflake_benchmark/config.py'>

In [91]:
gcs_client = storage.Client.from_service_account_json(config.gcp_cred_file)
bucket_name = config.gcs_1gb
fp = files[0]


gcs_sync = gcp_storage.BlobSync(client=gcs_client,
                                bucket_name=bucket_name,
                                blob_name=os.path.basename(fp),
                                local_filepath=fp)

In [92]:
gcs_sync.upload()