# TPC-DS Pipeline  

In [1]:
import tpcds_setup, tpcds_bq, gcp_storage, config

In [2]:
from importlib import reload

#### I. Make Project Files

In [3]:
tpcds_setup.make_directories()

#### II. Download TPC-DS Binaries from GCS  

In [4]:
tpcds_setup.download_zip()

Client created using default project: sada-colin-dietrich


'/home/colin/code/bq_snowflake_benchmark/download/tpc-ds_v2.11.0rc2.zip'

#### III. Unzip and Rename Source Files

In [5]:
#fp_zip = config.fp_download + config.sep + "tpc-ds_v2.11.0rc2.zip"
tpcds_setup.extract_tpcds_zip(zip_filepath=config.fp_ds_zip,
                              version=config.fp_ds_src_version)

#### IV. Make tpcds

In [6]:
tpcds_setup.make_tpcds()

#### V. Generate the table schema for BigQuery

In [7]:
tpcds_bq.schema(filepath_in=config.tpcds_schema_ansi_sql_filepath, 
                filepath_out=config.tpcds_schema_bq_filepath, 
                dataset_name=config.gcp_dataset)

In [8]:
tpcds_setup.dsdgen_bash_scripts()

In [9]:
tpcds_setup.dsdgen_move_bash_scripts()

#### VI. Generate Data  

From the command line, execute appropriate script for CPU size and desired data size.

Note that the config.fp_ds_output, config.fp_ds_output_mnt will dictate where the data is stored.  

config.fp_ds_output = store within the source code directory, and thus on the local disk.  
config.fp_ds_output_mnt = store another disk, either additional disk, Google Compute Engine persistent disk or FUSE mounted Google Cloud Storage.


#### VII. Create BQ Table Schema  

In [10]:
tpcds_bq.create_dataset()

Dataset(DatasetReference('sada-colin-dietrich', 'gcprabbit'))

In [11]:
tpcds_bq.create_schema()

#### VIII. Upload Data Files to Cloud Storage

In [12]:
folder = config.fp_ds_output + config.sep + "1GB" + config.sep + "**.dat"
folder

'/home/colin/code/bq_snowflake_benchmark/ds/output/1GB/**.dat'

In [13]:
inventory_list = tpcds_setup.upload_tpc_data(folder=folder,
                                 bucket_name=config.gcs_1gb,
                                 limit=20,
                                 verbose=True)

Uploading customer_address_1_8.dat @ 5.451571 MB
...done!
Uploading web_page_1_8.dat @ 0.005657 MB
...done!
Uploading web_returns_1_8.dat @ 9.793638 MB
...done!
Skipping inventory_6_8.dat @ 28.084882 MB
Skipping web_sales_1_8.dat @ 146.197191 MB
Uploading household_demographics_1_8.dat @ 0.144453 MB
...done!
Uploading catalog_page_1_8.dat @ 1.619498 MB
...done!
Uploading customer_demographics_3_8.dat @ 9.73973 MB
...done!
Uploading customer_demographics_8_8.dat @ 9.98347 MB
...done!
Uploading customer_demographics_5_8.dat @ 9.940931 MB
...done!
Skipping catalog_sales_1_8.dat @ 294.323762 MB
Skipping catalog_returns_1_8.dat @ 21.191034 MB
Uploading ship_mode_1_8.dat @ 0.001062 MB
...done!
Uploading customer_1_8.dat @ 13.104567 MB
...done!
Skipping store_sales_1_8.dat @ 385.804932 MB
Uploading promotion_1_8.dat @ 0.036812 MB
...done!
Uploading call_center_1_8.dat @ 0.001828 MB
...done!
Skipping inventory_8_8.dat @ 28.084647 MB
Uploading income_band_1_8.dat @ 0.000308 MB
...done!
Skipping