## TPC Dataset Import
Import data for TPC test from GCS into Snowflake database

### Set Configs

In [1]:
import config, sf, datetime

# In "Dry Run" mode we generate and print SQL queries but not run them.
# You can manually run them in "workbench" if you want
DRY_RUN = False
TEST = sf.TEST_DS  # we want to run TPC-DS
SIZE = '1000GB'  # dataset size to use in test

`Note: that we use configuration data from config.py`

### Start Snowflake WAREHOUSE

In [2]:
# initiate SnowflakeHelper with Test type and dataset size specified
sf_helper = sf.SnowflakeHelper(TEST, SIZE, config)

# start Warehouse
sf_helper.warehouse_start()

preparing to open connection to Snowflake
using config: user:dauren, pass: 239nj8834uffe, account: wja13212
connection opened
running query: USE ROLE ACCOUNTADMIN
result: Statement executed successfully.
running query: ALTER WAREHOUSE TEST1 RESUME;
warehouse start: Statement executed successfully.
running query: USE WAREHOUSE TEST1
result: Statement executed successfully.
running query: CREATE DATABASE IF NOT EXISTS ds_1000GB
result: Database DS_1000GB successfully created.
running query: USE DATABASE ds_1000GB
result: Statement executed successfully.


### Setup STAGE: Link to GCS data source and stages files for uploading

In [3]:
if not sf_helper.is_integrated():
    # integrate Snoflake with GCS.
    integration_id = sf_helper.create_integration(is_dry_run=DRY_RUN)
    print(f'integrated with gcs: {integration_id}')



--creating named file format: "@csv_file_format"
running query: create or replace file format csv_file_format
            type = csv
            field_delimiter = '|'
            skip_header = 1
            null_if = ('NULL', 'null')
            empty_field_as_null = true
            encoding = 'iso-8859-1' 
            compression = none;
result File format CSV_FILE_FORMAT successfully created.


--done creating named file format


--integrating "gcs_ds_1000GB_integration" ... 


--creating storage integration: "gcs_ds_1000GB_integration"
running query: CREATE STORAGE INTEGRATION gcs_ds_1000GB_integration TYPE=EXTERNAL_STAGE STORAGE_PROVIDER=GCS ENABLED=TRUE STORAGE_ALLOWED_LOCATIONS=('gcs://tpc-benchmark-5947/');
result Integration GCS_DS_1000GB_INTEGRATION successfully created.


--finished creating storage integration
running query: GRANT CREATE STAGE on schema public to ROLE ACCOUNTADMIN;
result Statement executed successfully.
running query: GRANT USAGE on INTEGRATION gcs_ds_10

### Test STAGE

In [4]:
db_files = sf_helper.list_integration(integration_id)
for table, files in db_files.items():
    print(f'{table}:')
    for file in files:
        print(f'\t{file}')
    print(f'\tmissing {sf_helper.gcs_file_range - len(files)} files\n\n')



--listing stage: "@gcs_ds_1000GB_integration_stage"
unknown table!!!! gcs://tpc-benchmark-5947/ds_1000GB_date_dim_1_96.dat
unknown table!!!! gcs://tpc-benchmark-5947/ds_1000GB_dbgen_version_1_96.dat
unknown table!!!! gcs://tpc-benchmark-5947/ds_1000GB_warehouse_1_96.dat


--done listing stage
call_center:
	gcs://tpc-benchmark-5947/ds_1000GB_call_center_1_96.dat
	missing 95 files


catalog_page:
	gcs://tpc-benchmark-5947/ds_1000GB_catalog_page_1_96.dat
	missing 95 files


catalog_returns:
	gcs://tpc-benchmark-5947/ds_1000GB_catalog_returns_10_96.dat
	gcs://tpc-benchmark-5947/ds_1000GB_catalog_returns_11_96.dat
	gcs://tpc-benchmark-5947/ds_1000GB_catalog_returns_12_96.dat
	gcs://tpc-benchmark-5947/ds_1000GB_catalog_returns_13_96.dat
	gcs://tpc-benchmark-5947/ds_1000GB_catalog_returns_14_96.dat
	gcs://tpc-benchmark-5947/ds_1000GB_catalog_returns_15_96.dat
	gcs://tpc-benchmark-5947/ds_1000GB_catalog_returns_16_96.dat
	gcs://tpc-benchmark-5947/ds_1000GB_catalog_returns_17_96.dat
	gcs://tp

### Create tables in Snowflake if needed

In [5]:
if not sf_helper.is_integrated():
    sf_helper.create_schema(is_dry_run=DRY_RUN)



--pushing schema: "/home/vagrant/bq_snowflake_benchmark/h/2.18.0_rc2/dbgen/dss.ddl"
running query: -- 
-- Legal Notice 
-- 
-- This document and associated source code (the "Work") is a part of a 
-- benchmark specification maintained by the TPC. 
-- 
-- The TPC reserves all right, title, and interest to the Work as provided 
-- under U.S. and international laws, including without limitation all patent 
-- and trademark rights therein. 
-- 
-- No Warranty 
-- 
-- 1.1 TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THE INFORMATION 
--     CONTAINED HEREIN IS PROVIDED "AS IS" AND WITH ALL FAULTS, AND THE 
--     AUTHORS AND DEVELOPERS OF THE WORK HEREBY DISCLAIM ALL OTHER 
--     WARRANTIES AND CONDITIONS, EITHER EXPRESS, IMPLIED OR STATUTORY, 
--     INCLUDING, BUT NOT LIMITED TO, ANY (IF ANY) IMPLIED WARRANTIES, 
--     DUTIES OR CONDITIONS OF MERCHANTABILITY, OF FITNESS FOR A PARTICULAR 
--     PURPOSE, OF ACCURACY OR COMPLETENESS OF RESPONSES, OF RESULTS, OF 
--     WORKMANLIKE 

### Import Data from STAGE to target table

In [6]:
for table, files in db_files.items():
    print(f'Starting to import table: {table}')
    for file in sorted(files):
        print(f'\timporting file: {file}')
        sf_helper.import_data(table, file, integration_id)
        print(f'\tfinished @ {datetime.datetime.now().time()}')

Starting to import table: call_center
	importing file: gcs://tpc-benchmark-5947/ds_1000GB_call_center_1_96.dat
running query: copy into call_center from 'gcs://tpc-benchmark-5947/ds_1000GB_call_center_1_96.dat' storage_integration=gcs_ds_1000GB_integration file_format=(format_name=csv_file_format);
result gcs://tpc-benchmark-5947/ds_1000GB_call_center_1_96.dat
	finished @ 19:11:42.797362
Starting to import table: catalog_page
	importing file: gcs://tpc-benchmark-5947/ds_1000GB_catalog_page_1_96.dat
running query: copy into catalog_page from 'gcs://tpc-benchmark-5947/ds_1000GB_catalog_page_1_96.dat' storage_integration=gcs_ds_1000GB_integration file_format=(format_name=csv_file_format);
result gcs://tpc-benchmark-5947/ds_1000GB_catalog_page_1_96.dat
	finished @ 19:11:52.363649
Starting to import table: catalog_returns
	importing file: gcs://tpc-benchmark-5947/ds_1000GB_catalog_returns_10_96.dat
running query: copy into catalog_returns from 'gcs://tpc-benchmark-5947/ds_1000GB_catalog_ret

### Suspend WAREHOUSE

In [8]:
sf_helper.warehouse_suspend()

warehouse suspend: Statement executed successfully.
