## TPC Dataset Import
Import data for TPC test from GCS into Snowflake database

### Set Configs

In [1]:
import config, sf

# In "Dry Run" mode we generate and print SQL queries but not run them.
# You can manually run them in "workbench" if you want
DRY_RUN = True
TEST = sf.TEST_H  # we want to run TPC-H
SIZE = '100GB'  # dataset size to use in test

`Note: that we use configuration data from config.py`

### Start Snowflake WAREHOUSE

In [2]:
# initiate SnowflakeHelper with Test type and dataset size specified
sf_helper = sf.SnowflakeHelper(TEST, SIZE, config)

# start Warehouse
sf_helper.warehouse_start()

preparing to open connection to Snowflake
using config: user:sadadauren, pass: Test1234!, account: ed75261.us-central1.gcp
connection opened
running query: USE ROLE ACCOUNTADMIN
result: Statement executed successfully.
running query: ALTER WAREHOUSE TEST1 RESUME;
warehouse start: Statement executed successfully.
running query: USE WAREHOUSE TEST1
result: Statement executed successfully.
running query: USE DATABASE H_100GB
result: Statement executed successfully.


### Setup STAGE: Link to GCS data source and stages files for uploading

In [3]:
if not sf_helper.is_integrated():
    # integrate Snoflake with GCS.
    integration_id = sf_helper.create_integration(is_dry_run=DRY_RUN)
    print(f'integrated with gcs: {integration_id}')



--creating named file format: "@csv_file_format"
create or replace file format csv_file_format
            type = csv
            field_delimiter = '|'
            skip_header = 1
            null_if = ('NULL', 'null')
            empty_field_as_null = true
            encoding = 'iso-8859-1' 
            compression = none;


--done creating named file format


--integrating "gcs_h_100GB_integration" ... 


--creating storage integration: "gcs_h_100GB_integration"
CREATE STORAGE INTEGRATION gcs_h_100GB_integration TYPE=EXTERNAL_STAGE STORAGE_PROVIDER=GCS ENABLED=TRUE STORAGE_ALLOWED_LOCATIONS=('gcs://tpc-benchmark-5947/');


--finished creating storage integration
GRANT CREATE STAGE on schema public to ROLE ACCOUNTADMIN;
GRANT USAGE on INTEGRATION gcs_h_100GB_integration to ROLE ACCOUNTADMIN;
CREATE STAGE gcs_h_100GB_integration_stage URL='gcs://tpc-benchmark-5947' STORAGE_INTEGRATION=gcs_h_100GB_integration FILE_FORMAT=csv_file_format;
list  @gcs_h_100GB_integration_stage;
--finish

### Test STAGE

In [4]:
db_files = sf_helper.list_integration(integration_id)
for table, files in db_files.items():
    print(f'{table}:')
    for file in files:
        print(f'\t{file}')
    print(f'\tmissing {sf_helper.gcs_file_range - len(files)} files\n\n')



--listing stage: "@gcs_h_100GB_integration_stage"


--extracting table name: "gcs://tpc-benchmark-5947/_data_ds_1000GB_catalog_returns_15_96.dat"
_data_ds_1000GB_catalog_returns_15_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_1000GB_catalog_returns_15_96.dat
gcs://tpc-benchmark-5947/_data_h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/_data_ds_1000GB_catalog_returns_19_96.dat"
_data_ds_1000GB_catalog_returns_19_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_1000GB_catalog_returns_19_96.dat
gcs://tpc-benchmark-5947/_data_h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/_data_ds_1000GB_catalog_returns_23_96.dat"
_data_ds_1000GB_catalog_returns_23_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_1000GB_catalog_returns_23_96.dat
gcs://tpc-benchmark-5947/_data_h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/_data_ds_1000GB_catalog_returns_2_96.dat"
_data_ds_1000GB_catalog_returns_2_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_1000GB_catalog_returns_2_96.dat

ds_10000GB_store_returns_12_96.dat
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/ds_10000GB_store_returns_13_96.dat"
ds_10000GB_store_returns_13_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_10000GB_store_returns_13_96.dat
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/ds_10000GB_store_returns_14_96.dat"
ds_10000GB_store_returns_14_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_10000GB_store_returns_14_96.dat
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/ds_10000GB_store_returns_15_96.dat"
ds_10000GB_store_returns_15_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_10000GB_store_returns_15_96.dat
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/ds_10000GB_store_returns_16_96.dat"
ds_10000GB_store_returns_16_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_10000GB_store_returns_16_96.dat
gcs://tpc-benchmark-5947/h_100GB_


--extracting table na

gcs://tpc-benchmark-5947/h_100GB_
ds_1000GB_inventory_22_96.dat
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/ds_1000GB_inventory_23_96.dat"
ds_1000GB_inventory_23_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_1000GB_inventory_23_96.dat
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/ds_1000GB_inventory_24_96.dat"
ds_1000GB_inventory_24_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_1000GB_inventory_24_96.dat
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/ds_1000GB_inventory_25_96.dat"
ds_1000GB_inventory_25_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_1000GB_inventory_25_96.dat
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/ds_1000GB_inventory_26_96.dat"
ds_1000GB_inventory_26_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_1000GB_inventory_26_96.dat
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/d

gcs://tpc-benchmark-5947/h_100GB_
ds_100GB_customer_demographics_44_96.dat
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/ds_100GB_customer_demographics_45_96.dat"
ds_100GB_customer_demographics_45_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_100GB_customer_demographics_45_96.dat
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/ds_100GB_customer_demographics_46_96.dat"
ds_100GB_customer_demographics_46_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_100GB_customer_demographics_46_96.dat
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/ds_100GB_customer_demographics_47_96.dat"
ds_100GB_customer_demographics_47_96.dat
gcs://tpc-benchmark-5947/h_100GB_
ds_100GB_customer_demographics_47_96.dat
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/ds_100GB_customer_demographics_48_96.dat"
ds_100GB_customer_demographics_48_96.dat
gcs://tpc-benchmark-

h_10000GB_lineitem.tbl.48
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/h_10000GB_lineitem.tbl.49"
h_10000GB_lineitem.tbl.49
gcs://tpc-benchmark-5947/h_100GB_
h_10000GB_lineitem.tbl.49
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/h_10000GB_lineitem.tbl.5"
h_10000GB_lineitem.tbl.5
gcs://tpc-benchmark-5947/h_100GB_
h_10000GB_lineitem.tbl.5
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/h_10000GB_lineitem.tbl.50"
h_10000GB_lineitem.tbl.50
gcs://tpc-benchmark-5947/h_100GB_
h_10000GB_lineitem.tbl.50
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/h_10000GB_lineitem.tbl.51"
h_10000GB_lineitem.tbl.51
gcs://tpc-benchmark-5947/h_100GB_
h_10000GB_lineitem.tbl.51
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/h_10000GB_lineitem.tbl.52"
h_10000GB_lineitem.tbl.52
gcs://tpc-benchmark-5947/h_100GB_
h_1

h_1000GB_part.tbl.12
gcs://tpc-benchmark-5947/h_100GB_
h_1000GB_part.tbl.12
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/h_1000GB_part.tbl.13"
h_1000GB_part.tbl.13
gcs://tpc-benchmark-5947/h_100GB_
h_1000GB_part.tbl.13
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/h_1000GB_part.tbl.14"
h_1000GB_part.tbl.14
gcs://tpc-benchmark-5947/h_100GB_
h_1000GB_part.tbl.14
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/h_1000GB_part.tbl.15"
h_1000GB_part.tbl.15
gcs://tpc-benchmark-5947/h_100GB_
h_1000GB_part.tbl.15
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/h_1000GB_part.tbl.16"
h_1000GB_part.tbl.16
gcs://tpc-benchmark-5947/h_100GB_
h_1000GB_part.tbl.16
gcs://tpc-benchmark-5947/h_100GB_


--extracting table name: "gcs://tpc-benchmark-5947/h_1000GB_part.tbl.17"
h_1000GB_part.tbl.17
gcs://tpc-benchmark-5947/h_100GB_
h_1000GB_part.tbl.17

RuntimeError: No active exception to reraise

### Create tables in Snowflake if needed

In [None]:
if not sf_helper.is_integrated():
    sf_helper.create_schema(is_dry_run=DRY_RUN)

### Import Data from STAGE to target table

In [None]:
for table, files in db_files.items():
    print(f'Starting to import table: {table}')
    for file in sorted(files):
        print(f'\timporting file: {file}')
        sf_helper.import_data(table, file, integration_id)

### Suspend WAREHOUSE

In [None]:
sf_helper.warehouse_suspend()