## TPC-DS BQ Query Validation

In [1]:
from google.cloud import bigquery

In [2]:
import ds_setup, config, tools

In [3]:
from importlib import reload

In [4]:
client = bigquery.Client.from_service_account_json(config.gcp_cred_file)
job_config = bigquery.QueryJobConfig()
job_config.default_dataset = "tpc-benchmarking-9432.ds_1GB_qual"

job_config.dry_run = False         # only approximate the time and cost
job_config.use_query_cache = False  # default is True, (try to used cached results)

In [5]:
def ds_qgen_template(n, directory, scale=1, qual=False, verbose=False):
    """Generate DS query text for query number n
    
    Parameters
    ----------|
    n : int, query number to generate BigQuery SQL
    directory : str, absolute path to directory of query templates
        to draw from for n.
    scale : int, scale factor of db being queried
    qual : bool, generate qualification queries in ascending order
        
    Returns
    -------
    str : BigQuery SQL query
    """
    
    if qual:
        qual = "Y"
    else:
        qual = "N"
        
    std_out, err_out = ds_setup.dsqgen(directory=directory,
                                       dialect="sqlserver_bq",
                                       scale=scale,
                                       template="query{}.tpl".format(n),
                                       filter="Y",  # write to std_out
                                       qualify=qual
                                      )
    return std_out

In [6]:
n = 98

query_text = ds_qgen_template(n, config.fp_ds_bq_template_dir, scale=1, qual=False)

query_job = client.query(query_text, job_config=job_config)

result = query_job.result()
df = result.to_dataframe()

t0 = query_job.started
t1 = query_job.ended
dt = t1 - t0
bytes_processed = query_job.total_bytes_processed
bytes_billed = query_job.total_bytes_billed

print("Total Time Elapsed: {}".format(dt))
print("Bytes Processed: {}".format(bytes_processed))
print("Bytes Billed: {}".format(bytes_billed))
print("Head of Result:")
print(df.head())

Total Time Elapsed: 0:00:01.522000
Bytes Processed: 70982699
Bytes Billed: 71303168
Head of Result:
          i_item_id                                        i_item_desc  \
0  AAAAAAAAGFLDAAAA                                               None   
1  AAAAAAAAABEDAAAA  No longer soft cameras mean select, small poli...   
2  AAAAAAAAAFOAAAAA  National, upper principles fill relatives. Com...   
3  AAAAAAAAAGABAAAA  Badly difficult intervals should not get today...   
4  AAAAAAAAAKLCAAAA  Regulations go almost. Complex operations may ...   

  i_category i_class  i_current_price  itemrevenue  revenueratio  
0      Books    None            82.97      8398.00     66.798174  
1      Books    arts             1.52      5267.75      2.635621  
2      Books    arts             7.27      5731.18      2.867489  
3      Books    arts             0.29     10839.39      5.423287  
4      Books    arts            76.12      7404.76      3.704834  


In [7]:
break

SyntaxError: 'break' outside loop (<ipython-input-7-6aaf1f276005>, line 1)

In [None]:
def ds_qgen_stream(tpl_directory, out_directory, scale=1, streams=10):
    """Generate DS query streams
    
    Parameters
    ----------
    tpl_directory : str, absolute path to directory of query templates
    out_directory : str, absolute path to directory to output query streams
    scale : int, scale factor of db being queried. Defaults to 1.
    streams : int, number of query streams to generate. Defaults to 10.
    
    Returns
    -------
    str : BigQuery SQL query
    """
    std_out, err_out = ds_setup.dsqgen(directory=tpl_directory,  # the whole directory will be used
                                       input=tpl_directory + config.sep + "templates.lst",
                                       output_dir=out_directory,
                                       dialect="sqlserver_bq",
                                       scale=scale,
                                       streams=streams)
    return std_out, err_out

In [None]:
test_name = "bq_ds_1GB_01_default"

In [None]:
tpl_dir = config.fp_ds_bq_template_dir
tpl_dir

In [None]:
out_dir = config.fp_query + config.sep + test_name
tools.mkdir_safe(out_dir)
out_dir

In [None]:
std_out, err_out = ds_qgen_stream(tpl_directory=tpl_dir,
                                  out_directory=out_dir,
                                  scale=1,
                                  streams=10)

In [None]:
std_out

In [None]:
print(err_out)

In [None]:
client = bigquery.Client.from_service_account_json(config.gcp_cred_file)
job_config = bigquery.QueryJobConfig()
job_config.default_dataset = "tpc-benchmarking-9432.ds_1GB_qual"

In [None]:
fp = '/home/colin/code/bq_snowflake_benchmark/q/bq_ds_1GB_01_default/query_0.sql'
text = open(fp, 'r').read()
query_job = client.query(text, job_config=job_config)
result = query_job.result()

result = query_job.result()

list(result)