# ibm_sql_query

Run arbitrary SQL based data transformation jobs on CSV, PARQUET, JSON, AVRO and ORC data stored on Cloud Object Storage using IBM SQL Query. Transformation results are written stored back to Cloud Object Storage

In [None]:
import os
create_image = bool(os.environ.get('create_image',False))
if (create_image):
    docker_file="""
    FROM registry.access.redhat.com/ubi8/python-39
    RUN pip install ipython nbformat numpy ibm-cos-sdk-core ibm-cos-sdk ibm-watson-machine-learning ibm-watson-studio-pipelines ibmcloudsql pyyaml
    ADD ibm-sql-query-cpd.ipynb .
    """
    with open("Dockerfile", "w") as text_file:
        text_file.write(docker_file)

    !docker build -t ibm_sql_query_cpd .
    exit()
else:
    !pip install nbformat numpy ibm-cos-sdk-core ibm-cos-sdk ibm-watson-machine-learning ibm-watson-studio-pipelines ibmcloudsql pyyaml
    None

In [None]:
import glob
import logging
import ibmcloudsql
from ibmcloudsql import SQLQuery
import os
import shutil
import sys
import re
from ibm_watson_machine_learning import APIClient
from ibm_watson_studio_pipelines import WSPipelines
from ibm_watson_studio_pipelines.cpd_paths import CpdScope, CpdPath
import yaml

In [None]:
# IBM Cloud API key
api_key = os.environ.get('api_key')

# COS URL where the results of the SQL job are to be stored
target_dir_path = os.environ.get('target_dir_path')

# Asset name to register for the results written by the SQL job
target_asset_name = os.environ.get('target_asset_name')

# sql statement to execute
sql = os.environ.get('sql')

# (unique) Custom Resource Name (CRN) of IBM SQL Query Service
sql_query_crn = os.environ.get('sql_query_crn')

# default: CSV - (will be generated into according STORED AS … clause in the INTO clause)
format = os.environ.get('format' , 'CSV')

# optional, list of columns to use for partitioning the results of the SQL job, will be generated into according PARTITIONED BY (<columns>) clause in the INTO clause)
partition_columns = os.environ.get('partition_columns')

# optional, number of objects to store the results of the SQL job in, will be generated into according PARTITIONED INTO <num> OBJECTS clause in INTO clause
number_of_objects = int(os.environ.get('number_of_objects', 0))

# optional, number of rows to be stored in each result object of the SQL job, will be generated into according PARTITIONED EVERY <num> ROWS clause in INTO clause
rows_per_object = int(os.environ.get('rows_per_object', 0))

# default: False, only valid when none of the above partitioning option is specified, produces exactly one object with name specified in target_dir_path, twill be generated into sqlClient.rename_exact_result(jobid) after SQL has run.
exact_name = bool(os.environ.get('exact_name', False))

# default: False - will be generated into JOBPREFIX NONE in the INTO clause. Will cause results of previous runs with same output_uri to be overwritten, because no unique sub folder will be created for the result)
no_jobid_folder = bool(os.environ.get('no_jobid_folder', False))

In [None]:
"""
api_key = '_sVLwgGniPFvX4YFr-VqQyZgzoW_inmuxbVjHff-Ec'
#token = 'eyJraWQiOiIyMDIyMDMxNzA4MjMiLCJhbGciOiJSUzI1NiJ9.eyJpYW1faWQiOiJJQk1pZC0yNzAwMDI1NzNZIiwiaWQiOiJJQk1pZC0yNzAwMDI1NzNZIiwicmVhbG1pZCI6IklCTWlkIiwic2Vzc2lvbl9pZCI6IkMtZjI4Mjg0ODEtNGFiZi00MmZmLWJkZTMtYWUxMDE2MzFjNzFlIiwic2Vzc2lvbl9leHBfbWF4IjoxNjQ3Njg0Mzc0LCJzZXNzaW9uX2V4cF9uZXh0IjoxNjQ3NjA1MTg5LCJqdGkiOiIzMDhhYThkZC03Mjk5LTQ3MmYtYTI1YS1lYzExMzdkZjU4ZjMiLCJpZGVudGlmaWVyIjoiMjcwMDAyNTczWSIsImdpdmVuX25hbWUiOiJSb21lbyIsImZhbWlseV9uYW1lIjoiS2llbnpsZXIiLCJuYW1lIjoiUm9tZW8gS2llbnpsZXIiLCJlbWFpbCI6InJvbWVvLmtpZW56bGVyQGNoLmlibS5jb20iLCJzdWIiOiJyb21lby5raWVuemxlckBjaC5pYm0uY29tIiwiYXV0aG4iOnsic3ViIjoicm9tZW8ua2llbnpsZXJAY2guaWJtLmNvbSIsImlhbV9pZCI6IklCTWlkLTI3MDAwMjU3M1kiLCJuYW1lIjoiUm9tZW8gS2llbnpsZXIiLCJnaXZlbl9uYW1lIjoiUm9tZW8iLCJmYW1pbHlfbmFtZSI6IktpZW56bGVyIiwiZW1haWwiOiJyb21lby5raWVuemxlckBjaC5pYm0uY29tIn0sImFjY291bnQiOnsiYm91bmRhcnkiOiJnbG9iYWwiLCJ2YWxpZCI6dHJ1ZSwiYnNzIjoiOWIxM2I4NTdhMzIzNDFiNzE2NzI1NWRlNzE3MTcyZjUiLCJpbXNfdXNlcl9pZCI6IjgwODAyMjIiLCJpbXMiOiIyMDMyNDc0In0sImlhdCI6MTY0NzU5Nzk4OSwiZXhwIjoxNjQ3NTk5MTg5LCJpc3MiOiJodHRwczovL2lhbS5jbG91ZC5pYm0uY29tL2lkZW50aXR5IiwiZ3JhbnRfdHlwZSI6InVybjppYm06cGFyYW1zOm9hdXRoOmdyYW50LXR5cGU6cGFzc2NvZGUiLCJzY29wZSI6ImlibSBvcGVuaWQiLCJjbGllbnRfaWQiOiJieCIsImFjciI6MSwiYW1yIjpbInB3ZCJdfQ.aKshVIsF8zRgRpsL2OzVRak4B0ORVRGQcHezVq8-znB6sHHiKkPpPGWD8DdnAlrMBgZAdF3y_XAJ7gBXr6NOnX2Q-BNXe_JjN1o5VFlWWcmxouuAG9xz0SmaD1MFmL6xvX5_EaeAoJS11Yizgm1y0SFzzLGPy1EZFZtzekyTMuVEZI5jgR0FG8cgwPOdg5S9ERkShid5BMDlX1h-sq8c8PjPwiDxutFE5SzpglmBuYrnYW5K8T5NdgqQHd3qPMnAUJk9cTcIoMJhCRNLEQKXC74cSfWrP8lDyMd4wQUm2e9M3Dc3FsWkHdAaxrwKXYQMBtLl2YXESXmQDjiKZNudxw'
target_dir_path='cos://eu-de/claimed-test/data.parquet/sql_results'
sql='SELECT * FROM cos://eu-de/claimed-test/data.parquet stored as parquet'
sql_query_crn='crn:v1:bluemix:public:sql-query:us-south:a/9b13b857a32341b7167255de717172f5:cd3e149d-e183-4162-a392-50bab6c64f47::'
target_asset_name='target_asset_name'
"""

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
sh = logging.StreamHandler(sys.stdout)
logger.addHandler(sh)

for arg in sys.argv:
    logger.debug(arg)
    
for en in os.environ:
    logger.debug(en + ':' + os.environ.get(en))

parameters = list(
    map(lambda s: re.sub('$', '"', s),
        map(
            lambda s: s.replace('=', '="'),
            filter(
                lambda s: s.find('=') > -1 and bool(re.match(r'[A-Za-z0-9_]*=[.\/A-Za-z0-9]*', s)),
                sys.argv
            )
    )))

exact_name = bool(exact_name)
no_jobid_folder = bool(no_jobid_folder)
number_of_objects = int(number_of_objects)
rows_per_object = int(rows_per_object)


for parameter in parameters:
    logging.warning('Parameter: ' + parameter)
    exec(parameter)

In [None]:
logging.debug(api_key)
logging.debug(sql_query_crn)


def exists(var):
     return var in globals() and var is not None

if not exists('api_key'):
    sqlClient = SQLQuery(api_key=None, token=token, instance_crn=sql_query_crn)
else:
    logging.debug(api_key[1:5])
    logging.debug(sql_query_crn)
    sqlClient = SQLQuery(api_key=api_key, instance_crn=sql_query_crn)

sql = sql + ' INTO {}'.format(target_dir_path)

partitioned_by = False

if partition_columns is not None and len(partition_columns) > 0:
    if not partitioned_by:
        sql = sql + ' PARTITIONED'
        partitioned_by = True
    sql = sql + ' BY ({})'.format(partition_columns)

if number_of_objects is not None and number_of_objects > 0:
    if not partitioned_by:
        sql = sql + ' PARTITIONED'
        partitioned_by = True
    sql = sql + ' INTO {} OBJECTS'.format(number_of_objects)    

if rows_per_object is not None and rows_per_object > 0:
    if not partitioned_by:
        sql = sql + ' PARTITIONED'
        partitioned_by = True
    sql = sql + ' EVERY {} ROWS'.format(rows_per_object)

if no_jobid_folder:
    sql = sql + ' JOBPREFIX NONE'

sql = sql + ' STORED AS {}'.format(format)

if exact_name:
    job_id = sqlClient.submit_sql(sql)
    job_status = sqlClient.wait_for_job(job_id)
    print("Job " + job_id + " terminated with status: " + job_status)
    sqlClient.rename_exact_result(job_id) 
else:
    sqlClient.run_sql(sql)

In [None]:
logging.info(sql)
logging.info('done')