# SparkSQL

Execute arbitrary SQL queries againts CSV and PARQUET files

In [None]:
%%bash
export version=`python --version |awk '{print $2}' |awk -F"." '{print $1$2}'`

echo $version

if [ $version == '36' ] || [ $version == '37' ]; then
    echo 'Starting installation...'
    pip3 install pyspark==2.4.8 wget==3.2 pyspark2pmml==0.5.1 > install.log 2> install.log
    if [ $? == 0 ]; then
        echo 'Please <<RESTART YOUR KERNEL>> (Kernel->Restart Kernel and Clear All Outputs)'
    else
        echo 'Installation failed, please check log:'
        cat install.log
    fi
elif [ $version == '38' ] || [ $version == '39' ]; then
    pip3 install pyspark==3.1.2 wget==3.2 pyspark2pmml==0.5.1 > install.log 2> install.log
    if [ $? == 0 ]; then
        echo 'Please <<RESTART YOUR KERNEL>> (Kernel->Restart Kernel and Clear All Outputs)'
    else
        echo 'Installation failed, please check log:'
        cat install.log
    fi
else
    echo 'Currently only python 3.6, 3.7 , 3.8 and 3.9 are supported, in case you need a different version please open an issue at https://github.com/IBM/claimed/issues'
    exit -1
fi

In [None]:
import glob
import logging
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os
import shutil
import sys
import re

In [None]:
# name of resulting file (default: data_result.csv)
output_result_file = os.environ.get('output_result_file', 'data_result.csv')

# file name for CSV or PARQUET file - must end with .csv or .parquet (default: data.csv)
data_file = os.environ.get('data_file', 'data.csv')

# master url of spark master (default: local mode)
master = os.environ.get('master', "local[*]")

# data_dir temporal data storage for local execution (default: ../../data/)
data_dir = os.environ.get('data_dir', '../../data/')

# sql statement to execute, table name == df, example: select * from df
sql = os.environ.get('sql')

In [None]:
parameters = list(
    map(lambda s: re.sub('$', '"', s),
        map(
            lambda s: s.replace('=', '="'),
            filter(
                lambda s: s.find('=') > -1 and bool(re.match(r'[A-Za-z0-9_]*=[.\/A-Za-z0-9]*', s)),
                sys.argv
            )
    )))

for parameter in parameters:
    logging.warning('Parameter: ' + parameter)
    exec(parameter)

In [None]:
sc = SparkContext.getOrCreate(SparkConf().setMaster(master))
spark = SparkSession.builder.getOrCreate()

In [None]:
if '.csv' in data_file:
    df = spark.read.option('header', 'true').option("inferSchema", "true").csv(data_dir + data_file)
elif '.parquet' in data_file:
    df = spark.read.parquet.csv(data_dir + data_file)
else:
    raise Exception('Neither .csv nor .parquet in file name')

In [None]:
df.createOrReplaceTempView('df')
df = spark.sql(sql)

In [None]:
if os.path.exists(data_dir + output_result_file):
    os.remove(data_dir + output_result_file)

if '.csv' in data_file:
    df.coalesce(1).write.option("header", "true").csv(data_dir + output_result_file)
elif '.parquet' in data_file:
    df.coalesce(1).write.parquet(data_dir + output_result_file)
else:
    raise Exception('Neither .csv nor .parquet in file name')

file = glob.glob(data_dir + output_result_file + '/part-*')
shutil.move(file[0], data_dir + output_result_file + '.tmp')
shutil.rmtree(data_dir + output_result_file)
shutil.move(data_dir + output_result_file + '.tmp', data_dir + output_result_file)