In [1]:
import findspark
findspark.init() # this must be executed before the below import

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Python Spark SQL Execution").getOrCreate()

In [13]:
import numpy as np
import time

In [3]:
def transform_query_to_sql(query, used_dims, column_name_dict, hdfs_path):
    sql = ''
    for i, dim in enumerate(used_dims):
        if query[i][0] != -1:
            sql += column_name_dict[dim] + '>=' + str(query[i][0]) + ' and '
        if query[i][1] != -1:
            sql += column_name_dict[dim] + '<=' + str(query[i][1]) + ' and '
    sql = sql[0:-4] # remove the last 'and '
    
    sql = "SELECT * FROM parquet.`" + hdfs_path + "`WHERE " + sql
    
    return sql

In [4]:
def query_with_parquets(query, used_dims, column_name_dict, hdfs_path, print_execution_time=False):

    start_time = time.time()
    
    sql = transform_query_to_sql(query, used_dims, column_name_dict, hdfs_path)
    
    end_time_1 = time.time()
    
    query_result = spark.sql(sql).collect()
    
    end_time_2 = time.time()
    
    query_translation_time = end_time_1 - start_time
    query_execution_time = end_time_2 - end_time_1
    
    if print_execution_time:
        print('query translation time: ', query_translation_time)
        print('query execution time: ', query_execution_time)
    
    return (query_result, query_translation_time, query_execution_time)

In [17]:
def load_query(path):
    query_set = np.genfromtxt(path, delimiter=' ')
    query_set = query_set.reshape(len(query_set),-1,2)
    return query_set

def batch_query(queryset, used_dims, column_name_dict, hdfs_path):
    
    start_time = time.time()
    
    # add statistics result
    results = []
    for query in queryset:
        result = query_with_parquets(query, used_dims, column_name_dict, hdfs_path)
        results.append(result)
    
    end_time = time.time()
    
    print('total query response time: ', end_time - start_time)
    print('average query response time: ', (end_time - start_time) / len(queryset))

In [18]:
# = = = Configuration = = =

scale_factor = 2
query_base_path = '/home/cloudray/NORA_Query/'

distribution_path = query_base_path + 'distribution_' + str(scale_factor) + '.csv'
random_path = query_base_path + 'random_' + str(scale_factor) + '.csv'

distribution_query = load_query(distribution_path)
random_query = load_query(random_path)

training_set_percentage = 0.5
Td = int(len(distribution_query) * training_set_percentage)
Tr = int(len(random_query) * training_set_percentage)

training_set = np.concatenate((distribution_query[0:Td], random_query[0:Tr]), axis=0)
testing_set = np.concatenate((distribution_query[Td:], random_query[Tr:]), axis=0)

used_dims = [1,2]
num_dims = 16
column_names = ['_c'+str(i) for i in range(num_dims)]
column_name_dict = {}
for i in range(num_dims):
    column_name_dict[i] = column_names[i]

hdfs_path_nora = 'hdfs://localhost:9000/user/cloudray/NORA/merged/'
hdfs_path_qdtree = 'hdfs://localhost:9000/user/cloudray/QdTree/merged/'

In [19]:
# NORA
batch_query(testing_set, used_dims, column_name_dict, hdfs_path_nora)

total query response time:  110.49015402793884
average query response time:  2.209803080558777


In [20]:
# Qd-Tree
batch_query(testing_set, used_dims, column_name_dict, hdfs_path_qdtree)

total query response time:  19.17433786392212
average query response time:  0.3834867572784424
