In [1]:
import findspark
findspark.init() # this must be executed before the below import

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Execution") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory","8g") \
    .config("spark.memory.offHeap.enabled",True) \
    .config("spark.memory.offHeap.size","8g") \
    .getOrCreate()

In [3]:
import numpy as np
import time
import rtree
from rtree import index

In [4]:
def find_overlap_parquets(query, partition_index):
    '''
    find out all the overlap partition ids
    '''
    query_lower = [qr[0] for qr in query]
    query_upper = [qr[1] for qr in query]
    query_border = tuple(query_lower + query_upper)
    overlap_pids = list(partition_index.intersection(query_border))
    
    return overlap_pids

In [5]:
def transform_query_to_sql(query, used_dims, column_name_dict, hdfs_path, pids=None):
    sql = ''
    for i, dim in enumerate(used_dims):
        if query[i][0] != -1:
            sql += column_name_dict[dim] + '>=' + str(query[i][0]) + ' and '
        if query[i][1] != -1:
            sql += column_name_dict[dim] + '<=' + str(query[i][1]) + ' and '
    sql = sql[0:-4] # remove the last 'and '
    
    if pids == None:
        #sql = "SELECT * FROM parquet.`" + hdfs_path + "`WHERE " + sql
        sql = "SELECT variance(_c0) FROM parquet.`" + hdfs_path + "`WHERE " + sql
    else:
        pids = str(set(pids)).replace(" ", "")
        #sql = "SELECT * FROM parquet.`" + hdfs_path + 'partition_' + pids + ".parquet` WHERE " + sql
        sql = "SELECT variance(_c0) FROM parquet.`" + hdfs_path + 'partition_' + pids + ".parquet` WHERE " + sql
    return sql

In [6]:
def query_with_parquets(query, used_dims, column_name_dict, hdfs_path, rtree_idx=None, print_execution_time=False):

    start_time = time.time()
    
    sql = None
    if rtree_idx == None:
        sql = transform_query_to_sql(query, used_dims, column_name_dict, hdfs_path)
    else:
        pids = find_overlap_parquets(query, rtree_idx)
        sql = transform_query_to_sql(query, used_dims, column_name_dict, hdfs_path, pids)
    
    end_time_1 = time.time()
    
    query_result = spark.sql(sql).collect()
    
    end_time_2 = time.time()
    
    query_translation_time = end_time_1 - start_time
    query_execution_time = end_time_2 - end_time_1
    
    if print_execution_time:
        print('query translation time: ', query_translation_time)
        print('query execution time: ', query_execution_time)
    
    #return (query_result, query_translation_time, query_execution_time) # this takes too much memory
    return (query_translation_time, query_execution_time)

In [7]:
def load_query(path):
    query_set = np.genfromtxt(path, delimiter=' ')
    query_set = query_set.reshape(len(query_set),-1,2)
    return query_set

def batch_query(queryset, used_dims, column_name_dict, hdfs_path, rtree_idx=None):
    
    start_time = time.time()
    
    # add statistics result
    results = []
    count = 0
    for query in queryset:
        result = query_with_parquets(query, used_dims, column_name_dict, hdfs_path, rtree_idx)
        print('finish query', count)
        count += 1
        results.append(result)
    
    end_time = time.time()
    
    print('total query response time: ', end_time - start_time)
    print('average query response time: ', (end_time - start_time) / len(queryset))

In [8]:
# # = = = Configuration (COMP Cloud Ubuntu) = = =

# scale_factor = 100
# # query_base_path = '/home/cloudray/NORA_Query/'
# query_base_path = '/home/ubuntu/Queryset/'

# distribution_path = query_base_path + 'distribution_' + str(scale_factor) + '.csv'
# random_path = query_base_path + 'random_' + str(scale_factor) + '.csv'

# distribution_query = load_query(distribution_path)
# random_query = load_query(random_path)

# training_set_percentage = 0.5
# Td = int(len(distribution_query) * training_set_percentage)
# Tr = int(len(random_query) * training_set_percentage)

# training_set = np.concatenate((distribution_query[0:Td], random_query[0:Tr]), axis=0)
# testing_set = np.concatenate((distribution_query[Td:], random_query[Tr:]), axis=0)

# used_dims = [1,2]
# num_dims = 16
# column_names = ['_c'+str(i) for i in range(num_dims)]
# column_name_dict = {}
# for i in range(num_dims):
#     column_name_dict[i] = column_names[i]

# # hdfs_path_nora = 'hdfs://localhost:9000/user/cloudray/NORA/merged/'
# # hdfs_path_qdtree = 'hdfs://localhost:9000/user/cloudray/QdTree/merged/'

# hdfs_path_nora = 'hdfs://10.88.88.103:9000/user/cloudray/NORA/scale100/merged/'
# hdfs_path_qdtree = 'hdfs://10.88.88.103:9000/user/cloudray/QdTree/scale100/merged/'
# hdfs_path_kdtree = 'hdfs://10.88.88.103:9000/user/cloudray/KDTree/scale100/merged/'

# # partition_base_path = '/home/ubuntu/PartitionLayout/'
# # nora_partition = partition_base_path + 'nora_partitions_' + str(scale_factor)

In [11]:
# = = = Configuration (UBDA Cloud Centos) = = =

scale_factor = 100
# query_base_path = '/home/cloudray/NORA_Query/'
query_base_path = '/home/centos/Queryset/'

distribution_path = query_base_path + 'distribution_' + str(scale_factor) + '.csv'
random_path = query_base_path + 'random_' + str(scale_factor) + '.csv'

distribution_query = load_query(distribution_path)
random_query = load_query(random_path)

training_set_percentage = 0.5
Td = int(len(distribution_query) * training_set_percentage)
Tr = int(len(random_query) * training_set_percentage)

training_set = np.concatenate((distribution_query[0:Td], random_query[0:Tr]), axis=0)
testing_set = np.concatenate((distribution_query[Td:], random_query[Tr:]), axis=0)

used_dims = [1,2]
num_dims = 16
column_names = ['_c'+str(i) for i in range(num_dims)]
column_name_dict = {}
for i in range(num_dims):
    column_name_dict[i] = column_names[i]

# hdfs_path_nora = 'hdfs://localhost:9000/user/cloudray/NORA/merged/'
# hdfs_path_qdtree = 'hdfs://localhost:9000/user/cloudray/QdTree/merged/'

hdfs_path_nora = 'hdfs://192.168.6.62:9000/user/cloudray/NORA/scale100/merged/'
hdfs_path_qdtree = 'hdfs://192.168.6.62:9000/user/cloudray/QdTree/scale100/merged/'
hdfs_path_kdtree = 'hdfs://192.168.6.62:9000/user/cloudray/KDTree/scale100/merged/'

# partition_base_path = '/home/ubuntu/PartitionLayout/'
# nora_partition = partition_base_path + 'nora_partitions_' + str(scale_factor)

In [26]:
# test query
# notice, there should not be any white space between and two pids
# sql = 'SELECT * FROM parquet.`hdfs://10.88.88.103:9000/user/cloudray/NORA/scale100/merged/partition_{164,165}.parquet`'
# sql = 'SELECT variance(_c0) FROM parquet.`hdfs://10.88.88.103:9000/user/cloudray/NORA/scale100/merged/partition_{164,165}.parquet`'
# result = spark.sql(sql).collect()

In [None]:
# len(result) # 0 and 1: 3124568
# len(result) # 0: 1556604
# len(result) # 1: 1567964

In [16]:
# def kdnode_2_border(kdnode):
#     lower = [domain[0] for domain in kdnode[0]]
#     upper = [domain[1] for domain in kdnode[0]]
#     border = tuple(lower + upper) # non interleave
#     return border

# def load_partitions_from_file(path):
#     '''
#     the loaded stretched_kdnodes: [num_dims, l1,l2,...,ln, u1,u2,...,un, size, id, pid, left_child,id, right_child_id]
#     '''
#     stretched_kdnodes = np.genfromtxt(path, delimiter=',')
#     num_dims = int(stretched_kdnodes[0,0])
#     kdnodes = []
#     for i in range(len(stretched_kdnodes)):
#         domains = [ [stretched_kdnodes[i,k+1],stretched_kdnodes[i,1+num_dims+k]] for k in range(num_dims) ]
#         row = [domains]
#         row.append(stretched_kdnodes[i,2*num_dims+1])
#         # to be compatible with qd-tree's partition, that do not have the last 4 attributes
#         if len(stretched_kdnodes[i]) > 2*num_dims+2:
#             row.append(stretched_kdnodes[i,-4])
#             row.append(stretched_kdnodes[i,-3])
#             row.append(stretched_kdnodes[i,-2])
#             row.append(stretched_kdnodes[i,-1])
#         kdnodes.append(row)
#     return kdnodes

In [17]:
# partitions = load_partitions_from_file(nora_partition)
    
# p = index.Property()
# p.leaf_capacity = 32
# p.index_capacity = 32
# p.NearMinimumOverlaoFactor = 16
# p.fill_factor = 0.8
# p.overwrite = True
# pidx = index.Index(properties = p)
    
# partition_index = index.Index(properties = p)
# for i in range(len(partitions)):
#     partition_index.insert(i, kdnode_2_border(partitions[i]))

In [33]:
# NORA
# batch_query(testing_set, used_dims, column_name_dict, hdfs_path_nora, partition_index)

# SELECT *
# total query response time:  861.4990439414978
# average query response time:  17.229980878829956

# SELECT COUNT(*) # the advantage is more obvious when io of query result do not dominate the query time
# total query response time:  24.595819234848022
# average query response time:  0.4919163846969605

# SELECT variance(_c0)
# total query response time:  32.315288066864014
# average query response time:  0.6463057613372802

In [32]:
# Qd-Tree
# batch_query(testing_set, used_dims, column_name_dict, hdfs_path_qdtree)

# SELECT *
# total query response time:  1169.1192693710327
# average query response time:  23.382385387420655

# SELECT COUNT(*)
# total query response time:  85.07339429855347
# average query response time:  1.7014678859710692

# SELECT variance(_c0)
# total query response time:  102.03884530067444
# average query response time:  2.040776906013489

In [10]:
# NORA
batch_query(testing_set, used_dims, column_name_dict, hdfs_path_nora)

# total query response time:  863.7078773975372
# average query response time:  17.274157547950743

finish query 0
finish query 1
finish query 2
finish query 3
finish query 4
finish query 5
finish query 6
finish query 7
finish query 8
finish query 9
finish query 10
finish query 11
finish query 12
finish query 13
finish query 14
finish query 15
finish query 16
finish query 17
finish query 18
finish query 19
finish query 20
finish query 21
finish query 22
finish query 23
finish query 24
finish query 25
finish query 26
finish query 27
finish query 28
finish query 29
finish query 30
finish query 31
finish query 32
finish query 33
finish query 34
finish query 35
finish query 36
finish query 37
finish query 38
finish query 39
finish query 40
finish query 41
finish query 42
finish query 43
finish query 44
finish query 45
finish query 46
finish query 47
finish query 48
finish query 49
total query response time:  97.01754927635193
average query response time:  1.9403509855270387


In [11]:
# Qd-Tree
batch_query(testing_set, used_dims, column_name_dict, hdfs_path_qdtree)

# total query response time:  1192.0776464939117
# average query response time:  23.841552929878233

finish query 0
finish query 1
finish query 2
finish query 3
finish query 4
finish query 5
finish query 6
finish query 7
finish query 8
finish query 9
finish query 10
finish query 11
finish query 12
finish query 13
finish query 14
finish query 15
finish query 16
finish query 17
finish query 18
finish query 19
finish query 20
finish query 21
finish query 22
finish query 23
finish query 24
finish query 25
finish query 26
finish query 27
finish query 28
finish query 29
finish query 30
finish query 31
finish query 32
finish query 33
finish query 34
finish query 35
finish query 36
finish query 37
finish query 38
finish query 39
finish query 40
finish query 41
finish query 42
finish query 43
finish query 44
finish query 45
finish query 46
finish query 47
finish query 48
finish query 49
total query response time:  115.19208145141602
average query response time:  2.3038416290283203


In [12]:
# KDTree
batch_query(testing_set, used_dims, column_name_dict, hdfs_path_kdtree)

# total query response time:  1192.0776464939117
# average query response time:  23.841552929878233

finish query 0
finish query 1
finish query 2
finish query 3
finish query 4
finish query 5
finish query 6
finish query 7
finish query 8
finish query 9
finish query 10
finish query 11
finish query 12
finish query 13
finish query 14
finish query 15
finish query 16
finish query 17
finish query 18
finish query 19
finish query 20
finish query 21
finish query 22
finish query 23
finish query 24
finish query 25
finish query 26
finish query 27
finish query 28
finish query 29
finish query 30
finish query 31
finish query 32
finish query 33
finish query 34
finish query 35
finish query 36
finish query 37
finish query 38
finish query 39
finish query 40
finish query 41
finish query 42
finish query 43
finish query 44
finish query 45
finish query 46
finish query 47
finish query 48
finish query 49
total query response time:  116.32540726661682
average query response time:  2.3265081453323364
