In [None]:
import findspark
findspark.init() # this must be executed before the below import

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark import SparkFiles

In [None]:
import time

In [None]:
sc = SparkContext()
sqlContext = SQLContext(sc)

In [None]:
# use a Rtree to index the partitions, the id is the partition id
# use a dictionary to save the {partition_id : parquet_id}
# compose parquet file address with the parquet_id, or do we directly use partition id as parquet id

In [None]:
def find_overlap_parquets(query, partition_index):
    '''
    find out all the overlap partition ids
    '''
    query_lower = [qr[0] for qr in query]
    query_upper = [qr[1] for qr in query]
    query_border = tuple(query_lower + query_upper)
    overlap_pids = list(partition_index.intersection(query_border))
    
    return overlap_pids

In [None]:
def get_parquet_file_paths(partition_ids, hdfs_path=None):
    
    if hdfs_path == None:
        hdfs_path = 'hdfs://localhost:9000/user/cloudray/NORA/'
    
    result_paths = []
    
    for pid in partition_ids:
        partition_name = 'partition_' + str(pid)+'.parquet'
        path = hdfs_path + partition_name
        result_paths.append(path)
        
    return result_paths

In [6]:
def transform_query_to_sql(query, used_dims, column_name_dict):
    sql = ''
    for i in range(len(query)):
        if query[i][0] != -1:
            sql += column_name_dict[used_dims[i]] + '>=' + str(query[i][0]) + ' and '
        if query[i][1] != -1:
            sql += column_name_dict[used_dims[i]] + '<=' + str(query[i][1]) + ' and '
    sql = sql[0:-4]
    return sql

In [None]:
def query_with_parquets(query, partition_index, used_dims, column_name_dict, 
                        hdfs_path, print_execution_time=False):
    '''
    parameters:
    @query: should contains the same dimension that partition_index holds
    
    first, find the overlapped parquet ids
    second, load these parquets
    third, query from the loaded parquets
    '''
    start_time = time.time()
    
    pids = find_overlap_parquets(query, partition_index)
    paths = get_parquet_file_paths(pids, hdfs_path)
    dfs = sqlContext.read.parquet(*paths)
    sql = transform_query_to_sql(query, used_dims, column_name_dict)
    query_result = dfs.filter(sql).collect()
    
    end_time = time.time()
    query_response_time = end_time - start_time
    
    if print_execution_time:
        print('query response time: ', query_response_time)
    
    return query_result

In [9]:
# # = = = Unit Test = = =

# column_name_dict = {0:'_c0', 1:'_c1', 2:'_c2', 3:'_c3'}
# query = [[-1,-1],[10,20],[10,-1]]

# sql = transform_query_to_sql(query, column_name_dict)
# print(sql)

In [None]:
def kdnode_2_border(kdnode):
    lower = [domain[0] for domain in kdnode[0]]
    upper = [domain[1] for domain in kdnode[0]]
    border = tuple(lower + upper) # non interleave
    return border

def batch_query(queryset, partition_path, used_dims, column_name_dict, hdfs_path):
    
    start_time = time.time()
    
    partitions = load_partitions_from_file(partition_path)
    
    p = index.Property()
    p.leaf_capacity = 100 # cannot be less than 100, indicate the maximum capacity
    p.fill_factor = 0.5
    p.overwrite = True
    
    partition_index = index.Index(properties = p)
    for i in range(len(partitions)):
        # qd-tree do not have this
        #partition_index.insert(int(partitions[i][-4]), kdnode_2_border(partitions[i])) 
        partition_index.insert(i, kdnode_2_border(partitions[i]))
    
    # add statistics result
    for query in queryset:
        query_with_parquets(query, partition_index, used_dims, column_name_dict, hdfs_path)
    
    end_time = time.time()
    
    query_response_time = end_time - start_time
    avg_query_response_time = query_response_time / len(queryset)
    
    print('total query response time: ', query_response_time)
    print('average query response time: ', avg_query_response_time)

In [None]:
# = = = Execution = = =
partition_path = '/home/cloudray/NORA_Partitions/nora_partitions'
used_dims = [1,2]
column_names = ['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7']
hdfs_path = 'hdfs://localhost:9000/user/cloudray/NORA/'

batch_query(queryset, partition_path, used_dims, column_name_dict, hdfs_path)