In [1]:
import statistics
# === Performance Evaluation ===
#
# evaluate the blocks of data to be fetched, when physical data of kdnodes are seperate !!!
# @queries: a collection of queries contains the lower and upper value in all dimensions; numpy object
# @kdnodes: the kdnodes generated above; array object

def Query(queries, kdnodes, alpha = 0):
    
    '''
    assume the queries are combined with distribution (first) and random (second)
    where the percentage of random query is alpha
    '''
    counts_partitions = []
    counts_records = []
    counts_total = 0
    
    count_single_query = 0;
    count_single_query_records = 0
    
    # number of dimensions
    dims = int(len(queries[0]))
    
    dis_rand_split_index = len(queries) * (1 - alpha)
    distribution_query_cost = 0
    random_query_cost = 0
    
    # for each query
    for i in range(len(queries)):
        
        count_single_query = 0
        count_single_query_records = 0
        
        intersected_partitions = []
        
        # check for intersection for each kdnode
        for j in range(len(kdnodes)):
            
            intersection_tag = True
            
            # for each dimension
            for k in range(dims):
                
                # an intersection holds if it intersecs in all dimensions
                if queries[i][k][0] >= kdnodes[j][0][k][1] or queries[i][k][1] <= kdnodes[j][0][k][0]:
                    intersection_tag = False
                    break
                
            # if the query intersect with this kdnode
            if intersection_tag:
                intersected_partitions.append(j)
                count_single_query += 1
                count_single_query_records += kdnodes[j][1] # number of records in this partition
                
        if i < dis_rand_split_index:
            distribution_query_cost += count_single_query_records
        else:
            random_query_cost += count_single_query_records
        
        print("Query", i, " Cost: ", count_single_query_records, " Intersected Partitions:", intersected_partitions)
        
        counts_partitions.append(count_single_query)
        counts_records.append(count_single_query_records)
        counts_total += count_single_query_records
    
    #print("blocks IO: ", counts)
    print("average partitions each query overlap(average): ", statistics.mean(counts_partitions))
    print("average records each query retrieve(average): ", statistics.mean(counts_records))
    print("total records that all the queries retrieve: ", counts_total)
    
    avg_dist_cost = distribution_query_cost / dis_rand_split_index
    print("average distribution query cost:", avg_dist_cost)
    avg_rand_cost = random_query_cost / (len(queries) - dis_rand_split_index)
    print("average random query cost:", avg_rand_cost)

# # === Performance Evaluation (dense) ===
# #
# # evaluate the blocks of data to be fetched, when physical data of kdnodes are dense !!!
# # @queries: a collection of queries contains the lower and upper value in all dimensions; numpy object
# # @kdnodes: the kdnodes generated above; array object
# #
# def QueryDense(queries, dense_kdnodes):
    
#     counts = []
#     count_single_query = 0;
    
#     # number of dimensions
#     dims = int(len(queries[0]))
    
#     # for each query
#     for i in range(len(queries)):
        
#         pages = []
        
#         # check for intersection for each kdnode
#         for j in range(len(dense_kdnodes)):
            
#             # for each dimension
#             intersection_tag = True
#             for k in range(dims):
                
#                 # an intersection holds if it intersecs in all dimensions
#                 if queries[i][k][0] >= dense_kdnodes[j][0][k][1] or queries[i][k][1] <= dense_kdnodes[j][0][k][0]:
#                     intersection_tag = False
#                     break
                
#             # if the query intersect with this kdnode
#             if intersection_tag:
#                 pages.extend(dense_kdnodes[j][1]) # remember to remove repeated
        
#         counts.append(len(set(pages)))
    
#     #print("blocks IO: ", counts)
#     print("blocks IO(average): ", statistics.mean(counts))    

# # === Physical Storage ===
# #
# # Reform the kdnodes from seperate physcial stroage to dense, continous physical storage
# #
# # @kdnodes: the previous generated kdnodes, already in order; array
# # @threshold: the maximum page size 
# #
# # return @dense_kdnodes: array

# def DenseKDNodes(kdnodes, threshold):
    
#     dense_kdnodes = []
#     previous_records = 0
#     current_records = 0
#     page_count = 0
    
#     for i in range(len(kdnodes)):
#         previous_records = current_records
#         current_records += kdnodes[i][1]
#         if current_records > threshold:
#             # determine how many pages exceeds
#             remaining = kdnodes[i][1] - (threshold - previous_records)
#             num_pages = math.ceil(remaining / threshold) # num of new pages required
#             pages = [i+page_count for i in range(num_pages+1)]
#             page_count += num_pages
#             current_records = remaining % threshold
#             dense_kdn = [kdnodes[i][0],pages]
#             dense_kdnodes.append(dense_kdn)
#         else:
#             dense_kdn = [kdnodes[i][0],[page_count]]
#             dense_kdnodes.append(dense_kdn)
                         
#     return dense_kdnodes

In [None]:
# Example of usage (Performance Evaluation Learned)
#Query(testing_set, kdnodes)

# # Example of usage (Physical Storage)
# dense_kdnodes = DenseKDNodes(kdnodes, block_size)

# # Example of usage (Performance Evaluation (dense))
# QueryDense(testing_set, dense_kdnodes)

In [None]:
# this is designed for incremental query?
def QueryWithKdnodeIndex(single_query, kdnode_index, kdnode_dict):
    pass