In [None]:
import numpy as np

In [None]:
# === Assistant Functions ===
#
# this works for one dimension only !!! An implementation of query bounding.

def getoverlap(al, au, bl, bu):
    return max(0, min(au,bu)-max(al,bl))

# currently not used.
def bounding_union(query_collection):
    
    # should keep it ordered first by the lower interval !!!!!!
    query_collection = query_collection[query_collection[:,0].argsort()]
    
    remaining_query = query_collection
    bounded_intervals = []
    
    while len(remaining_query) != 0:
        
        initial_interval = [remaining_query[0][0], remaining_query[0][1]]
        temp_interval = []
        
        for i in range(len(remaining_query)-1):
            
            overlap = getoverlap(initial_interval[0],initial_interval[1],remaining_query[i+1][0], remaining_query[i+1][1])
            
            # there is no overlap
            if overlap == 0:
                temp_interval.append([remaining_query[i+1][0], remaining_query[i+1][1]])
            else: # update interval border
                initial_interval[0] = min(initial_interval[0], remaining_query[i+1][0])
                initial_interval[1] = max(initial_interval[1], remaining_query[i+1][1])
                
        bounded_intervals.append(initial_interval)
        remaining_query = temp_interval
    
    return bounded_intervals

In [None]:
# === Learned KD-Tree Split (Fast Version) ===
#
# asssumption: the query boundings will not overlap. divide the KD-Tree recursively
#
# @dataset: contains the data only in this subnode, ordered in original load order;
# @query: contains all the queries (bounded or original); ordered in original generated order;
# @domains: the current domain of the node of every dimension [first lower, second upper],[]...; array object
# @threshold: maximum page size
# @level: the current tree depth
#
# return @kdnodes: contains the domain of each node and the correpsonding records amount, notice the domain is
# ordered by the original load order as dataset
def FastLearnedResuriveDivide(dataset, query, domains, threshold, level, current_dim = 0):

    # check if the threshold is already satisfied
    total_size = len(dataset)
    #print('level: ', level, ' size: ', total_size, ' domain: ',domains)
    if total_size <= threshold:
        kdnodes = []
        kdnodes.append([domains, total_size])
        return kdnodes
    
    split_distance_each_dim = []
    split_position_each_dim = []
    split_value_each_dim = []
    
    # for each dimension, we calculated the distance from median to its first non-cross split
    for D in range(len(dataset[0])):

        # median, with fast median algorithm
        median = np.median(dataset[:,D])
        median_low = domains[D][0]
        median_up = domains[D][1]    

        # split position
        split_distance = 0
        split_position = int(total_size / 2)

        # that is the place where we need query bounding !
        if len(query) == 0:
            split_distance_each_dim.append(split_distance)
            split_position_each_dim.append(split_position)
            split_value_each_dim.append(median)
            continue

        query = np.asarray(query)
        query_in_this_dim = query[:, D]
        
        # bound the projected queries in this dimension
        query_bound = bounding_union(query_in_this_dim)
        
        # check if the split position intersect some query boundings in this dim
        for i in range(len(query_bound)):

            # if intersect some query bounds (only possible to intersect one bounded query)
            if median > query_bound[i][0] and median < query_bound[i][1]:

                # check if the two end already exceeds domain, if yes, split from the middle
                if query_bound[i][0] <= domains[D][0] and query_bound[i][1] >= domains[D][1]:
                    split_distance = int(total_size / 2)
                    break;
                # if not exceeds, determine which side is closer to the median
                else:
                    # for the left side
                    if query_bound[i][0] > domains[D][0]:
                        median_low = query_bound[i][0]         
                    # for the right side
                    if query_bound[i][1] < domains[D][1]:
                        median_up = query_bound[i][1]

                # if not exceeds then choose the one that is closest from the median (in terms of #records!)
                number_of_records_from_low_to_median = len(dataset[(dataset[:,D]>=median_low) & (dataset[:,D] < median)])
                number_of_records_from_up_to_median = len(dataset[(dataset[:,D]<=median_up) & (dataset[:,D] > median)])
                
                if number_of_records_from_low_to_median <= number_of_records_from_up_to_median:
                    median = median_low
                    split_distance = number_of_records_from_low_to_median
                else:
                    median = median_up
                    split_distance = number_of_records_from_up_to_median
                    
        # for each dimension, record its result
        split_distance_each_dim.append(split_distance)
        split_value_each_dim.append(median)

    # aftern calculating the distance from median to its first non-cross split
    split_distance_each_dim = np.asarray(split_distance_each_dim)
    split_dimension = 0
    split_value = 0
    
    # degradation mechansim (if no valid split position, then using round robin)
    # if the median do not cross any historical query, split round robin to enhance robustness
    if min(split_distance_each_dim) >= int((total_size / 2)-10) or max(split_distance_each_dim) <= 10:
        split_dimension = current_dim + 1
        if split_dimension >= len(domains):
            split_dimension %= len(domains)
        split_value = np.median(dataset[:,split_dimension])
    else:
        split_dimension = np.argmin(split_distance_each_dim)  # get the split dimension
        split_value = split_value_each_dim[split_dimension]

    # split the dataset according to the split position
    sub_dataset1 = dataset[dataset[:,split_dimension] <= split_value]
    sub_dataset2 = dataset[dataset[:,split_dimension] > split_value]
    
    if len(sub_dataset1) < threshold or len(sub_dataset2) < threshold:
        kdnodes = []
        kdnodes.append([domains, total_size])
        return kdnodes
    
    # change the domains
    sub_domains1 = np.copy(domains)
    sub_domains1[split_dimension][1] = split_value
    sub_domains2 = np.copy(domains)
    sub_domains2[split_dimension][0] = split_value

    # filter the queries for each sub node
    sub_query1 = query[query[:,split_dimension,0] < split_value]
    sub_query2 = query[query[:,split_dimension,1] > split_value]

    # used to see the current depth
    level += 1

    # recursion
    kdnodes = []
    kdnodes.extend(FastLearnedResuriveDivide(sub_dataset1, sub_query1, sub_domains1, threshold, level, split_dimension))
    kdnodes.extend(FastLearnedResuriveDivide(sub_dataset2, sub_query2, sub_domains2, threshold, level, split_dimension))

    return kdnodes

# start_time = time.time()
# fast_kdnodes = FastLearnedResuriveDivide(dataset, training_set, domains_, block_size, 0)
# end_time = time.time()
# print("training time for fast Learned KD-Tree(s): ", end_time-start_time)
# print('fast learned KD-Tree leaf nodes: ',len(fast_kdnodes))