# The required packages.

In [1]:
import import_ipynb
import copy
import time
import random
import numpy as np
import pandas as pd # for batch data loading, in generating sampled dataset
from rtree import index # this package is only used for constructing Rtree filter
from numpy import genfromtxt
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from mpl_toolkits.mplot3d import Axes3D

# The Utilities Classes

In [2]:
class SplitContext:
    '''
    to maintain the split information
    split_type:
    0 = candidate cut
    1 = bounding split
    2 = dual-bounding split
    3 = new bounding split
    '''
    def __init__(self, split_type, split_dimension, split_value):
        self.split_type = split_type
        self.split_dimension = split_dimension
        self.split_value = split_value
        
        self.sub_partitions = []
        self.split_gain = 0

In [3]:
class QueryMBR:
    '''
    the MBR that bound overlapped queries
    '''
    def __init__(self, boundary, added_as_fist_query = True):
        self.num_dims = len(boundary) / 2
        self.boundary = boundary
        self.num_query = 1
        self.queries = []
        self.bound_size = None # number of records this MBR overlaps
        self.total_query_result_size = None # total query results size of all the queries inside this MBR
        self.query_result_size = [] # record each query's result size
        self.is_extended = False
        self.ill_extended = False
        if added_as_fist_query:
            self.queries = [copy.deepcopy(boundary)] # OR the change of boundary will affect the queries!
        
    def check_condition3(self, data_threshold):
        '''
        check whether this MBR satisfy the new bounding split condition 3:
        1. every query size > BP - b
        2. total_query_result_size + b > bound_size * num_query
        '''
        for size in self.query_result_size:
            if size <= self.bound_size - data_threshold:
                return False
        
        if self.total_query_result_size + data_threshold <= self.bound_size * self.num_query:
            return False
        
        return True

# The PartitionNode Class.
It maintains the meta infos of a partition.
It also provides support functions related to split and query overlap checking.

In [37]:
class PartitionNode:
    '''
    A partition node, including both the internal and leaf nodes in the partition tree
    '''
    def __init__(self, num_dims = 0, boundary = [], nid = None, pid = None, is_irregular_shape_parent = False,
                 is_irregular_shape = False, num_children = 0, children_ids = [], is_leaf = True, node_size = 0):
        
        # print("Initialize PartitionTree Root: num_dims",num_dims,"boundary:",boundary,"children_ids:",children_ids)
        self.num_dims = num_dims # number of dimensions
        # the domain, [l1,l2,..,ln, u1,u2,..,un,], for irregular shape partition, one need to exempt its siblings
        self.boundary = boundary # I think the lower side should be inclusive and the upper side should be exclusive?
        self.nid = nid # node id
        self.pid = pid # parent id
        self.is_irregular_shape_parent = is_irregular_shape_parent # whether the [last] child is an irregular shape partition
        self.is_irregular_shape = is_irregular_shape # an irregular shape partition cannot be further split, and it must be a leaf node
        self.num_children = num_children # number of children, should be 0, 2, or 3
        self.children_ids = children_ids # if it's the irregular shape parent, then the last child should be the irregular partition
        self.is_leaf = is_leaf
        self.node_size = node_size # number of records in this partition
        
        # the following attributes will not be serialized
        self.dataset = None # only used in partition algorithms, temporary, should consist records that within this partition
        self.queryset = None # only used in partition algorithms, temporary, should consist queries that overlap this partition
        self.partitionable = True # only used in partition algorithms
        self.query_MBRs = None # only used in partition algorithms, temporary
        self.split_type = None # only used in partition algorithms
        
        # Rtree filters
        self.rtree_filters = None # a collection of MBRs, in the shape of boundary, used to indicate the data distribution
        
        # beam search
        self.depth = 0 # only used in beam search, root node depth is 0
        self.no_valid_partition = False # use to stop, an variant of partitionable
        
    def is_overlap(self, query):
        '''
        query is in plain form, i.e., [l1,l2,...,ln, u1,u2,...,un]
        !query dimension should match the partition dimensions! i.e., all projected or all not projected
        return 0 if no overlap
        return 1 if overlap
        return 2 if inside
        '''
        if len(query) != 2 * self.num_dims:
            return -1 # error
        
        overlap_flag = True
        inside_flag = True
        
        for i in range(self.num_dims):
            if query[i] >= self.boundary[self.num_dims + i] or query[self.num_dims + i] <= self.boundary[i]:
                overlap_flag = False
                inside_flag = False
                return 0
            elif query[i] < self.boundary[i] or query[self.num_dims + i] > self.boundary[self.num_dims + i]:
                inside_flag = False
                
        if inside_flag:
            return 2
        elif overlap_flag:
            return 1
        else:
            return 0
    
    def is_overlap_np(self, query):
        '''
        the numpy version of the is_overlap function
        the query here and boundary class attribute should in the form of numpy array
        '''
        if all((boundary[0:self.num_dims] > query[self.num_dims:]) | (boundary[self.num_dims:] <= query[0:self.num_dims])):
            return 0 # no overlap
        elif all((boundary[0:self.num_dims] >= query[0:self.num_dims]) & (boundary[self.num_dims:] <= query[self.num_dims:])):
            return 2 # inside
        else:
            return 1 # overlap
    
    def is_contain(self, point):
        '''
        used to determine wheter a data point is contained in this node
        point: [dim1_value, dim2_value,...], should has the same dimensions as this node
        '''
        for i in range(self.num_dims):
            if point[i] > self.boundary[self.num_dims + i] or point[i] < self.boundary[i]:
                return False
        return True
    
    def get_candidate_cuts(self, extended = False):
        '''
        get the candidate cut positions
        if extended is set to True, also add medians from all dimensions
        '''
        candidate_cut_pos = []
        for query in self.queryset:
            for dim in range(self.num_dims):
                # check if the cut position is inside the partition, as the queryset are queries overlap this partition
                if query[dim] >= self.boundary[dim] and query[dim] <= self.boundary[self.num_dims+dim]:
                    candidate_cut_pos.append((dim, query[dim]))
                if query[self.num_dims+dim] >= self.boundary[dim] and query[self.num_dims+dim] <= self.boundary[self.num_dims+dim]:
                    candidate_cut_pos.append((dim, query[self.num_dims+dim]))
        
        if extended:
            for dim in range(self.num_dims):
                split_value = np.median(self.dataset[:,dim])
                candidate_cut_pos.append((dim, split_value))
        
        return candidate_cut_pos
    
    def if_split(self, split_dim, split_value, data_threshold, test = False): # rename: if_split_get_gain
        '''
        return the skip gain and children partition size if split a node from a given split dimension and split value
        '''
        #print("current_node.nid:", current_node.nid)
        #print("current_node.is_leaf:", current_node.is_leaf)
        #print("current_node.dataset is None:", current_node.dataset is None)
        sub_dataset1_size = np.count_nonzero(self.dataset[:,split_dim] < split_value) # process time: 0.007
        sub_dataset2_size = self.node_size - sub_dataset1_size

        if sub_dataset1_size < data_threshold or sub_dataset2_size < data_threshold:
            return False, 0, sub_dataset1_size, sub_dataset2_size
        
        left_part, right_part, mid_part = self.split_queryset(split_dim, split_value)
        num_overlap_child1 = len(left_part) + len(mid_part)
        num_overlap_child2 = len(right_part) + len(mid_part)
        
        if test:
            print("num left part:",len(left_part), "num right part:",len(right_part), "num mid part:",len(mid_part))
            print("left part:", left_part, "right part:", right_part, "mid part:",mid_part)
        
        # temp_child_node1, temp_child_node2 = self.__if_split_get_child(split_dim, split_value)
        skip_gain = len(self.queryset)*self.node_size - num_overlap_child1*sub_dataset1_size - num_overlap_child2*sub_dataset2_size
        return True, skip_gain, sub_dataset1_size, sub_dataset2_size
    
    def if_bounding_split(self, data_threshold, approximate = False, force_extend = False):
        '''
        # the split node is assumed to be >= 2b
        approximate: whether use approximation (even distribution) to find the number of records within a partition
        force_extend: whether extend the bounding partition to make its size greater than data_threshold, if possible
        return availability, skip gain, and the (possible extended) bound
        '''
        max_bound = self.__max_bound(self.queryset)
        bound_size = self.query_result_size(max_bound, approximate)
        if bound_size is None:
            return False, None, None
        
        extended_bound = copy.deepcopy(max_bound)
        if bound_size < data_threshold: # assume the partition is >= 2b, then we must be able to find the valid extension
            if force_extend:
                side = 0
                for dim in range(self.num_dims):
                    valid, extended_bound, bound_size = self.__try_extend(extended_bound, dim, 0, data_threshold) # lower side
                    if valid:
                        break
                    valid, extended_bound, bound_size = self.__try_extend(extended_bound, dim, 1, data_threshold) # upper side
                    if valid:
                        break
            else:
                return False, None, None   
        
        remaining_size = self.node_size - bound_size
        if remaining_size < data_threshold:
            return False, None, None
        cost_before_split = len(self.queryset) * self.node_size
        cost_bound_split = len(self.queryset) * bound_size
        skip_gain = cost_before_split - cost_bound_split
        
        if force_extend:
            return True, skip_gain, extended_bound
        else:
            return True, skip_gain, max_bound # TODO: should we also return the extended bound? 

    def if_new_bounding_split(self, data_threshold, approximate = False, force_extend = True):
        '''
        Also known as var-bounding split or multi-group split
        In this version, we try to generate a collection of MBR partitions if every MBR satisfy:
        1. its size <= b; or
        2. it contains only 1 query; or
        3. |Q|*Core + b > its size * |Q|
        
        OR (if the above failed) a single bounding partition and an irregular shape partition as the old version
        '''
        if self.query_MBRs is None or len(self.query_MBRs) == 0:
            return False
        
        check_valid = True
        extended_flag = False
        
        # simple pruning
        if len(self.query_MBRs) * data_threshold > self.node_size:
            check_valid = False
        else:
            for MBR in self.query_MBRs:
                if MBR.bound_size <= data_threshold or MBR.num_query == 1 or MBR.check_condition3(data_threshold):
                    pass
                else:
                    check_valid = False
                    break
        
        if check_valid:
            # try extend the MBRs to satisfy b, and check whether the extended MBRs overlap with others
            for MBR in self.query_MBRs:
                if MBR.bound_size < data_threshold:
                    MBR.boundary, MBR.bound_size = self.extend_bound(MBR.boundary, data_threshold)
                    MBR.is_extended = True
                    if MBR.bound_size > 2 * data_threshold:
                        MBR.ill_extended = True # if there are too many same key records
                if MBR.is_extended:
                    extended_flag = True # also for historical extended MBRs !!!
                    
            
        # check if the extended MBRs overlaps each other
        if extended_flag and len(self.query_MBRs) > 1:
            for i in range(len(self.query_MBRs) - 1):
                for j in range(i+1, len(self.query_MBRs)):
                    if self.query_MBRs[i].ill_extended or self.query_MBRs[j].ill_extended or self.__is_overlap(self.query_MBRs[i].boundary, self.query_MBRs[j].boundary):
                        #print("partition",self.nid,"found overlap of extended MBRs:", self.query_MBRs[i].boundary, self.query_MBRs[j].boundary)
                        check_valid = False
                        break
                if not check_valid:
                    break
        
        if len(self.query_MBRs) == 1 and self.query_MBRs[0].ill_extended: # in case there is only 1 MBR
            check_valid = False
        
        # check the remaining partition size, if it's not greater than b, return false
        remaining_size = self.node_size
        for MBR in self.query_MBRs:
            remaining_size -= MBR.bound_size
        if remaining_size < data_threshold:
            check_valid = False
        
        # if the above failed
        if check_valid:
            return True # since this is the optimal, we don't need to return skip
        else:
            # do we need to restore the MBRs?
            # NO, when split cross a MBR, it will be rebuilt on both side
            # In other cases, the extended MBR doesn't matter
            return False  
    
    def if_general_group_split(self, data_threshold):
        '''
        the general group split in PAW (this one use merge and doesn't handle overlap)
        '''
        if self.query_MBRs is None or len(self.query_MBRs) == 0:
            #print("PAW: no MBRs")
            return False
        
        def check_MBR_sizes():
            for MBR in self.query_MBRs:
                if MBR.bound_size < data_threshold:
                    return False
            return True
            
        while not check_MBR_sizes() and len(self.query_MBRs)>= 2: # what if only 1 MBR and its size is less than b
            # merge MBRs
            merged_records = [] # (cost, MBR index1, MBR_index2)
            for i in range(len(self.query_MBRs) - 1):
                for j in range(i+1, len(self.query_MBRs)):
                    merged_MBR = self.__if_merge_2MBRs(self.query_MBRs[i], self.query_MBRs[j])
                    cost = merged_MBR.num_query * merged_MBR.bound_size
                    merged_records.append((cost, i, j, merged_MBR))
            merged_records.sort()
            merged_MBR = merged_records[0][-1]
            i, j = merged_records[0][1], merged_records[0][2]
            del self.query_MBRs[j]
            del self.query_MBRs[i] # i < j
            self.query_MBRs.append(merged_MBR)
            #print("merged MBR size:", merged_MBR.bound_size, "boundary:",merged_MBR.boundary)
        
        # check if every partition size is greater than b
        remaining_size = self.node_size
        for MBR in self.query_MBRs:
            remaining_size -= MBR.bound_size
            if MBR.bound_size < data_threshold:
                #print("PAW: MBR size < b, len(MBRs):",len(self.query_MBRs))
                return False
        if remaining_size < data_threshold:
            #print("PAW: remaining size < b")
            return False
        
        # get the cost
        cost = 0
        for MBR in self.query_MBRs:
            cost += MBR.num_query * MBR.bound_size
        # return cost
        skip = len(self.queryset) * self.node_size - cost
        #print("PAW: skip", skip)
        return skip
    
    def if_general_group_split_2(self, data_threshold):
        '''
        this one use extend() + handle overlap
        '''
        if self.query_MBRs is None or len(self.query_MBRs) == 0:
            #print("PAW: no MBRs")
            return False
        
        def check_MBR_overlap(MBRs):
            '''
            return True if overlaps
            '''
            if len(MBRs) >= 2:
                for i in range(len(MBRs)-1):
                    for j in range(i+1, len(MBRs)):
                        if self.__is_overlap(MBRs[i].boundary, MBRs[j].boundary):
                            return True
            return False
        
        # extend
        for MBR in self.query_MBRs:
            if MBR.bound_size < data_threshold:
                #print("extend MBR", MBR.boundary, "before extend, size:", MBR.bound_size)
                MBR.boundary, MBR.bound_size = self.extend_bound(MBR.boundary, data_threshold)
                #print("MBR", MBR.boundary, "after extend, size:", MBR.bound_size)
                MBR.is_extended = True
            if MBR.bound_size > 2 * data_threshold:
                MBR.ill_extended = True # if there are too many same key records
                
        # check overlap:
        if check_MBR_overlap(self.query_MBRs):
            #print("PAW: overlap exists in (extended) MBRs")
            return False
        else:
            # check remaining size
            cost = 0
            remaining_size = self.node_size
            for MBR in self.query_MBRs:
                if MBR.bound_size < data_threshold:
                    #print("PAW: MBR size < b")
                    return False
                remaining_size -= MBR.bound_size
                cost += MBR.bound_size * MBR.num_query
            if remaining_size < data_threshold:
                #print("PAW: remaining size < b")
                return False
            
            # return skip
            skip = len(self.queryset) * self.node_size - cost
            return skip
    
    
    def if_dual_bounding_split(self, split_dim, split_value, data_threshold, approximate = False):
        '''
        check whether it's available to perform dual bounding split
        return availability and skip gain
        '''
        # split queriese first
        left_part, right_part, mid_part = self.split_queryset(split_dim, split_value)
        max_bound_left = self.__max_bound(left_part)
        max_bound_right = self.__max_bound(right_part)
        
        # Should we only consider the case when left and right cannot be further split? i.e., [b,2b)
        # this check logic is given in the PartitionAlgorithm, not here, as the split action should be general
        naive_left_size = np.count_nonzero(self.dataset[:,split_dim] < split_value)
        naive_right_size = self.node_size - naive_left_size
        
        # get (irregular-shape) sub-partition size
        left_size = self.query_result_size(max_bound_left, approximate)
        if left_size is None: # there is no query within the left 
            left_size = naive_left_size # use the whole left part as its size
        if left_size < data_threshold:
            return False, None
        right_size = self.query_result_size(max_bound_right, approximate)
        if right_size is None: # there is no query within the right
            right_size = naive_right_size # use the whole right part as its size
        if right_size < data_threshold:
            return False, None
        remaining_size = self.node_size - left_size - right_size
        if remaining_size < data_threshold:
            return False, None
        
        # check cost
        cost_before_split = len(self.queryset) * self.node_size
        cost_dual_split = len(left_part) * left_size + len(right_part) * right_size + len(mid_part) * remaining_size
        for query in mid_part:
            # if it overlap left bounding box
            if max_bound_left is None or self.__is_overlap(max_bound_left, query) > 0:
                cost_dual_split += left_size
            # if it overlap right bounding box
            if max_bound_right is None or self.__is_overlap(max_bound_right, query) > 0:
                cost_dual_split += right_size
        skip_gain = cost_before_split - cost_dual_split
        return True, skip_gain
        
    def num_query_crossed(self, split_dim, split_value):
        '''
        similar to the split_queryset function, but just return how many queries the intended split will cross
        '''
        count = 0
        if self.queryset is not None:
            for query in self.queryset:
                if query[split_dim] < split_value and query[self.num_dims + split_dim] > split_value:
                    count += 1
            return count
        return None
    
    def split_queryset(self, split_dim, split_value):
        '''
        split the queryset into 3 parts:
        the left part, the right part, and those cross the split value
        '''
        if self.queryset is not None:
            left_part = []
            right_part = []
            mid_part = []
            for query in self.queryset:
                #print("[Split Queryset] query:",query, "split dim:", split_dim, "split value", split_value, "query[split dim]:",query[split_dim])
                if query[split_dim] >= split_value:
                    #print("[Split Queryset] query is right part")
                    right_part.append(query)
                elif query[self.num_dims + split_dim] <= split_value:
                    #print("[Split Queryset] query is left part")
                    left_part.append(query)
                elif query[split_dim] < split_value and query[self.num_dims + split_dim] > split_value:
                    #print("[Split Queryset] query is mid part")
                    mid_part.append(query)
                else:
                    #print("[Split Queryset] query is nothing")
                    pass
            #print("[Split Queryset] left part:",len(left_part), "right part:",len(right_part),"mid part:",len(mid_part))
            return left_part, right_part, mid_part
    
    def query_result_size(self, query, approximate = False):
        '''
        get the query result's size on this node
        the approximate parameter is set to True, the use even distribution to approximate
        '''
        if query is None:
            return None
        
        result_size = 0
        if approximate:
            query_volume = 1
            volume = 1
            for d in range(self.num_dims):
                query_volume *= query[self.num_dims + d] - query[d]
                volume *= self.boundary[self.num_dims + d] - self.boundary[d]

            result_size = int(query_volume / volume * self.node_size)
        else:
            constraints = []
            for d in range(self.num_dims):
                constraint_L = self.dataset[:,d] >= query[d]
                constraint_U = self.dataset[:,d] <= query[self.num_dims + d]
                constraints.append(constraint_L)
                constraints.append(constraint_U)
            constraint = np.all(constraints, axis=0)
            result_size = np.count_nonzero(constraint)
        return result_size
    
    def split_query_MBRs(self, split_dim, split_value):
        if self.query_MBRs is not None:
            
            leaf_boundary = copy.deepcopy(self.boundary)
            right_boundary = copy.deepcopy(self.boundary)
            leaf_boundary[self.num_dims + split_dim] = split_value
            right_boundary[split_dim] = split_value
            
            left_part = [] # totally in left
            right_part = [] # totally in right
            mid_part = []
            for MBR in self.query_MBRs:
                if MBR.boundary[split_dim] >= split_value:
                    right_part.append(MBR)
                elif MBR.boundary[self.num_dims + split_dim] <= split_value:
                    left_part.append(MBR)
                elif MBR.boundary[split_dim] < split_value and MBR.boundary[self.num_dims + split_dim] > split_value:
                    mid_part.append(MBR)
                    
            # process each mid_part MBR
            overlap_left_part_queries = []
            overlap_right_part_queries = []
            for MBR in mid_part:
                for query in MBR.queries:
                    if query[split_dim] < split_value:
                        overlap_left_part_queries.append(query.copy())
                    if query[self.num_dims + split_dim] > split_value:
                        overlap_right_part_queries.append(query.copy())
                
            # generate MBRs for both part. Notice we cannot simply adjust the shape using original MBRs
            # originally, this step doest not bound it by the left part!!! 
            mid_part_left_MBRs = self.generate_query_MBRs(overlap_left_part_queries, leaf_boundary)
            mid_part_right_MBRs = self.generate_query_MBRs(overlap_right_part_queries, right_boundary)
            
            left_part += mid_part_left_MBRs
            right_part += mid_part_right_MBRs
            
            return left_part, right_part
    
    def generate_query_MBRs(self, queryset = None, parent_boundary = None):
        '''
        bound the overlapped queries in this partition into MBRs
        the MBRs will only contains the part inside this partition
        '''
        if queryset is None:
            queryset = self.queryset
        
        if len(queryset) == 0:
            return []
        
        query_MBRs = []
        for query in queryset:
            query_MBRs.append(QueryMBR(query, True))
            
        #print("before merged, number of query MBRs:", len(query_MBRs))
        
        while len(query_MBRs) >= 2:
            
            new_query_MBRs = []
            merged_qids = []

            for i in range(len(query_MBRs)-1):
                new_MBR = copy.deepcopy(query_MBRs[i])
                
                if i in merged_qids:
                    continue
                
                for j in range(i+1, len(query_MBRs)):
                    if j in merged_qids:
                        continue
                    if self.__is_overlap(query_MBRs[i].boundary, query_MBRs[j].boundary):
                        #print("merge:",i,j,"i boundary:",query_MBRs[i].boundary, "j boundary:",query_MBRs[j].boundary,"i queries:",query_MBRs[i].queries, "j queries:", query_MBRs[j].queries)
                        #print("before merge, new MBR queries:", new_MBR.queries)
                        new_MBR = self.__merge_2MBRs(new_MBR, query_MBRs[j])
                        #print("after merge",i,j, "new boundary:",new_MBR.boundary,"new queries:",new_MBR.queries)
                        merged_qids.append(j)
                
                new_query_MBRs.append(new_MBR)
                #print("for iteration",i, "current new_query_MBRs size:",len(new_query_MBRs))
            
            if len(query_MBRs)-1 not in merged_qids: # the last one
                new_query_MBRs.append(query_MBRs[-1])
            
            if len(query_MBRs) == len(new_query_MBRs):
                break
            else:
                query_MBRs = copy.deepcopy(new_query_MBRs)
        
        #print("after merged, number of query MBRs:", len(query_MBRs))
        
        # bound each query MBRs by its partition boundary, and calculate the result size
        for MBR in query_MBRs:
            if parent_boundary is None:
                MBR.boundary = self.__max_bound_single(MBR.boundary)
            else:
                MBR.boundary = self.__max_bound_single(MBR.boundary, parent_boundary)
            MBR.bound_size = self.query_result_size(MBR.boundary) # extend when necessry, not now
            #print("Initial MBR", MBR.boundary," bound size:",MBR.bound_size)
            for query in MBR.queries:
                MBR.query_result_size.append(self.query_result_size(query))
            MBR.total_query_result_size = sum(MBR.query_result_size)
        
        if parent_boundary is None:
            self.query_MBRs = query_MBRs
        
        return query_MBRs
    
    def extend_bound(self, bound, data_threshold, print_info = False, algorithm = 2):
        '''
        extend a bound to be at least b, assume the bound is within the partition boundary
        algorithm == 1: binary search on each dimension
        algorithm == 2: Ken's extend bound method
        '''
        # safe guard
        current_size = self.query_result_size(bound, approximate = False)
        if current_size >= data_threshold:
            return bound, current_size
        
        if algorithm == 1:
            side = 0
            for dim in range(self.num_dims): # or it cannot adapted to other dataset ! #[2,0,1,4,3,5,6]: reranged by distinct values
                if dim+1 > self.num_dims:
                    continue
                valid, bound, bound_size = self.__try_extend(bound, dim, 0, data_threshold, print_info) # lower side
                if print_info:
                    print("dim:",dim,"current bound:",bound,valid,bound_size)
                if valid:
                    break
                valid, bound, bound_size = self.__try_extend(bound, dim, 1, data_threshold, print_info) # upper side
                if print_info:
                    print("dim:",dim,"current bound:",bound,valid,bound_size)
                if valid:
                    break
            return bound, bound_size
        
        elif algorithm == 2:
            center = [(bound[i] + bound[i + self.num_dims]) / 2 for i in range(self.num_dims)]
            radius = [(bound[i + self.num_dims] - bound[i]) / 2 for i in range(self.num_dims)]
            f_records = []
            for point in self.dataset:
                dist_ratio = [abs(point[i] - center[i]) / radius[i] for i in range(self.num_dims)]
                max_dist_ratio = max(dist_ratio)
                f_records.append(max_dist_ratio)
            f_records.sort()
            threshold_ratio = f_records[data_threshold]
            extend_bound_lower = [center[i] - threshold_ratio*radius[i] for i in range(self.num_dims)]
            extend_bound_upper = [center[i] + threshold_ratio*radius[i] for i in range(self.num_dims)]
            extended_bound = extend_bound_lower + extend_bound_upper
            extended_bound = self.__max_bound_single(extended_bound)
            bound_size = self.query_result_size(extended_bound, approximate = False)
            return extended_bound, bound_size
        
    
    # = = = = = internal functions = = = = =
    
    def __try_extend(self, current_bound, try_dim, side, data_threshold, print_info = False):
        '''
        side = 0: lower side
        side = 1: upper side
        return whether this extend has made bound greater than b, current extended bound, and the size
        '''
        # first try the extreme case
        dim = try_dim
        if side == 1:
            dim += self.num_dims
            
        extended_bound = copy.deepcopy(current_bound)
        extended_bound[dim] = self.boundary[dim]
        
        bound_size = self.query_result_size(extended_bound, approximate = False)
        if bound_size < data_threshold:
            return False, extended_bound, bound_size
        
        # binary search in this extend direction
        L, U = None, None
        if side == 0:
            L, U = self.boundary[dim], current_bound[dim]
        else:
            L, U = current_bound[dim], self.boundary[dim]
        
        if print_info:
            print("L,U:",L,U)
        
        loop_count = 0
        while L < U and loop_count < 30:
            mid = (L+U)/2
            extended_bound[dim] = mid
            bound_size = self.query_result_size(extended_bound, approximate = False)
            if bound_size < data_threshold:
                L = mid
            elif bound_size > data_threshold:
                U = mid
                if U - L < 0.00001:
                    break
            else:
                break
            if print_info:
                print("loop,L:",L,"U:",U,"mid:",mid,"extended_bound:",extended_bound,"size:",bound_size)
            loop_count += 1
            
        return bound_size >= data_threshold, extended_bound, bound_size
        
    
    def __is_overlap(self, boundary, query):
        '''
        the difference between this function and the public is_overlap function lies in the boundary parameter
        '''
        if len(query) != 2 * self.num_dims:
            return -1 # error
        
        overlap_flag = True
        inside_flag = True
        
        for i in range(self.num_dims):
            if query[i] >= boundary[self.num_dims + i] or query[self.num_dims + i] <= boundary[i]:
                overlap_flag = False
                inside_flag = False
                return 0
            elif query[i] < boundary[i] or query[self.num_dims + i] > boundary[self.num_dims + i]:
                inside_flag = False
                
        if inside_flag:
            return 2
        elif overlap_flag:
            return 1
        else:
            return 0
        
    def __merge_2MBRs(self, MBR1, MBR2):
        '''
        merge 2 MBRs into 1 (the first one)
        in this step we do not consider whether the merged MBR exceeds the current partition
        '''
        #print("inside merge funtion, before merge, MBR1 queries:", MBR1.queries, " MBR2 queries:", MBR2.queries)
        for i in range(self.num_dims):
            MBR1.boundary[i] = min(MBR1.boundary[i], MBR2.boundary[i])
            MBR1.boundary[self.num_dims + i] = max(MBR1.boundary[self.num_dims + i], MBR2.boundary[self.num_dims + i])
        
        MBR1.queries += MBR2.queries
        MBR1.num_query += MBR2.num_query
        #print("inside merge funtion, after merge, MBR1 queries:", MBR1.queries, " MBR2 queries:", MBR2.queries)
        return MBR1
    
    def __if_merge_2MBRs(self, MBR1, MBR2):
        merged_MBR = copy.deepcopy(MBR1)
        for i in range(self.num_dims):
            merged_MBR.boundary[i] = min(merged_MBR.boundary[i], MBR2.boundary[i])
            merged_MBR.boundary[self.num_dims + i] = max(merged_MBR.boundary[self.num_dims + i], MBR2.boundary[self.num_dims + i])
        merged_MBR.queries += MBR2.queries
        merged_MBR.num_query += MBR2.num_query
        merged_MBR.bound_size = self.query_result_size(merged_MBR.boundary)
        return merged_MBR
    
    def __max_bound(self, queryset):
        '''
        bound the queries by their maximum bounding rectangle !NOTE it is for a collection of queries!!!
        then constraint the MBR by the node's boundary!
        
        the return bound is in the same form as boundary
        '''
        if len(queryset) == 0:
            return None
        #if len(queryset) == 1:
        #    pass, I don't think there will be shape issue here
        
        max_bound_L = np.amin(np.array(queryset)[:,0:self.num_dims],axis=0).tolist()
        # bound the lower side with the boundary's lower side
        max_bound_L = np.amax(np.array([max_bound_L, self.boundary[0:self.num_dims]]),axis=0).tolist()
        
        max_bound_U = np.amax(np.array(queryset)[:,self.num_dims:],axis=0).tolist()
        # bound the upper side with the boundary's upper side
        max_bound_U = np.amin(np.array([max_bound_U, self.boundary[self.num_dims:]]),axis=0).tolist()
        
        max_bound = max_bound_L + max_bound_U # concat
        return max_bound
    
    def __max_bound_single(self, query, parent_boundary = None):
        '''
        bound anything in the shape of query by the current partition boundary
        '''
        if parent_boundary is None:
            for i in range(self.num_dims):
                query[i] = max(query[i], self.boundary[i])
                query[self.num_dims + i] = min(query[self.num_dims + i], self.boundary[self.num_dims + i])
            return query
        else:
            for i in range(self.num_dims):
                query[i] = max(query[i], parent_boundary[i])
                query[self.num_dims + i] = min(query[self.num_dims + i], parent_boundary[self.num_dims + i])
            return query
        
    
    def __if_split_get_child(self, split_dim, split_value): # should I rename this to if_split_get_child
        '''
        return 2 child nodes if a split take place on given dimension with given value
        This function is only used to simplify the skip calculation process, it does not really split the node
        '''
        boundary1 = self.boundary.copy()
        boundary1[split_dim + self.num_dims] = split_value
        boundary2 = self.boundary.copy()
        boundary2[split_dim] = split_value
        child_node1 = PartitionNode(self.num_dims, boundary1)
        child_node2 = PartitionNode(self.num_dims, boundary2)
        return child_node1, child_node2

# The PartitionTree Class.
It maintain the structure of the whole partition process, including intermediate nodes and the final leaf partitions.
It also provides functions to maintain, serialize, and query the tree.

In [307]:
 class PartitionTree:
        '''
        The data structure that represent the partition layout, which also maintain the parent, children relation info
        Designed to provide efficient online query and serialized ability
        
        The node data structure could be checked from the PartitionNode class
        
        '''   
        def __init__(self, num_dims = 0, boundary = []):
            
            # the node id of root should be 0, its pid should be -1
            # note this initialization does not need dataset and does not set node size!

            self.pt_root = PartitionNode(num_dims, boundary, nid = 0, pid = -1, is_irregular_shape_parent = False, 
                                         is_irregular_shape = False, num_children = 0, children_ids = [], is_leaf = True, node_size = 0)
            self.nid_node_dict = {0: self.pt_root} # node id to node dictionary
            self.node_count = 1 # the root node
        
        # = = = = = public functions (API) = = = = =
        
        def save_tree(self, path):
            node_list = self.__generate_node_list(self.pt_root) # do we really need this step?
            serialized_node_list = self.__serialize(node_list)
            #print(serialized_node_list)
            np.savetxt(path, serialized_node_list, delimiter=',')
            return serialized_node_list
            
        def load_tree(self, path):
            serialized_node_list = genfromtxt(path, delimiter=',')
            self.__build_tree_from_serialized_node_list(serialized_node_list)
        
        def query_single(self, query, using_rtree_filter = False, print_info = False, redundant_partitions = None):
            '''
            query is in plain form, i.e., [l1,l2,...,ln, u1,u2,...,un]
            return the overlapped leaf partitions ids!
            redundant_partition: [(boundary, size)...]
            '''
            # used only when redundant_partition is given
            def check_inside(query, partition_boundary):
                num_dims = len(query)//2
                for i in range(num_dims):
                    if query[i] >= partition_boundary[i] and query[num_dims + i] <= partition_boundary[num_dims + i]:
                        pass
                    else:
                        return False
                return True
            
            if redundant_partitions is None:
                partition_ids = self.__find_overlapped_partition(self.pt_root, query, using_rtree_filter, print_info)
                return partition_ids
            else:
                # first check if a query is inside any redundant partition, find the smallest one
                costs = []
                for rp in redundant_partitions:
                    if check_inside(query, rp[0]):
                        costs.append(rp[1])
                if len(costs) == 0:
                    # if not, use regular approach
                    partition_ids = self.__find_overlapped_partition(self.pt_root, query, using_rtree_filter, print_info)
                    return partition_ids
                else:
                    # return the smallest rp size
                    return [-min(costs)] # the minus sign used to differentiate from partition ides
        
        def query_batch(self, queries):
            '''
            to be implemented
            '''
            pass
        
        def get_queryset_cost(self, queries):
            '''
            return the cost array directly
            '''
            costs = []
            for query in queries:
                overlapped_leaf_ids = self.query_single(query)
                cost = 0
                for nid in overlapped_leaf_ids:
                    cost += self.nid_node_dict[nid].node_size
                costs.append(cost)
            return costs
        
        def evaluate_query_cost(self, queries, print_result = False, using_rtree_filter = False, redundant_partitions = None):
            '''
            get the logical IOs of the queris
            return the average query cost
            '''
            total_cost = 0
            case = 0
            total_overlap_ids = {}
            case_cost = {}
            
            for query in queries:
                cost = 0
                overlapped_leaf_ids = self.query_single(query, using_rtree_filter, False, redundant_partitions)
                total_overlap_ids[case] = overlapped_leaf_ids
                for nid in overlapped_leaf_ids:
                    if nid >= 0:
                        cost += self.nid_node_dict[nid].node_size
                    else:
                        cost += (-nid) # redundant partition cost
                total_cost += cost
                case_cost[case] = cost
                case += 1
            
            if print_result:
                print("Total logical IOs:", total_cost)
                print("Average logical IOs:", total_cost // len(queries))
                for case, ids in total_overlap_ids.items():
                    print("query",case, ids, "cost:", case_cost[case])
            
            return total_cost // len(queries)
        
        def get_pid_for_data_point(self, point):
            '''
            get the corresponding leaf partition nid for a data point
            point: [dim1_value, dim2_value...], contains the same dimenions as the partition tree
            '''
            return self.__find_resided_partition(self.pt_root, point)
        
        def add_node(self, parent_id, child_node):
            child_node.nid = self.node_count
            self.node_count += 1
            
            child_node.pid = parent_id
            self.nid_node_dict[child_node.nid] = child_node
            
            child_node.depth = self.nid_node_dict[parent_id].depth + 1
            
            self.nid_node_dict[parent_id].children_ids.append(child_node.nid)
            self.nid_node_dict[parent_id].num_children += 1
            self.nid_node_dict[parent_id].is_leaf = False
        
        
        def apply_split(self, parent_nid, split_dim, split_value, split_type = 0, extended_bound = None, approximate = False,
                        pretend = False):
            '''
            split_type = 0: split a node into 2 sub-nodes by a given dimension and value, distribute dataset
            split_type = 1: split a node by bounding split (will create an irregular shape partition)
            split_type = 2: split a node by daul-bounding split (will create an irregular shape partition)
            split_type = 3: split a node by var-bounding split (multi MBRs), distribute dataset
            extended_bound is only used in split type 1
            approximate: used for measure query result size
            pretend: if pretend is True, return the split result, but do not apply this split
            '''
            parent_node = self.nid_node_dict[parent_nid]
            if pretend:
                parent_node = copy.deepcopy(self.nid_node_dict[parent_nid])
            
            child_node1, child_node2 = None, None
            
            if split_type == 0:
                
                #print("[Apply Split] Before split node", parent_nid, "node queryset:", parent_node.queryset, "MBRs:", parent_node.query_MBRs)
            
                # create sub nodes
                child_node1 = copy.deepcopy(parent_node)
                child_node1.boundary[split_dim + child_node1.num_dims] = split_value
                child_node1.children_ids = []

                child_node2 = copy.deepcopy(parent_node)
                child_node2.boundary[split_dim] = split_value
                child_node2.children_ids = []
                
                if parent_node.query_MBRs is not None:
                    MBRs1, MBRs2 = parent_node.split_query_MBRs(split_dim, split_value)
                    child_node1.query_MBRs = MBRs1
                    child_node2.query_MBRs = MBRs2
                    
                # if parent_node.dataset != None: # The truth value of an array with more than one element is ambiguous.
                # https://stackoverflow.com/questions/36783921/valueerror-when-checking-if-variable-is-none-or-numpy-array
                if parent_node.dataset is not None:
                    child_node1.dataset = parent_node.dataset[parent_node.dataset[:,split_dim] < split_value]
                    child_node1.node_size = len(child_node1.dataset)
                    child_node2.dataset = parent_node.dataset[parent_node.dataset[:,split_dim] >= split_value]
                    child_node2.node_size = len(child_node2.dataset)

                if parent_node.queryset is not None:
                    left_part, right_part, mid_part = parent_node.split_queryset(split_dim, split_value)
                    child_node1.queryset = left_part + mid_part
                    child_node2.queryset = right_part + mid_part
                    
                #print("[Apply Split] After split node", parent_nid, "left child queryset:", child_node1.queryset, "MBRs:", child_node1.query_MBRs)
                #print("[Apply Split] After split node", parent_nid, "right child queryset:", child_node2.queryset, "MBRs:", child_node2.query_MBRs)

                # update current node
                if not pretend:
                    self.add_node(parent_nid, child_node1)
                    self.add_node(parent_nid, child_node2)
                    self.nid_node_dict[parent_nid].split_type = "candidate cut"
            
            elif split_type == 1: # must reach leaf node, hence no need to maintain dataset and queryset any more
                
                child_node1 = copy.deepcopy(parent_node) # the bounding partition
                child_node2 = copy.deepcopy(parent_node) # the remaining partition, i.e., irregular shape
                
                child_node1.is_leaf = True
                child_node2.is_leaf = True
                
                child_node1.children_ids = []
                child_node2.children_ids = []
                
                max_bound = None
                if extended_bound is not None:
                    max_bound = extended_bound
                else:
                    max_bound = parent_node._PartitionNode__max_bound(parent_node.queryset)
                child_node1.boundary = max_bound
                child_node2.is_irregular_shape = True
                
                bound_size = parent_node.query_result_size(max_bound, approximate = False)
                remaining_size = parent_node.node_size - bound_size           
                child_node1.node_size = bound_size
                child_node2.node_size = remaining_size
                
                child_node1.partitionable = False
                child_node2.partitionable = False
                
                if not pretend:
                    self.add_node(parent_nid, child_node1)
                    self.add_node(parent_nid, child_node2)
                    self.nid_node_dict[parent_nid].is_irregular_shape_parent = True
                    self.nid_node_dict[parent_nid].split_type = "sole-bounding split"
            
            elif split_type == 2: # must reach leaf node, hence no need to maintain dataset and queryset any more
                
                child_node1 = copy.deepcopy(parent_node) # the bounding partition 1
                child_node2 = copy.deepcopy(parent_node) # the bounding partition 2
                child_node3 = copy.deepcopy(parent_node) # the remaining partition, i.e., irregular shape
                
                child_node1.is_leaf = True
                child_node2.is_leaf = True
                child_node3.is_leaf = True
                
                child_node1.children_ids = []
                child_node2.children_ids = []
                child_node3.children_ids = []
                
                left_part, right_part, mid_part = parent_node.split_queryset(split_dim, split_value)
                max_bound_1 = parent_node._PartitionNode__max_bound(left_part)
                max_bound_2 = parent_node._PartitionNode__max_bound(right_part)
                
                child_node1.boundary = max_bound_1
                child_node2.boundary = max_bound_2
                child_node3.is_irregular_shape = True          
                
                # Should we only consider the case when left and right cannot be further split? i.e., [b,2b)
                # this check logic is given in the PartitionAlgorithm, not here, as the split action should be general
                naive_left_size = np.count_nonzero(parent_node.dataset[:,split_dim] < split_value)
                naive_right_size = parent_node.node_size - naive_left_size

                # get (irregular-shape) sub-partition size
                bound_size_1 = parent_node.query_result_size(max_bound_1, approximate)
                if bound_size_1 is None: # there is no query within the left 
                    bound_size_1 = naive_left_size # use the whole left part as its size
               
                bound_size_2 = parent_node.query_result_size(max_bound_2, approximate)
                if bound_size_2 is None: # there is no query within the right
                    bound_size_2 = naive_right_size # use the whole right part as its size
               
                remaining_size = parent_node.node_size - bound_size_1 - bound_size_2
                
                child_node1.node_size = bound_size_1
                child_node2.node_size = bound_size_2
                child_node3.node_size = remaining_size
                
                child_node1.partitionable = False
                child_node2.partitionable = False
                child_node3.partitionable = False
                
                if not pretend:
                    self.add_node(parent_nid, child_node1)
                    self.add_node(parent_nid, child_node2)
                    self.add_node(parent_nid, child_node3)
                    self.nid_node_dict[parent_nid].is_irregular_shape_parent = True
                    self.nid_node_dict[parent_nid].split_type = "dual-bounding split"
            
            elif split_type == 3: # new bounding split, create a collection of MBR partitions
                
                remaining_size = parent_node.node_size
                for MBR in parent_node.query_MBRs:
                    child_node = copy.deepcopy(parent_node)
                    child_node.is_leaf = True
                    child_node.children_ids = []
                    child_node.boundary = MBR.boundary
                    child_node.node_size = MBR.bound_size
                    child_node.partitionable = False
                    remaining_size -= child_node.node_size
                    child_node.dataset = self.__extract_sub_dataset(parent_node.dataset, child_node.boundary)
                    child_node.queryset = MBR.queries # no other queries could overlap this MBR, or it's invalid
                    child_node.query_MBRs = [MBR]
                        
                    if not pretend:
                        self.add_node(parent_nid, child_node)
                
                # the last irregular shape partition, we do not need to consider its dataset
                child_node = copy.deepcopy(parent_node)
                child_node.is_leaf = True
                child_node.children_ids = []
                child_node.is_irregular_shape = True
                child_node.node_size = remaining_size
                child_node.partitionable = False
                child_node.dataset = None
                child_node.queryset = None
                child_node.query_MBRs = None
                
                if not pretend:
                    self.add_node(parent_nid, child_node)
                    self.nid_node_dict[parent_nid].is_irregular_shape_parent = True
                    self.nid_node_dict[parent_nid].split_type = "var-bounding split"
            
            else:
                print("Invalid Split Type!")
            
            if not pretend:
                del self.nid_node_dict[parent_nid].dataset
                del self.nid_node_dict[parent_nid].queryset
                #del self.nid_node_dict[parent_nid].query_MBRs
                #self.nid_node_dict[parent_nid] = parent_node
                
            return child_node1, child_node2
        
        def get_leaves(self, use_partitionable = False):
            nodes = []
            if use_partitionable:
                for nid, node in self.nid_node_dict.items():
                    if node.is_leaf and node.partitionable:
                        nodes.append(node)
            else:
                for nid, node in self.nid_node_dict.items():
                    if node.is_leaf:
                        nodes.append(node)
            return nodes
        
        def visualize(self, dims = [0, 1], queries = [], path = None, focus_region = None, add_text = True, use_sci = False):
            '''
            visualize the partition tree's leaf nodes
            focus_region: in the shape of boundary
            '''
            if len(dims) == 2:
                self.__visualize_2d(dims, queries, path, focus_region, add_text, use_sci)
            else:
                self.__visualize_3d(dims[0:3], queries, path, focus_region)
            
        
        # = = = = = internal functions = = = = =
        
        def __extract_sub_dataset(self, dataset, query):
            constraints = []
            num_dims = self.pt_root.num_dims
            for d in range(num_dims):
                constraint_L = dataset[:,d] >= query[d]
                constraint_U = dataset[:,d] <= query[num_dims + d]
                constraints.append(constraint_L)
                constraints.append(constraint_U)
            constraint = np.all(constraints, axis=0)
            sub_dataset = dataset[constraint]
            return sub_dataset
        
        def __generate_node_list(self, node):
            '''
            recursively add childrens into the list
            '''
            node_list = [node]
            for nid in node.children_ids:
                node_list += self.__generate_node_list(self.nid_node_dict[nid])
            return node_list
        
        def __serialize(self, node_list):
            '''
            convert object to attributes to save
            '''
            serialized_node_list = []
            for node in node_list:
                # follow the same order of attributes in partition class
                attributes = [node.num_dims]
                #attributes += node.boundary
                if isinstance(node.boundary, list):
                    attributes += node.boundary
                else:
                    attributes += node.boundary.tolist()
                attributes.append(node.nid) # node id = its ow id
                attributes.append(node.pid) # parent id
                attributes.append(1 if node.is_irregular_shape_parent else 0)
                attributes.append(1 if node.is_irregular_shape else 0)
                attributes.append(node.num_children) # number of children
                #attributes += node.children_ids
                attributes.append(1 if node.is_leaf else 0)
                attributes.append(node.node_size)
                
                serialized_node_list.append(attributes)
            return serialized_node_list
        
        def __build_tree_from_serialized_node_list(self, serialized_node_list):
            
            self.pt_root = None
            self.nid_node_dict.clear()
            pid_children_ids_dict = {}
            
            for serialized_node in serialized_node_list:
                num_dims = int(serialized_node[0])
                boundary = serialized_node[1: 1+2*num_dims]
                nid = int(serialized_node[1+2*num_dims]) # node id
                pid = int(serialized_node[2+2*num_dims]) # parent id
                is_irregular_shape_parent = False if serialized_node[3+2*num_dims] == 0 else True
                is_irregular_shape = False if serialized_node[4+2*num_dims] == 0 else True
                num_children = int(serialized_node[5+2*num_dims])
#                 children_ids = []
#                 if num_children != 0:
#                     children_ids = serialized_node[1+5+2*num_dims: 1+num_children+1+5+2*num_dims] # +1 for the end exclusive
#                 is_leaf = False if serialized_node[1+num_children+5+2*num_dims] == 0 else True
#                 node_size = serialized_node[2+num_children+5+2*num_dims] # don't use -1 in case of match error
                is_leaf = False if serialized_node[6+2*num_dims] == 0 else True
                node_size = int(serialized_node[7+2*num_dims])
                
                node = PartitionNode(num_dims, boundary, nid, pid, is_irregular_shape_parent, 
                                     is_irregular_shape, num_children, [], is_leaf, node_size) # let the children_ids empty
                self.nid_node_dict[nid] = node # update dict
                
                if node.pid in pid_children_ids_dict:
                    pid_children_ids_dict[node.pid].append(node.nid)
                else:
                    pid_children_ids_dict[node.pid] = [node.nid]
            
            # make sure the irregular shape partition is placed at the end of the child list
            for pid, children_ids in pid_children_ids_dict.items():
                if pid == -1:
                    continue
                if self.nid_node_dict[pid].is_irregular_shape_parent and not self.nid_node_dict[children_ids[-1]].is_irregular_shape:
                    # search for the irregular shape partition
                    new_children_ids = []
                    irregular_shape_id = None
                    for nid in children_ids:
                        if self.nid_node_dict[nid].is_irregular_shape:
                            irregular_shape_id = nid
                        else:
                            new_children_ids.append(nid)
                    new_children_ids.append(irregular_shape_id)
                    self.nid_node_dict[pid].children_ids = new_children_ids
                else:
                    self.nid_node_dict[pid].children_ids = children_ids
            
            self.pt_root = self.nid_node_dict[0]
        
        def __bound_query_by_boundary(self, query, boundary):
            '''
            bound the query by a node's boundary
            '''
            bounded_query = query.copy()
            num_dims = self.pt_root.num_dims
            for dim in range(num_dims):
                bounded_query[dim] = max(query[dim], boundary[dim])
                bounded_query[num_dims+dim] = min(query[num_dims+dim], boundary[num_dims+dim])
            return bounded_query
        
        def __find_resided_partition(self, node, point):
            '''
            for data point only
            '''
            #print("enter function!")
            if node.is_leaf:
                #print("within leaf",node.nid)
                if node.is_contain(point):
                    return node.nid
            
            for nid in node.children_ids:
                if self.nid_node_dict[nid].is_contain(point):
                    #print("within child", nid, "of parent",node.nid)
                    return self.__find_resided_partition(self.nid_node_dict[nid], point)
            
            #print("no children of node",node.nid,"contains point")
            return -1
        
        def __find_overlapped_partition(self, node, query, using_rtree_filter = False, print_info = False):
            
            if print_info:
                print("Enter node", node.nid)
                
            if node.is_leaf:
                if print_info:
                    print("node", node.nid, "is leaf")
                
                if using_rtree_filter and node.rtree_filters is not None:
                    for mbr in node.rtree_filters:
                        if node._PartitionNode__is_overlap(mbr, query) > 0:
                            return [node.nid]
                    return []
                else:
                    if print_info and node.is_overlap(query) > 0:
                        print("node", node.nid, "is added as result")
                    return [node.nid] if node.is_overlap(query) > 0 else []
            
            node_id_list = []
            if node.is_overlap(query) <= 0:
                if print_info:
                    print("node", node.nid, "is not overlap with the query")
                pass
            elif node.is_irregular_shape_parent: # special process for irregular shape partitions!
                if print_info:
                    print("node", node.nid, "is_irregular_shape_parent")
                    
                # bound the query with parent partition's boundary, that's for the inside case determination
                bounded_query = self.__bound_query_by_boundary(query, node.boundary)
                
                overlap_irregular_shape_node_flag = True
                for nid in node.children_ids[0: -1]: # except the last one, should be the irregular shape partition
                    overlap_case = self.nid_node_dict[nid].is_overlap(bounded_query)
                    if overlap_case == 2: # inside
                        node_id_list += self.__find_overlapped_partition(self.nid_node_dict[nid], query, using_rtree_filter, print_info)
                        overlap_irregular_shape_node_flag = False
                        if print_info:
                            print("query within children node", nid, "irregular shape neighbors")
                        break
                    if overlap_case == 1: # overlap
                        node_id_list += self.__find_overlapped_partition(self.nid_node_dict[nid], query, using_rtree_filter, print_info)
                        overlap_irregular_shape_node_flag = True
                        if print_info:
                            print("query overlap children node", nid, "irregular shape neighbors")
                if overlap_irregular_shape_node_flag:
                    if print_info:
                        print("query overlap irregular shape child node", node.children_ids[-1])
                    node_id_list.append(node.children_ids[-1])
            else:
                if print_info:
                    print("searching childrens for node", node.nid)
                for nid in node.children_ids:
                    node_id_list += self.__find_overlapped_partition(self.nid_node_dict[nid], query, using_rtree_filter, print_info)
            return node_id_list
        
        def __visualize_2d(self, dims, queries = [], path = None, focus_region = None, add_text = True, use_sci = False):
            '''
            focus_region: in the shape of boundary
            '''
            fig, ax = plt.subplots(1)
            num_dims = self.pt_root.num_dims
            
            plt.xlim(self.pt_root.boundary[dims[0]], self.pt_root.boundary[dims[0]+num_dims])
            plt.ylim(self.pt_root.boundary[dims[1]], self.pt_root.boundary[dims[1]+num_dims])
            
            
            leaves = self.get_leaves()
            for leaf in leaves: 
                lower1 = leaf.boundary[dims[0]]
                lower2 = leaf.boundary[dims[1]]             
                upper1 = leaf.boundary[dims[0]+num_dims]
                upper2 = leaf.boundary[dims[1]+num_dims]

                rect = Rectangle((lower1,lower2),upper1-lower1,upper2-lower2,fill=False,edgecolor='g',linewidth=1)
                if add_text:
                    ax.text(lower1, lower2, leaf.nid, fontsize=7)
                ax.add_patch(rect)
            
            case = 0
            for query in queries:
                lower1 = query[dims[0]]
                lower2 = query[dims[1]]  
                upper1 = query[dims[0]+num_dims]
                upper2 = query[dims[1]+num_dims]    

                rect = Rectangle((lower1,lower2),upper1-lower1,upper2-lower2,fill=False,edgecolor='r',linewidth=1)
                if add_text:
                    ax.text(upper1, upper2, case, color='b',fontsize=7)
                case += 1
                ax.add_patch(rect)

            ax.set_xlabel('dimension 1', fontsize=15)
            ax.set_ylabel('dimension 2', fontsize=15)
            if use_sci:
                plt.ticklabel_format(axis="both", style="sci", scilimits=(0,0))
            #plt.xticks(np.arange(0, 400001, 100000), fontsize=10)
            #plt.yticks(np.arange(0, 20001, 5000), fontsize=10)

            plt.tight_layout() # preventing clipping the labels when save to pdf
            if focus_region is not None:
                
                # reform focus region into interleaf format
                formated_focus_region = []
                for i in range(2):
                    formated_focus_region.append(focus_region[i])
                    formated_focus_region.append(focus_region[2+i])
                
                plt.axis(formated_focus_region)

            if path is not None:
                fig.savefig(path)

            plt.show()
        
        %matplotlib notebook
        def __visualize_3d(self, dims, queries = [], path = None, focus_region = None):
            fig = plt.figure()
            ax = Axes3D(fig)
            
            num_dims = self.pt_root.num_dims
            plt.xlim(self.pt_root.boundary[dims[0]], self.pt_root.boundary[dims[0]+num_dims])
            plt.ylim(self.pt_root.boundary[dims[1]], self.pt_root.boundary[dims[1]+num_dims])
            ax.set_zlim(self.pt_root.boundary[dims[2]], self.pt_root.boundary[dims[2]+num_dims])
            
            leaves = self.get_leaves()
            for leaf in leaves:
                
                L1 = leaf.boundary[dims[0]]
                L2 = leaf.boundary[dims[1]]
                L3 = leaf.boundary[dims[2]]      
                U1 = leaf.boundary[dims[0]+num_dims]
                U2 = leaf.boundary[dims[1]+num_dims]
                U3 = leaf.boundary[dims[2]+num_dims]
                
                # the 12 lines to form a rectangle
                x = [L1, U1]
                y = [L2, L2]
                z = [L3, L3]
                ax.plot3D(x,y,z,color="g")
                y = [U2, U2]
                ax.plot3D(x,y,z,color="g")
                z = [U3, U3]
                ax.plot3D(x,y,z,color="g")
                y = [L2, L2]
                ax.plot3D(x,y,z,color="g")

                x = [L1, L1]
                y = [L2, U2]
                z = [L3, L3]
                ax.plot3D(x,y,z,color="g")
                x = [U1, U1]
                ax.plot3D(x,y,z,color="g")
                z = [U3, U3]
                ax.plot3D(x,y,z,color="g")
                x = [L1, L1]
                ax.plot3D(x,y,z,color="g")

                x = [L1, L1]
                y = [L2, L2]
                z = [L3, U3]
                ax.plot3D(x,y,z,color="g")
                x = [U1, U1]
                ax.plot3D(x,y,z,color="g")
                y = [U2, U2]
                ax.plot3D(x,y,z,color="g")
                x = [L1, L1]
                ax.plot3D(x,y,z,color="g")
            
            for query in queries:

                L1 = query[dims[0]]
                L2 = query[dims[1]]
                L3 = query[dims[2]]
                U1 = query[dims[0]+num_dims]
                U2 = query[dims[1]+num_dims]
                U3 = query[dims[2]+num_dims]

                # the 12 lines to form a rectangle
                x = [L1, U1]
                y = [L2, L2]
                z = [L3, L3]
                ax.plot3D(x,y,z,color="r")
                y = [U2, U2]
                ax.plot3D(x,y,z,color="r")
                z = [U3, U3]
                ax.plot3D(x,y,z,color="r")
                y = [L2, L2]
                ax.plot3D(x,y,z,color="r")

                x = [L1, L1]
                y = [L2, U2]
                z = [L3, L3]
                ax.plot3D(x,y,z,color="r")
                x = [U1, U1]
                ax.plot3D(x,y,z,color="r")
                z = [U3, U3]
                ax.plot3D(x,y,z,color="r")
                x = [L1, L1]
                ax.plot3D(x,y,z,color="r")

                x = [L1, L1]
                y = [L2, L2]
                z = [L3, U3]
                ax.plot3D(x,y,z,color="r")
                x = [U1, U1]
                ax.plot3D(x,y,z,color="r")
                y = [U2, U2]
                ax.plot3D(x,y,z,color="r")
                x = [L1, L1]
                ax.plot3D(x,y,z,color="r")

            if path is not None:
                fig.savefig(path)

            plt.show()

# The PartitionAlgorithm Class.
It provides the algorithms to build a PartitionTree.

In [270]:
class PartitionAlgorithm:
    '''
    The partition algorithms, inlcuding NORA, QdTree and kd-tree.
    '''
    def __init__(self, data_threshold = 10000):
        self.partition_tree = None
        self.data_threshold = data_threshold
    
    
    # = = = = = public functions (API) = = = = =
    
    def InitializeWithPAW(self, queries, num_dims, boundary, dataset, data_threshold, max_active_ratio = 3, strategy = 1,
                          using_beam_search = False, candidate_size = 1, candidate_depth = 1, beam_search_mode = 0):
        self.partition_tree = PartitionTree(num_dims, boundary)
        self.partition_tree.pt_root.node_size = len(dataset)
        self.partition_tree.pt_root.dataset = dataset
        self.partition_tree.pt_root.queryset = queries # assume all queries overlap with the boundary
        self.partition_tree.pt_root.generate_query_MBRs()
        start_time = time.time()
        if using_beam_search:
            self.__PAW_Beam_Search(data_threshold, queries, max_active_ratio, strategy, candidate_size, candidate_depth, 
                                   None, beam_search_mode)
        else:
            self.__PAW(data_threshold, max_active_ratio, strategy)
        end_time = time.time()        
        print("Build Time (s):", end_time-start_time)
    
    def InitializeWithNORA(self, queries, num_dims, boundary, dataset, data_threshold, using_1_by_1 = False, using_kd = False, 
                           depth_limit = None, return_query_cost = False, using_beam_search = False, candidate_size = 2, 
                           candidate_depth = 2):
        '''
        using_1_BY_1: using some optimizations including new bounding split and bounding split in internal node
        using_kd: split leaf node by kd if still greater than b
        '''
        self.partition_tree = PartitionTree(num_dims, boundary)
        self.partition_tree.pt_root.node_size = len(dataset)
        self.partition_tree.pt_root.dataset = dataset
        self.partition_tree.pt_root.queryset = queries # assume all queries overlap with the boundary
        start_time = time.time()
        if using_1_by_1:
            self.partition_tree.pt_root.generate_query_MBRs()
            if using_beam_search:
                self.__NORA_Beam_Search(data_threshold, candidate_size, candidate_depth)
            else:
                self.__NORA_1_BY_1(data_threshold, depth_limit)
        else:
            self.__NORA(data_threshold, depth_limit)
        if using_kd:
            for leaf in self.partition_tree.get_leaves():
                if leaf.is_irregular_shape or leaf.is_irregular_shape_parent:
                    continue
                self.__KDT(0, data_threshold, leaf)
        end_time = time.time()
        
        if return_query_cost:
            return self.partition_tree.evaluate_query_cost(training_set, False)
        
        print("Build Time (s):", end_time-start_time)
    
    def InitializeWithQDT(self, queries, num_dims, boundary, dataset, data_threshold):
        '''
        # should I also store the candidate cut positions in Partition Node ?
        The dimension of queries should match the dimension of boundary and dataset!
        '''
        self.partition_tree = PartitionTree(num_dims, boundary)
        self.partition_tree.pt_root.node_size = len(dataset)
        self.partition_tree.pt_root.dataset = dataset
        self.partition_tree.pt_root.queryset = queries # assume all queries overlap with the boundary
        start_time = time.time()
        self.__QDT(data_threshold)
        end_time = time.time()
        #print("Build Time (s):", end_time-start_time)
     
    def InitializeWithKDT(self, num_dims, boundary, dataset, data_threshold):
        '''
        num_dims denotes the (first) number of dimension to split, usually it should correspond with the boundary
        rewrite the KDT using PartitionTree data structure
        call the recursive __KDT methods
        '''
        self.partition_tree = PartitionTree(num_dims, boundary)
        self.partition_tree.pt_root.node_size = len(dataset)
        self.partition_tree.pt_root.dataset = dataset
        # start from the first dimension
        start_time = time.time()
        self.__KDT(0, data_threshold, self.partition_tree.pt_root)
        end_time = time.time()
        #print("Build Time (s):", end_time-start_time)
    
    def ContinuePartitionWithKDT(self, existing_partition_tree, data_threshold):
        '''
        pass in a PartitionTree instance
        then keep partition its leaf nodes with KDT, if available
        '''
        self.partition_tree = existing_partition_tree
        leaves = existing_partition_tree.get_leaves()
        for leaf in leaves:
            self.__KDT(0, data_threshold, leaf)
    
    def CreateRtreeFilter(self, data_threshold, capacity_ratio = 0.5):
        '''
        create Rtree MBRs for leaf nodes as a filter layer for skew dataset
        '''
        for leaf in self.partition_tree.get_leaves():
            if leaf.is_irregular_shape:
                continue
            else:
                MBRs = self.__CreateRtreeMBRs(leaf.dataset, data_threshold, capacity_ratio)   
                leaf.rtree_filters = MBRs
    
    def RedundantPartitions(self, redundant_space, queries, dataset, data_threshold, weight = None):
        '''
        create redundant partitions to maximize the cost deduction, the extra space is limited by the redundant space
        this is a typical dynamic programming problem
        '''
        old_costs = self.partition_tree.get_queryset_cost(queries)
        spaces = self.__real_result_size(dataset, queries)
        spaces = [max(s, data_threshold) for s in spaces]
        gains = [old_costs[i]-spaces[i] for i in range(len(queries))]
        
        #print("old cost:",old_costs)
        #print("spaces:", spaces)
        #print("gains:", gains)
        
        if weight is not None: # the expected query amount
            gains = [gains[i]*weight[i] for i in range(len(queries))]
        
        max_total_gain, materialized_queries = self.__RPDP(0, gains, spaces, redundant_space, {})
        
        query_size = len(queries) if weight is None else sum(weight)
        old_query_cost = sum(old_costs)
        old_average_query_cost = old_query_cost / query_size
        new_query_cost = old_query_cost - max_total_gain
        new_average_query_cost = new_query_cost / query_size
        
        #print("max total gain:", max_total_gain)
        #print("old_query_cost:", old_query_cost, "new_query_cost:", new_query_cost)
        #print("old_average_query_cost:", old_average_query_cost, "new_average_query_cost:", new_average_query_cost)
        
        return max_total_gain, materialized_queries
    
    # = = = = = internal functions = = = = =
    
    
    def __CreateRtreeMBRs(self, dataset, data_threshold, capacity_ratio = 0.5):
    
        def DatasetGenerator(dataset):
            for i in range(len(dataset)):
                yield(i, tuple(dataset[i].tolist()+dataset[i].tolist()), dataset[i])
            return

        p = index.Property()
        p.dimension = dataset.shape[1]
        p.leaf_capacity = int(capacity_ratio * data_threshold) # cannot be less than 100, indicate the maximum capacity
        p.fill_factor = 0.9
        p.overwrite = True
        p.interleaved = False

        rtree_idx = index.Index(DatasetGenerator(dataset), properties = p)
    #     rtree_idx = index.Index(properties = p) # Rtree index for queries
    #     for i in range(len(dataset)):
    #         rtree_idx.insert(i, tuple(dataset[i].tolist()+dataset[i].tolist())) # Check whether this operation is correct !!!

        leaves = rtree_idx.leaves()
        #print(leaves)

        MBRs = [] # check whether the False interleaved property will make the mbr not interleaved? -> Result: Yes
        for leaf in leaves:
            MBRs.append(leaf[2]) # [0]: id?; [1]: [records...]; [2]: boundary

        return MBRs
    
    
    def __RPDP(self, i, gains, spaces, total_space, i_space_dict):
        '''
        i: the current query id to be considered
        total_space: the remaining redundant space
        '''
        key = (i, total_space)
        if key in i_space_dict:
            return i_space_dict[key]
        
        if i >= len(gains): # end
            return (0, [])
        
        gain, Q = None, None
        if total_space > spaces[i]:
            # create RP for this query
            (gain1, Q1) = self.__RPDP(i+1, gains, spaces, total_space-spaces[i], i_space_dict)
            (gain1, Q1) = (gains[i] + gain1, [i] + Q1)
            # do not create RP for this query
            (gain2, Q2) = self.__RPDP(i+1, gains, spaces, total_space, i_space_dict) 
            (gain, Q) = (gain1, Q1) if gain1 >= gain2 else (gain2, Q2)
        else:
            # do not create RP for this query
            (gain, Q) = self.__RPDP(i+1, gains, spaces, total_space, i_space_dict)
        
        i_space_dict[key] = (gain, Q)
        return (gain, Q)
        
    
    def __real_result_size(self, dataset, queries):
        num_dims = dataset.shape[1]
        results = []
        for query in queries:
            constraints = []
            for d in range(num_dims):
                constraint_L = dataset[:,d] >= query[d]
                constraint_U = dataset[:,d] <= query[num_dims + d]
                constraints.append(constraint_L)
                constraints.append(constraint_U)
            constraint = np.all(constraints, axis=0)
            result_size = np.count_nonzero(constraint)
            results.append(result_size)
        return results
    
    def __max_bound(self, num_dims, queryset):
        '''
        bound the queries by their maximum bounding rectangle
        '''
        max_bound_L = np.amin(np.array(queryset)[:,0:num_dims],axis=0).tolist()
        max_bound_U = np.amax(np.array(queryset)[:,num_dims:],axis=0).tolist()
        max_bound = max_bound_L + max_bound_U # concat
        return max_bound
    
    
    def __PAW_Beam_Search(self, data_threshold, queries, max_active_ratio = 3, strategy = 1, candidate_size = 1, 
                          candidate_depth = 1, depth_limit = None, beam_search_mode = 0):
        
        '''
        strategy: related to group split
        beam_search_mode: 0 = subsequent calls also use beam search; 1 = subsequent calls only use the best
        '''
        
        if depth_limit is not None and depth_limit < candidate_depth:
            print("-> Enter beam search for leaf", self.partition_tree.pt_root.nid, "depth_limit:", depth_limit,"/", candidate_depth)
        
        def len_DQ(someset):
            if someset is not None:
                return len(someset)
            return 0
        
        CanSplit = True
        while CanSplit:
            CanSplit = False           
            
            # DO NOT CONSIDER Partitionable!
            leaves = self.partition_tree.get_leaves()
            #print("# number of leaf nodes:",len(leaves))
            
            whole_partition_cost = 0
            
            for leaf in leaves:
                
                if leaf.node_size < 2 * data_threshold or leaf.queryset is None or leaf.is_irregular_shape or leaf.no_valid_partition:
                    whole_partition_cost += len_DQ(leaf.queryset) * leaf.node_size
                    print("X IGNORE leaf", leaf.nid, "leaf node dataset", len_DQ(leaf.dataset), "queryset size:",len_DQ(leaf.queryset))
                    print("[ignore leaf] current whole partition cost:", whole_partition_cost, " + step:", len_DQ(leaf.queryset) * leaf.node_size)
                    continue
                print("O CONSIDER leaf node id:",leaf.nid, "leaf node dataset", len_DQ(leaf.dataset), "queryset size:",len_DQ(leaf.queryset))
                    
                split_candidates = []
                skip = 0
                
                # always extend the candidate cut with medians
                candidate_cuts = leaf.get_candidate_cuts(True)
                
                # try general group split
                if leaf.node_size <= max_active_ratio * data_threshold:
                    if strategy == 0:
                        skip = leaf.if_general_group_split(data_threshold)
                    elif strategy == 1:
                        skip = leaf.if_general_group_split_2(data_threshold)
                    split_candidates.append((skip, None, None, 3))
                
                for split_dim, split_value in candidate_cuts:

                    # first try normal split
                    valid, skip, left_size, right_size = leaf.if_split(split_dim, split_value, data_threshold)
                    if valid:
                        split_candidates.append((skip, split_dim, split_value, 0))
                        
                split_candidates.sort(key=lambda item: item[0], reverse=True) # from the most skip gain to least
                
                best_split = None
                min_cost = float('inf')
                
                if depth_limit is None:
                    depth_limit = candidate_depth
                if depth_limit == 0:
                    # return the cost of max skip split directly       
                    leaf_cost = len(leaf.queryset) * leaf.node_size - split_candidates[0][0] # original cost - skip = after split cost
                    whole_partition_cost += leaf_cost
                    print("[depth limit 0] current whole partition cost:", whole_partition_cost, "+ step:",leaf_cost,"skip:",split_candidates[0][0])
                    if candidate_depth != 0:
                        continue
                    else:
                        best_split = split_candidates[0]
                        min_cost = leaf_cost
                    #return self.partition_tree.evaluate_query_cost(training_set, False)
                
                if candidate_depth != 0:
                    print("START beam search for node", leaf.nid)
                    if beam_search_mode == 0 or depth_limit == candidate_depth:
                    
                        for i in range(min(candidate_size, len(split_candidates))):

                            # I need to copy myself for the split
                            temp_node = copy.deepcopy(leaf)
                            temp_queries = copy.deepcopy(leaf.queryset)

                            # apply this split on temp node
                            temp_tree = PartitionTree(temp_node.num_dims, temp_node.boundary)
                            temp_tree.pt_root = temp_node
                            temp_tree.nid_node_dict = {temp_node.nid:  temp_node}
                            temp_tree.node_count = temp_node.nid + 1
                            temp_tree.apply_split(temp_node.nid, split_candidates[i][1], split_candidates[i][2], split_candidates[i][3])

                            temp_algo = PartitionAlgorithm()
                            temp_algo.partition_tree = temp_tree

                            child_cost = temp_algo._PartitionAlgorithm__PAW_Beam_Search(data_threshold, temp_queries, max_active_ratio, 
                                                                                        strategy, candidate_size, candidate_depth, 
                                                                                        depth_limit-1, beam_search_mode)
                            print("<- beam search result cost for leaf", leaf.nid, "cut",i, "type:",split_candidates[i][3],"cost:",child_cost)
                            if child_cost < min_cost:
                                best_split = split_candidates[i]
                                min_cost = child_cost
                                
                    elif beam_search_mode == 1:
                        temp_node = copy.deepcopy(leaf)
                        temp_queries = copy.deepcopy(leaf.queryset)

                        # apply this split on temp node
                        temp_tree = PartitionTree(temp_node.num_dims, temp_node.boundary)
                        temp_tree.pt_root = temp_node
                        temp_tree.nid_node_dict = {temp_node.nid:  temp_node}
                        temp_tree.node_count = temp_node.nid + 1
                        temp_tree.apply_split(temp_node.nid, split_candidates[0][1], split_candidates[0][2], split_candidates[0][3])

                        temp_algo = PartitionAlgorithm()
                        temp_algo.partition_tree = temp_tree

                        child_cost = temp_algo._PartitionAlgorithm__PAW_Beam_Search(data_threshold, temp_queries, max_active_ratio, 
                                                                                    strategy, candidate_size, candidate_depth, 
                                                                                    depth_limit-1, beam_search_mode)
                        min_cost = child_cost

                    if depth_limit != candidate_depth:
                        whole_partition_cost += min_cost
                        print("[after beam search] current whole partition cost:", whole_partition_cost, "+step:",min_cost)
                        continue
                        #return min_cost
                
                # if this is the current level (i.e., after the beam search returned), apply the search result
                print("= = = AFTER beam search for node", leaf.nid)
                if min_cost < len(leaf.queryset) * leaf.node_size and best_split is not None and best_split[0] > 0:
                    # if the cost become smaller, apply the cut
                    self.partition_tree.apply_split(leaf.nid, best_split[1], best_split[2], best_split[3])
                    print("= = = SPLIT on node id:", leaf.nid, "split type:", best_split[3])
                    CanSplit = True
                else:
                    print("leaf node id:",leaf.nid,"does not have valid split")
                    leaf.no_valid_partition = True
            
            if depth_limit != candidate_depth and candidate_depth != 0:
                return whole_partition_cost
            
        # if it already met the bottom of the tree, (even it hasn't met the depth limit)
        return int(self.partition_tree.evaluate_query_cost(queries) * len(queries))
    
    def __PAW(self, data_threshold, max_active_ratio = 3, strategy = 1):
        '''
        using Ken's Algorithm 1: iteratively bound 2 MBRs if they are less than b
        max_active_ratio: when the partition size is <= ratio * b: consider group split
        strategy: 0 = using merge without considering overlap; 1 = using extend, aborted when overlapped
        ''' 
        CanSplit = True
        while CanSplit:
            CanSplit = False           
            
            # DO NOT CONSIDER Partitionable!
            leaves = self.partition_tree.get_leaves()
            #print("# number of leaf nodes:",len(leaves))
            for leaf in leaves:
                
                #print("current leaf node id:",leaf.nid, "leaf node dataset size:", leaf.node_size)
                if leaf.node_size < 2 * data_threshold or leaf.queryset is None or leaf.is_irregular_shape or leaf.no_valid_partition:
                    continue
                    
                # get best candidate cut position
                skip, max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = 0, 0, 0, 0, 0
                
                # always extend the candidate cut with medians
                candidate_cuts = leaf.get_candidate_cuts(True)
                
                # try general group split
                if leaf.node_size <= max_active_ratio * data_threshold:
                    if strategy == 0:
                        skip = leaf.if_general_group_split(data_threshold)
                    elif strategy == 1:
                        skip = leaf.if_general_group_split_2(data_threshold)
                        
                    #print("PAW: general group split is tried")
                    if skip > max_skip:
                        max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = skip, None, None, 3
                
                for split_dim, split_value in candidate_cuts:

                    # first try normal split
                    valid, skip, left_size, right_size = leaf.if_split(split_dim, split_value, data_threshold)
                    if valid and skip > max_skip:
                        max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = skip, split_dim, split_value, 0

                if max_skip > 0:
                    # if the cost become smaller, apply the cut
                    child_node1, child_node2 = self.partition_tree.apply_split(leaf.nid, max_skip_split_dim, max_skip_split_value, max_skip_split_type)
                    #print(" Split on node id:", leaf.nid, "split type:", max_skip_split_type, "split gain:", max_skip, "split value:", max_skip_split_value, "split dim:",max_skip_split_dim)
                    #print("after split, child node 1 queryset size:",len(child_node1.queryset), "child node 2 queryset size:", len(child_node2.queryset))
                    CanSplit = True
                else:
                    #print("leaf node id:",leaf.nid,"does not have valid split")
                    leaf.no_valid_partition = True
                    
    
    def __NORA_Beam_Search(self, data_threshold, candidate_size = 1, candidate_depth = 2):
        '''
        using beam search to improve the search space
        candidate_size: how many candidate splits we maintain in a layer
        candidate_depth: how many layers we keep during the search
        '''
        '''
        the NORA algorithm that optimized for 1 by 1 mapping scenario
        '''
        
        CanSplit = True
        while CanSplit:
            CanSplit = False           
            
            # for leaf in self.partition_tree.get_leaves():
            leaves = self.partition_tree.get_leaves(use_partitionable = True) # there could be large irregular shape partitions
            #print("# number of leaf nodes:",len(leaves))
            for leaf in leaves:
                
                # print("current leaf node id:",leaf.nid, "leaf node dataset size:",len(leaf.dataset))
                if leaf.node_size < 2 * data_threshold or leaf.queryset is None:
                    continue
                
#                 if len(leaf.queryset) == 1:
#                     # try bounding split here
#                     valid, skip, bound = leaf.if_bounding_split(data_threshold, approximate = False)
                    
#                     if valid:
#                         # apply this split
#                         #print(" = = = Apply Old Bounding Split = = = ")
#                         self.partition_tree.apply_split(leaf.nid, None, None, 1, bound)
#                         #print("!!!Split From Internal Bounding Split!!!")
#                         continue    
#                         # design an variant of bounding split, which will handle node size problem # done
#                         # if valid, mark the children as  don't consider anymore" -> partitionable = False
                   
#                 if leaf.if_new_bounding_split(data_threshold):
#                     #print(" = = = Apply New Bounding Split = = = ")
#                     # create new bounding split sub-partitions
#                     self.partition_tree.apply_split(leaf.nid, None, None, 3)
#                     continue
                
                # get best candidate cut position
                skip, max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = 0, -1, 0, 0, 0
                # extend the candidate cut with medians when it reach the bottom
                #candidate_cuts = leaf.get_candidate_cuts(True) if leaf.node_size < 4 * data_threshold else leaf.get_candidate_cuts()     
                candidate_cuts = leaf.get_candidate_cuts(True) # try extends it always
                
                split_candidates = []
                for split_dim, split_value in candidate_cuts:

                    # first try normal split
                    valid, skip, left_size, right_size = leaf.if_split(split_dim, split_value, data_threshold)
                    #if valid and skip > max_skip:
                    #    max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = skip, split_dim, split_value, 0
                    if valid:
                        num_query_crossed = leaf.num_query_crossed(split_dim, split_value)
                        split_candidates.append((skip, split_dim, split_value, 0, valid, num_query_crossed))
                    
                    # Should we remove the leaf node case here in 1 BY 1?
                    # the following cases here are applied only for leaf nodes
                    # if it's available for bounding split, try it
                    if leaf.node_size < 3 * data_threshold:
                        # try bounding split
                        valid, skip,_ = leaf.if_bounding_split(data_threshold, approximate = False)
                        if valid and skip > max_skip:
                            split_candidates.append((skip, split_dim, split_value, 1, valid, 0))
                            #max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = skip, split_dim, split_value, 1

                    # if it's availble for dual-bounding split, try it
                    elif leaf.node_size < 4 * data_threshold and left_size < 2 * data_threshold and right_size < 2 * data_threshold:
                        # try dual-bounding split              
                        valid, skip = leaf.if_dual_bounding_split(split_dim, split_value, data_threshold, approximate = False)
                        if valid and skip > max_skip:
                            split_candidates.append((skip, split_dim, split_value, 1, valid, 2))
                            #max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = skip, split_dim, split_value, 2
                
                if len(split_candidates) == 0:
                    continue
                
                split_candidates.sort(key=lambda item: item[0], reverse=True) # from the most skip gain to least
                found_available_split = False
                split_context = None
                split_context = split_candidates[0]
                
                # Beam search on the first few candidates
                min_cost = leaf.node_size * len(leaf.queryset)
                for i in range(min(candidate_size, len(split_candidates))):
                    # explored_cost = pass
                    # first create a partition tree using the current partition
                    # then split it with depth constraint!
                    # third, analyze the cost, maybe I just used the minimum total (avg) cost for comparison
                    
                    fake_split = split_candidates[i]
                    left_child, right_child = self.partition_tree.apply_split(leaf.nid, fake_split[1], fake_split[2], fake_split[3],
                                                                             pretend = True) # do not actually apply this split
                    
                    temp_left_partition = PartitionAlgorithm()
                    temp_right_partition = PartitionAlgorithm()
                    
                    # WAIT! , it should be the left cost + the right cost instead of directly using the current leaf!
                    left_cost = temp_left_partition.InitializeWithNORA(left_child.queryset, leaf.num_dims, left_child.boundary, 
                                                                       left_child.dataset, data_threshold, using_1_by_1 = False, 
                                                                       using_kd = False, depth_limit = candidate_depth, 
                                                                       return_query_cost = True)
                    right_cost = temp_right_partition.InitializeWithNORA(right_child.queryset, leaf.num_dims, right_child.boundary, 
                                                                         right_child.dataset, data_threshold, using_1_by_1 = False, 
                                                                         using_kd = False, depth_limit = candidate_depth, 
                                                                         return_query_cost = True)
                    explored_cost = left_cost + right_cost
                    if explored_cost < min_cost:
                        min_cost = explored_cost
                        split_context = split_candidates[i]          
                
                # apply split context
                if split_context[0] > 0:
                    self.partition_tree.apply_split(leaf.nid, split_context[1], split_context[2], split_context[3])
                    #print(" Split on node id:", leaf.nid)
                    #print("split_context:",split_context)
                    CanSplit = True

    
    
    def __NORA_1_BY_1(self, data_threshold, depth_limit = None):
        '''
        the NORA algorithm that optimized for 1 by 1 mapping scenario
        '''
        CanSplit = True
        while CanSplit:
            CanSplit = False           
            
            # for leaf in self.partition_tree.get_leaves():
            leaves = self.partition_tree.get_leaves(use_partitionable = True) # there could be large irregular shape partitions
            #print("# number of leaf nodes:",len(leaves))
            for leaf in leaves:
                
                # print("current leaf node id:",leaf.nid, "leaf node dataset size:",len(leaf.dataset))
                if leaf.node_size < 2 * data_threshold or leaf.queryset is None or (depth_limit is not None and leaf.depth >= depth_limit):
                    continue
                
                if len(leaf.queryset) == 1:
                    # try bounding split here
                    valid, skip, bound = leaf.if_bounding_split(data_threshold, approximate = False)
                    
                    if valid:
                        # apply this split
                        #print(" = = = Apply Old Bounding Split = = = ")
                        self.partition_tree.apply_split(leaf.nid, None, None, 1, bound)
                        #print("!!!Split From Internal Bounding Split!!!")
                        continue    
                        # design an variant of bounding split, which will handle node size problem # done
                        # if valid, mark the children as  don't consider anymore" -> partitionable = False
                   
                if leaf.if_new_bounding_split(data_threshold):
                    #print(" = = = Apply New Bounding Split for node:", leaf.nid)
                    # create new bounding split sub-partitions
                    self.partition_tree.apply_split(leaf.nid, None, None, 3)
                    continue
                
                # get best candidate cut position
                skip, max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = 0, -1, 0, 0, 0
                # extend the candidate cut with medians when it reach the bottom
                #candidate_cuts = leaf.get_candidate_cuts(True) if leaf.node_size < 4 * data_threshold else leaf.get_candidate_cuts()     
                candidate_cuts = leaf.get_candidate_cuts(True) # try extends it always
                
                split_candidates = []
                for split_dim, split_value in candidate_cuts:

                    # first try normal split
                    valid, skip, left_size, right_size = leaf.if_split(split_dim, split_value, data_threshold)
                    #if valid and skip > max_skip:
                    #    max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = skip, split_dim, split_value, 0
                    if valid:
                        num_query_crossed = leaf.num_query_crossed(split_dim, split_value)
                        split_candidates.append((skip, split_dim, split_value, 0, valid, num_query_crossed))
                    
                    # Should we remove the leaf node case here in 1 BY 1?
                    # the following cases here are applied only for leaf nodes
                    # if it's available for bounding split, try it
                    if leaf.node_size < 3 * data_threshold:
                        # try bounding split
                        valid, skip,_ = leaf.if_bounding_split(data_threshold, approximate = False)
                        if valid and skip > max_skip:
                            split_candidates.append((skip, split_dim, split_value, 1, valid, 0))
                            #max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = skip, split_dim, split_value, 1

                    # if it's availble for dual-bounding split, try it
                    elif leaf.node_size < 4 * data_threshold and left_size < 2 * data_threshold and right_size < 2 * data_threshold:
                        # try dual-bounding split              
                        valid, skip = leaf.if_dual_bounding_split(split_dim, split_value, data_threshold, approximate = False)
                        if valid and skip > max_skip:
                            split_candidates.append((skip, split_dim, split_value, 1, valid, 2))
                            #max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = skip, split_dim, split_value, 2
                
                if len(split_candidates) == 0:
                    continue
                
                split_candidates.sort(key=lambda item: item[0], reverse=True) # from the most skip gain to least
                found_available_split = False
                split_context = None
                
                # only apply QAVP on internal nodes
#                 if leaf.node_size > 4 * data_threshold:
#                     for split in split_candidates:
#                         if split[-1] == 0: # num_query_cross = 0 and valid = True
#                             # apply this split
#                             found_available_split = True
#                             split_context = split

#                     if not found_available_split:
#                         # idea 1: compare the cost of apply bounding and split the rest by general NORA

#                         # idea 2: use the least crossed 
#                         #split_candidates.sort(key=lambda item: (item[-1], -item[0]))
#                         #split_context = split_candidates[0] # use the first one for split

#                         # idea 3: just use the maximum skip one, currently I think this one could be easier
#                         split_context = split_candidates[0]        
#                 else:
#                     split_context = split_candidates[0]

                split_context = split_candidates[0]
    
                # apply split context
                if split_context[0] > 0:
                    self.partition_tree.apply_split(leaf.nid, split_context[1], split_context[2], split_context[3])
                    #print(" Split on node id:", leaf.nid)
                    #print("split_context:",split_context)
                    CanSplit = True
                
#                 if max_skip > 0:
#                     # if the cost become smaller, apply the cut
#                     child_node1, child_node2 = self.partition_tree.apply_split(leaf.nid, max_skip_split_dim, max_skip_split_value, max_skip_split_type)
#                     print(" Split on node id:", leaf.nid)
#                     CanSplit = True
    
    def __NORA(self, data_threshold, depth_limit = None):
        '''
        the general NORA algorithm, which utilize bounding split, daul-bounding split and extend candidate cuts with medians
        '''
        CanSplit = True
        while CanSplit:
            CanSplit = False           
            
            # for leaf in self.partition_tree.get_leaves():
            leaves = self.partition_tree.get_leaves()
            #print("# number of leaf nodes:",len(leaves))
            for leaf in leaves:
                
                # print("current leaf node id:",leaf.nid, "leaf node dataset size:",len(leaf.dataset))
                if leaf.node_size < 2 * data_threshold or leaf.queryset is None or (depth_limit is not None and leaf.depth >= depth_limit):
                    continue
                    
                # get best candidate cut position
                skip, max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = 0, -1, 0, 0, 0
                # extend the candidate cut with medians when it reach the bottom
                candidate_cuts = leaf.get_candidate_cuts(True) if leaf.node_size < 4 * data_threshold else leaf.get_candidate_cuts()     
                             
                for split_dim, split_value in candidate_cuts:

                    # first try normal split
                    valid, skip, left_size, right_size = leaf.if_split(split_dim, split_value, data_threshold)
                    if valid and skip > max_skip:
                        max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = skip, split_dim, split_value, 0

                    # if it's available for bounding split, try it
                    if leaf.node_size < 3 * data_threshold:
                        # try bounding split
                        valid, skip,_ = leaf.if_bounding_split(data_threshold, approximate = False)
                        if valid and skip > max_skip:
                            max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = skip, split_dim, split_value, 1

                    # if it's availble for dual-bounding split, try it
                    elif leaf.node_size < 4 * data_threshold and left_size < 2 * data_threshold and right_size < 2 * data_threshold:
                        # try dual-bounding split              
                        valid, skip = leaf.if_dual_bounding_split(split_dim, split_value, data_threshold, approximate = False)
                        if valid and skip > max_skip:
                            max_skip, max_skip_split_dim, max_skip_split_value, max_skip_split_type = skip, split_dim, split_value, 2

                if max_skip > 0:
                    # if the cost become smaller, apply the cut
                    child_node1, child_node2 = self.partition_tree.apply_split(leaf.nid, max_skip_split_dim, max_skip_split_value, max_skip_split_type)
                    #print(" Split on node id:", leaf.nid)
                    CanSplit = True
                    
    
    def __QDT(self, data_threshold):
        '''
        the QdTree partition algorithm
        '''
        CanSplit = True
        while CanSplit:
            CanSplit = False           
            
            # for leaf in self.partition_tree.get_leaves():
            leaves = self.partition_tree.get_leaves()
            #print("# number of leaf nodes:",len(leaves))
            for leaf in leaves:
                     
                # print("current leaf node id:",leaf.nid, "leaf node dataset size:",len(leaf.dataset))
                if leaf.node_size < 2 * data_threshold:
                    continue
                
                candidate_cuts = leaf.get_candidate_cuts()
                
                # get best candidate cut position
                skip, max_skip, max_skip_split_dim, max_skip_split_value = 0, -1, 0, 0
                for split_dim, split_value in candidate_cuts:

                    valid,skip,_,_ = leaf.if_split(split_dim, split_value, data_threshold)
                    if valid and skip > max_skip:
                        max_skip = skip
                        max_skip_split_dim = split_dim
                        max_skip_split_value = split_value

                if max_skip > 0:
                    # if the cost become smaller, apply the cut
                    child_node1, child_node2 = self.partition_tree.apply_split(leaf.nid, max_skip_split_dim, max_skip_split_value)
                    # print(" Split on node id:", leaf.nid)
                    CanSplit = True
            
    
    def __KDT(self, current_dim, data_threshold, current_node):
        '''
        Store the dataset in PartitionNode: we can keep it, but only as a tempoary attribute
        '''
        # cannot be further split
        if current_node.node_size < 2 * data_threshold:
            return   
        
        # split the node into equal halves by its current split dimension
        median = np.median(current_node.dataset[:,current_dim])
        
        sub_dataset1_size = np.count_nonzero(current_node.dataset[:,current_dim] < median)
        sub_dataset2_size = len(current_node.dataset) - sub_dataset1_size
        
        if sub_dataset1_size < data_threshold or sub_dataset2_size < data_threshold:
            pass
        else:
            child_node1, child_node2 = self.partition_tree.apply_split(current_node.nid, current_dim, median)
            
            # update next split dimension
            current_dim += 1
            if current_dim >= current_node.num_dims:
                current_dim %= current_node.num_dims
    
            # recursive call on sub nodes
            self.__KDT(current_dim, data_threshold, child_node1)
            self.__KDT(current_dim, data_threshold, child_node2)

# The DatasetAndQuerysetHelper Class.
It provides support functions to create, save, load, and modify the dataset and queryset.

In [60]:
class DatasetAndQuerysetHelper:
    '''
    naming:
    dataset: [base_path]/dataset/lineitem_[scale_factor]_[prob_threshold].csv
    domain: [base_path]/dataset/lineitem_[scale_factor]_[prob_threshold]_domains.csv
    queryset: [base_path]/queryset/[prob]/[vary_item]/[vary_val]_[used_dimensions]_[distribution/random].csv
    '''    
    def __init__(self, used_dimensions = None, scale_factor = 100, base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments',
                prob_id = 1, vary_id = 0, vary_val = 0, train_percent = 0.5, random_percent = 0):
        
        self.used_dimensions = used_dimensions # i.e., [1,2,3,4]
        self.total_dims = 16 # the dimensions of lineitem table
        self.domain_dims = 8 # the dimensions we used for split and maintain min max for
        
        self.scale_factor = scale_factor
        self.prob_threshold = 1 / self.scale_factor # the probability of an original record being sampled into this dataset
        self.block_size = 1000000 // self.scale_factor # in original file, 1M rows take approximately 128MB
        
        self.base_path = base_path
        self.save_path_data = base_path + '/dataset/lineitem_' + str(scale_factor) + '_' + str(self.prob_threshold) + '.csv'
        self.save_path_domain = base_path + '/dataset/lineitem_' + str(scale_factor) + '_' + str(self.prob_threshold) + '_domains.csv'
        
        self.vary_items = ['default', 'alpha', 'num_dims', 'prob_dims', 'num_X']
        self.vary_id = vary_id
        self.vary_val = vary_val
        
        self.prob_id = prob_id
        self.query_base_path = self.base_path + '/queryset/prob' + str(self.prob_id) + '/' + self.vary_items[vary_id] + '/'
        self.query_file_name = str(vary_val) + '_' + str(self.used_dimensions) # dependent on used_dimensions, so change dim first
        
        self.query_distribution_path = self.query_base_path + self.query_file_name + '_distribution.csv'
        self.query_random_path = self.query_base_path + self.query_file_name + '_random.csv'
        
        self.train_percent = train_percent
        
        # the following are default query generation settings
        self.random_percent = random_percent # usef for query generation
        self.cluster_center_amount = 10
        self.maximum_range_percent = 0.1 # 10% of the corresponding domain
        self.sigma_percent = 0.2 # control the differences in a cluster
        
        self.QDistThreshold_percent = 0.01 # distance threshold, 1% of the corresponding domain
        self.maximum_X = 5 # used for the 1-X train test case
        
    
    # = = = = = public functions (API) = = = = =
    
    def set_config(self, scale_factor = 100, base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', 
                   used_dimensions = None, prob_id = 1, vary_id = 0, vary_val = 0):
        '''
        As many attributes are related to each other, this is used to refresh the whole settings.
        '''
        self.used_dimensions = used_dimensions
        self.scale_factor = scale_factor
        self.prob_threshold = 1 / self.scale_factor
        self.block_size = 1000000 // self.scale_factor # in original file, 1M rows take approximately 128MB
        self.base_path = base_path
        self.save_path_data = base_path + '/dataset/lineitem_' + str(scale_factor) + '_' + str(self.prob_threshold) + '.csv'
        self.save_path_domain = base_path + '/dataset/lineitem_' + str(scale_factor) + '_' + str(self.prob_threshold) + '_domains.csv'
        self.vary_id = vary_id
        self.vary_val = vary_val
        self.prob_id = prob_id
        self.query_base_path = self.base_path + '/queryset/prob' + str(self.prob_id) + '/' + self.vary_items[vary_id] + '/'
        self.query_file_name = str(vary_val) + '_' + str(self.used_dimensions)
        self.query_distribution_path = self.query_base_path + self.query_file_name + '_distribution.csv'
        self.query_random_path = self.query_base_path + self.query_file_name + '_random.csv'    
    
    def load_dataset(self, used_dimensions = []):
        '''
        the priority of the used_dimensions argument in the function is higher than the saved attribute version
        domains: [[L1, U1], [L2, U2],...]
        return the dataset projected on selected dimensions
        '''
        dataset = np.genfromtxt(self.save_path_data, delimiter=',') # the sampled subset
        domains = np.genfromtxt(self.save_path_domain, delimiter=',') # the domain of that scale
        if used_dimensions != []:
            dataset = dataset[:,used_dimensions]
            domains = domains[used_dimensions]
        elif self.used_dimensions is not None:
            dataset = dataset[:,self.used_dimensions]
            domains = domains[self.used_dimensions]
        return dataset, domains
    
    def load_queryset(self, return_train_test = True, query_distribution_path = None, query_random_path = None):
        '''
        query is in plain form, i.e., [l1,l2,...,ln, u1,u2,...,un]
        how about the used dimension?
        return the saved queryset, should be projected on selected dimensions.
        '''
        # embed used_dimension info into query file's name
        # when load, will auto matically check whether used_dimension is matched!!! or load will failed
        
        distribution_query, random_query = None, None
        
        if query_distribution_path is not None and query_random_path is not None:
            distribution_query = np.genfromtxt(query_distribution_path, delimiter=',')
            random_query = np.genfromtxt(query_random_path, delimiter=',')
        else:
            distribution_query = np.genfromtxt(self.query_distribution_path, delimiter=',')
            random_query = np.genfromtxt(self.query_random_path, delimiter=',')
        
        if return_train_test:
            training_set, testing_set = self.__convert_to_train_test(distribution_query, random_query)
            return training_set, testing_set
        else:
            return distribution_query, random_query
    
    def generate_dataset_and_save(self, original_table_path, chunk_size = 100000):
        '''
        refer to TPCH tools to generate the original dataset (.tbl)
        this function is used to process the .tbl file with given sampling rate to generate a .csv file
        consider the possible table size, this function is implemented in a batch processing manner
        '''
        sampled_subset = []
        domains = [[float('Infinity'), float('-Infinity')] for i in range(self.domain_dims)] # indicate min, max
        
        col_names = ['_c'+str(i) for i in range(self.total_dims)]
        cols = [i for i in range(self.total_dims)]

        start_time = time.time()
        
        batch_count = 0
        for chunk in pd.read_table(original_table_path, delimiter='|', usecols=cols, names=col_names, chunksize=chunk_size):
            print('current chunk: ', batch_count)
            chunk.apply(lambda row: self.__process_chunk_sampling(row, domains, sampled_subset), axis=1)
            batch_count += 1

        end_time = time.time()
        print('total processing time: ', end_time - start_time)
        
        sampled_subset = np.array(sampled_subset)
        domains = np.array(domains)
        np.savetxt(self.save_path_data, sampled_subset, delimiter=',')
        np.savetxt(self.save_path_domain, domains, delimiter=',')
    
    def generate_queryset_and_save(self, query_amount, queryset_type = 0, dim_prob = [], prob_id = 1, vary_id = 0, vary_val = 0, 
                                   return_train_test = True):
        '''
        generate queryset for given dimensions.
        query_amount: total query amount, including distribution queries and random queries, or training set and testing set
        
        queryset_type = 0: typical (old) queryset generator, generate distribution and random queries
        queryset_type = 1: new NORA queryset generator, generate 1-1 train test which satisfy given distance threshold
        queryset_type = 2: new NORA queryset generator, generate 1-X train test which satisfy given distance threshold
        queryset_type = 3: hybrid workload, based on type 2, but random percent are random queries
        queryset_type = 4: Mixture Gaussian, but satisfy distance threshold
        
        dim_prob: the probability of using a given dimension (in used_dimensions) in a query
        other configurations are stored in class attributes
        REMEMBER to change the used_dimensions first if not using the previous one !!!
        
        the returned queries are not numpy object by default
        '''
        
        domains = np.genfromtxt(self.save_path_domain, delimiter=',')[self.used_dimensions]
        if dim_prob == []: # by default, use all the selected dimensions
            dim_prob = [1 for i in range(len(self.used_dimensions))]  
        maximum_range = [(domains[i,1] - domains[i,0]) * self.maximum_range_percent for i in range(len(domains))]
            
        if queryset_type == 0:
        
            num_random_query = int(query_amount * self.random_percent)
            num_distribution_query = query_amount - num_random_query

            distribution_query = self.__generate_distribution_query(num_distribution_query, dim_prob, domains, maximum_range)
            random_query = self.__generate_random_query(num_random_query, dim_prob, domains, maximum_range)

            # refresh query related class attributes
            self.vary_id = vary_id
            self.vary_val = vary_val      
            self.query_base_path = self.base_path + '/queryset/prob' + str(prob_id) + '/' + self.vary_items[vary_id] + '/'
            self.query_file_name = str(vary_val) + '_' + str(self.used_dimensions)
            self.query_distribution_path = self.query_base_path + self.query_file_name + '_distribution.csv'
            self.query_random_path = self.query_base_path + self.query_file_name + '_random.csv'

            # save
            np.savetxt(self.query_distribution_path, distribution_query, delimiter=',')
            np.savetxt(self.query_random_path, random_query, delimiter=',')

            # print(" = = = distribution query = = = ")
            # print(distribution_query)

            # print(" = = = random query = = = ")
            # print(random_query)

            if return_train_test:
                training_set, testing_set = self.__convert_to_train_test(distribution_query, random_query)
                return training_set, testing_set
            else:
                return distribution_query, random_query
            
        elif queryset_type == 1:
            num_training_query = query_amount // 2
            num_testing_query = num_training_query
            training_set = self.__generate_new_training_set(num_training_query, domains, maximum_range)
            testing_set = self.__generate_new_testing_set(training_set, domains, maximum_X = 1)
            # TODO save it
            return training_set, testing_set
            
        elif queryset_type == 2: # in this case, the total query amount are not fixed
            num_training_query = query_amount // 2
            num_testing_query = num_training_query
            training_set = self.__generate_new_training_set(num_training_query, domains, maximum_range)
            testing_set = self.__generate_new_testing_set(training_set, domains, maximum_X = self.maximum_X)
            # TODO save it
            return training_set, testing_set
        
        elif queryset_type == 3: # hybrid workload, include some random queries based on type 2
            num_random_query = int(query_amount * self.random_percent)
            num_distribution_query = query_amount - num_random_query
            
            num_training_query = num_distribution_query // 2
            num_testing_query = num_training_query
            
            random_query = self.__generate_random_query(num_random_query, dim_prob, domains, maximum_range)
            training_set = self.__generate_new_training_set(num_training_query, domains, maximum_range)
            testing_set = self.__generate_new_testing_set(training_set, domains, maximum_X = self.maximum_X)
            
            train_random = random_query[0:int(self.train_percent*len(random_query))]
            test_random = random_query[int(self.train_percent*len(random_query)):]
            
            training_set = training_set + train_random
            testing_set = testing_set + test_random
            
            return training_set, testing_set
        
        elif queryset_type == 4:
            num_training_query = query_amount // 2
            training_set = self.__generate_distribution_query(num_training_query, dim_prob, domains, maximum_range)
            testing_set = self.__generate_new_testing_set(training_set, domains, maximum_X = self.maximum_X)
            return training_set, testing_set
            
        else:
            print("No supported queryset type!")
            return None, None
        
    def extend_queryset(self, queries, QDistThreshold_percent = None, domains = None):
        '''
        extend the provided queryset with the previous provided query distance threshold
        '''
        extended_queries = []
        
        if QDistThreshold_percent is None:
            QDistThreshold_percent = self.QDistThreshold_percent
        
        if domains is None:
            domains = np.genfromtxt(self.save_path_domain, delimiter=',')[self.used_dimensions]
            
        num_dims = len(domains)
            
        extended_values = [(domain[1]-domain[0]) * QDistThreshold_percent for domain in domains]
        EV = np.array(extended_values)
        BL = [domain[0] for domain in domains]
        BU = [domain[1] for domain in domains]
         
        for query in queries:
            
            QL = np.array(query[0:num_dims])
            QU = np.array(query[num_dims:])
            
            QL -= EV # extended_values do not need to be converted to numpy
            QL = np.amax(np.array([QL.tolist(), BL]),axis=0).tolist()# bound it by the domain
            
            QU += EV
            QU = np.amin(np.array([QU.tolist(), BU]),axis=0).tolist() # bound it by the domain
            
            extended_query = QL + QU
            extended_queries.append(extended_query)
               
        return extended_queries
    
    def visualize_queryset_and_dataset(self, dims, training_set = None, testing_set = None, dataset = None, path = None):
        '''
        the dims are not the original dims, it's with regarded to the training set's dims
        the dimensions of training set, testing set and dataset should be corresponding to self.used_dimensions
        2D only!
        '''
        fig, ax = plt.subplots(1)
        
        domains = np.genfromtxt(self.save_path_domain, delimiter=',')[self.used_dimensions]
        num_dims = len(self.used_dimensions)
        
        plt.xlim(domains[dims[0]][0], domains[dims[0]][1])
        plt.ylim(domains[dims[1]][0], domains[dims[1]][1])
        
        if training_set is not None:
            case = 0
            for query in training_set:

                lower1 = query[dims[0]]
                lower2 = query[dims[1]]  
                upper1 = query[dims[0]+num_dims]
                upper2 = query[dims[1]+num_dims]    

                rect = Rectangle((lower1,lower2),upper1-lower1,upper2-lower2,fill=False,edgecolor='r',linewidth=1)
                ax.text(upper1, upper2, case, color='b',fontsize=7)
                case += 1
                ax.add_patch(rect)
        
        if testing_set is not None:
            case = 0
            for query in testing_set:

                lower1 = query[dims[0]]
                lower2 = query[dims[1]]  
                upper1 = query[dims[0]+num_dims]
                upper2 = query[dims[1]+num_dims]    

                rect = Rectangle((lower1,lower2),upper1-lower1,upper2-lower2,fill=False,edgecolor='g',linewidth=1)
                ax.text(upper1, upper2, case, color='b',fontsize=7)
                case += 1
                ax.add_patch(rect)
                
        if dataset is not None:
            plt.scatter(dataset[:,dims[0]], dataset[:,dims[0]],color='blue')

        ax.set_xlabel('dim 1', fontsize=15)
        ax.set_ylabel('dim 2', fontsize=15)
        #plt.xticks(np.arange(0, 400001, 100000), fontsize=10)
        #plt.yticks(np.arange(0, 20001, 5000), fontsize=10)

        plt.tight_layout() # preventing clipping the labels when save to pdf

        if path is not None:
            fig.savefig(path)

        plt.show()
    
    def real_result_size(self, dataset, queries):
        num_dims = int(len(self.used_dimensions))
        results = []
        for query in queries:
            constraints = []
            for d in range(num_dims):
                constraint_L = dataset[:,d] >= query[d]
                constraint_U = dataset[:,d] <= query[num_dims + d]
                constraints.append(constraint_L)
                constraints.append(constraint_U)
            constraint = np.all(constraints, axis=0)
            result_size = np.count_nonzero(constraint)
            results.append(result_size)
        return results
    
    def queryset_distance(self, queryset1, queryset2):
        '''
        estimate the single direction Hausdorff distance
        '''
        domains = np.genfromtxt(self.save_path_domain, delimiter=',')[self.used_dimensions]
        num_dims = len(self.used_dimensions)
        if len(queryset1[0]) == len(queryset2[0]):
            num_dims = len(queryset1[0]) // 2
            
        max_dist = 0
        for query1 in queryset1:
            min_dist = float('inf')
            for query2 in queryset2:
                dist = 0
                for k in range(num_dims):
                    dist_L = abs(query1[k]-query2[k])
                    dist_U = abs(query1[k+num_dims]-query2[k+num_dims])
                    dist_L_ratio = dist_L / (domains[k][1] - domains[k][0])
                    dist_U_ratio = dist_U / (domains[k][1] - domains[k][0])
                    dist = max(dist, dist_L_ratio, dist_U_ratio)
                if dist < min_dist:
                    min_dist = dist
            if min_dist > max_dist:
                max_dist = min_dist
        
        return max_dist
    
    # = = = = = internal functions = = = = =
    
    def __process_chunk_sampling(self, row, domains, sampled_subset):
        prob = random.uniform(0, 1)
        row_numpy = row.to_numpy()  
        for i in range(len(domains)):
            if row_numpy[i] > domains[i][1]:
                domains[i][1] = row_numpy[i]
            if row_numpy[i] < domains[i][0]:
                domains[i][0] = row_numpy[i]
        if prob <= self.prob_threshold:    
            sampled_subset.append(row_numpy[0:self.domain_dims].tolist())
    
    def __generate_new_testing_set(self, training_set, domains, maximum_X = 1):
        '''
        generate the testing set for 1-1 or 1-X train test scenarios (i.e., new NORA) which satisfy distance threshold
        
        the maximum_X denotes the X for 1-X, if it is 1, it indicate the 1-1 case, else it indicate the 1-X case
        '''
        extended_values = [(domain[1]-domain[0]) * self.QDistThreshold_percent for domain in domains]
        EV = np.array(extended_values)
        
        testing_set = []
        for query in training_set:
            num_X = 1 if maximum_X == 1 else random.randint(0, maximum_X) # inclusive for both end
            for i in range(num_X):
                query_lower, query_upper = [], []
                for k in range(len(domains)):
                    QL = query[k] - random.uniform(0, EV[k])
                    QU = query[len(domains)+k] + random.uniform(0, EV[k])
                    if QL <= domains[k][0]:
                        QL = domains[k][0]
                    if QU >= domains[k][1]:
                        QU = domains[k][1]
                    if QL > QU:
                        QL, QU = QU, QL
                    query_lower.append(QL)
                    query_upper.append(QU)
                testing_set.append(query_lower+query_upper)
        return testing_set
    
    def __generate_new_training_set(self, query_amount, domains, maximum_range):
        '''
        generate the training set for 1-1 or 1-X train test scenarios (i.e., new NORA)
        query_amount: number of training queries to generate
        '''
        # first, generate the centers of each query
        centers = []
        for i in range(query_amount):
            center = [] # [D1, D2,..., Dk]
            for k in range(len(domains)):
                ck = random.uniform(domains[k][0], domains[k][1])
                center.append(ck)
            centers.append(center)
        
        # second, generate expected range for each dimension for each center (of each query)
        centers_ranges = []
        for i in range(query_amount):
            ranges = [] # the range in all dimensions for a given center
            for k in range(len(domains)):
                ran = random.uniform(0, maximum_range[k])
                ranges.append(ran)
            centers_ranges.append(ranges)
            
        # third, build the queries and bound them by the domain
        generated_queries = []
        for i in range(query_amount):
            center = centers[i]
            query_lower, query_upper = [], []
            for k in range(len(domains)):
                query_range = centers_ranges[i][k]
                L = center[k] - query_range/2
                U = center[k] + query_range/2
                if L <= domains[k][0]:
                    L = domains[k][0]
                if U >= domains[k][1]:
                    U = domains[k][1]
                if L > U:
                    L, U = U, L
                query_lower.append(L)
                query_upper.append(U)
            generated_queries.append(query_lower+query_upper)
            
        return generated_queries
    
    def __convert_to_train_test(self, distribution_query, random_query):
        train_distribution = distribution_query[0:int(self.train_percent*len(distribution_query))]
        test_distribution = distribution_query[int(self.train_percent*len(distribution_query)):]
        train_random = random_query[0:int(self.train_percent*len(random_query))]
        test_random = random_query[int(self.train_percent*len(random_query)):]
        
        # to deal with the shape issue, 0 items cannot be concated
        if len(distribution_query) == 0 and len(random_query) == 0:
            return [], []
        elif len(distribution_query) == 0:
            return train_random, test_random
        elif len(random_query) == 0:
            return train_distribution, test_distribution
        else:
            training_set = np.concatenate((train_distribution, train_random), axis=0)
            testing_set = np.concatenate((test_distribution, test_random), axis=0)
            return training_set, testing_set
    
    def __generate_distribution_query(self, query_amount, dim_prob, domains, maximum_range):
        '''
        generate clusters of queries
        '''
        # first, generate cluster centers
        centers = []
        for i in range(self.cluster_center_amount):
            center = [] # [D1, D2,..., Dk]
            for k in range(len(domains)):
                ck = random.uniform(domains[k][0], domains[k][1])
                center.append(ck)
            centers.append(center)

        # second, generate expected range for each dimension for each center
        centers_ranges = []
        for i in range(self.cluster_center_amount):
            ranges = [] # the range in all dimensions for a given center
            for k in range(len(domains)):
                ran = random.uniform(0, maximum_range[k])
                ranges.append(ran)
            centers_ranges.append(ranges)

        # third, generate sigma for each dimension for each center
        centers_sigmas = []
        for i in range(self.cluster_center_amount):
            sigmas = []
            for k in range(len(domains)):
                sigma = random.uniform(0, maximum_range[k] * self.sigma_percent)
                sigmas.append(sigma)
            centers_sigmas.append(sigmas)

        # fourth, generate queries
        distribution_query = [] = []
        for i in range(query_amount):
            # choose a center
            center_index = random.randint(0, self.cluster_center_amount-1) # this is inclusive            
            query_lower, query_upper = [], []
            for k in range(len(domains)):
                # consider whether or not to use this dimension
                L, U = None, None
                prob = random.uniform(0, 1)
                if prob > dim_prob[k]:
                    L = domains[k][0]
                    U = domains[k][1]
                else:
                    center = centers[center_index]
                    query_range = centers_ranges[center_index][k]
                    L = center[k] - query_range/2
                    U = center[k] + query_range/2
                    L = random.gauss(L, centers_sigmas[center_index][k])
                    U = random.gauss(U, centers_sigmas[center_index][k])
                    if L <= domains[k][0]:
                        L = domains[k][0]
                    if U >= domains[k][1]:
                        U = domains[k][1]
                    if L > U:
                        L, U = U, L
                query_lower.append(L)
                query_upper.append(U)
            distribution_query.append(query_lower + query_upper)
        return distribution_query
    
    def __generate_random_query(self, query_amount, dim_prob, domains, maximum_range):
        random_query = []
        for i in range(query_amount):
            query_lower, query_upper = [], []
            for k in range(len(domains)):     
                # consider whether or not to use this dimension
                L, U = None, None
                prob = random.uniform(0, 1)
                if prob > dim_prob[k]:
                    L = domains[k][0]
                    U = domains[k][1]
                else:
                    center = random.uniform(domains[k][0], domains[k][1])
                    query_range = random.uniform(0, maximum_range[k])
                    L = center - query_range/2
                    U = center + query_range/2
                    if L <= domains[k][0]:
                        L = domains[k][0]
                    if U >= domains[k][1]:
                        U = domains[k][1]
                query_lower.append(L)
                query_upper.append(U)
            random_query.append(query_lower + query_upper)
        return random_query

# Performance Comparisom
In this following comparison, we compare the logical IO for k-d tree, QdTree, and NORA on training and testing set 
generated with strict 1 by 1 mapping (within a distance threshold).

In [272]:
used_dims = [1,2]
# used_dims = [1,2,3,4]
block_size = 10000

### Setting the base path here!!!

In [273]:
# = = = Test Dataset and Queryset Loader = = = 
# helper = DatasetAndQuerysetHelper(base_path = 'where NORA_experiments folder is placed', used_dimensions = [1,2]) # replace the base_path!
helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = used_dims) # EXAMPLE

### Load the dataset and its corresponding domains.

In [274]:
dataset, domains = helper.load_dataset(used_dims)
boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]
# len(dataset) # 6001309
print(domains)

[[1.e+00 2.e+07]
 [1.e+00 1.e+06]]


### Generate the training set and testing set queries.

In [290]:
# helper.maximum_range_percent = 0.3
# helper.QDistThreshold_percent = 0.015

training_set, testing_set = helper.generate_queryset_and_save(60, queryset_type=1) # 1-1
# training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=2) # 1-X (X is 5 at most by default)
# training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=4) # mixture Gaussian
extended_training_set = helper.extend_queryset(training_set)
helper.visualize_queryset_and_dataset([0,1], training_set, testing_set)

# distance = helper.queryset_distance(training_set[0:25], training_set[25:])
# print(distance)

<IPython.core.display.Javascript object>

In [None]:
# # query distribution analysis
# hist = np.zeros((100,100))

# for i in range(100):
#     training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=2) # 1-X (X is 5 at most by default)
#     for query in training_set:
#         indexes = ()
#         for k in range(len(used_dims)):
#             query_index_L = int((query[k] - boundary[k]) / (boundary[len(used_dims) + k] - boundary[k])  * 100)
#             query_index_U = int((query[len(used_dims) + k] - boundary[k]) / (boundary[len(used_dims) + k] - boundary[k]) * 100)
#             #print(query_index_L, query_index_U)
#             indexes += (slice(query_index_L, query_index_U, 1),)
#         #print(indexes)
#         hist[indexes] += 1
        
# # heat map
# fig = plt.figure(figsize=(6, 3.2))
# plt.imshow(hist, interpolation='none')
# plt.show()

In [None]:
# save the special queryset
# np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/uniform_queries_train.csv", training_set, delimiter=',')
# np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/uniform_queries_test.csv", testing_set, delimiter=',')
# np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/skewed_queries_train.csv", training_set, delimiter=',')
# np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/skewed_queries_test.csv", testing_set, delimiter=',')

In [None]:
# LB-Cost
results = helper.real_result_size(dataset, training_set)
# results = helper.real_result_size(dataset, testing_set)
max(sum(results)/len(results), block_size)

### Create k-d tree.

In [308]:
# = = = = = Test PartitionAlgorithm (KDT) = = = = = 
pa1 = PartitionAlgorithm()
pa1.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size) # 447

### Test the logical IO of k-d tree on both training set and testing set (with visualization)

In [305]:
# pa1.partition_tree.visualize(queries = training_set, path="C:/Users/Cloud/iCloudDrive/NORA_experiments/plots/experiment_explanation/kdtree_train.png")
# pa1.partition_tree.visualize(queries = testing_set, path="C:/Users/Cloud/iCloudDrive/NORA_experiments/plots/experiment_explanation/kdtree_test.png")
pa1.partition_tree.visualize(queries = training_set, focus_region = [10000000, 500000, 20000000, 1000000],add_text = False, use_sci = True)
# pa1.partition_tree.evaluate_query_cost(training_set, True)
pa1.partition_tree.visualize(queries = testing_set, focus_region = [10000000, 500000, 20000000, 1000000],add_text = False, use_sci = True)
# pa1.partition_tree.evaluate_query_cost(testing_set, True)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Test the logical IO of QdTree on both training set and testing set (with visualization)

In [310]:
# = = = = = Test PartitionAlgorithm (QDT) = = = = = 
pa2 = PartitionAlgorithm()
pa2.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)

In [311]:
# pa2.partition_tree.visualize(queries = training_set, path="C:/Users/Cloud/iCloudDrive/NORA_experiments/plots/experiment_explanation/qdtree_train.png")
pa2.partition_tree.visualize(queries = training_set, focus_region = [10000000, 500000, 20000000, 1000000],add_text = False, use_sci = True)
# pa2.partition_tree.evaluate_query_cost(training_set, True) # Average logical IOs: 196487
# pa2.partition_tree.visualize(queries = testing_set, path="C:/Users/Cloud/iCloudDrive/NORA_experiments/plots/experiment_explanation/qdtree_test.png")
pa2.partition_tree.visualize(queries = testing_set, focus_region = [10000000, 500000, 20000000, 1000000],add_text = False, use_sci = True)
# pa2.partition_tree.evaluate_query_cost(testing_set, True) # Average logical IOs: 196487

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [220]:
# pa2.partition_tree.visualize(queries = training_set, focus_region = [10000000, 500000, 20000000, 1000000])
# pa2.partition_tree.visualize(queries = testing_set, focus_region = [10000000, 500000, 20000000, 1000000])

### Qd-tree trained by extended queries

In [238]:
# # QdTree trained with extended queries
# pa2_2 = PartitionAlgorithm()
# pa2_2.InitializeWithQDT(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)

# pa2_2.partition_tree.visualize(queries = training_set)
# pa2_2.partition_tree.evaluate_query_cost(training_set, True) # Average logical IOs: 196487
# pa2_2.partition_tree.visualize(queries = testing_set)
# pa2_2.partition_tree.evaluate_query_cost(testing_set, True) # Average logical IOs: 196487

### Test the logical IO of NORA on both training set and testing set (with visualization)

In [548]:
# = = = = = Test PartitionAlgorithm (NORA) = = = = = 
pa3 = PartitionAlgorithm()
pa3.InitializeWithNORA(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
                       using_1_by_1 = True, using_kd = False)

Build Time (s): 183.23359608650208


In [549]:
# pa3.partition_tree.visualize(queries = training_set, path="C:/Users/Cloud/iCloudDrive/NORA_experiments/plots/experiment_explanation/nora_train.png")
# pa3.partition_tree.visualize(queries = extended_training_set, path="C:/Users/Cloud/iCloudDrive/NORA_experiments/plots/experiment_explanation/nora_train.png")
pa3.partition_tree.visualize(queries = extended_training_set)
pa3.partition_tree.evaluate_query_cost(extended_training_set, True)
pa3.partition_tree.visualize(queries = training_set)
pa3.partition_tree.evaluate_query_cost(training_set, True)
# pa3.partition_tree.visualize(queries = testing_set, path="C:/Users/Cloud/iCloudDrive/NORA_experiments/plots/experiment_explanation/nora_test.png")
pa3.partition_tree.visualize(queries = testing_set)
pa3.partition_tree.evaluate_query_cost(testing_set, True)

<IPython.core.display.Javascript object>

Total logical IOs: 435613
Average logical IOs: 17424
query 0 [76] cost: 14324
query 1 [93] cost: 0
query 2 [111] cost: 10000
query 3 [85, 62] cost: 27690
query 4 [53] cost: 12957
query 5 [125, 126] cost: 21588
query 6 [31, 126, 105] cost: 33354
query 7 [83] cost: 10123
query 8 [119] cost: 12153
query 9 [84] cost: 10124
query 10 [77] cost: 10000
query 11 [102, 84] cost: 23346
query 12 [107, 108, 45] cost: 28993
query 13 [95, 115, 116, 117] cost: 40247
query 14 [116] cost: 10071
query 15 [101] cost: 15296
query 16 [65] cost: 10000
query 17 [52] cost: 14448
query 18 [117, 118] cost: 20274
query 19 [120] cost: 19907
query 20 [44] cost: 18271
query 21 [109] cost: 10000
query 22 [87, 32] cost: 23118
query 23 [123, 108] cost: 29329
query 24 [89] cost: 10000


<IPython.core.display.Javascript object>

Total logical IOs: 435613
Average logical IOs: 17424
query 0 [76] cost: 14324
query 1 [93] cost: 0
query 2 [111] cost: 10000
query 3 [85, 62] cost: 27690
query 4 [53] cost: 12957
query 5 [125, 126] cost: 21588
query 6 [31, 126, 105] cost: 33354
query 7 [83] cost: 10123
query 8 [119] cost: 12153
query 9 [84] cost: 10124
query 10 [77] cost: 10000
query 11 [102, 84] cost: 23346
query 12 [107, 108, 45] cost: 28993
query 13 [95, 115, 116, 117] cost: 40247
query 14 [116] cost: 10071
query 15 [101] cost: 15296
query 16 [65] cost: 10000
query 17 [52] cost: 14448
query 18 [117, 118] cost: 20274
query 19 [120] cost: 19907
query 20 [44] cost: 18271
query 21 [109] cost: 10000
query 22 [87, 32] cost: 23118
query 23 [123, 108] cost: 29329
query 24 [89] cost: 10000


<IPython.core.display.Javascript object>

Total logical IOs: 731286
Average logical IOs: 14924
query 0 [76] cost: 14324
query 1 [76] cost: 14324
query 2 [76] cost: 14324
query 3 [93] cost: 0
query 4 [93] cost: 0
query 5 [111] cost: 10000
query 6 [111] cost: 10000
query 7 [125, 126] cost: 21588
query 8 [31, 126, 105] cost: 33354
query 9 [83] cost: 10123
query 10 [83] cost: 10123
query 11 [119] cost: 12153
query 12 [119] cost: 12153
query 13 [119] cost: 12153
query 14 [119] cost: 12153
query 15 [119] cost: 12153
query 16 [77] cost: 10000
query 17 [107, 108, 45] cost: 28993
query 18 [107, 108, 45] cost: 28993
query 19 [116] cost: 10071
query 20 [116] cost: 10071
query 21 [116] cost: 10071
query 22 [116] cost: 10071
query 23 [101] cost: 15296
query 24 [101] cost: 15296
query 25 [65] cost: 10000
query 26 [65] cost: 10000
query 27 [65] cost: 10000
query 28 [65] cost: 10000
query 29 [52] cost: 14448
query 30 [52] cost: 14448
query 31 [52] cost: 14448
query 32 [52] cost: 14448
query 33 [117, 118] cost: 20274
query 34 [117, 118] cost: 

14924

In [429]:
# = = = = = Test Distance Threshold Estimation = = = = =
pa3e = PartitionAlgorithm()
estimated_extended_training_set = helper.extend_queryset(training_set, distance)
pa3e.InitializeWithNORA(estimated_extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
                        using_1_by_1 = True, using_kd = True)
pa3e.partition_tree.visualize(queries = testing_set)
pa3e.partition_tree.evaluate_query_cost(testing_set, True)

Build Time (s): 1689.4446411132812


<IPython.core.display.Javascript object>

Total logical IOs: 4661195
Average logical IOs: 42763
query 0 [388, 595] cost: 31941
query 1 [388, 595] cost: 31941
query 2 [388, 595] cost: 31941
query 3 [810, 737] cost: 29599
query 4 [810, 737] cost: 29599
query 5 [585, 743, 385] cost: 42286
query 6 [585, 743, 385] cost: 42286
query 7 [475, 682] cost: 25516
query 8 [818, 752] cost: 37309
query 9 [818, 752] cost: 37309
query 10 [818, 752] cost: 37309
query 11 [818, 752] cost: 37309
query 12 [818, 752] cost: 37309
query 13 [807, 730, 731] cost: 43440
query 14 [807, 730, 731] cost: 43440
query 15 [804] cost: 11782
query 16 [804] cost: 11782
query 17 [380, 738] cost: 30469
query 18 [738] cost: 11604
query 19 [380, 738] cost: 30469
query 20 [380, 738] cost: 30469
query 21 [838] cost: 13356
query 22 [745, 385, 387] cost: 47722
query 23 [762, 420, 646, 469] cost: 54882
query 24 [710, 368, 370] cost: 50561
query 25 [530, 531, 710, 569, 368, 370] cost: 89916
query 26 [530, 531, 710, 569, 368, 370] cost: 89916
query 27 [530, 531, 710, 569, 36

42763

In [365]:
# test save and load the partition layouts
# serialized_node_list = pa3.partition_tree.save_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob2_nora')
# pa3_assembled = PartitionTree()
# pa3_assembled.load_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob2_nora')
# pa3_assembled.visualize(queries = training_set)
# pa3_assembled.evaluate_query_cost(extended_training_set, True)
# pa3_assembled.visualize(queries = testing_set)
# pa3_assembled.evaluate_query_cost(testing_set, True)

In [101]:
# # = = = = Test Nora 1 BY 1 = = = =
# pa4 = PartitionAlgorithm()
# pa4.InitializeWithNORA(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, using_1_by_1 = True)
# pa4.partition_tree.visualize(queries = extended_training_set)
# pa4.partition_tree.evaluate_query_cost(extended_training_set)
# pa4.partition_tree.visualize(queries = testing_set)
# pa4.partition_tree.evaluate_query_cost(testing_set)

### NORA without extended queries

In [16]:
# = = = = = NORA without extended queries = = = = = 
pa3_2 = PartitionAlgorithm()
pa3_2.InitializeWithNORA(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)

Build Time (s): 15.613705158233643


In [17]:
pa3_2.partition_tree.visualize(queries = training_set)
pa3_2.partition_tree.evaluate_query_cost(training_set, True)
pa3_2.partition_tree.visualize(queries = testing_set)
pa3_2.partition_tree.evaluate_query_cost(testing_set, True)

<IPython.core.display.Javascript object>

Total logical IOs: 1089591
Average logical IOs: 21791
query 0 [120] cost: 46417
query 1 [203] cost: 27097
query 2 [139, 245] cost: 30474
query 3 [161] cost: 15441
query 4 [71] cost: 41623
query 5 [211] cost: 14607
query 6 [250, 214] cost: 41336
query 7 [74] cost: 11993
query 8 [211, 249, 250] cost: 42083
query 9 [121, 195] cost: 37801
query 10 [233] cost: 15943
query 11 [139, 243, 245, 246] cost: 57146
query 12 [101] cost: 21766
query 13 [148] cost: 13323
query 14 [131] cost: 12652
query 15 [257, 258] cost: 25765
query 16 [193] cost: 15852
query 17 [141] cost: 16703
query 18 [222] cost: 13969
query 19 [121] cost: 20800
query 20 [209, 248] cost: 30849
query 21 [200] cost: 16098
query 22 [157] cost: 11878
query 23 [135] cost: 11164
query 24 [152] cost: 18902
query 25 [172] cost: 15498
query 26 [236] cost: 22118
query 27 [209] cost: 18136
query 28 [150] cost: 17007
query 29 [255] cost: 12440
query 30 [189] cost: 22669
query 31 [68] cost: 10020
query 32 [240] cost: 11898
query 33 [182] cos

<IPython.core.display.Javascript object>

Total logical IOs: 29262194
Average logical IOs: 232239
query 0 [119, 120, 64, 67, 69] cost: 418331
query 1 [119, 120, 64, 67, 69] cost: 418331
query 2 [205, 206, 139, 243, 245, 246] cost: 104268
query 3 [205, 206, 139, 243, 245, 246] cost: 104268
query 4 [205, 206, 139, 243, 245, 246] cost: 104268
query 5 [205, 206, 139, 243, 245, 246] cost: 104268
query 6 [205, 206, 139, 243, 245, 246] cost: 104268
query 7 [216, 160, 161, 162, 165, 169] cost: 210270
query 8 [160, 161, 162, 165, 169] cost: 199224
query 9 [216, 160, 161, 162, 165, 169] cost: 210270
query 10 [216, 160, 161, 162, 165, 169] cost: 210270
query 11 [216, 160, 161, 162, 165, 169] cost: 210270
query 12 [211, 249] cost: 31256
query 13 [211, 249] cost: 31256
query 14 [211, 249] cost: 31256
query 15 [94, 249, 250, 214, 157, 158, 99] cost: 408986
query 16 [94, 249, 250, 214, 157, 158, 99] cost: 408986
query 17 [94, 249, 250, 214, 157, 158] cost: 216172
query 18 [94, 249, 250, 214, 157, 158, 99] cost: 408986
query 19 [73, 74, 241, 

232239

In [18]:
# = = = = = NORA without extended queries (bounding split in inner node) = = = = = 
pa3_3 = PartitionAlgorithm()
pa3_3.InitializeWithNORA(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, using_1_by_1 = True)
# pa3_3.partition_tree.nid_node_dict[22].__dict__ # show all the attributes

Build Time (s): 79.73106741905212


In [19]:
pa3_3.partition_tree.visualize(queries = training_set)
pa3_3.partition_tree.evaluate_query_cost(training_set, True)
pa3_3.partition_tree.visualize(queries = testing_set)
pa3_3.partition_tree.evaluate_query_cost(testing_set, True)

<IPython.core.display.Javascript object>

Total logical IOs: 994761
Average logical IOs: 19895
query 0 [68] cost: 39433
query 1 [80] cost: 27097
query 2 [136, 209] cost: 41082
query 3 [154] cost: 9999
query 4 [127] cost: 32423
query 5 [184] cost: 14607
query 6 [206, 187] cost: 41336
query 7 [78] cost: 10000
query 8 [184, 205, 206] cost: 42083
query 9 [120, 172] cost: 60853
query 10 [164] cost: 15680
query 11 [136, 199, 209, 202] cost: 67754
query 12 [52] cost: 15325
query 13 [143] cost: 10000
query 14 [81] cost: 10000
query 15 [211, 212] cost: 25765
query 16 [117] cost: 10000
query 17 [87] cost: 10000
query 18 [193] cost: 7729
query 19 [120] cost: 20800
query 20 [180, 203] cost: 30849
query 21 [37] cost: 10000
query 22 [150] cost: 11878
query 23 [132] cost: 10000
query 24 [95] cost: 10446
query 25 [162] cost: 17865
query 26 [195] cost: 21768
query 27 [180] cost: 18136
query 28 [146] cost: 17007
query 29 [165] cost: 10000
query 30 [170] cost: 10000
query 31 [73] cost: 10020
query 32 [118] cost: 10000
query 33 [59] cost: 11108
q

<IPython.core.display.Javascript object>

Total logical IOs: 18636605
Average logical IOs: 147909
query 0 [68, 69] cost: 112961
query 1 [68, 69] cost: 112961
query 2 [134, 135, 136, 199, 209, 202] cost: 166249
query 3 [134, 135, 136, 199, 209, 202] cost: 166249
query 4 [134, 135, 136, 199, 209, 202] cost: 166249
query 5 [134, 135, 136, 199, 209, 202] cost: 166249
query 6 [134, 135, 136, 199, 209, 202] cost: 166249
query 7 [154, 155] cost: 25721
query 8 [154, 155] cost: 25721
query 9 [154, 155] cost: 25721
query 10 [154, 155] cost: 25721
query 11 [154, 155] cost: 25721
query 12 [184, 205] cost: 31256
query 13 [184, 205] cost: 31256
query 14 [184, 205] cost: 31256
query 15 [205, 206, 187, 150] cost: 69863
query 16 [205, 206, 187, 150] cost: 69863
query 17 [205, 206, 187, 150] cost: 69863
query 18 [205, 206, 187, 150] cost: 69863
query 19 [78, 79] cost: 124601
query 20 [184, 185, 205, 206, 187] cost: 83222
query 21 [184, 185, 205, 206, 187] cost: 83222
query 22 [184, 185, 205, 206, 187] cost: 83222
query 23 [184, 185, 205, 206, 1

147909

# PAW (General Group Split)

In [542]:
# # = = = = = Test PartitionAlgorithm (PAW: using merge without handling overlapped partitions) = = = = = 
# pa4 = PartitionAlgorithm()
# pa4.InitializeWithPAW(extended_training_set, len(boundary)//2, boundary, dataset, block_size, 10, strategy = 0)

In [543]:
# pa4.partition_tree.visualize(queries = training_set)
# pa4.partition_tree.evaluate_query_cost(training_set, True)
# pa4.partition_tree.visualize(queries = testing_set)
# pa4.partition_tree.evaluate_query_cost(testing_set, True)

In [312]:
# = = = = = Test PartitionAlgorithm (PAW: extend + abourt when overlap. Now it's the default) = = = = = 
pa4_2 = PartitionAlgorithm()
pa4_2.InitializeWithPAW(extended_training_set, len(boundary)//2, boundary, dataset, block_size, 1000, strategy = 1)

Build Time (s): 67.97295999526978


In [313]:
# pa4_2.partition_tree.visualize(queries = extended_training_set)
# pa4_2.partition_tree.evaluate_query_cost(extended_training_set, True)
pa4_2.partition_tree.visualize(queries = training_set, focus_region = [10000000, 500000, 20000000, 1000000],add_text = False, use_sci = True)
# pa4_2.partition_tree.evaluate_query_cost(training_set, True)
pa4_2.partition_tree.visualize(queries = testing_set, focus_region = [10000000, 500000, 20000000, 1000000],add_text = False, use_sci = True)
# pa4_2.partition_tree.evaluate_query_cost(testing_set, True)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [223]:
# pa4_2.partition_tree.visualize(queries = training_set, focus_region = [10000000, 500000, 20000000, 1000000])
# pa4_2.partition_tree.visualize(queries = testing_set, focus_region = [10000000, 500000, 20000000, 1000000])

# Different between unifrom and skew workload in OSM

In [358]:
block_size = 10000
used_dims = [0,1]
helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = [0, 1])
dataset = np.genfromtxt('C:/Users/Cloud/iCloudDrive/NORA_experiments/dataset/OSM_6M.csv', delimiter=',')
boundary = [-180, -90, 180, 90]

In [362]:
# training_set = helper._DatasetAndQuerysetHelper__generate_random_query(50, [1,1], [[-180,180],[-90,90]], [36,18]) # uniform
training_set = helper._DatasetAndQuerysetHelper__generate_distribution_query(50, [1,1], [[-180,180],[-90,90]], [36,18]) # skew
testing_set = helper.extend_queryset(training_set, domains = [[-180.0,180.0],[-90.0,90.0]])
extended_training_set = helper.extend_queryset(training_set, domains = [[-180.0,180.0],[-90.0,90.0]])

In [363]:
pa_kd = PartitionAlgorithm()
pa_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size)
pa_qd.partition_tree.visualize(queries = testing_set)
pa_kd.partition_tree.evaluate_query_cost(testing_set, True)
print("")

pa_qd = PartitionAlgorithm()
pa_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
pa_qd.partition_tree.visualize(queries = testing_set)
pa_qd.partition_tree.evaluate_query_cost(testing_set, True)
print("")

pa_paw = PartitionAlgorithm()
pa_paw.InitializeWithPAW(extended_training_set, len(boundary)//2, boundary, dataset, block_size, 1000, strategy = 1)
pa_paw.partition_tree.visualize(queries = testing_set)
pa_paw.partition_tree.evaluate_query_cost(testing_set, True)
print("")

result_sizes = helper.real_result_size(dataset, testing_set)
cost_real = sum(result_sizes)/len(result_sizes)
min_cost = max(cost_real, block_size)
print("LB-Cost:", cost_real)



<IPython.core.display.Javascript object>

Total logical IOs: 4980359
Average logical IOs: 99607
query 0 [1021] cost: 11719
query 1 [1021] cost: 11719
query 2 [151, 152, 164, 166, 189, 190, 242, 243, 255, 257, 307, 308, 310, 322, 397, 398, 399, 400, 403, 404, 405, 411, 412, 417] cost: 281243
query 3 [180] cost: 11719
query 4 [145, 146, 151, 152, 157, 158, 160, 163, 164, 165, 166, 189, 211, 212, 213, 214, 220, 222, 225, 226, 227, 228, 235, 236, 237, 238, 241, 242, 243, 249, 250, 251, 252, 255, 257, 307, 308, 309, 310, 397, 398, 403, 404] cost: 503894
query 5 [350, 355, 356, 379, 380, 385, 386] cost: 82030
query 6 [1021] cost: 11719
query 7 [17] cost: 11718
query 8 [180] cost: 11719
query 9 [54, 56, 62, 64, 67, 68, 69, 70, 109, 110, 111, 112, 115, 116, 117, 118, 130, 131, 132, 145, 146, 151, 152, 157, 158, 159, 160, 163, 164, 165, 166, 205, 206, 207, 208, 211, 212, 213, 214, 219, 220, 221, 222, 225, 226, 227, 228, 235, 236, 237, 238, 241, 242, 243, 249, 250, 251, 252, 255, 257, 278, 280, 301, 302, 307, 308, 309, 310] cost: 796860

<IPython.core.display.Javascript object>

Total logical IOs: 37169111
Average logical IOs: 743382
query 0 [34, 49, 50, 51, 52, 24] cost: 1049014
query 1 [49, 50, 51, 52, 24] cost: 211663
query 2 [59, 78, 80, 81, 82, 20, 21] cost: 1908086
query 3 [7, 26, 9] cost: 318982
query 4 [85, 86, 74, 87, 88, 76, 59, 77, 78, 79, 80, 81, 82, 21] cost: 1996373
query 5 [44, 63, 64, 65, 66, 20] cost: 699805
query 6 [34, 49, 50, 51, 52, 24] cost: 1049014
query 7 [7] cost: 22957
query 8 [7, 26, 9] cost: 318982
query 9 [69, 70, 71, 72, 85, 86, 74, 87, 88, 76, 59, 77, 78, 79, 80, 81] cost: 738876
query 10 [9, 10, 34] cost: 1153609
query 11 [7] cost: 22957
query 12 [7] cost: 22957
query 13 [9, 10, 34] cost: 1153609
query 14 [43, 44, 63, 64, 65, 66, 20] cost: 709940
query 15 [21, 47, 83, 84, 68, 34] cost: 3245616
query 16 [63, 64, 65] cost: 312939
query 17 [34, 49, 50, 51, 52] cost: 938760
query 18 [7, 37, 53, 54, 26] cost: 106551
query 19 [34, 49, 50, 51, 52] cost: 938760
query 20 [9, 10, 34] cost: 1153609
query 21 [17, 69, 70, 71, 72, 85, 86, 74,

<IPython.core.display.Javascript object>

Total logical IOs: 3114722
Average logical IOs: 62294
query 0 [5] cost: 10001
query 1 [5] cost: 10001
query 2 [32, 33, 58, 60, 61, 35] cost: 160847
query 3 [51] cost: 10517
query 4 [54, 55, 56, 57, 32, 33, 46, 58, 60] cost: 379735
query 5 [7] cost: 10001
query 6 [5] cost: 10001
query 7 [16] cost: 10339
query 8 [51] cost: 10517
query 9 [41, 42, 54, 55, 56, 57, 32, 33, 46, 58] cost: 661180
query 10 [12] cost: 10001
query 11 [16] cost: 10339
query 12 [16] cost: 10339
query 13 [12] cost: 10001
query 14 [7] cost: 10001
query 15 [8] cost: 10001
query 16 [7] cost: 10001
query 17 [5] cost: 10001
query 18 [48, 49, 50, 51, 38, 39] cost: 90390
query 19 [5] cost: 10001
query 20 [12] cost: 10001
query 21 [52, 53, 41, 55, 56, 32, 33] cost: 559226
query 22 [26] cost: 10001
query 23 [32] cost: 15291
query 24 [53, 41, 54, 55, 32] cost: 531916
query 25 [17] cost: 10339
query 26 [48, 49, 50, 51, 38, 39] cost: 90390
query 27 [8] cost: 10001
query 28 [7] cost: 10001
query 29 [8] cost: 10001
query 30 [48, 4

# Test Beam Search

In [33]:
# pa_beam_nora = PartitionAlgorithm()
# pa_beam_nora.InitializeWithNORA(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, using_1_by_1 = True, 
#                                 using_kd = False, depth_limit = None, return_query_cost = False, using_beam_search = True, 
#                                 candidate_size = 10, candidate_depth = 3)
# pa_beam_nora.partition_tree.evaluate_query_cost(training_set)

Build Time (s): 396.50184655189514


21472

In [1]:
paw_beam = PartitionAlgorithm()
paw_beam.InitializeWithPAW(extended_training_set, len(boundary)//2, boundary, dataset, block_size, max_active_ratio = 1000,
                           strategy = 1, using_beam_search = True, candidate_size = 2, candidate_depth = 1)

In [659]:
paw_beam.partition_tree.visualize(queries = extended_training_set)
paw_beam.partition_tree.evaluate_query_cost(extended_training_set, True)
paw_beam.partition_tree.visualize(queries = training_set)
paw_beam.partition_tree.evaluate_query_cost(training_set, True)
# paw_beam.partition_tree.visualize(queries = testing_set)
# paw_beam.partition_tree.evaluate_query_cost(testing_set, True)

<IPython.core.display.Javascript object>

Total logical IOs: 849119
Average logical IOs: 33964
query 0 [1] cost: 64014
query 1 [2] cost: 15045
query 2 [3] cost: 49428
query 3 [28, 34] cost: 36561
query 4 [5] cost: 35031
query 5 [36, 40, 41, 31] cost: 46995
query 6 [6] cost: 37408
query 7 [7] cost: 13848
query 8 [8] cost: 27539
query 9 [28, 36] cost: 32664
query 10 [9] cost: 33152
query 11 [10] cost: 20868
query 12 [11] cost: 15094
query 13 [12] cost: 41529
query 14 [13] cost: 44278
query 15 [14] cost: 19765
query 16 [39] cost: 10073
query 17 [16] cost: 21533
query 18 [17] cost: 34808
query 19 [18] cost: 33202
query 20 [19] cost: 14479
query 21 [20] cost: 54895
query 22 [21] cost: 50747
query 23 [22] cost: 18941
query 24 [28, 34, 35, 36, 40] cost: 77222


<IPython.core.display.Javascript object>

Total logical IOs: 829648
Average logical IOs: 33185
query 0 [1] cost: 64014
query 1 [2] cost: 15045
query 2 [3] cost: 49428
query 3 [34] cost: 17090
query 4 [5] cost: 35031
query 5 [36, 40, 41, 31] cost: 46995
query 6 [6] cost: 37408
query 7 [7] cost: 13848
query 8 [8] cost: 27539
query 9 [28, 36] cost: 32664
query 10 [9] cost: 33152
query 11 [10] cost: 20868
query 12 [11] cost: 15094
query 13 [12] cost: 41529
query 14 [13] cost: 44278
query 15 [14] cost: 19765
query 16 [39] cost: 10073
query 17 [16] cost: 21533
query 18 [17] cost: 34808
query 19 [18] cost: 33202
query 20 [19] cost: 14479
query 21 [20] cost: 54895
query 22 [21] cost: 50747
query 23 [22] cost: 18941
query 24 [28, 34, 35, 36, 40] cost: 77222


33185

In [906]:
# Beam Search with Skewed Dataset (TWEET)

# Using Tweet Dataset
TWEET = np.genfromtxt('C:/Users/Cloud/Desktop/LearnIndex/data/SortedSingleDimPOIs2.csv', delimiter=',')
cols = [1, 0] # change the shape to: first lon, then lat
TWEET_dataset = TWEET[:,cols]
TWEET_boundary = [-180, -90, 180, 90]
TWEET_block_size = 2000 # 1.1M compared to 6M


In [907]:
plt.scatter(TWEET_dataset[:,0], TWEET_dataset[:,1])
plt.show()

In [908]:
# generate a collection of queries:
Tweet_queries = helper._DatasetAndQuerysetHelper__generate_random_query(50, [1,1], [[-180,180],[-90,90]], [36,18])
paw_beam_tree = PartitionTree(2, TWEET_boundary)
paw_beam_tree.visualize(queries = Tweet_queries)

<IPython.core.display.Javascript object>

In [757]:
# def general_size(query):
#     constraints = []
#     for d in range(2):
#         constraint_L = TWEET_dataset[:,d] >= query[d]
#         constraint_U = TWEET_dataset[:,d] <= query[2 + d]
#         constraints.append(constraint_L)
#         constraints.append(constraint_U)
#     constraint = np.all(constraints, axis=0)
#     result_size = np.count_nonzero(constraint)
#     return result_size

# # general_size([-36.30802167175495,-11.723267110779023,38.41879126458899,25.417782988613432])
# general_size([31.13035744375027,4.486078852074243,38.41879126458899,20.86197115648723])

In [711]:
paw_beam_tweet.partition_tree.nid_node_dict[0].__dict__
# len(paw_beam_tweet.partition_tree.nid_node_dict[0].dataset)
paw_beam_tweet.partition_tree.nid_node_dict[0].query_result_size([42.79028732443038, 67.45514960891715, 64.22553896233364, 72.97931477311631])

0

In [2]:
paw_beam_tweet = PartitionAlgorithm()
paw_beam_tweet.InitializeWithPAW(Tweet_queries, len(TWEET_boundary)//2, TWEET_boundary, TWEET_dataset, TWEET_block_size, 
                                 max_active_ratio = 1000, strategy = 1, using_beam_search = True, candidate_size = 5, 
                                 candidate_depth = 2, beam_search_mode = 1)

In [976]:
# paw_beam_tweet.partition_tree.nid_node_dict[0].__dict__

In [983]:
# MBRs = []
# for MBR in paw_beam_tweet.partition_tree.nid_node_dict[0].query_MBRs:
#     MBRs.append(MBR.boundary)
#     print(MBR.boundary)

In [891]:
# paw_beam_tweet.partition_tree.query_single(Tweet_queries[12], print_info = True)

In [982]:
# paw_beam_tweet.partition_tree.visualize(queries = MBRs)

In [1007]:
paw_beam_tweet.partition_tree.visualize(queries = Tweet_queries)
paw_beam_tweet.partition_tree.evaluate_query_cost(Tweet_queries, True)

<IPython.core.display.Javascript object>

Total logical IOs: 468587
Average logical IOs: 9371
query 0 [75, 80] cost: 4764
query 1 [15] cost: 2203
query 2 [15] cost: 2203
query 3 [29] cost: 10400
query 4 [21] cost: 2003
query 5 [15] cost: 2203
query 6 [15] cost: 2203
query 7 [16] cost: 3083
query 8 [15] cost: 2203
query 9 [84, 73, 42] cost: 19043
query 10 [15, 16, 58, 32, 60] cost: 26426
query 11 [67, 81, 83, 84] cost: 27195
query 12 [44] cost: 2453
query 13 [15] cost: 2203
query 14 [15] cost: 2203
query 15 [84] cost: 10133
query 16 [73] cost: 6910
query 17 [15] cost: 2203
query 18 [15] cost: 2203
query 19 [59, 65] cost: 7040
query 20 [74, 29] cost: 12401
query 21 [48] cost: 2002
query 22 [15] cost: 2203
query 23 [58, 59, 32] cost: 20155
query 24 [29] cost: 10400
query 25 [45] cost: 2001
query 26 [15] cost: 2203
query 27 [81, 85, 83, 84, 73] cost: 30337
query 28 [63] cost: 2099
query 29 [80] cost: 2764
query 30 [29, 77] cost: 12402
query 31 [47, 51] cost: 94571
query 32 [50] cost: 10873
query 33 [59, 32, 61] cost: 21079
query 3

9371

In [923]:
# Beam Search with Skewed Dataset (OSM)

# Using OSM Dataset
OSM_dataset = np.genfromtxt('C:/Users/Cloud/iCloudDrive/NORA_experiments/dataset/OSM_6M.csv', delimiter=',')
OSM_boundary = [-180, -90, 180, 90]
OSM_block_size = 10000 # 1.1M compared to 6M

In [730]:
plt.scatter(OSM_dataset[:,0], OSM_dataset[:,1])
plt.show()

In [948]:
OSM_queries = helper._DatasetAndQuerysetHelper__generate_random_query(50, [1,1], [[-180,180],[-90,90]], [36,18])
paw_beam_tree = PartitionTree(2, OSM_boundary)
paw_beam_tree.visualize(queries = OSM_queries)

<IPython.core.display.Javascript object>

In [3]:
paw_beam_osm = PartitionAlgorithm()
paw_beam_osm.InitializeWithPAW(OSM_queries, len(OSM_boundary)//2, OSM_boundary, OSM_dataset, OSM_block_size, 
                               max_active_ratio = 1000, strategy = 1, using_beam_search = True, candidate_size = 5, 
                               candidate_depth = 2, beam_search_mode = 1)

In [972]:
# paw_beam_osm.partition_tree.nid_node_dict[23].query_MBRs[0].__dict__

In [1009]:
paw_beam_osm.partition_tree.visualize(queries = OSM_queries)
paw_beam_osm.partition_tree.evaluate_query_cost(OSM_queries, True)

<IPython.core.display.Javascript object>

Total logical IOs: 1903343
Average logical IOs: 38066
query 0 [21] cost: 10348
query 1 [21] cost: 10348
query 2 [64] cost: 13168
query 3 [21] cost: 10348
query 4 [11] cost: 10001
query 5 [63] cost: 37675
query 6 [42] cost: 10001
query 7 [21] cost: 10348
query 8 [21, 38] cost: 31748
query 9 [64] cost: 13168
query 10 [49] cost: 10001
query 11 [43] cost: 10001
query 12 [21, 38] cost: 31748
query 13 [21] cost: 10348
query 14 [26] cost: 10001
query 15 [58] cost: 22298
query 16 [47] cost: 10001
query 17 [38] cost: 21400
query 18 [65, 40, 41] cost: 46561
query 19 [45] cost: 10000
query 20 [12] cost: 56225
query 21 [66, 28] cost: 56546
query 22 [38] cost: 21400
query 23 [65, 41] cost: 28947
query 24 [14, 60] cost: 878228
query 25 [58] cost: 22298
query 26 [58] cost: 22298
query 27 [13, 64] cost: 23169
query 28 [48] cost: 44645
query 29 [21] cost: 10348
query 30 [23] cost: 10000
query 31 [21] cost: 10348
query 32 [50] cost: 10001
query 33 [21] cost: 10348
query 34 [21] cost: 10348
query 35 [53]

38066

# Test Rtree Filter

In [14]:
for capacity_ratio in [0.5, 0.2, 0.1, 0.05, 0.02, 0.01]:
    
    print("= = = = = Current Capacity Ratio:", capacity_ratio, "(", int(block_size*capacity_ratio),") = = = = =")

    print("= = = NORA = = =")
    pa_rf_nora = PartitionAlgorithm() # we don't need to optimize it?
    pa_rf_nora.InitializeWithNORA(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, using_1_by_1 = False)
    pa_rf_nora.CreateRtreeFilter(block_size, capacity_ratio)
    pa_rf_nora.partition_tree.evaluate_query_cost(training_set, print_result = True, using_rtree_filter = False)
    print("")
    pa_rf_nora.partition_tree.evaluate_query_cost(training_set, print_result = True, using_rtree_filter = True)
    print("")

    print("= = = kd-Tree = = =")
    pa_rf_kd = PartitionAlgorithm() # we don't need to optimize it?
    pa_rf_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size)
    pa_rf_kd.CreateRtreeFilter(block_size, capacity_ratio)
    pa_rf_kd.partition_tree.evaluate_query_cost(training_set, print_result = True, using_rtree_filter = False)
    print("")
    pa_rf_kd.partition_tree.evaluate_query_cost(training_set, print_result = True, using_rtree_filter = True)
    print("")

    print("= = = Qd-Tree = = =")
    pa_rf_qd = PartitionAlgorithm() # we don't need to optimize it?
    pa_rf_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
    pa_rf_qd.CreateRtreeFilter(block_size, capacity_ratio)
    pa_rf_qd.partition_tree.evaluate_query_cost(training_set, print_result = True, using_rtree_filter = False)
    print("")
    pa_rf_qd.partition_tree.evaluate_query_cost(training_set, print_result = True, using_rtree_filter = True)
    print("")

= = = = = Current Capacity Ratio: 0.5 ( 5000 ) = = = = =
= = = NORA = = =
Build Time (s): 15.475178241729736
Total logical IOs: 1089591
Average logical IOs: 21791
query 0 [120] cost: 46417
query 1 [203] cost: 27097
query 2 [139, 245] cost: 30474
query 3 [161] cost: 15441
query 4 [71] cost: 41623
query 5 [211] cost: 14607
query 6 [250, 214] cost: 41336
query 7 [74] cost: 11993
query 8 [211, 249, 250] cost: 42083
query 9 [121, 195] cost: 37801
query 10 [233] cost: 15943
query 11 [139, 243, 245, 246] cost: 57146
query 12 [101] cost: 21766
query 13 [148] cost: 13323
query 14 [131] cost: 12652
query 15 [257, 258] cost: 25765
query 16 [193] cost: 15852
query 17 [141] cost: 16703
query 18 [222] cost: 13969
query 19 [121] cost: 20800
query 20 [209, 248] cost: 30849
query 21 [200] cost: 16098
query 22 [157] cost: 11878
query 23 [135] cost: 11164
query 24 [152] cost: 18902
query 25 [172] cost: 15498
query 26 [236] cost: 22118
query 27 [209] cost: 18136
query 28 [150] cost: 17007
query 29 [255] c

Build Time (s): 15.721000671386719
Total logical IOs: 1089591
Average logical IOs: 21791
query 0 [120] cost: 46417
query 1 [203] cost: 27097
query 2 [139, 245] cost: 30474
query 3 [161] cost: 15441
query 4 [71] cost: 41623
query 5 [211] cost: 14607
query 6 [250, 214] cost: 41336
query 7 [74] cost: 11993
query 8 [211, 249, 250] cost: 42083
query 9 [121, 195] cost: 37801
query 10 [233] cost: 15943
query 11 [139, 243, 245, 246] cost: 57146
query 12 [101] cost: 21766
query 13 [148] cost: 13323
query 14 [131] cost: 12652
query 15 [257, 258] cost: 25765
query 16 [193] cost: 15852
query 17 [141] cost: 16703
query 18 [222] cost: 13969
query 19 [121] cost: 20800
query 20 [209, 248] cost: 30849
query 21 [200] cost: 16098
query 22 [157] cost: 11878
query 23 [135] cost: 11164
query 24 [152] cost: 18902
query 25 [172] cost: 15498
query 26 [236] cost: 22118
query 27 [209] cost: 18136
query 28 [150] cost: 17007
query 29 [255] cost: 12440
query 30 [189] cost: 22669
query 31 [68] cost: 10020
query 32 [

Build Time (s): 15.333976984024048
Total logical IOs: 1089591
Average logical IOs: 21791
query 0 [120] cost: 46417
query 1 [203] cost: 27097
query 2 [139, 245] cost: 30474
query 3 [161] cost: 15441
query 4 [71] cost: 41623
query 5 [211] cost: 14607
query 6 [250, 214] cost: 41336
query 7 [74] cost: 11993
query 8 [211, 249, 250] cost: 42083
query 9 [121, 195] cost: 37801
query 10 [233] cost: 15943
query 11 [139, 243, 245, 246] cost: 57146
query 12 [101] cost: 21766
query 13 [148] cost: 13323
query 14 [131] cost: 12652
query 15 [257, 258] cost: 25765
query 16 [193] cost: 15852
query 17 [141] cost: 16703
query 18 [222] cost: 13969
query 19 [121] cost: 20800
query 20 [209, 248] cost: 30849
query 21 [200] cost: 16098
query 22 [157] cost: 11878
query 23 [135] cost: 11164
query 24 [152] cost: 18902
query 25 [172] cost: 15498
query 26 [236] cost: 22118
query 27 [209] cost: 18136
query 28 [150] cost: 17007
query 29 [255] cost: 12440
query 30 [189] cost: 22669
query 31 [68] cost: 10020
query 32 [

Build Time (s): 15.352037191390991
Total logical IOs: 1089591
Average logical IOs: 21791
query 0 [120] cost: 46417
query 1 [203] cost: 27097
query 2 [139, 245] cost: 30474
query 3 [161] cost: 15441
query 4 [71] cost: 41623
query 5 [211] cost: 14607
query 6 [250, 214] cost: 41336
query 7 [74] cost: 11993
query 8 [211, 249, 250] cost: 42083
query 9 [121, 195] cost: 37801
query 10 [233] cost: 15943
query 11 [139, 243, 245, 246] cost: 57146
query 12 [101] cost: 21766
query 13 [148] cost: 13323
query 14 [131] cost: 12652
query 15 [257, 258] cost: 25765
query 16 [193] cost: 15852
query 17 [141] cost: 16703
query 18 [222] cost: 13969
query 19 [121] cost: 20800
query 20 [209, 248] cost: 30849
query 21 [200] cost: 16098
query 22 [157] cost: 11878
query 23 [135] cost: 11164
query 24 [152] cost: 18902
query 25 [172] cost: 15498
query 26 [236] cost: 22118
query 27 [209] cost: 18136
query 28 [150] cost: 17007
query 29 [255] cost: 12440
query 30 [189] cost: 22669
query 31 [68] cost: 10020
query 32 [

Build Time (s): 15.299104690551758
Total logical IOs: 1089591
Average logical IOs: 21791
query 0 [120] cost: 46417
query 1 [203] cost: 27097
query 2 [139, 245] cost: 30474
query 3 [161] cost: 15441
query 4 [71] cost: 41623
query 5 [211] cost: 14607
query 6 [250, 214] cost: 41336
query 7 [74] cost: 11993
query 8 [211, 249, 250] cost: 42083
query 9 [121, 195] cost: 37801
query 10 [233] cost: 15943
query 11 [139, 243, 245, 246] cost: 57146
query 12 [101] cost: 21766
query 13 [148] cost: 13323
query 14 [131] cost: 12652
query 15 [257, 258] cost: 25765
query 16 [193] cost: 15852
query 17 [141] cost: 16703
query 18 [222] cost: 13969
query 19 [121] cost: 20800
query 20 [209, 248] cost: 30849
query 21 [200] cost: 16098
query 22 [157] cost: 11878
query 23 [135] cost: 11164
query 24 [152] cost: 18902
query 25 [172] cost: 15498
query 26 [236] cost: 22118
query 27 [209] cost: 18136
query 28 [150] cost: 17007
query 29 [255] cost: 12440
query 30 [189] cost: 22669
query 31 [68] cost: 10020
query 32 [

Build Time (s): 15.887999534606934
Total logical IOs: 1089591
Average logical IOs: 21791
query 0 [120] cost: 46417
query 1 [203] cost: 27097
query 2 [139, 245] cost: 30474
query 3 [161] cost: 15441
query 4 [71] cost: 41623
query 5 [211] cost: 14607
query 6 [250, 214] cost: 41336
query 7 [74] cost: 11993
query 8 [211, 249, 250] cost: 42083
query 9 [121, 195] cost: 37801
query 10 [233] cost: 15943
query 11 [139, 243, 245, 246] cost: 57146
query 12 [101] cost: 21766
query 13 [148] cost: 13323
query 14 [131] cost: 12652
query 15 [257, 258] cost: 25765
query 16 [193] cost: 15852
query 17 [141] cost: 16703
query 18 [222] cost: 13969
query 19 [121] cost: 20800
query 20 [209, 248] cost: 30849
query 21 [200] cost: 16098
query 22 [157] cost: 11878
query 23 [135] cost: 11164
query 24 [152] cost: 18902
query 25 [172] cost: 15498
query 26 [236] cost: 22118
query 27 [209] cost: 18136
query 28 [150] cost: 17007
query 29 [255] cost: 12440
query 30 [189] cost: 22669
query 31 [68] cost: 10020
query 32 [

In [26]:
def total_MBR_volume(partition_tree, num_dims):
    leaves = partition_tree.get_leaves()
    MBRs = []
    for leaf in leaves:
        if leaf.rtree_filters is not None:
            MBRs += leaf.rtree_filters
    total_volume = 0
    for MBR in MBRs:
        volume = 1
        for k in range(num_dims):
            volume *= MBR[k + num_dims] - MBR[k]
        total_volume += volume
    print("!!! Total Volume:",total_volume)

In [47]:
# Using Tweet Dataset
TWEET = np.genfromtxt('C:/Users/Cloud/Desktop/LearnIndex/data/SortedSingleDimPOIs2.csv', delimiter=',')
cols = [1, 0] # change the shape to: first lon, then lat
TWEET_dataset = TWEET[:,cols]

# generate a collection of queries:
Tweet_queries = helper._DatasetAndQuerysetHelper__generate_random_query(50, [1,1], [[-180,180],[-90,90]], [36,18])
TWEET_boundary = [-180, -90, 180, 90]
TWEET_block_size = 2000 # 1.1M compared to 6M

In [50]:
for capacity_ratio in [0.5, 0.2, 0.1, 0.05, 0.02, 0.01]:
    
    print("= = = = = Current Capacity Ratio:", capacity_ratio, "(", int(TWEET_block_size*capacity_ratio),") = = = = =")
    
    print("= = = NORA = = =")
    pa_rf_nora = PartitionAlgorithm() # we don't need to optimize it?
    pa_rf_nora.InitializeWithNORA(Tweet_queries, 2, TWEET_boundary, TWEET_dataset, data_threshold = TWEET_block_size, using_1_by_1 = True, using_kd = True)
    pa_rf_nora.CreateRtreeFilter(TWEET_block_size, capacity_ratio)
    pa_rf_nora.partition_tree.evaluate_query_cost(Tweet_queries, print_result = True, using_rtree_filter = False)
    print("")
    pa_rf_nora.partition_tree.evaluate_query_cost(Tweet_queries, print_result = True, using_rtree_filter = True)
    print("")

    print("= = = kd-Tree = = =")
    pa_rf_kd = PartitionAlgorithm() # we don't need to optimize it?
    pa_rf_kd.InitializeWithKDT(2, TWEET_boundary, TWEET_dataset, data_threshold = TWEET_block_size)
    pa_rf_kd.CreateRtreeFilter(TWEET_block_size, capacity_ratio)
    pa_rf_kd.partition_tree.evaluate_query_cost(Tweet_queries, print_result = True, using_rtree_filter = False)
    print("")
    pa_rf_kd.partition_tree.evaluate_query_cost(Tweet_queries, print_result = True, using_rtree_filter = True)
    print("")

    print("= = = Qd-Tree = = =")
    pa_rf_qd = PartitionAlgorithm() # we don't need to optimize it?
    pa_rf_qd.InitializeWithQDT(Tweet_queries, 2, TWEET_boundary, TWEET_dataset, data_threshold = TWEET_block_size)
    pa_rf_qd.CreateRtreeFilter(TWEET_block_size, capacity_ratio)
    pa_rf_qd.partition_tree.evaluate_query_cost(Tweet_queries, print_result = True, using_rtree_filter = False)
    print("")
    pa_rf_qd.partition_tree.evaluate_query_cost(Tweet_queries, print_result = True, using_rtree_filter = True)
    print("")
    
    total_MBR_volume(pa_rf_nora.partition_tree, 2)
    print("")

= = = = = Current Capacity Ratio: 0.5 ( 1000 ) = = = = =
= = = NORA = = =
Build Time (s): 52.45396113395691
Total logical IOs: 129618
Average logical IOs: 2592
query 0 [40, 91] cost: 3319
query 1 [7] cost: 2711
query 2 [34] cost: 2108
query 3 [7] cost: 2711
query 4 [28] cost: 2210
query 5 [7] cost: 2711
query 6 [58] cost: 2974
query 7 [7] cost: 2711
query 8 [75] cost: 0
query 9 [30] cost: 3792
query 10 [86] cost: 0
query 11 [70] cost: 3548
query 12 [28] cost: 2210
query 13 [84] cost: 0
query 14 [71, 80] cost: 2915
query 15 [7] cost: 2711
query 16 [7] cost: 2711
query 17 [34] cost: 2108
query 18 [91] cost: 3314
query 19 [70, 28] cost: 5758
query 20 [71] cost: 2915
query 21 [28] cost: 2210
query 22 [55] cost: 0
query 23 [67, 28] cost: 2210
query 24 [50] cost: 0
query 25 [7, 15] cost: 6179
query 26 [7] cost: 2711
query 27 [71, 72, 73] cost: 5896
query 28 [7] cost: 2711
query 29 [82] cost: 0
query 30 [7] cost: 2711
query 31 [30] cost: 3792
query 32 [28] cost: 2210
query 33 [34] cost: 2108


Build Time (s): 51.1211678981781
Total logical IOs: 129618
Average logical IOs: 2592
query 0 [40, 91] cost: 3319
query 1 [7] cost: 2711
query 2 [34] cost: 2108
query 3 [7] cost: 2711
query 4 [28] cost: 2210
query 5 [7] cost: 2711
query 6 [58] cost: 2974
query 7 [7] cost: 2711
query 8 [75] cost: 0
query 9 [30] cost: 3792
query 10 [86] cost: 0
query 11 [70] cost: 3548
query 12 [28] cost: 2210
query 13 [84] cost: 0
query 14 [71, 80] cost: 2915
query 15 [7] cost: 2711
query 16 [7] cost: 2711
query 17 [34] cost: 2108
query 18 [91] cost: 3314
query 19 [70, 28] cost: 5758
query 20 [71] cost: 2915
query 21 [28] cost: 2210
query 22 [55] cost: 0
query 23 [67, 28] cost: 2210
query 24 [50] cost: 0
query 25 [7, 15] cost: 6179
query 26 [7] cost: 2711
query 27 [71, 72, 73] cost: 5896
query 28 [7] cost: 2711
query 29 [82] cost: 0
query 30 [7] cost: 2711
query 31 [30] cost: 3792
query 32 [28] cost: 2210
query 33 [34] cost: 2108
query 34 [91] cost: 3314
query 35 [7] cost: 2711
query 36 [7] cost: 2711
qu

Build Time (s): 51.166011333465576
Total logical IOs: 129618
Average logical IOs: 2592
query 0 [40, 91] cost: 3319
query 1 [7] cost: 2711
query 2 [34] cost: 2108
query 3 [7] cost: 2711
query 4 [28] cost: 2210
query 5 [7] cost: 2711
query 6 [58] cost: 2974
query 7 [7] cost: 2711
query 8 [75] cost: 0
query 9 [30] cost: 3792
query 10 [86] cost: 0
query 11 [70] cost: 3548
query 12 [28] cost: 2210
query 13 [84] cost: 0
query 14 [71, 80] cost: 2915
query 15 [7] cost: 2711
query 16 [7] cost: 2711
query 17 [34] cost: 2108
query 18 [91] cost: 3314
query 19 [70, 28] cost: 5758
query 20 [71] cost: 2915
query 21 [28] cost: 2210
query 22 [55] cost: 0
query 23 [67, 28] cost: 2210
query 24 [50] cost: 0
query 25 [7, 15] cost: 6179
query 26 [7] cost: 2711
query 27 [71, 72, 73] cost: 5896
query 28 [7] cost: 2711
query 29 [82] cost: 0
query 30 [7] cost: 2711
query 31 [30] cost: 3792
query 32 [28] cost: 2210
query 33 [34] cost: 2108
query 34 [91] cost: 3314
query 35 [7] cost: 2711
query 36 [7] cost: 2711


Build Time (s): 51.189000368118286
Total logical IOs: 129618
Average logical IOs: 2592
query 0 [40, 91] cost: 3319
query 1 [7] cost: 2711
query 2 [34] cost: 2108
query 3 [7] cost: 2711
query 4 [28] cost: 2210
query 5 [7] cost: 2711
query 6 [58] cost: 2974
query 7 [7] cost: 2711
query 8 [75] cost: 0
query 9 [30] cost: 3792
query 10 [86] cost: 0
query 11 [70] cost: 3548
query 12 [28] cost: 2210
query 13 [84] cost: 0
query 14 [71, 80] cost: 2915
query 15 [7] cost: 2711
query 16 [7] cost: 2711
query 17 [34] cost: 2108
query 18 [91] cost: 3314
query 19 [70, 28] cost: 5758
query 20 [71] cost: 2915
query 21 [28] cost: 2210
query 22 [55] cost: 0
query 23 [67, 28] cost: 2210
query 24 [50] cost: 0
query 25 [7, 15] cost: 6179
query 26 [7] cost: 2711
query 27 [71, 72, 73] cost: 5896
query 28 [7] cost: 2711
query 29 [82] cost: 0
query 30 [7] cost: 2711
query 31 [30] cost: 3792
query 32 [28] cost: 2210
query 33 [34] cost: 2108
query 34 [91] cost: 3314
query 35 [7] cost: 2711
query 36 [7] cost: 2711


Build Time (s): 51.30700087547302
Total logical IOs: 129618
Average logical IOs: 2592
query 0 [40, 91] cost: 3319
query 1 [7] cost: 2711
query 2 [34] cost: 2108
query 3 [7] cost: 2711
query 4 [28] cost: 2210
query 5 [7] cost: 2711
query 6 [58] cost: 2974
query 7 [7] cost: 2711
query 8 [75] cost: 0
query 9 [30] cost: 3792
query 10 [86] cost: 0
query 11 [70] cost: 3548
query 12 [28] cost: 2210
query 13 [84] cost: 0
query 14 [71, 80] cost: 2915
query 15 [7] cost: 2711
query 16 [7] cost: 2711
query 17 [34] cost: 2108
query 18 [91] cost: 3314
query 19 [70, 28] cost: 5758
query 20 [71] cost: 2915
query 21 [28] cost: 2210
query 22 [55] cost: 0
query 23 [67, 28] cost: 2210
query 24 [50] cost: 0
query 25 [7, 15] cost: 6179
query 26 [7] cost: 2711
query 27 [71, 72, 73] cost: 5896
query 28 [7] cost: 2711
query 29 [82] cost: 0
query 30 [7] cost: 2711
query 31 [30] cost: 3792
query 32 [28] cost: 2210
query 33 [34] cost: 2108
query 34 [91] cost: 3314
query 35 [7] cost: 2711
query 36 [7] cost: 2711
q

Build Time (s): 51.17596220970154
Total logical IOs: 129618
Average logical IOs: 2592
query 0 [40, 91] cost: 3319
query 1 [7] cost: 2711
query 2 [34] cost: 2108
query 3 [7] cost: 2711
query 4 [28] cost: 2210
query 5 [7] cost: 2711
query 6 [58] cost: 2974
query 7 [7] cost: 2711
query 8 [75] cost: 0
query 9 [30] cost: 3792
query 10 [86] cost: 0
query 11 [70] cost: 3548
query 12 [28] cost: 2210
query 13 [84] cost: 0
query 14 [71, 80] cost: 2915
query 15 [7] cost: 2711
query 16 [7] cost: 2711
query 17 [34] cost: 2108
query 18 [91] cost: 3314
query 19 [70, 28] cost: 5758
query 20 [71] cost: 2915
query 21 [28] cost: 2210
query 22 [55] cost: 0
query 23 [67, 28] cost: 2210
query 24 [50] cost: 0
query 25 [7, 15] cost: 6179
query 26 [7] cost: 2711
query 27 [71, 72, 73] cost: 5896
query 28 [7] cost: 2711
query 29 [82] cost: 0
query 30 [7] cost: 2711
query 31 [30] cost: 3792
query 32 [28] cost: 2210
query 33 [34] cost: 2108
query 34 [91] cost: 3314
query 35 [7] cost: 2711
query 36 [7] cost: 2711
q

In [12]:
# Using OSM Dataset + default distance threshold

OSM_dataset = np.genfromtxt('C:/Users/Cloud/iCloudDrive/NORA_experiments/dataset/OSM_6M.csv', delimiter=',')
OSM_boundary = [-180, -90, 180, 90]
OSM_block_size = 10000 # 1.1M compared to 6M

# helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments')
# OSM_queries = helper._DatasetAndQuerysetHelper__generate_random_query(50, [1,1], [[-180,180],[-90,90]], [36,18])
# paw_beam_tree = PartitionTree(2, OSM_boundary)
# paw_beam_tree.visualize(queries = OSM_queries)

# extended_OSM_queries = helper.extend_queryset(OSM_queries, QDistThreshold_percent = None, domains = [[-180,180],[-90,90]])
# paw_beam_tree.visualize(queries = extended_OSM_queries)

In [16]:
plt.scatter(OSM_dataset[:,0], OSM_dataset[:,1])
plt.show()

In [61]:
# helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments')

In [13]:
helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments')
OSM_queries = helper._DatasetAndQuerysetHelper__generate_random_query(50, [1,1], [[-180,180],[-90,90]], [36,18])
paw_beam_tree = PartitionTree(2, OSM_boundary)
paw_beam_tree.visualize(queries = OSM_queries)

extended_OSM_queries = helper.extend_queryset(OSM_queries, QDistThreshold_percent = None, domains = [[-180,180],[-90,90]])
paw_beam_tree.visualize(queries = extended_OSM_queries)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [42]:
# [0.5, 0.2, 0.1, 0.05, 0.02, 0.01]
for capacity_ratio in [0.2, 0.1, 0.05, 0.02, 0.01]:
    
    print("= = = = = Current Capacity Ratio:", capacity_ratio, "(", int(OSM_block_size*capacity_ratio),") = = = = =")
    
    print("= = = PAW = = =")
    pa_rf_paw = PartitionAlgorithm() # we don't need to optimize it? 
    pa_rf_paw.InitializeWithPAW(extended_OSM_queries, len(OSM_boundary)//2, OSM_boundary, OSM_dataset, OSM_block_size, 1000, strategy = 1)
    pa_rf_paw.CreateRtreeFilter(OSM_block_size, capacity_ratio)
    pa_rf_paw.partition_tree.evaluate_query_cost(extended_OSM_queries, print_result = True, using_rtree_filter = False)
    print("")
    pa_rf_paw.partition_tree.evaluate_query_cost(extended_OSM_queries, print_result = True, using_rtree_filter = True)
    print("")

    print("= = = kd-Tree = = =")
    pa_rf_kd = PartitionAlgorithm() # we don't need to optimize it?
    pa_rf_kd.InitializeWithKDT(2, OSM_boundary, OSM_dataset, data_threshold = OSM_block_size)
    pa_rf_kd.CreateRtreeFilter(OSM_block_size, capacity_ratio)
    pa_rf_kd.partition_tree.evaluate_query_cost(extended_OSM_queries, print_result = True, using_rtree_filter = False)
    print("")
    pa_rf_kd.partition_tree.evaluate_query_cost(extended_OSM_queries, print_result = True, using_rtree_filter = True)
    print("")

    print("= = = Qd-Tree = = =")
    pa_rf_qd = PartitionAlgorithm() # we don't need to optimize it?
    pa_rf_qd.InitializeWithQDT(OSM_queries, 2, OSM_boundary, OSM_dataset, data_threshold = OSM_block_size)
    pa_rf_qd.CreateRtreeFilter(OSM_block_size, capacity_ratio)
    pa_rf_qd.partition_tree.evaluate_query_cost(extended_OSM_queries, print_result = True, using_rtree_filter = False)
    print("")
    pa_rf_qd.partition_tree.evaluate_query_cost(extended_OSM_queries, print_result = True, using_rtree_filter = True)
    print("")
    
    #total_MBR_volume(pa_rf_paw.partition_tree, 2)
    #print("")

= = = = = Current Capacity Ratio: 0.2 ( 2000 ) = = = = =
= = = PAW = = =
Build Time (s): 285.72472953796387
Total logical IOs: 3350748
Average logical IOs: 67014
query 0 [14, 32] cost: 33351
query 1 [74, 90, 96, 92] cost: 247628
query 2 [37] cost: 17858
query 3 [54, 37] cost: 27859
query 4 [37] cost: 17858
query 5 [86, 87] cost: 39933
query 6 [38] cost: 10338
query 7 [37] cost: 17858
query 8 [96, 97, 95] cost: 536657
query 9 [82] cost: 10532
query 10 [65, 66] cost: 68917
query 11 [84, 63, 89, 73] cost: 100622
query 12 [28] cost: 10001
query 13 [88, 89] cost: 31027
query 14 [56] cost: 10001
query 15 [37] cost: 17858
query 16 [15, 33] cost: 532550
query 17 [46] cost: 17786
query 18 [26] cost: 10001
query 19 [37] cost: 17858
query 20 [38] cost: 10338
query 21 [82, 46] cost: 28318
query 22 [65] cost: 11370
query 23 [62] cost: 14774
query 24 [62] cost: 14774
query 25 [34] cost: 10001
query 26 [37] cost: 17858
query 27 [46, 27] cost: 27787
query 28 [92] cost: 10331
query 29 [11] cost: 10001


Total logical IOs: 16485368
Average logical IOs: 329707
query 0 [90, 73, 74, 91, 92, 93, 77] cost: 545800
query 1 [67, 95, 96, 86] cost: 367726
query 2 [29, 30] cost: 39396
query 3 [94, 29] cost: 35646
query 4 [29] cost: 24538
query 5 [38, 83, 41] cost: 96756
query 6 [15] cost: 17053
query 7 [30] cost: 14858
query 8 [67, 96, 86, 87, 101, 102, 100, 98, 89, 90] cost: 1232734
query 9 [79, 32] cost: 66970
query 10 [57, 58, 81, 82] cost: 97642
query 11 [32, 55, 56, 34, 84, 62, 40] cost: 333117
query 12 [65, 66, 44, 50] cost: 296175
query 13 [83, 84, 62, 41, 65] cost: 161898
query 14 [79, 80] cost: 53533
query 15 [29] cost: 24538
query 16 [25, 77, 78, 28] cost: 2837840
query 17 [37, 38] cost: 42109
query 18 [63, 64, 66, 44, 25] cost: 2325451
query 19 [29] cost: 24538
query 20 [15] cost: 17053
query 21 [79, 32, 37] cost: 85194
query 22 [57] cost: 10716
query 23 [55, 34] cost: 107758
query 24 [34] cost: 19955
query 25 [91, 93, 94] cost: 71102
query 26 [29] cost: 24538
query 27 [41, 63] cost: 2

Total logical IOs: 16485368
Average logical IOs: 329707
query 0 [90, 73, 74, 91, 92, 93, 77] cost: 545800
query 1 [67, 95, 96, 86] cost: 367726
query 2 [29, 30] cost: 39396
query 3 [94, 29] cost: 35646
query 4 [29] cost: 24538
query 5 [38, 83, 41] cost: 96756
query 6 [15] cost: 17053
query 7 [30] cost: 14858
query 8 [67, 96, 86, 87, 101, 102, 100, 98, 89, 90] cost: 1232734
query 9 [79, 32] cost: 66970
query 10 [57, 58, 81, 82] cost: 97642
query 11 [32, 55, 56, 34, 84, 62, 40] cost: 333117
query 12 [65, 66, 44, 50] cost: 296175
query 13 [83, 84, 62, 41, 65] cost: 161898
query 14 [79, 80] cost: 53533
query 15 [29] cost: 24538
query 16 [25, 77, 78, 28] cost: 2837840
query 17 [37, 38] cost: 42109
query 18 [63, 64, 66, 44, 25] cost: 2325451
query 19 [29] cost: 24538
query 20 [15] cost: 17053
query 21 [79, 32, 37] cost: 85194
query 22 [57] cost: 10716
query 23 [55, 34] cost: 107758
query 24 [34] cost: 19955
query 25 [91, 93, 94] cost: 71102
query 26 [29] cost: 24538
query 27 [41, 63] cost: 2

Total logical IOs: 16485368
Average logical IOs: 329707
query 0 [90, 73, 74, 91, 92, 93, 77] cost: 545800
query 1 [67, 95, 96, 86] cost: 367726
query 2 [29, 30] cost: 39396
query 3 [94, 29] cost: 35646
query 4 [29] cost: 24538
query 5 [38, 83, 41] cost: 96756
query 6 [15] cost: 17053
query 7 [30] cost: 14858
query 8 [67, 96, 86, 87, 101, 102, 100, 98, 89, 90] cost: 1232734
query 9 [79, 32] cost: 66970
query 10 [57, 58, 81, 82] cost: 97642
query 11 [32, 55, 56, 34, 84, 62, 40] cost: 333117
query 12 [65, 66, 44, 50] cost: 296175
query 13 [83, 84, 62, 41, 65] cost: 161898
query 14 [79, 80] cost: 53533
query 15 [29] cost: 24538
query 16 [25, 77, 78, 28] cost: 2837840
query 17 [37, 38] cost: 42109
query 18 [63, 64, 66, 44, 25] cost: 2325451
query 19 [29] cost: 24538
query 20 [15] cost: 17053
query 21 [79, 32, 37] cost: 85194
query 22 [57] cost: 10716
query 23 [55, 34] cost: 107758
query 24 [34] cost: 19955
query 25 [91, 93, 94] cost: 71102
query 26 [29] cost: 24538
query 27 [41, 63] cost: 2

Total logical IOs: 16485368
Average logical IOs: 329707
query 0 [90, 73, 74, 91, 92, 93, 77] cost: 545800
query 1 [67, 95, 96, 86] cost: 367726
query 2 [29, 30] cost: 39396
query 3 [94, 29] cost: 35646
query 4 [29] cost: 24538
query 5 [38, 83, 41] cost: 96756
query 6 [15] cost: 17053
query 7 [30] cost: 14858
query 8 [67, 96, 86, 87, 101, 102, 100, 98, 89, 90] cost: 1232734
query 9 [79, 32] cost: 66970
query 10 [57, 58, 81, 82] cost: 97642
query 11 [32, 55, 56, 34, 84, 62, 40] cost: 333117
query 12 [65, 66, 44, 50] cost: 296175
query 13 [83, 84, 62, 41, 65] cost: 161898
query 14 [79, 80] cost: 53533
query 15 [29] cost: 24538
query 16 [25, 77, 78, 28] cost: 2837840
query 17 [37, 38] cost: 42109
query 18 [63, 64, 66, 44, 25] cost: 2325451
query 19 [29] cost: 24538
query 20 [15] cost: 17053
query 21 [79, 32, 37] cost: 85194
query 22 [57] cost: 10716
query 23 [55, 34] cost: 107758
query 24 [34] cost: 19955
query 25 [91, 93, 94] cost: 71102
query 26 [29] cost: 24538
query 27 [41, 63] cost: 2

Total logical IOs: 16485368
Average logical IOs: 329707
query 0 [90, 73, 74, 91, 92, 93, 77] cost: 545800
query 1 [67, 95, 96, 86] cost: 367726
query 2 [29, 30] cost: 39396
query 3 [94, 29] cost: 35646
query 4 [29] cost: 24538
query 5 [38, 83, 41] cost: 96756
query 6 [15] cost: 17053
query 7 [30] cost: 14858
query 8 [67, 96, 86, 87, 101, 102, 100, 98, 89, 90] cost: 1232734
query 9 [79, 32] cost: 66970
query 10 [57, 58, 81, 82] cost: 97642
query 11 [32, 55, 56, 34, 84, 62, 40] cost: 333117
query 12 [65, 66, 44, 50] cost: 296175
query 13 [83, 84, 62, 41, 65] cost: 161898
query 14 [79, 80] cost: 53533
query 15 [29] cost: 24538
query 16 [25, 77, 78, 28] cost: 2837840
query 17 [37, 38] cost: 42109
query 18 [63, 64, 66, 44, 25] cost: 2325451
query 19 [29] cost: 24538
query 20 [15] cost: 17053
query 21 [79, 32, 37] cost: 85194
query 22 [57] cost: 10716
query 23 [55, 34] cost: 107758
query 24 [34] cost: 19955
query 25 [91, 93, 94] cost: 71102
query 26 [29] cost: 24538
query 27 [41, 63] cost: 2

In [126]:
# Using OSM Dataset + distance threshold = 0

for capacity_ratio in [0.5, 0.2, 0.1, 0.05, 0.02, 0.01]:
    
    print("= = = = = Current Capacity Ratio:", capacity_ratio, "(", int(OSM_block_size*capacity_ratio),") = = = = =")
    
    print("= = = PAW = = =")
    pa_rf_paw = PartitionAlgorithm() # we don't need to optimize it? 
    pa_rf_paw.InitializeWithPAW(OSM_queries, len(OSM_boundary)//2, OSM_boundary, OSM_dataset, OSM_block_size, 1000, strategy = 1)
    pa_rf_paw.CreateRtreeFilter(OSM_block_size, capacity_ratio)
    pa_rf_paw.partition_tree.evaluate_query_cost(OSM_queries, print_result = True, using_rtree_filter = False)
    print("")
    pa_rf_paw.partition_tree.evaluate_query_cost(OSM_queries, print_result = True, using_rtree_filter = True)
    print("")

    print("= = = kd-Tree = = =")
    pa_rf_kd = PartitionAlgorithm() # we don't need to optimize it?
    pa_rf_kd.InitializeWithKDT(2, OSM_boundary, OSM_dataset, data_threshold = OSM_block_size)
    pa_rf_kd.CreateRtreeFilter(OSM_block_size, capacity_ratio)
    pa_rf_kd.partition_tree.evaluate_query_cost(OSM_queries, print_result = True, using_rtree_filter = False)
    print("")
    pa_rf_kd.partition_tree.evaluate_query_cost(OSM_queries, print_result = True, using_rtree_filter = True)
    print("")

    print("= = = Qd-Tree = = =")
    pa_rf_qd = PartitionAlgorithm() # we don't need to optimize it?
    pa_rf_qd.InitializeWithQDT(OSM_queries, 2, OSM_boundary, OSM_dataset, data_threshold = OSM_block_size)
    pa_rf_qd.CreateRtreeFilter(OSM_block_size, capacity_ratio)
    pa_rf_qd.partition_tree.evaluate_query_cost(OSM_queries, print_result = True, using_rtree_filter = False)
    print("")
    pa_rf_qd.partition_tree.evaluate_query_cost(OSM_queries, print_result = True, using_rtree_filter = True)
    print("")
    
    #total_MBR_volume(pa_rf_paw.partition_tree, 2)
    #print("")

= = = = = Current Capacity Ratio: 0.5 ( 5000 ) = = = = =
= = = PAW = = =
Build Time (s): 451.24836564064026
Total logical IOs: 2142667
Average logical IOs: 42853
query 0 [71, 31] cost: 20464
query 1 [47] cost: 166286
query 2 [35] cost: 19698
query 3 [34] cost: 19698
query 4 [34] cost: 19698
query 5 [65] cost: 10001
query 6 [15] cost: 17053
query 7 [35] cost: 19698
query 8 [86] cost: 176853
query 9 [73] cost: 10001
query 10 [60, 61] cost: 64189
query 11 [76, 58, 39, 44] cost: 90025
query 12 [24] cost: 10001
query 13 [45] cost: 18675
query 14 [53] cost: 10001
query 15 [34] cost: 19698
query 16 [30] cost: 137116
query 17 [42] cost: 18224
query 18 [23] cost: 10000
query 19 [34] cost: 19698
query 20 [15] cost: 17053
query 21 [73] cost: 10001
query 22 [60] cost: 10716
query 23 [39] cost: 19955
query 24 [39] cost: 19955
query 25 [32] cost: 10001
query 26 [34] cost: 19698
query 27 [21] cost: 10001
query 28 [79] cost: 10001
query 29 [28] cost: 10001
query 30 [49] cost: 10001
query 31 [15] cost:

Build Time (s): 455.539671421051
Total logical IOs: 2142667
Average logical IOs: 42853
query 0 [71, 31] cost: 20464
query 1 [47] cost: 166286
query 2 [35] cost: 19698
query 3 [34] cost: 19698
query 4 [34] cost: 19698
query 5 [65] cost: 10001
query 6 [15] cost: 17053
query 7 [35] cost: 19698
query 8 [86] cost: 176853
query 9 [73] cost: 10001
query 10 [60, 61] cost: 64189
query 11 [76, 58, 39, 44] cost: 90025
query 12 [24] cost: 10001
query 13 [45] cost: 18675
query 14 [53] cost: 10001
query 15 [34] cost: 19698
query 16 [30] cost: 137116
query 17 [42] cost: 18224
query 18 [23] cost: 10000
query 19 [34] cost: 19698
query 20 [15] cost: 17053
query 21 [73] cost: 10001
query 22 [60] cost: 10716
query 23 [39] cost: 19955
query 24 [39] cost: 19955
query 25 [32] cost: 10001
query 26 [34] cost: 19698
query 27 [21] cost: 10001
query 28 [79] cost: 10001
query 29 [28] cost: 10001
query 30 [49] cost: 10001
query 31 [15] cost: 17053
query 32 [15] cost: 17053
query 33 [34] cost: 19698
query 34 [54] co

Build Time (s): 451.07303071022034
Total logical IOs: 2142667
Average logical IOs: 42853
query 0 [71, 31] cost: 20464
query 1 [47] cost: 166286
query 2 [35] cost: 19698
query 3 [34] cost: 19698
query 4 [34] cost: 19698
query 5 [65] cost: 10001
query 6 [15] cost: 17053
query 7 [35] cost: 19698
query 8 [86] cost: 176853
query 9 [73] cost: 10001
query 10 [60, 61] cost: 64189
query 11 [76, 58, 39, 44] cost: 90025
query 12 [24] cost: 10001
query 13 [45] cost: 18675
query 14 [53] cost: 10001
query 15 [34] cost: 19698
query 16 [30] cost: 137116
query 17 [42] cost: 18224
query 18 [23] cost: 10000
query 19 [34] cost: 19698
query 20 [15] cost: 17053
query 21 [73] cost: 10001
query 22 [60] cost: 10716
query 23 [39] cost: 19955
query 24 [39] cost: 19955
query 25 [32] cost: 10001
query 26 [34] cost: 19698
query 27 [21] cost: 10001
query 28 [79] cost: 10001
query 29 [28] cost: 10001
query 30 [49] cost: 10001
query 31 [15] cost: 17053
query 32 [15] cost: 17053
query 33 [34] cost: 19698
query 34 [54] 

Build Time (s): 454.9750266075134
Total logical IOs: 2142667
Average logical IOs: 42853
query 0 [71, 31] cost: 20464
query 1 [47] cost: 166286
query 2 [35] cost: 19698
query 3 [34] cost: 19698
query 4 [34] cost: 19698
query 5 [65] cost: 10001
query 6 [15] cost: 17053
query 7 [35] cost: 19698
query 8 [86] cost: 176853
query 9 [73] cost: 10001
query 10 [60, 61] cost: 64189
query 11 [76, 58, 39, 44] cost: 90025
query 12 [24] cost: 10001
query 13 [45] cost: 18675
query 14 [53] cost: 10001
query 15 [34] cost: 19698
query 16 [30] cost: 137116
query 17 [42] cost: 18224
query 18 [23] cost: 10000
query 19 [34] cost: 19698
query 20 [15] cost: 17053
query 21 [73] cost: 10001
query 22 [60] cost: 10716
query 23 [39] cost: 19955
query 24 [39] cost: 19955
query 25 [32] cost: 10001
query 26 [34] cost: 19698
query 27 [21] cost: 10001
query 28 [79] cost: 10001
query 29 [28] cost: 10001
query 30 [49] cost: 10001
query 31 [15] cost: 17053
query 32 [15] cost: 17053
query 33 [34] cost: 19698
query 34 [54] c

Build Time (s): 457.1760549545288
Total logical IOs: 2142667
Average logical IOs: 42853
query 0 [71, 31] cost: 20464
query 1 [47] cost: 166286
query 2 [35] cost: 19698
query 3 [34] cost: 19698
query 4 [34] cost: 19698
query 5 [65] cost: 10001
query 6 [15] cost: 17053
query 7 [35] cost: 19698
query 8 [86] cost: 176853
query 9 [73] cost: 10001
query 10 [60, 61] cost: 64189
query 11 [76, 58, 39, 44] cost: 90025
query 12 [24] cost: 10001
query 13 [45] cost: 18675
query 14 [53] cost: 10001
query 15 [34] cost: 19698
query 16 [30] cost: 137116
query 17 [42] cost: 18224
query 18 [23] cost: 10000
query 19 [34] cost: 19698
query 20 [15] cost: 17053
query 21 [73] cost: 10001
query 22 [60] cost: 10716
query 23 [39] cost: 19955
query 24 [39] cost: 19955
query 25 [32] cost: 10001
query 26 [34] cost: 19698
query 27 [21] cost: 10001
query 28 [79] cost: 10001
query 29 [28] cost: 10001
query 30 [49] cost: 10001
query 31 [15] cost: 17053
query 32 [15] cost: 17053
query 33 [34] cost: 19698
query 34 [54] c

Build Time (s): 454.96396470069885
Total logical IOs: 2142667
Average logical IOs: 42853
query 0 [71, 31] cost: 20464
query 1 [47] cost: 166286
query 2 [35] cost: 19698
query 3 [34] cost: 19698
query 4 [34] cost: 19698
query 5 [65] cost: 10001
query 6 [15] cost: 17053
query 7 [35] cost: 19698
query 8 [86] cost: 176853
query 9 [73] cost: 10001
query 10 [60, 61] cost: 64189
query 11 [76, 58, 39, 44] cost: 90025
query 12 [24] cost: 10001
query 13 [45] cost: 18675
query 14 [53] cost: 10001
query 15 [34] cost: 19698
query 16 [30] cost: 137116
query 17 [42] cost: 18224
query 18 [23] cost: 10000
query 19 [34] cost: 19698
query 20 [15] cost: 17053
query 21 [73] cost: 10001
query 22 [60] cost: 10716
query 23 [39] cost: 19955
query 24 [39] cost: 19955
query 25 [32] cost: 10001
query 26 [34] cost: 19698
query 27 [21] cost: 10001
query 28 [79] cost: 10001
query 29 [28] cost: 10001
query 30 [49] cost: 10001
query 31 [15] cost: 17053
query 32 [15] cost: 17053
query 33 [34] cost: 19698
query 34 [54] 

In [68]:
# leaves = pa_rf_paw.partition_tree.get_leaves()
# MBRs = []
# for leaf in leaves:
#     if leaf.rtree_filters is not None:
#         MBRs += leaf.rtree_filters

In [69]:
# pa_rf_paw.partition_tree.visualize(queries = Tweet_queries)
# pa_rf_paw.partition_tree.visualize(queries = MBRs)

# Test Redundant Partition

In [109]:
# Using OSM Dataset + default distance threshold
# OSM_dataset = np.genfromtxt('C:/Users/Cloud/iCloudDrive/NORA_experiments/dataset/OSM_6M.csv', delimiter=',')
# OSM_boundary = [-180, -90, 180, 90]
# OSM_block_size = 10000 # 1.1M compared to 6M

# helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments')
# OSM_queries = helper._DatasetAndQuerysetHelper__generate_random_query(50, [1,1], [[-180,180],[-90,90]], [36,18])
# paw_beam_tree = PartitionTree(2, OSM_boundary)
# paw_beam_tree.visualize(queries = OSM_queries)

# extended_OSM_queries = helper.extend_queryset(OSM_queries, QDistThreshold_percent = None, domains = [[-180,180],[-90,90]])

In [110]:
def real_result_size(dataset, query, num_dims):
    constraints = []
    for d in range(num_dims):
        constraint_L = dataset[:,d] >= query[d]
        constraint_U = dataset[:,d] <= query[num_dims + d]
        constraints.append(constraint_L)
        constraints.append(constraint_U)
    constraint = np.all(constraints, axis=0)
    result_size = np.count_nonzero(constraint)
    return result_size

def create_rp_format(selected_indexes, queries, dataset, data_threshold):
    redundant_partition = []
    for i in selected_indexes:
        redundant_partition.append((queries[i], max(real_result_size(dataset, queries[i], len(queries[0])//2), data_threshold)))
    return redundant_partition

In [122]:
pa_rp_kd = PartitionAlgorithm()
pa_rp_kd.InitializeWithKDT(len(OSM_boundary)//2, OSM_boundary, OSM_dataset, data_threshold = OSM_block_size) # 447

pa_rp_qd = PartitionAlgorithm()
pa_rp_qd.InitializeWithQDT(OSM_queries, len(OSM_boundary)//2, OSM_boundary, OSM_dataset, data_threshold = OSM_block_size)

pa_rp_paw = PartitionAlgorithm() # we don't need to optimize it?
pa_rp_paw.InitializeWithPAW(extended_OSM_queries, len(OSM_boundary)//2, OSM_boundary, OSM_dataset, OSM_block_size, 1000, strategy = 1)

Build Time (s): 291.22503423690796


In [124]:
results = helper.real_result_size(OSM_dataset, extended_OSM_queries)
print("LB-Cost:", max(sum(results)/len(results), OSM_block_size))

LB-Cost: 55268.76


In [123]:
for redundant_ratio in [0.01, 0.02, 0.05, 0.1, 0.2, 0.5]:
    print("= = = Result For Redundant Ratio", redundant_ratio, "= = =")
    print("!!!KDT:")
    rp_kd_gain, rp_kd_Q = pa_rp_kd.RedundantPartitions(int(redundant_ratio*len(OSM_dataset)), OSM_queries, OSM_dataset, OSM_block_size)
    print("KDT RPQ:", rp_kd_Q)
    kd_rps = create_rp_format(rp_kd_Q, OSM_queries, OSM_dataset, OSM_block_size)
    pa_rp_kd.partition_tree.evaluate_query_cost(extended_OSM_queries, print_result = True, redundant_partitions = kd_rps)
    print("")
    print("!!!QDT:")
    rp_qd_gain, rp_qd_Q = pa_rp_qd.RedundantPartitions(int(redundant_ratio*len(OSM_dataset)), OSM_queries, OSM_dataset, OSM_block_size)
    print("QDT RPQ:", rp_qd_Q)
    qd_rps = create_rp_format(rp_qd_Q, OSM_queries, OSM_dataset, OSM_block_size)
    pa_rp_qd.partition_tree.evaluate_query_cost(extended_OSM_queries, print_result = True, redundant_partitions = qd_rps)
    print("")
    print("!!!PAW:")
    rp_paw_gain, rp_paw_Q = pa_rp_paw.RedundantPartitions(int(redundant_ratio*len(OSM_dataset)), extended_OSM_queries, OSM_dataset, OSM_block_size)
    print("PAW RPQ:", rp_paw_Q)
    paw_rps = create_rp_format(rp_paw_Q, extended_OSM_queries, OSM_dataset, OSM_block_size)
    pa_rp_paw.partition_tree.evaluate_query_cost(extended_OSM_queries, print_result = True, redundant_partitions = paw_rps)
    print("")

= = = Result For Redundant Ratio 0.01 = = =
!!!KDT:
KDT RPQ: [6, 18, 35, 38, 42]
Total logical IOs: 5343615
Average logical IOs: 106872
query 0 [414, 417, 419, 459, 461, 473, 475] cost: 82027
query 1 [81, 82, 87, 88, 93, 94, 95, 96, 99, 100, 101, 102, 271, 272, 273, 274, 277, 279, 285, 286, 287, 288, 291, 292, 293, 333, 334, 339, 340] cost: 339834
query 2 [992, 1015, 1016] cost: 35156
query 3 [349] cost: 11718
query 4 [350, 355, 356, 379] cost: 46874
query 5 [143, 144, 146, 151, 152, 187, 188, 189, 190] cost: 105466
query 6 [17, 18, 23, 24, 47, 48, 53, 54, 143, 144, 149] cost: 128901
query 7 [1016, 1021] cost: 23438
query 8 [86, 88, 99, 100, 102, 109, 110, 111, 112, 115, 116, 117, 118, 123, 124, 125, 126, 129, 130, 131, 132, 207, 208, 213, 214, 219, 220, 221, 222, 225, 226, 227, 228, 235, 236, 237, 238, 249, 251, 277, 278, 279, 280, 292, 293, 294, 301, 302, 303, 304, 307, 309, 315, 316, 317, 318, 321] cost: 667954
query 9 [17] cost: 11718
query 10 [659, 660, 661, 662, 697, 698, 699, 70

QDT RPQ: [9, 12, 14, 18, 21, 27, 28, 30, 40, 41, 46]
Total logical IOs: 16485368
Average logical IOs: 329707
query 0 [90, 73, 74, 91, 92, 93, 77] cost: 545800
query 1 [67, 95, 96, 86] cost: 367726
query 2 [29, 30] cost: 39396
query 3 [94, 29] cost: 35646
query 4 [29] cost: 24538
query 5 [38, 83, 41] cost: 96756
query 6 [15] cost: 17053
query 7 [30] cost: 14858
query 8 [67, 96, 86, 87, 101, 102, 100, 98, 89, 90] cost: 1232734
query 9 [79, 32] cost: 66970
query 10 [57, 58, 81, 82] cost: 97642
query 11 [32, 55, 56, 34, 84, 62, 40] cost: 333117
query 12 [65, 66, 44, 50] cost: 296175
query 13 [83, 84, 62, 41, 65] cost: 161898
query 14 [79, 80] cost: 53533
query 15 [29] cost: 24538
query 16 [25, 77, 78, 28] cost: 2837840
query 17 [37, 38] cost: 42109
query 18 [63, 64, 66, 44, 25] cost: 2325451
query 19 [29] cost: 24538
query 20 [15] cost: 17053
query 21 [79, 32, 37] cost: 85194
query 22 [57] cost: 10716
query 23 [55, 34] cost: 107758
query 24 [34] cost: 19955
query 25 [91, 93, 94] cost: 7110

PAW RPQ: [0, 2, 3, 4, 5, 7, 10, 15, 17, 19, 21, 23, 24, 26, 27, 33, 34, 35, 45, 46, 48]
Total logical IOs: 3146137
Average logical IOs: 62922
query 0 [-26829] cost: 26829
query 1 [74, 90, 96, 92] cost: 247628
query 2 [-10000] cost: 10000
query 3 [-10000] cost: 10000
query 4 [-10000] cost: 10000
query 5 [-19870] cost: 19870
query 6 [38] cost: 10338
query 7 [-10000] cost: 10000
query 8 [96, 97, 95] cost: 536657
query 9 [82] cost: 10532
query 10 [-54014] cost: 54014
query 11 [84, 63, 89, 73] cost: 100622
query 12 [28] cost: 10001
query 13 [88, 89] cost: 31027
query 14 [56] cost: 10001
query 15 [-10000] cost: 10000
query 16 [15, 33] cost: 532550
query 17 [-10000] cost: 10000
query 18 [26] cost: 10001
query 19 [-10000] cost: 10000
query 20 [38] cost: 10338
query 21 [-10000] cost: 10000
query 22 [65] cost: 11370
query 23 [-10000] cost: 10000
query 24 [-10000] cost: 10000
query 25 [34] cost: 10001
query 26 [-10000] cost: 10000
query 27 [-14088] cost: 14088
query 28 [92] cost: 10331
query 29 [

KDT RPQ: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49]
Total logical IOs: 5343615
Average logical IOs: 106872
query 0 [414, 417, 419, 459, 461, 473, 475] cost: 82027
query 1 [81, 82, 87, 88, 93, 94, 95, 96, 99, 100, 101, 102, 271, 272, 273, 274, 277, 279, 285, 286, 287, 288, 291, 292, 293, 333, 334, 339, 340] cost: 339834
query 2 [992, 1015, 1016] cost: 35156
query 3 [349] cost: 11718
query 4 [350, 355, 356, 379] cost: 46874
query 5 [143, 144, 146, 151, 152, 187, 188, 189, 190] cost: 105466
query 6 [17, 18, 23, 24, 47, 48, 53, 54, 143, 144, 149] cost: 128901
query 7 [1016, 1021] cost: 23438
query 8 [86, 88, 99, 100, 102, 109, 110, 111, 112, 115, 116, 117, 118, 123, 124, 125, 126, 129, 130, 131, 132, 207, 208, 213, 214, 219, 220, 221, 222, 225, 226, 227, 228, 235, 236, 237, 238, 249, 251, 277, 278, 279, 280, 292, 293, 294, 301, 302, 303, 304, 307, 

QDT RPQ: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
Total logical IOs: 16485368
Average logical IOs: 329707
query 0 [90, 73, 74, 91, 92, 93, 77] cost: 545800
query 1 [67, 95, 96, 86] cost: 367726
query 2 [29, 30] cost: 39396
query 3 [94, 29] cost: 35646
query 4 [29] cost: 24538
query 5 [38, 83, 41] cost: 96756
query 6 [15] cost: 17053
query 7 [30] cost: 14858
query 8 [67, 96, 86, 87, 101, 102, 100, 98, 89, 90] cost: 1232734
query 9 [79, 32] cost: 66970
query 10 [57, 58, 81, 82] cost: 97642
query 11 [32, 55, 56, 34, 84, 62, 40] cost: 333117
query 12 [65, 66, 44, 50] cost: 296175
query 13 [83, 84, 62, 41, 65] cost: 161898
query 14 [79, 80] cost: 53533
query 15 [29] cost: 24538
query 16 [25, 77, 78, 28] cost: 2837840
query 17 [37, 38] cost: 42109
query 18 [63, 64, 66, 44, 25] cost: 2325451
query 19 [29] cost: 24538
query 20 [15] cost: 17053
qu

In [125]:
# using OSM when delta = 0
pa_rp_kd = PartitionAlgorithm()
pa_rp_kd.InitializeWithKDT(len(OSM_boundary)//2, OSM_boundary, OSM_dataset, data_threshold = OSM_block_size) # 447

pa_rp_qd = PartitionAlgorithm()
pa_rp_qd.InitializeWithQDT(OSM_queries, len(OSM_boundary)//2, OSM_boundary, OSM_dataset, data_threshold = OSM_block_size)

pa_rp_paw = PartitionAlgorithm() # we don't need to optimize it?
pa_rp_paw.InitializeWithPAW(OSM_queries, len(OSM_boundary)//2, OSM_boundary, OSM_dataset, OSM_block_size, 1000, strategy = 1)

results = helper.real_result_size(OSM_dataset, OSM_queries)
print("LB-Cost:", max(sum(results)/len(results), OSM_block_size))

for redundant_ratio in [0.01, 0.02, 0.05, 0.1, 0.2, 0.5]:
    print("= = = Result For Redundant Ratio", redundant_ratio, "= = =")
    print("!!!KDT:")
    rp_kd_gain, rp_kd_Q = pa_rp_kd.RedundantPartitions(int(redundant_ratio*len(OSM_dataset)), OSM_queries, OSM_dataset, OSM_block_size)
    print("KDT RPQ:", rp_kd_Q)
    kd_rps = create_rp_format(rp_kd_Q, OSM_queries, OSM_dataset, OSM_block_size)
    pa_rp_kd.partition_tree.evaluate_query_cost(OSM_queries, print_result = True, redundant_partitions = kd_rps)
    print("")
    print("!!!QDT:")
    rp_qd_gain, rp_qd_Q = pa_rp_qd.RedundantPartitions(int(redundant_ratio*len(OSM_dataset)), OSM_queries, OSM_dataset, OSM_block_size)
    print("QDT RPQ:", rp_qd_Q)
    qd_rps = create_rp_format(rp_qd_Q, OSM_queries, OSM_dataset, OSM_block_size)
    pa_rp_qd.partition_tree.evaluate_query_cost(OSM_queries, print_result = True, redundant_partitions = qd_rps)
    print("")
    print("!!!PAW:")
    rp_paw_gain, rp_paw_Q = pa_rp_paw.RedundantPartitions(int(redundant_ratio*len(OSM_dataset)), OSM_queries, OSM_dataset, OSM_block_size)
    print("PAW RPQ:", rp_paw_Q)
    paw_rps = create_rp_format(rp_paw_Q, OSM_queries, OSM_dataset, OSM_block_size)
    pa_rp_paw.partition_tree.evaluate_query_cost(OSM_queries, print_result = True, redundant_partitions = paw_rps)
    print("")

Build Time (s): 463.92549109458923
LB-Cost: 30285.28
= = = Result For Redundant Ratio 0.01 = = =
!!!KDT:
KDT RPQ: [6, 18, 35, 38, 42]
Total logical IOs: 3123850
Average logical IOs: 62477
query 0 [417, 419, 459, 461, 473, 475] cost: 70308
query 1 [81, 82, 87, 88, 93, 94, 95, 96, 99, 101, 102, 271, 272, 273, 274, 277, 279, 285, 286, 287, 288, 291, 292, 293] cost: 281241
query 2 [1016] cost: 11719
query 3 [349] cost: 11718
query 4 [355, 356, 379] cost: 35155
query 5 [151, 152, 189, 190] cost: 46874
query 6 [-10000] cost: 10000
query 7 [1016, 1021] cost: 23438
query 8 [123, 124, 125, 126, 129, 130, 131, 132, 221, 222, 227, 277, 278, 279, 280, 301, 302, 303, 304, 307, 309] cost: 246087
query 9 [17] cost: 11718
query 10 [659, 660, 661, 662, 697, 698, 699] cost: 82029
query 11 [528, 533, 534, 559, 560, 565, 566, 571, 572, 577, 578, 655, 656, 661, 667, 669] cost: 187494
query 12 [752] cost: 11719
query 13 [194, 196, 527] cost: 35156
query 14 [181, 182] cost: 23437
query 15 [475, 476] cost: 23

PAW RPQ: [2, 3, 4, 7, 15, 19, 23, 24, 26, 33, 45]
Total logical IOs: 2035475
Average logical IOs: 40709
query 0 [71, 31] cost: 20464
query 1 [47] cost: 166286
query 2 [-10000] cost: 10000
query 3 [-10000] cost: 10000
query 4 [-10000] cost: 10000
query 5 [65] cost: 10001
query 6 [15] cost: 17053
query 7 [-10000] cost: 10000
query 8 [86] cost: 176853
query 9 [73] cost: 10001
query 10 [60, 61] cost: 64189
query 11 [76, 58, 39, 44] cost: 90025
query 12 [24] cost: 10001
query 13 [45] cost: 18675
query 14 [53] cost: 10001
query 15 [-10000] cost: 10000
query 16 [30] cost: 137116
query 17 [42] cost: 18224
query 18 [23] cost: 10000
query 19 [-10000] cost: 10000
query 20 [15] cost: 17053
query 21 [73] cost: 10001
query 22 [60] cost: 10716
query 23 [-10000] cost: 10000
query 24 [-10000] cost: 10000
query 25 [32] cost: 10001
query 26 [-10000] cost: 10000
query 27 [21] cost: 10001
query 28 [79] cost: 10001
query 29 [28] cost: 10001
query 30 [49] cost: 10001
query 31 [15] cost: 17053
query 32 [15] c

QDT RPQ: [0, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49]
Total logical IOs: 1908768
Average logical IOs: 38175
query 0 [-20099] cost: 20099
query 1 [95] cost: 167268
query 2 [-10000] cost: 10000
query 3 [-10000] cost: 10000
query 4 [-10000] cost: 10000
query 5 [-10000] cost: 10000
query 6 [-10000] cost: 10000
query 7 [-10000] cost: 10000
query 8 [102] cost: 176853
query 9 [-10000] cost: 10000
query 10 [-45657] cost: 45657
query 11 [-64440] cost: 64440
query 12 [-10000] cost: 10000
query 13 [-18675] cost: 18675
query 14 [-10000] cost: 10000
query 15 [-10000] cost: 10000
query 16 [78] cost: 138013
query 17 [-10000] cost: 10000
query 18 [-10000] cost: 10000
query 19 [-10000] cost: 10000
query 20 [-10000] cost: 10000
query 21 [-10000] cost: 10000
query 22 [-10000] cost: 10000
query 23 [-10000] cost: 10000
query 24 [-10000] cost: 10000
query 25 [-10000] cost: 1000

KDT RPQ: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
Total logical IOs: 1903082
Average logical IOs: 38061
query 0 [-20099] cost: 20099
query 1 [-166286] cost: 166286
query 2 [-10000] cost: 10000
query 3 [-10000] cost: 10000
query 4 [-10000] cost: 10000
query 5 [-10000] cost: 10000
query 6 [-10000] cost: 10000
query 7 [-10000] cost: 10000
query 8 [-176853] cost: 176853
query 9 [-10000] cost: 10000
query 10 [-45657] cost: 45657
query 11 [-64440] cost: 64440
query 12 [-10000] cost: 10000
query 13 [-18675] cost: 18675
query 14 [-10000] cost: 10000
query 15 [-10000] cost: 10000
query 16 [-137116] cost: 137116
query 17 [-10000] cost: 10000
query 18 [-10000] cost: 10000
query 19 [-10000] cost: 10000
query 20 [-10000] cost: 10000
query 21 [-10000] cost: 10000
query 22 [-10000] cost: 10000
query 23 [-10000] cost: 10000
query 24 [-10000] cost: 10000


In [120]:
def check_inside(query, partition_boundary):
    num_dims = len(query)//2
    for i in range(num_dims):
        if query[i] >= partition_boundary[i] and query[num_dims + i] <= partition_boundary[num_dims + i]:
            pass
        else:
            return False
    return True

check_inside(extended_OSM_queries[0], paw_rps[0][0])

False

In [None]:
# # Using TPCH
# # First create a normal partition

# pa_rp_kd = PartitionAlgorithm()
# pa_rp_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size) # 447

# pa_rp_qd = PartitionAlgorithm()
# pa_rp_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)

# pa_rp_nora = PartitionAlgorithm() # we don't need to optimize it?
# pa_rp_nora.InitializeWithNORA(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
#                               using_1_by_1 = True, using_kd = True)

# results = helper.real_result_size(dataset, training_set)
# # results = helper.real_result_size(dataset, testing_set)
# print("LB-Cost:", max(sum(results)/len(results), block_size))

# for redundant_ratio in [0.01, 0.02, 0.05, 0.1, 0.2, 0.5]:
#     print("= = = Result For Redundant Ratio", redundant_ratio, "= = =")
#     print("!!!KDT:")
#     pa_rp_kd.RedundantPartitions(int(redundant_ratio*len(dataset)), training_set, dataset, block_size)
#     print("!!!QDT:")
#     pa_rp_qd.RedundantPartitions(int(redundant_ratio*len(dataset)), training_set, dataset, block_size)
#     print("!!!NORA:")
#     pa_rp_nora.RedundantPartitions(int(redundant_ratio*len(dataset)), training_set, dataset, block_size)
#     print("")

# Distance Threshold Estimation

In [437]:
# uniform workload
# training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=2) # 1-X (X is 5 at most by default)

for num_Q in [100, 200, 500]:
    
    training_set, testing_set = helper.generate_queryset_and_save(num_Q, queryset_type=2) # uniform
    extended_training_set = helper.extend_queryset(training_set)

    estimated_threshold = helper.queryset_distance(training_set[0:int(num_Q/4)], training_set[int(num_Q/4):])
    estimated_extended_training_set = helper.extend_queryset(training_set, estimated_threshold)

    pa_dte_r = PartitionAlgorithm()
    pa_dte_r.InitializeWithNORA(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
                            using_1_by_1 = True, using_kd = True)
    pa_dte_r.partition_tree.visualize(queries = testing_set)
    cost_real = pa_dte_r.partition_tree.evaluate_query_cost(testing_set)

    pa_dte_e = PartitionAlgorithm()
    pa_dte_e.InitializeWithNORA(estimated_extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
                            using_1_by_1 = True, using_kd = True)
    pa_dte_e.partition_tree.visualize(queries = testing_set)
    cost_estimate = pa_dte_e.partition_tree.evaluate_query_cost(testing_set)
    
    results = helper.real_result_size(dataset, testing_set)
    cost_lb = max(sum(results)/len(results), block_size)
    
    print("= = = Num_Query = ", num_Q, "= = =")
    print("estimated threshold:", estimated_threshold)
    print("NORA T Cost:", cost_real)
    print("NORA T' Cost:", cost_estimate)
    print("LB-Cost:", cost_lb)

Build Time (s): 712.9992589950562


<IPython.core.display.Javascript object>

Build Time (s): 2419.4959304332733


<IPython.core.display.Javascript object>

= = = Num_Query =  100 = = =
estimated threshold: 0.5091454453293583
NORA T Cost: 14561
NORA T' Cost: 35505
LB-Cost: 10000
Build Time (s): 1344.4021365642548


<IPython.core.display.Javascript object>

Build Time (s): 2437.270729780197


<IPython.core.display.Javascript object>

= = = Num_Query =  200 = = =
estimated threshold: 0.3281210742570579
NORA T Cost: 16925
NORA T' Cost: 32975
LB-Cost: 10000
Build Time (s): 3223.2319672107697


<IPython.core.display.Javascript object>

Build Time (s): 5431.084000110626


<IPython.core.display.Javascript object>

= = = Num_Query =  500 = = =
estimated threshold: 0.30300217167937626
NORA T Cost: 17145
NORA T' Cost: 30473
LB-Cost: 10000


In [438]:
# skew workload

for num_Q in [100, 200, 500]:
    
    training_set, testing_set = helper.generate_queryset_and_save(num_Q, queryset_type=4) # mixture Gaussian
    extended_training_set = helper.extend_queryset(training_set)

    estimated_threshold = helper.queryset_distance(training_set[0:int(num_Q/4)], training_set[int(num_Q/4):])
    estimated_extended_training_set = helper.extend_queryset(training_set, estimated_threshold)

    pa_dte_r = PartitionAlgorithm()
    pa_dte_r.InitializeWithNORA(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
                            using_1_by_1 = True, using_kd = True)
    pa_dte_r.partition_tree.visualize(queries = testing_set)
    cost_real = pa_dte_r.partition_tree.evaluate_query_cost(testing_set)

    pa_dte_e = PartitionAlgorithm()
    pa_dte_e.InitializeWithNORA(estimated_extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
                            using_1_by_1 = True, using_kd = True)
    pa_dte_e.partition_tree.visualize(queries = testing_set)
    cost_estimate = pa_dte_e.partition_tree.evaluate_query_cost(testing_set)
    
    results = helper.real_result_size(dataset, testing_set)
    cost_lb = max(sum(results)/len(results), block_size)
    
    print("= = = Num_Query = ", num_Q, "= = =")
    print("estimated threshold:", estimated_threshold)
    print("NORA T Cost:", cost_real)
    print("NORA T' Cost:", cost_estimate)
    print("LB-Cost:", cost_lb)

Build Time (s): 114.88499975204468


<IPython.core.display.Javascript object>

Build Time (s): 118.25800013542175


<IPython.core.display.Javascript object>

= = = Num_Query =  100 = = =
estimated threshold: 0.047369059235582545
NORA T Cost: 14246
NORA T' Cost: 10639
LB-Cost: 10000
Build Time (s): 139.5759916305542


<IPython.core.display.Javascript object>

Build Time (s): 116.41403436660767


<IPython.core.display.Javascript object>

= = = Num_Query =  200 = = =
estimated threshold: 0.04573109812172161
NORA T Cost: 10447
NORA T' Cost: 12347
LB-Cost: 10000
Build Time (s): 197.54996538162231


<IPython.core.display.Javascript object>

Build Time (s): 211.46996331214905


<IPython.core.display.Javascript object>

= = = Num_Query =  500 = = =
estimated threshold: 0.038551776917340685
NORA T Cost: 13317
NORA T' Cost: 12648
LB-Cost: 10000


In [439]:
for num_center in [10, 20, 50, 100]:
    
    helper.cluster_center_amount = num_center
    
    training_set, testing_set = helper.generate_queryset_and_save(500, queryset_type=4) # mixture Gaussian
    extended_training_set = helper.extend_queryset(training_set)

    estimated_threshold = helper.queryset_distance(training_set[0:int(num_Q/4)], training_set[int(num_Q/4):])
    estimated_extended_training_set = helper.extend_queryset(training_set, estimated_threshold)

    pa_dte_r = PartitionAlgorithm()
    pa_dte_r.InitializeWithNORA(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
                            using_1_by_1 = True, using_kd = True)
    pa_dte_r.partition_tree.visualize(queries = testing_set)
    cost_real = pa_dte_r.partition_tree.evaluate_query_cost(testing_set)

    pa_dte_e = PartitionAlgorithm()
    pa_dte_e.InitializeWithNORA(estimated_extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
                            using_1_by_1 = True, using_kd = True)
    pa_dte_e.partition_tree.visualize(queries = testing_set)
    cost_estimate = pa_dte_e.partition_tree.evaluate_query_cost(testing_set)
    
    results = helper.real_result_size(dataset, testing_set)
    cost_lb = max(sum(results)/len(results), block_size)
    
    print("= = = Num_Centers = ", num_center, "= = =")
    print("estimated threshold:", estimated_threshold)
    print("NORA T Cost:", cost_real)
    print("NORA T' Cost:", cost_estimate)
    print("LB-Cost:", cost_lb)

Build Time (s): 275.7484619617462


<IPython.core.display.Javascript object>

Build Time (s): 199.19714426994324




<IPython.core.display.Javascript object>

= = = Num_Centers =  10 = = =
estimated threshold: 0.03472621437131893
NORA T Cost: 12275
NORA T' Cost: 11859
LB-Cost: 10000
Build Time (s): 367.724999666214


<IPython.core.display.Javascript object>

Build Time (s): 261.79703664779663


<IPython.core.display.Javascript object>

= = = Num_Centers =  20 = = =
estimated threshold: 0.04812516435394635
NORA T Cost: 12540
NORA T' Cost: 14288
LB-Cost: 10000
Build Time (s): 647.6460411548615


<IPython.core.display.Javascript object>

Build Time (s): 4566.934130430222


<IPython.core.display.Javascript object>

= = = Num_Centers =  50 = = =
estimated threshold: 0.2972071296846262
NORA T Cost: 12743
NORA T' Cost: 29311
LB-Cost: 10000
Build Time (s): 1171.766999721527


<IPython.core.display.Javascript object>

Build Time (s): 4717.417000293732


<IPython.core.display.Javascript object>

= = = Num_Centers =  100 = = =
estimated threshold: 0.2786367803889104
NORA T Cost: 14816
NORA T' Cost: 36120
LB-Cost: 10000


In [611]:
training_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/dims/train_"+str(4)+"_"+str(0)+".csv", delimiter=',')
testing_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/dims/test_"+str(4)+"_"+str(0)+".csv", delimiter=',')

In [615]:
pa1_assembled = PartitionTree()
pa1_assembled.load_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob2_kdtree')
pa2_assembled = PartitionTree()
pa2_assembled.load_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob2_qdtree')
pa3_assembled = PartitionTree()
pa3_assembled.load_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob2_nora')

# Experiments: Robustness to Different Query Settings

In [333]:
# test_cases = 10
test_cases = 1

### Problem 1

In [218]:
# # improvement search loop

# max_ratio = 1.4

# for i in range(200):

#     training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=2) # 1-X (X is 5 at most by default)
    
#     pa_qd = PartitionAlgorithm()
#     pa_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
#     # pa_qd.partition_tree.visualize(queries = training_set)
#     cost_qd = pa_qd.partition_tree.evaluate_query_cost(training_set) # Average logical IOs: 196487

#     pa_nora = PartitionAlgorithm()
#     pa_nora.InitializeWithNORA(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, using_1_by_1 = True)
#     # pa_nora.partition_tree.visualize(queries = training_set)
#     cost_nora = pa_nora.partition_tree.evaluate_query_cost(training_set) # Average logical IOs: 196487

#     if cost_qd / cost_nora > max_ratio:
#         max_ratio = cost_qd / cost_nora
#         # save the queryset
#         print("!!!!! = = = = = Current Improvement:", int(max_ratio*100),"X = = = = = !!!!!")
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/improvement_train_"+str(int(max_ratio*100))+"_Q100.csv", training_set, delimiter=',')
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/improvement_test_"+str(int(max_ratio*100))+"_Q100.csv", testing_set, delimiter=',')

In [531]:
# search dimensionality

for num_dims in range(2,8):
# for num_dims in range(4,5):
    used_dims = [i for i in range(1, num_dims+1)]
    helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = used_dims) # EXAMPLE
    dataset, domains = helper.load_dataset(used_dims)
    boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]
    
    print("!!!!! = = = = = Current Dimension:", used_dims," = = = = = !!!!!")
    
    total_cost_qd = 0
    total_cost_kd = 0
    total_cost_nora = 0
    total_cost_paw = 0
    total_cost_real_result = 0
    total_minimum_cost = 0
    
    # query irrelevant, only calculated once
    pa_kd = PartitionAlgorithm()
    pa_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size) # 447
    # pa_qd.partition_tree.visualize(queries = training_set)
    
    for case in range(test_cases):
    
        #training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=2) # 1-X (X is 5 at most by default)
        training_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/dims/train_"+str(num_dims)+"_"+str(case)+".csv", delimiter=',')
        
        pa_qd = PartitionAlgorithm()
        pa_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
        #pa_qd.partition_tree.save_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob1_qdtree')
        # pa_qd.partition_tree.visualize(queries = training_set)
        cost_qd = pa_qd.partition_tree.evaluate_query_cost(training_set)
        total_cost_qd += cost_qd
        
        #pa_kd.partition_tree.save_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob1_kdtree')
#         cost_kd = pa_kd.partition_tree.evaluate_query_cost(training_set)
#         total_cost_kd += cost_kd

#         pa_nora = PartitionAlgorithm()
#         pa_nora.InitializeWithNORA(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
#                                    using_1_by_1 = True, using_kd = True)
#         #pa_nora.partition_tree.save_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob1_nora')
#         pa_nora.partition_tree.visualize(queries = training_set)
#         cost_nora = pa_nora.partition_tree.evaluate_query_cost(training_set)
#         total_cost_nora += cost_nora
        
        pa_paw = PartitionAlgorithm()
        pa_paw.InitializeWithPAW(training_set, len(boundary)//2, boundary, dataset, block_size, 1000, strategy = 1)
        cost_paw = pa_paw.partition_tree.evaluate_query_cost(training_set)
        total_cost_paw += cost_paw
        
        result_sizes = helper.real_result_size(dataset, training_set)
        cost_real = sum(result_sizes)/len(result_sizes)
        total_cost_real_result += cost_real
        
        min_cost = max(cost_real, block_size)
        total_minimum_cost += min_cost
        
        #np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/dims/train_"+str(num_dims)+"_"+str(case)+".csv", training_set, delimiter=',')
        #np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/dims/test_"+str(num_dims)+"_"+str(case)+".csv", testing_set, delimiter=',')
        
        #print("= = Cost Ratio:",cost_qd / cost_nora, " Qd-tree average cost:", cost_qd, " Nora average cost:", cost_nora, "= =")
    
#     print("= = = = Total Ratio(Qd-tree/NORA):",total_cost_qd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(kd-tree/NORA):",total_cost_kd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(MinCost/NORA):",total_minimum_cost/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA/MinCost):",total_cost_nora/total_minimum_cost," = = = =")
    
    print("= = = = Total Ratio(Qd-tree/PAW):",total_cost_qd/total_cost_paw," = = = =")
    print("= = = = Total PAW Cost:",total_cost_paw/test_cases," = = = =")
    
#     print("= = = = Total Qd-tree Cost :",total_cost_qd/test_cases," = = = =")
#     print("= = = = Total kd-tree Cost :",total_cost_kd/test_cases," = = = =")
#     print("= = = = Total NORA Cost:",total_cost_nora/test_cases," = = = =")
#     print("= = = = Total Real Cost:",total_cost_real_result/test_cases," = = = =")
#     print("= = = = Total Minimum Cost:",total_minimum_cost/test_cases," = = = =")
    print("")
    
#         if cost_qd / cost_nora > max_ratio:
#             max_ratio = cost_qd / cost_nora
#             # save the queryset
#             print("!!!!! = = = = = Current Improvement:", int(max_ratio*100),"X = = = = = !!!!!")
#             np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/improvement_train_"+str(int(max_ratio*100))+"_Q100.csv", training_set, delimiter=',')
#             np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/improvement_test_"+str(int(max_ratio*100))+"_Q100.csv", testing_set, delimiter=',')


!!!!! = = = = = Current Dimension: [1, 2]  = = = = = !!!!!
Build Time (s): 69.54799771308899
= = = = Total Ratio(Qd-tree/PAW): 1.2835214446952596  = = = =
= = = = Total PAW Cost: 24365.0  = = = =

!!!!! = = = = = Current Dimension: [1, 2, 3]  = = = = = !!!!!
Build Time (s): 420.2231819629669
= = = = Total Ratio(Qd-tree/PAW): 1.2376186621532121  = = = =
= = = = Total PAW Cost: 13589.0  = = = =

!!!!! = = = = = Current Dimension: [1, 2, 3, 4]  = = = = = !!!!!
Build Time (s): 530.3059639930725
= = = = Total Ratio(Qd-tree/PAW): 1.229304347826087  = = = =
= = = = Total PAW Cost: 11500.0  = = = =

!!!!! = = = = = Current Dimension: [1, 2, 3, 4, 5]  = = = = = !!!!!
Build Time (s): 658.2548162937164
= = = = Total Ratio(Qd-tree/PAW): 1.0491169977924946  = = = =
= = = = Total PAW Cost: 14496.0  = = = =

!!!!! = = = = = Current Dimension: [1, 2, 3, 4, 5, 6]  = = = = = !!!!!
Build Time (s): 783.5945041179657
= = = = Total Ratio(Qd-tree/PAW): 1.0425301577482216  = = = =
= = = = Total PAW Cost: 1293

In [None]:
# # pa_nora.partition_tree.nid_node_dict[0].num_dims

# serialized_node_list = []
# for nid, node in pa_nora.partition_tree.nid_node_dict.items():
#     # follow the same order of attributes in partition class
#     print(nid)
#     attributes = [node.num_dims]
#     if isinstance(node.boundary, list):
#         attributes += node.boundary
#     else:
#         attributes += node.boundary.tolist()
#     attributes.append(node.nid) # node id = its ow id
#     attributes.append(node.pid) # parent id
#     attributes.append(1 if node.is_irregular_shape_parent else 0)
#     attributes.append(1 if node.is_irregular_shape else 0)
#     attributes.append(node.num_children) # number of children
#     #attributes += node.children_ids
#     attributes.append(1 if node.is_leaf else 0)
#     attributes.append(node.node_size)

#     serialized_node_list.append(attributes)

In [533]:
# search query range 

# used_dims = [0,1]
used_dims = [0,1,2,3]
helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = used_dims) # EXAMPLE

dataset, domains = helper.load_dataset(used_dims)
boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]

pa_kd = PartitionAlgorithm()
pa_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size)

for query_range in [0.01, 0.02, 0.05, 0.1, 0.2, 0.5]: # maximum range
    
    helper.maximum_range_percent = query_range    
     
    print("!!!!! = = = = = Current Query Range:", query_range," = = = = = !!!!!")
    
    total_cost_qd = 0
    total_cost_kd = 0
    total_cost_nora = 0
    total_cost_paw = 0
    total_cost_real_result = 0
    total_minimum_cost = 0
    
    for case in range(test_cases):
    
        #training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=2) # 1-X (X is 5 at most by default)
        training_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/query_range/train_"+str(int(query_range*100))+"_"+str(case)+".csv", delimiter=',')
        
        pa_qd = PartitionAlgorithm()
        pa_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
        # pa_qd.partition_tree.visualize(queries = training_set)
        cost_qd = pa_qd.partition_tree.evaluate_query_cost(training_set)
        total_cost_qd += cost_qd
        
#         cost_kd = pa_kd.partition_tree.evaluate_query_cost(training_set)
#         total_cost_kd += cost_kd
        
#         pa_nora = PartitionAlgorithm()
#         pa_nora.InitializeWithNORA(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
#                                    using_1_by_1 = True, using_kd = True)
#         # pa_nora.partition_tree.visualize(queries = training_set)
#         cost_nora = pa_nora.partition_tree.evaluate_query_cost(training_set)
#         total_cost_nora += cost_nora
        
        pa_paw = PartitionAlgorithm()
        pa_paw.InitializeWithPAW(training_set, len(boundary)//2, boundary, dataset, block_size, 1000, strategy = 1)
        cost_paw = pa_paw.partition_tree.evaluate_query_cost(training_set)
        total_cost_paw += cost_paw
        
        result_sizes = helper.real_result_size(dataset, training_set)
        cost_real = sum(result_sizes)/len(result_sizes)
        total_cost_real_result += cost_real
        
        min_cost = max(cost_real, block_size)
        total_minimum_cost += min_cost
        
        #np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/query_range/train_"+str(int(query_range*100))+"_"+str(case)+".csv", training_set, delimiter=',')
        #np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/query_range/test_"+str(int(query_range*100))+"_"+str(case)+".csv", testing_set, delimiter=',')

        #print("= = Cost Ratio Qd/Nora:",cost_qd / cost_nora, " Qd-tree average cost:", cost_qd, " Nora average cost:", cost_nora, "= =")
    
#     print("= = = = Total Ratio(Qd-tree/NORA):",total_cost_qd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(kd-tree/NORA):",total_cost_kd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(MinCost/NORA):",total_minimum_cost/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA/MinCost):",total_cost_nora/total_minimum_cost," = = = =")
    
    print("= = = = Total Ratio(Qd-tree/PAW):",total_cost_qd/total_cost_paw," = = = =")
    print("= = = = Total PAW Cost:",total_cost_paw/test_cases," = = = =")
    
#     print("= = = = Total Qd-tree Cost :",total_cost_qd/test_cases," = = = =")
#     print("= = = = Total kd-tree Cost :",total_cost_kd/test_cases," = = = =")
#     print("= = = = Total NORA Cost:",total_cost_nora/test_cases," = = = =")
#     print("= = = = Total Real Cost:",total_cost_real_result/test_cases," = = = =")
#     print("= = = = Total Minimum Cost:",total_minimum_cost/test_cases," = = = =")
    print("")

!!!!! = = = = = Current Query Range: 0.01  = = = = = !!!!!
Build Time (s): 612.4418275356293
= = = = Total Ratio(Qd-tree/PAW): 1.2016655100624567  = = = =
= = = = Total PAW Cost: 14410.0  = = = =

!!!!! = = = = = Current Query Range: 0.02  = = = = = !!!!!
Build Time (s): 619.6626553535461
= = = = Total Ratio(Qd-tree/PAW): 1.17403175971833  = = = =
= = = = Total PAW Cost: 13917.0  = = = =

!!!!! = = = = = Current Query Range: 0.05  = = = = = !!!!!
Build Time (s): 755.0542342662811
= = = = Total Ratio(Qd-tree/PAW): 1.102078556106221  = = = =
= = = = Total PAW Cost: 13519.0  = = = =

!!!!! = = = = = Current Query Range: 0.1  = = = = = !!!!!
Build Time (s): 669.7473125457764
= = = = Total Ratio(Qd-tree/PAW): 1.1850172877449097  = = = =
= = = = Total PAW Cost: 13015.0  = = = =

!!!!! = = = = = Current Query Range: 0.2  = = = = = !!!!!
Build Time (s): 620.176465511322
= = = = Total Ratio(Qd-tree/PAW): 1.109578098903493  = = = =
= = = = Total PAW Cost: 13771.0  = = = =

!!!!! = = = = = Curren

In [534]:
# search number of queries

# used_dims = [0,1]
used_dims = [0,1,2,3]
helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = used_dims) # EXAMPLE

dataset, domains = helper.load_dataset(used_dims)
boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]

pa_kd = PartitionAlgorithm()
pa_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size)

for num_Q in [20, 50, 100, 200, 500, 1000]: # maximum range
     
    print("!!!!! = = = = = Current Numer of Queries:", num_Q," = = = = = !!!!!")
    
    total_cost_qd = 0
    total_cost_kd = 0
    total_cost_nora = 0
    total_cost_paw = 0
    total_cost_real_result = 0
    total_minimum_cost = 0
    
    for case in range(test_cases):
    
        #training_set, testing_set = helper.generate_queryset_and_save(num_Q, queryset_type=2) # 1-X (X is 5 at most by default)
        training_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/num_query/train_"+str(num_Q)+"_"+str(case)+".csv", delimiter=',')
        
        
        pa_qd = PartitionAlgorithm()
        pa_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
        # pa_qd.partition_tree.visualize(queries = training_set)
        cost_qd = pa_qd.partition_tree.evaluate_query_cost(training_set)
        total_cost_qd += cost_qd
        
#         cost_kd = pa_kd.partition_tree.evaluate_query_cost(training_set)
#         total_cost_kd += cost_kd

#         pa_nora = PartitionAlgorithm()
#         pa_nora.InitializeWithNORA(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
#                                    using_1_by_1 = True, using_kd = True)
#         # pa_nora.partition_tree.visualize(queries = training_set)
#         cost_nora = pa_nora.partition_tree.evaluate_query_cost(training_set)
#         total_cost_nora += cost_nora
        
        pa_paw = PartitionAlgorithm()
        pa_paw.InitializeWithPAW(training_set, len(boundary)//2, boundary, dataset, block_size, 1000, strategy = 1)
        cost_paw = pa_paw.partition_tree.evaluate_query_cost(training_set)
        total_cost_paw += cost_paw
        
        result_sizes = helper.real_result_size(dataset, training_set)
        cost_real = sum(result_sizes)/len(result_sizes)
        total_cost_real_result += cost_real
        
        min_cost = max(cost_real, block_size)
        total_minimum_cost += min_cost
        
        #np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/num_query/train_"+str(num_Q)+"_"+str(case)+".csv", training_set, delimiter=',')
        #np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/num_query/test_"+str(num_Q)+"_"+str(case)+".csv", testing_set, delimiter=',')

        # print("= = Cost Ratio Qd/Nora:",cost_qd / cost_nora, " Qd-tree average cost:", cost_qd, " Nora average cost:", cost_nora, "= =")
    
#     print("= = = = Total Ratio(Qd-tree/NORA):",total_cost_qd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(kd-tree/NORA):",total_cost_kd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(MinCost/NORA):",total_minimum_cost/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA/MinCost):",total_cost_nora/total_minimum_cost," = = = =")
    
    print("= = = = Total Ratio(Qd-tree/PAW):",total_cost_qd/total_cost_paw," = = = =")
    print("= = = = Total PAW Cost:",total_cost_paw/test_cases," = = = =")
    
#     print("= = = = Total Qd-tree Cost :",total_cost_qd/test_cases," = = = =")
#     print("= = = = Total kd-tree Cost :",total_cost_kd/test_cases," = = = =")
#     print("= = = = Total NORA Cost:",total_cost_nora/test_cases," = = = =")
#     print("= = = = Total Real Cost:",total_cost_real_result/test_cases," = = = =")
#     print("= = = = Total Minimum Cost:",total_minimum_cost/test_cases," = = = =")
    print("")

!!!!! = = = = = Current Numer of Queries: 20  = = = = = !!!!!
Build Time (s): 117.69800329208374
= = = = Total Ratio(Qd-tree/PAW): 1.218660214728565  = = = =
= = = = Total PAW Cost: 13226.0  = = = =

!!!!! = = = = = Current Numer of Queries: 50  = = = = = !!!!!
Build Time (s): 360.0420091152191
= = = = Total Ratio(Qd-tree/PAW): 1.1106626047220107  = = = =
= = = = Total PAW Cost: 13130.0  = = = =

!!!!! = = = = = Current Numer of Queries: 100  = = = = = !!!!!
Build Time (s): 706.584234714508
= = = = Total Ratio(Qd-tree/PAW): 1.1040289097339688  = = = =
= = = = Total PAW Cost: 13006.0  = = = =

!!!!! = = = = = Current Numer of Queries: 200  = = = = = !!!!!
Build Time (s): 1602.0557816028595
= = = = Total Ratio(Qd-tree/PAW): 1.0593126793730223  = = = =
= = = = Total PAW Cost: 13589.0  = = = =

!!!!! = = = = = Current Numer of Queries: 500  = = = = = !!!!!
Build Time (s): 4288.647853851318
= = = = Total Ratio(Qd-tree/PAW): 1.0730604916104265  = = = =
= = = = Total PAW Cost: 16151.0  = = = 

In [222]:
# search distance threshold -> not necessary
# search random query -> not necessary

### Problem 2

In [336]:
# search dimensionality
block_size = 10000
# for num_dims in range(2,8):
for num_dims in range(4,5):
    used_dims = [i for i in range(1, num_dims+1)]
    helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = used_dims) # EXAMPLE
    dataset, domains = helper.load_dataset(used_dims)
    boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]
    
    print("!!!!! = = = = = Current Dimension:", used_dims," = = = = = !!!!!")
    
    total_cost_qd = 0
    total_cost_kd = 0
    total_cost_nora = 0
    total_cost_nora_disable_bounding = 0
    total_cost_real_result = 0
    total_minimum_cost = 0
    
    # query irrelevant, only calculated once
    pa_kd = PartitionAlgorithm()
    pa_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size) # 447
    # pa_qd.partition_tree.visualize(queries = training_set)
    
    
    for case in range(test_cases):
    
        #training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=2) # 1-X (X is 5 at most by default)
        #np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/dims/train_"+str(num_dims)+"_"+str(case)+".csv", training_set, delimiter=',')
        #np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/dims/test_"+str(num_dims)+"_"+str(case)+".csv", testing_set, delimiter=',')

        training_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/dims/train_"+str(num_dims)+"_"+str(case)+".csv", delimiter=',')
        testing_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/dims/test_"+str(num_dims)+"_"+str(case)+".csv", delimiter=',')
        
        extended_training_set = helper.extend_queryset(training_set)
        
        pa_qd = PartitionAlgorithm()
        pa_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
        pa_qd.partition_tree.save_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob2_qdtree')
        # pa_qd.partition_tree.visualize(queries = training_set)
        cost_qd = pa_qd.partition_tree.evaluate_query_cost(testing_set)
        total_cost_qd += cost_qd
        
        pa_kd.partition_tree.save_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob2_kdtree')
        cost_kd = pa_kd.partition_tree.evaluate_query_cost(testing_set)
        total_cost_kd += cost_kd

        pa_nora = PartitionAlgorithm()
        pa_nora.InitializeWithNORA(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
                                   using_1_by_1 = True)
        pa_nora.partition_tree.save_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob2_nora')
        # pa_nora.partition_tree.visualize(queries = training_set)
        cost_nora = pa_nora.partition_tree.evaluate_query_cost(testing_set)
        total_cost_nora += cost_nora
        
        pa_nora_disable_bounding = PartitionAlgorithm()
        pa_nora_disable_bounding.InitializeWithQDT(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
        cost_nora_disable_bounding = pa_nora_disable_bounding.partition_tree.evaluate_query_cost(testing_set)
        total_cost_nora_disable_bounding += cost_nora_disable_bounding
            
        result_sizes = helper.real_result_size(dataset, testing_set)
        cost_real = sum(result_sizes)/len(result_sizes)
        total_cost_real_result += cost_real
        
        min_cost = max(cost_real, block_size)
        total_minimum_cost += min_cost
        
        # print("= = Cost Ratio:",cost_qd / cost_nora, " Qd-tree average cost:", cost_qd, " Nora average cost:", cost_nora, "= =")
    
    print("= = = = Total Ratio(Qd-tree/NORA):",total_cost_qd/total_cost_nora," = = = =")
    print("= = = = Total Ratio(kd-tree/NORA):",total_cost_kd/total_cost_nora," = = = =")
    print("= = = = Total Ratio(NORA_disable/NORA):",total_cost_nora_disable_bounding/total_cost_nora," = = = =")
    print("= = = = Total Ratio(Real/NORA):",total_cost_real_result/total_cost_nora," = = = =")
    print("= = = = Total Ratio(MinCost/NORA):",total_minimum_cost/total_cost_nora," = = = =")
    print("= = = = Total Ratio(NORA/MinCost):",total_cost_nora/total_minimum_cost," = = = =")
    
    print("= = = = Total Qd-tree Cost :",total_cost_qd/test_cases," = = = =")
    print("= = = = Total kd-tree Cost :",total_cost_kd/test_cases," = = = =")
    print("= = = = Total NORA Cost:",total_cost_nora/test_cases," = = = =")
    print("= = = = Total NORA disable bounding Cost:",total_cost_nora_disable_bounding/test_cases," = = = =")
    print("= = = = Total Real Cost:",total_cost_real_result/test_cases," = = = =")
    print("= = = = Total Minimum Cost:",total_minimum_cost/test_cases," = = = =")
    print("")
    
#         if cost_qd / cost_nora > max_ratio:
#             max_ratio = cost_qd / cost_nora
#             # save the queryset
#             print("!!!!! = = = = = Current Improvement:", int(max_ratio*100),"X = = = = = !!!!!")
#             np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/improvement_train_"+str(int(max_ratio*100))+"_Q100.csv", training_set, delimiter=',')
#             np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob1/improvement_test_"+str(int(max_ratio*100))+"_Q100.csv", testing_set, delimiter=',')


!!!!! = = = = = Current Dimension: [1, 2, 3, 4]  = = = = = !!!!!
Build Time (s): 468.9440016746521
= = = = Total Ratio(Qd-tree/NORA): 22.233072713057076  = = = =
= = = = Total Ratio(kd-tree/NORA): 8.801954652071931  = = = =
= = = = Total Ratio(NORA_disable/NORA): 1.2687255668491009  = = = =
= = = = Total Ratio(Real/NORA): 0.005624679841470977  = = = =
= = = = Total Ratio(MinCost/NORA): 0.7818608287724785  = = = =
= = = = Total Ratio(NORA/MinCost): 1.279  = = = =
= = = = Total Qd-tree Cost : 284361.0  = = = =
= = = = Total kd-tree Cost : 112577.0  = = = =
= = = = Total NORA Cost: 12790.0  = = = =
= = = = Total NORA disable bounding Cost: 16227.0  = = = =
= = = = Total Real Cost: 71.9396551724138  = = = =
= = = = Total Minimum Cost: 10000.0  = = = =



In [None]:
# (Option: using OSM)
# Using OSM Dataset + default distance threshold

OSM_dataset = np.genfromtxt('C:/Users/Cloud/iCloudDrive/NORA_experiments/dataset/OSM_6M.csv', delimiter=',')
OSM_boundary = [-180, -90, 180, 90]
OSM_block_size = 10000 # 1.1M compared to 6M

helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments')
OSM_queries = helper._DatasetAndQuerysetHelper__generate_random_query(50, [1,1], [[-180,180],[-90,90]], [36,18])
paw_beam_tree = PartitionTree(2, OSM_boundary)
paw_beam_tree.visualize(queries = OSM_queries)

extended_OSM_queries = helper.extend_queryset(OSM_queries, QDistThreshold_percent = None, domains = [[-180,180],[-90,90]])
paw_beam_tree.visualize(queries = extended_OSM_queries)

In [352]:
# search query range 
block_size = 10000
# TPCH
# used_dims = [0,1,2,3]
# helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = used_dims) # EXAMPLE
# dataset, domains = helper.load_dataset(used_dims)
# boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]

# OSM
used_dims = [0,1]
helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = [0, 1])
dataset = np.genfromtxt('C:/Users/Cloud/iCloudDrive/NORA_experiments/dataset/OSM_6M.csv', delimiter=',')
boundary = [-180, -90, 180, 90]

pa_kd = PartitionAlgorithm()
pa_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size)

# for OSM, using the same base training set!
helper.maximum_range_percent = 0.01
base_set = helper._DatasetAndQuerysetHelper__generate_random_query(50, [1,1], [[-180,180],[-90,90]], [36,18])
for query_range in [0.01, 0.02, 0.05, 0.1, 0.2, 0.5]: # maximum range
    
    # for OSM, comment this
    #helper.maximum_range_percent = query_range    
     
    print("!!!!! = = = = = Current Query Range:", query_range," = = = = = !!!!!")
    
    total_cost_qd = 0
    total_cost_kd = 0
    total_cost_nora = 0
    total_cost_nora_disable_bounding = 0
    total_cost_paw = 0
    total_cost_real_result = 0
    total_minimum_cost = 0
    
    for case in range(test_cases):
        
        # OSM
        training_set = helper.extend_queryset(base_set, QDistThreshold_percent= query_range-0.01,domains = [[-180.0,180.0],[-90.0,90.0]])
        pa_kd.partition_tree.visualize(queries = training_set)
        testing_set = helper.extend_queryset(training_set, domains = [[-180.0,180.0],[-90.0,90.0]])
        extended_training_set = helper.extend_queryset(training_set, domains = [[-180.0,180.0],[-90.0,90.0]])
        
#         training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=2) # 1-X (X is 5 at most by default)
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2_2dims/query_range/train_"+str(int(query_range*100))+"_"+str(case)+".csv", training_set, delimiter=',')
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2_2dims/query_range/test_"+str(int(query_range*100))+"_"+str(case)+".csv", testing_set, delimiter=',')

#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/query_range/train_"+str(int(query_range*100))+"_"+str(case)+".csv", training_set, delimiter=',')
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/query_range/test_"+str(int(query_range*100))+"_"+str(case)+".csv", testing_set, delimiter=',')
        
#         training_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/query_range/train_"+str(int(query_range*100))+"_"+str(case)+".csv", delimiter=',')
#         testing_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/query_range/test_"+str(int(query_range*100))+"_"+str(case)+".csv", delimiter=',')
        
#         extended_training_set = helper.extend_queryset(training_set)
        
        pa_qd = PartitionAlgorithm()
        pa_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
        # pa_qd.partition_tree.visualize(queries = training_set)
        cost_qd = pa_qd.partition_tree.evaluate_query_cost(testing_set)
        total_cost_qd += cost_qd
        
        cost_kd = pa_kd.partition_tree.evaluate_query_cost(testing_set)
        total_cost_kd += cost_kd
        
#         pa_nora = PartitionAlgorithm()
#         pa_nora.InitializeWithNORA(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
#                                    using_1_by_1 = True)
#         # pa_nora.partition_tree.visualize(queries = training_set)
#         cost_nora = pa_nora.partition_tree.evaluate_query_cost(testing_set)
#         total_cost_nora += cost_nora
        
#         pa_nora_disable_bounding = PartitionAlgorithm()
#         pa_nora_disable_bounding.InitializeWithQDT(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
#         cost_nora_disable_bounding = pa_nora_disable_bounding.partition_tree.evaluate_query_cost(testing_set)
#         total_cost_nora_disable_bounding += cost_nora_disable_bounding

        pa_paw = PartitionAlgorithm()
        pa_paw.InitializeWithPAW(extended_training_set, len(boundary)//2, boundary, dataset, block_size, 1000, strategy = 1)
        cost_paw = pa_paw.partition_tree.evaluate_query_cost(testing_set)
        total_cost_paw += cost_paw
        
        result_sizes = helper.real_result_size(dataset, testing_set)
        cost_real = sum(result_sizes)/len(result_sizes)
        total_cost_real_result += cost_real
        
        min_cost = max(cost_real, block_size)
        total_minimum_cost += min_cost

        #print("= = Cost Ratio Qd/Nora:",cost_qd / cost_nora, " Qd-tree average cost:", cost_qd, " Nora average cost:", cost_nora, "= =")
    
#     print("= = = = Total Ratio(Qd-tree/NORA):",total_cost_qd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(kd-tree/NORA):",total_cost_kd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA_disable/NORA):",total_cost_nora_disable_bounding/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(Real/NORA):",total_cost_real_result/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(MinCost/NORA):",total_minimum_cost/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA/MinCost):",total_cost_nora/total_minimum_cost," = = = =")
    
    print("= = = = Total Ratio(Qd-tree/PAW):",total_cost_qd/total_cost_paw," = = = =")
    print("= = = = Total PAW Cost:",total_cost_paw/test_cases," = = = =")
    
    print("= = = = Total Qd-tree Cost :",total_cost_qd/test_cases," = = = =")
    print("= = = = Total kd-tree Cost :",total_cost_kd/test_cases," = = = =")
#     print("= = = = Total NORA Cost:",total_cost_nora/test_cases," = = = =")
#     print("= = = = Total NORA disable bounding Cost:",total_cost_nora_disable_bounding/test_cases," = = = =")
#     print("= = = = Total Real Cost:",total_cost_real_result/test_cases," = = = =")
    print("= = = = Total Minimum Cost:",total_minimum_cost/test_cases," = = = =")
    print("")
    

!!!!! = = = = = Current Query Range: 0.01  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 50.15003275871277
= = = = Total Ratio(Qd-tree/PAW): 11.847267696564446  = = = =
= = = = Total PAW Cost: 21685.0  = = = =
= = = = Total Qd-tree Cost : 256908.0  = = = =
= = = = Total kd-tree Cost : 51561.0  = = = =
= = = = Total Minimum Cost: 10000.0  = = = =

!!!!! = = = = = Current Query Range: 0.02  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 14.296000957489014
= = = = Total Ratio(Qd-tree/PAW): 13.202391967376032  = = = =
= = = = Total PAW Cost: 29181.0  = = = =
= = = = Total Qd-tree Cost : 385259.0  = = = =
= = = = Total kd-tree Cost : 67029.0  = = = =
= = = = Total Minimum Cost: 15900.72  = = = =

!!!!! = = = = = Current Query Range: 0.05  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 12.823008060455322
= = = = Total Ratio(Qd-tree/PAW): 4.190587233727537  = = = =
= = = = Total PAW Cost: 108066.0  = = = =
= = = = Total Qd-tree Cost : 452860.0  = = = =
= = = = Total kd-tree Cost : 163120.0  = = = =
= = = = Total Minimum Cost: 86881.52  = = = =

!!!!! = = = = = Current Query Range: 0.1  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 13.316996812820435
= = = = Total Ratio(Qd-tree/PAW): 1.70040741378321  = = = =
= = = = Total PAW Cost: 365967.0  = = = =
= = = = Total Qd-tree Cost : 622293.0  = = = =
= = = = Total kd-tree Cost : 449051.0  = = = =
= = = = Total Minimum Cost: 336136.4  = = = =

!!!!! = = = = = Current Query Range: 0.2  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 17.880999326705933
= = = = Total Ratio(Qd-tree/PAW): 1.1205647874718039  = = = =
= = = = Total PAW Cost: 1350809.0  = = = =
= = = = Total Qd-tree Cost : 1513669.0  = = = =
= = = = Total kd-tree Cost : 1460590.0  = = = =
= = = = Total Minimum Cost: 1312545.76  = = = =

!!!!! = = = = = Current Query Range: 0.5  = = = = = !!!!!




<IPython.core.display.Javascript object>

Build Time (s): 32.579994201660156
= = = = Total Ratio(Qd-tree/PAW): 1.0340751823452403  = = = =
= = = = Total PAW Cost: 4491480.0  = = = =
= = = = Total Qd-tree Cost : 4644528.0  = = = =
= = = = Total kd-tree Cost : 4573254.0  = = = =
= = = = Total Minimum Cost: 4467698.88  = = = =



In [353]:
# search distance threshold
block_size = 10000
# TPCH
# used_dims = [0,1,2,3]
# helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = used_dims) # EXAMPLE
# dataset, domains = helper.load_dataset(used_dims)
# boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]

# OSM
used_dims = [0,1]
helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = [0, 1])
dataset = np.genfromtxt('C:/Users/Cloud/iCloudDrive/NORA_experiments/dataset/OSM_6M.csv', delimiter=',')
boundary = [-180, -90, 180, 90]

pa_kd = PartitionAlgorithm()
pa_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size)


# for OSM, using the same base training set!
training_set = helper._DatasetAndQuerysetHelper__generate_random_query(50, [1,1], [[-180,180],[-90,90]], [36,18])
for dist_T in [0.001, 0.005, 0.01, 0.05, 0.1]: # maximum range
    
    helper.QDistThreshold_percent = dist_T
     
    print("!!!!! = = = = = Current Distance Threshold:", dist_T," = = = = = !!!!!")
    
    total_cost_qd = 0
    total_cost_kd = 0
    total_cost_nora = 0
    total_cost_nora_disable_bounding = 0
    total_cost_paw = 0
    total_cost_real_result = 0
    total_minimum_cost = 0
    
    for case in range(test_cases):
        
        # OSM
        #training_set = helper._DatasetAndQuerysetHelper__generate_random_query(50, [1,1], [[-180,180],[-90,90]], [36,18])
        testing_set = helper.extend_queryset(training_set, domains = [[-180.0,180.0],[-90.0,90.0]])
        extended_training_set = helper.extend_queryset(training_set, domains = [[-180.0,180.0],[-90.0,90.0]])
        pa_kd.partition_tree.visualize(queries = testing_set)
    
#         training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=2) # 1-X (X is 5 at most by default)
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2_2dims/dist_threshold/train_"+str(int(dist_T*1000))+"_"+str(case)+".csv", training_set, delimiter=',')
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2_2dims/dist_threshold/test_"+str(int(dist_T*1000))+"_"+str(case)+".csv", testing_set, delimiter=',')
        
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/dist_threshold/train_"+str(int(dist_T*1000))+"_"+str(case)+".csv", training_set, delimiter=',')
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/dist_threshold/test_"+str(int(dist_T*1000))+"_"+str(case)+".csv", testing_set, delimiter=',')
        
#         training_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/dist_threshold/train_"+str(int(dist_T*1000))+"_"+str(case)+".csv", delimiter=',')
#         testing_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/dist_threshold/test_"+str(int(dist_T*1000))+"_"+str(case)+".csv", delimiter=',')
        
#         extended_training_set = helper.extend_queryset(training_set)
        
        pa_qd = PartitionAlgorithm()
        pa_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
        # pa_qd.partition_tree.visualize(queries = training_set)
        cost_qd = pa_qd.partition_tree.evaluate_query_cost(testing_set)
        total_cost_qd += cost_qd
        
        cost_kd = pa_kd.partition_tree.evaluate_query_cost(testing_set)
        total_cost_kd += cost_kd
        
#         pa_nora = PartitionAlgorithm()
#         pa_nora.InitializeWithNORA(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
#                                    using_1_by_1 = True)
#         # pa_nora.partition_tree.visualize(queries = training_set)
#         cost_nora = pa_nora.partition_tree.evaluate_query_cost(testing_set)
#         total_cost_nora += cost_nora
        
#         pa_nora_disable_bounding = PartitionAlgorithm()
#         pa_nora_disable_bounding.InitializeWithQDT(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
#         cost_nora_disable_bounding = pa_nora_disable_bounding.partition_tree.evaluate_query_cost(testing_set)
#         total_cost_nora_disable_bounding += cost_nora_disable_bounding
        
        pa_paw = PartitionAlgorithm()
        pa_paw.InitializeWithPAW(extended_training_set, len(boundary)//2, boundary, dataset, block_size, 1000, strategy = 1)
        cost_paw = pa_paw.partition_tree.evaluate_query_cost(testing_set)
        total_cost_paw += cost_paw
        
        result_sizes = helper.real_result_size(dataset, testing_set)
        cost_real = sum(result_sizes)/len(result_sizes)
        total_cost_real_result += cost_real
        
        min_cost = max(cost_real, block_size)
        total_minimum_cost += min_cost
        
        # print("= = Cost Ratio Qd/Nora:",cost_qd / cost_nora, " Qd-tree average cost:", cost_qd, " Nora average cost:", cost_nora, "= =")
    
#     print("= = = = Total Ratio(Qd-tree/NORA):",total_cost_qd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(kd-tree/NORA):",total_cost_kd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA_disable/NORA):",total_cost_nora_disable_bounding/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(Real/NORA):",total_cost_real_result/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(MinCost/NORA):",total_minimum_cost/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA/MinCost):",total_cost_nora/total_minimum_cost," = = = =")

    print("= = = = Total Ratio(Qd-tree/PAW):",total_cost_qd/total_cost_paw," = = = =")
    print("= = = = Total PAW Cost:",total_cost_paw/test_cases," = = = =")
    
    print("= = = = Total Qd-tree Cost :",total_cost_qd/test_cases," = = = =")
    print("= = = = Total kd-tree Cost :",total_cost_kd/test_cases," = = = =")
#     print("= = = = Total NORA Cost:",total_cost_nora/test_cases," = = = =")
#     print("= = = = Total NORA disable bounding Cost:",total_cost_nora_disable_bounding/test_cases," = = = =")
#     print("= = = = Total Real Cost:",total_cost_real_result/test_cases," = = = =")
    print("= = = = Total Minimum Cost:",total_minimum_cost/test_cases," = = = =")
    print("")

!!!!! = = = = = Current Distance Threshold: 0.001  = = = = = !!!!!




<IPython.core.display.Javascript object>

Build Time (s): 341.25179862976074
= = = = Total Ratio(Qd-tree/PAW): 4.359499003272276  = = = =
= = = = Total PAW Cost: 53174.0  = = = =
= = = = Total Qd-tree Cost : 231812.0  = = = =
= = = = Total kd-tree Cost : 88591.0  = = = =
= = = = Total Minimum Cost: 40371.58  = = = =

!!!!! = = = = = Current Distance Threshold: 0.005  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 276.78489208221436
= = = = Total Ratio(Qd-tree/PAW): 3.7551039564462383  = = = =
= = = = Total PAW Cost: 63921.0  = = = =
= = = = Total Qd-tree Cost : 240030.0  = = = =
= = = = Total kd-tree Cost : 102888.0  = = = =
= = = = Total Minimum Cost: 50353.12  = = = =

!!!!! = = = = = Current Distance Threshold: 0.01  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 208.95043301582336
= = = = Total Ratio(Qd-tree/PAW): 3.9349592402270708  = = = =
= = = = Total PAW Cost: 74338.0  = = = =
= = = = Total Qd-tree Cost : 292517.0  = = = =
= = = = Total kd-tree Cost : 119059.0  = = = =
= = = = Total Minimum Cost: 61123.76  = = = =

!!!!! = = = = = Current Distance Threshold: 0.05  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 12.714963436126709
= = = = Total Ratio(Qd-tree/PAW): 2.4978714224075937  = = = =
= = = = Total PAW Cost: 208590.0  = = = =
= = = = Total Qd-tree Cost : 521031.0  = = = =
= = = = Total kd-tree Cost : 276086.0  = = = =
= = = = Total Minimum Cost: 182392.84  = = = =

!!!!! = = = = = Current Distance Threshold: 0.1  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 15.174826860427856
= = = = Total Ratio(Qd-tree/PAW): 1.7058704753406166  = = = =
= = = = Total PAW Cost: 614107.0  = = = =
= = = = Total Qd-tree Cost : 1047587.0  = = = =
= = = = Total kd-tree Cost : 707795.0  = = = =
= = = = Total Minimum Cost: 581302.28  = = = =



In [357]:
# search number of queries
block_size = 10000
# TPCH
# used_dims = [0,1,2,3]
# helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = used_dims) # EXAMPLE
# dataset, domains = helper.load_dataset(used_dims)
# boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]

# OSM
used_dims = [0,1]
helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = [0, 1])
dataset = np.genfromtxt('C:/Users/Cloud/iCloudDrive/NORA_experiments/dataset/OSM_6M.csv', delimiter=',')
boundary = [-180, -90, 180, 90]

pa_kd = PartitionAlgorithm()
pa_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size)
# [20, 50, 100, 200, 500, 1000]
for num_Q in [500]: # maximum range
     
    print("!!!!! = = = = = Current Numer of Queries:", num_Q," = = = = = !!!!!")
    
    total_cost_qd = 0
    total_cost_kd = 0
    total_cost_nora = 0
    total_cost_nora_disable_bounding = 0
    total_cost_paw = 0
    total_cost_real_result = 0
    total_minimum_cost = 0
    
    for case in range(test_cases):
        
        training_set = helper._DatasetAndQuerysetHelper__generate_random_query(50, [1,1], [[-180,180],[-90,90]], [36,18])
        testing_set = helper.extend_queryset(training_set, domains = [[-180.0,180.0],[-90.0,90.0]])
        extended_training_set = helper.extend_queryset(training_set, domains = [[-180.0,180.0],[-90.0,90.0]])
    
#         training_set, testing_set = helper.generate_queryset_and_save(num_Q, queryset_type=2) # 1-X (X is 5 at most by default)
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2_2dims/num_query/train_"+str(num_Q)+"_"+str(case)+".csv", training_set, delimiter=',')
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2_2dims/num_query/test_"+str(num_Q)+"_"+str(case)+".csv", testing_set, delimiter=',')
        
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/num_query/train_"+str(num_Q)+"_"+str(case)+".csv", training_set, delimiter=',')
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/num_query/test_"+str(num_Q)+"_"+str(case)+".csv", testing_set, delimiter=',')

        #training_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/num_query/train_"+str(num_Q)+"_"+str(case)+".csv", delimiter=',')
        #testing_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/num_query/test_"+str(num_Q)+"_"+str(case)+".csv", delimiter=',')
        
        #extended_training_set = helper.extend_queryset(training_set)
        
        pa_qd = PartitionAlgorithm()
        pa_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
        # pa_qd.partition_tree.visualize(queries = training_set)
        cost_qd = pa_qd.partition_tree.evaluate_query_cost(testing_set)
        total_cost_qd += cost_qd
        
        cost_kd = pa_kd.partition_tree.evaluate_query_cost(testing_set)
        total_cost_kd += cost_kd

#         pa_nora = PartitionAlgorithm()
#         pa_nora.InitializeWithNORA(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, using_1_by_1 = True)
#         # pa_nora.partition_tree.visualize(queries = training_set)
#         cost_nora = pa_nora.partition_tree.evaluate_query_cost(testing_set)
#         total_cost_nora += cost_nora
        
#         pa_nora_disable_bounding = PartitionAlgorithm()
#         pa_nora_disable_bounding.InitializeWithQDT(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
#         cost_nora_disable_bounding = pa_nora_disable_bounding.partition_tree.evaluate_query_cost(testing_set)
#         total_cost_nora_disable_bounding += cost_nora_disable_bounding

        pa_paw = PartitionAlgorithm()
        pa_paw.InitializeWithPAW(extended_training_set, len(boundary)//2, boundary, dataset, block_size, 1000, strategy = 1)
        cost_paw = pa_paw.partition_tree.evaluate_query_cost(testing_set)
        total_cost_paw += cost_paw
        
        result_sizes = helper.real_result_size(dataset, testing_set)
        cost_real = sum(result_sizes)/len(result_sizes)
        total_cost_real_result += cost_real
        
        min_cost = max(cost_real, block_size)
        total_minimum_cost += min_cost
        
        # print("= = Cost Ratio Qd/Nora:",cost_qd / cost_nora, " Qd-tree average cost:", cost_qd, " Nora average cost:", cost_nora, "= =")
    
#     print("= = = = Total Ratio(Qd-tree/NORA):",total_cost_qd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(kd-tree/NORA):",total_cost_kd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA_disable/NORA):",total_cost_nora_disable_bounding/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(Real/NORA):",total_cost_real_result/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(MinCost/NORA):",total_minimum_cost/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA/MinCost):",total_cost_nora/total_minimum_cost," = = = =")
    
    print("= = = = Total Ratio(Qd-tree/PAW):",total_cost_qd/total_cost_paw," = = = =")
    print("= = = = Total PAW Cost:",total_cost_paw/test_cases," = = = =")
    
    print("= = = = Total Qd-tree Cost :",total_cost_qd/test_cases," = = = =")
    print("= = = = Total kd-tree Cost :",total_cost_kd/test_cases," = = = =")
#     print("= = = = Total NORA Cost:",total_cost_nora/test_cases," = = = =")
#     print("= = = = Total NORA disable bounding Cost:",total_cost_nora_disable_bounding/test_cases," = = = =")
#     print("= = = = Total Real Cost:",total_cost_real_result/test_cases," = = = =")
    print("= = = = Total Minimum Cost:",total_minimum_cost/test_cases," = = = =")
    print("")

!!!!! = = = = = Current Numer of Queries: 500  = = = = = !!!!!
Build Time (s): 328.39465045928955
= = = = Total Ratio(Qd-tree/PAW): 8.541856813386328  = = = =
= = = = Total PAW Cost: 40758.0  = = = =
= = = = Total Qd-tree Cost : 348149.0  = = = =
= = = = Total kd-tree Cost : 79451.0  = = = =
= = = = Total Minimum Cost: 29555.64  = = = =



In [227]:
# search random query percentage
block_size = 10000
# TPCH
used_dims = [0,1,2,3]
helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = used_dims) # EXAMPLE
dataset, domains = helper.load_dataset(used_dims)
boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]

# OSM
used_dims = [0,1]
helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = [0, 1])
dataset = np.genfromtxt('C:/Users/Cloud/iCloudDrive/NORA_experiments/dataset/OSM_6M.csv', delimiter=',')
boundary = [-180, -90, 180, 90]

pa_kd = PartitionAlgorithm()
pa_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size)

for random_percent in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0]: # maximum range
     
    print("!!!!! = = = = = Current Random Query Percent:", random_percent," = = = = = !!!!!")
    
    helper.random_percent = random_percent
    
    total_cost_qd = 0
    total_cost_kd = 0
    total_cost_nora = 0
    total_cost_nora_disable_bounding = 0
    total_cost_paw = 0
    total_cost_real_result = 0
    total_minimum_cost = 0
    
    for case in range(test_cases):
        
        training_set = helper._DatasetAndQuerysetHelper__generate_random_query(50, [1,1], [[-180,180],[-90,90]], [36,18])
        testing_set = helper.extend_queryset(training_set, domains = [[-180,180],[-90,90]])
        extended_training_set = helper.extend_queryset(training_set, domains = [[-180,180],[-90,90]])
    
#         training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=3) # 1-X (X is 5 at most by default)
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2_2dims/random/train_"+str(int(random_percent*100))+"_"+str(case)+".csv", training_set, delimiter=',')
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2_2dims/random/test_"+str(int(random_percent*100))+"_"+str(case)+".csv", testing_set, delimiter=',')
        
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/random/train_"+str(int(random_percent*100))+"_"+str(case)+".csv", training_set, delimiter=',')
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/random/test_"+str(int(random_percent*100))+"_"+str(case)+".csv", testing_set, delimiter=',')
        
        training_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/random/train_"+str(int(random_percent*100))+"_"+str(case)+".csv", delimiter=',')
        testing_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/random/test_"+str(int(random_percent*100))+"_"+str(case)+".csv", delimiter=',')
        extended_training_set = helper.extend_queryset(training_set)
        
        pa_qd = PartitionAlgorithm()
        pa_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
        # pa_qd.partition_tree.visualize(queries = training_set)
        cost_qd = pa_qd.partition_tree.evaluate_query_cost(testing_set)
        total_cost_qd += cost_qd
        
        cost_kd = pa_kd.partition_tree.evaluate_query_cost(testing_set)
        total_cost_kd += cost_kd

#         pa_nora = PartitionAlgorithm()
#         pa_nora.InitializeWithNORA(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size, 
#                                    using_1_by_1 = True, using_kd = True)
#         # pa_nora.partition_tree.visualize(queries = training_set)
#         cost_nora = pa_nora.partition_tree.evaluate_query_cost(testing_set)
#         total_cost_nora += cost_nora
        
#         pa_nora_disable_bounding = PartitionAlgorithm()
#         pa_nora_disable_bounding.InitializeWithQDT(extended_training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
#         cost_nora_disable_bounding = pa_nora_disable_bounding.partition_tree.evaluate_query_cost(testing_set)
#         total_cost_nora_disable_bounding += cost_nora_disable_bounding
        
        pa_paw = PartitionAlgorithm()
        pa_paw.InitializeWithPAW(training_set, len(boundary)//2, boundary, dataset, block_size, 1000, strategy = 1)
        cost_paw = pa_paw.partition_tree.evaluate_query_cost(training_set)
        total_cost_paw += cost_paw
    
        result_sizes = helper.real_result_size(dataset, testing_set)
        cost_real = sum(result_sizes)/len(result_sizes)
        total_cost_real_result += cost_real
        
        min_cost = max(cost_real, block_size)
        total_minimum_cost += min_cost
        
        # print("= = Cost Ratio Qd/Nora:",cost_qd / cost_nora, " Qd-tree average cost:", cost_qd, " Nora average cost:", cost_nora, "= =")
    
#     print("= = = = Total Ratio(Qd-tree/NORA):",total_cost_qd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(kd-tree/NORA):",total_cost_kd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA_disable/NORA):",total_cost_nora_disable_bounding/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(Real/NORA):",total_cost_real_result/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(MinCost/NORA):",total_minimum_cost/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA/MinCost):",total_cost_nora/total_minimum_cost," = = = =")

    print("= = = = Total Ratio(Qd-tree/PAW):",total_cost_qd/total_cost_paw," = = = =")
    print("= = = = Total PAW Cost:",total_cost_paw/test_cases," = = = =")
    
    print("= = = = Total Qd-tree Cost :",total_cost_qd/test_cases," = = = =")
    print("= = = = Total kd-tree Cost :",total_cost_kd/test_cases," = = = =")
#     print("= = = = Total NORA Cost:",total_cost_nora/test_cases," = = = =")
#     print("= = = = Total NORA disable bounding Cost:",total_cost_nora_disable_bounding/test_cases," = = = =")
#     print("= = = = Total Real Cost:",total_cost_real_result/test_cases," = = = =")
    print("= = = = Total Minimum Cost:",total_minimum_cost/test_cases," = = = =")
    print("")

!!!!! = = = = = Current Random Query Percent: 0  = = = = = !!!!!
 = = = Apply New Bounding Split for node: 15
 = = = Apply New Bounding Split for node: 20
 = = = Apply New Bounding Split for node: 33
 = = = Apply New Bounding Split for node: 36
 = = = Apply New Bounding Split for node: 38
 = = = Apply New Bounding Split for node: 44
 = = = Apply New Bounding Split for node: 46
 = = = Apply New Bounding Split for node: 52
 = = = Apply New Bounding Split for node: 55
 = = = Apply New Bounding Split for node: 56
 = = = Apply New Bounding Split for node: 59
 = = = Apply New Bounding Split for node: 60
 = = = Apply New Bounding Split for node: 62
 = = = Apply New Bounding Split for node: 63
 = = = Apply New Bounding Split for node: 67
 = = = Apply New Bounding Split for node: 68
 = = = Apply New Bounding Split for node: 77
 = = = Apply New Bounding Split for node: 78
 = = = Apply New Bounding Split for node: 81
 = = = Apply New Bounding Split for node: 90
 = = = Apply New Bounding Split for

 = = = Apply New Bounding Split for node: 55
 = = = Apply New Bounding Split for node: 59
 = = = Apply New Bounding Split for node: 68
 = = = Apply New Bounding Split for node: 74
 = = = Apply New Bounding Split for node: 75
 = = = Apply New Bounding Split for node: 78
 = = = Apply New Bounding Split for node: 80
 = = = Apply New Bounding Split for node: 82
 = = = Apply New Bounding Split for node: 84
 = = = Apply New Bounding Split for node: 86
 = = = Apply New Bounding Split for node: 93
 = = = Apply New Bounding Split for node: 94
 = = = Apply New Bounding Split for node: 95
 = = = Apply New Bounding Split for node: 98
 = = = Apply New Bounding Split for node: 99
 = = = Apply New Bounding Split for node: 100
 = = = Apply New Bounding Split for node: 104
 = = = Apply New Bounding Split for node: 107
 = = = Apply New Bounding Split for node: 108
 = = = Apply New Bounding Split for node: 110
 = = = Apply New Bounding Split for node: 112
 = = = Apply New Bounding Split for node: 116
 = 

 = = = Apply New Bounding Split for node: 88
 = = = Apply New Bounding Split for node: 89
 = = = Apply New Bounding Split for node: 94
 = = = Apply New Bounding Split for node: 96
 = = = Apply New Bounding Split for node: 97
 = = = Apply New Bounding Split for node: 99
 = = = Apply New Bounding Split for node: 102
 = = = Apply New Bounding Split for node: 103
 = = = Apply New Bounding Split for node: 104
 = = = Apply New Bounding Split for node: 105
 = = = Apply New Bounding Split for node: 109
 = = = Apply New Bounding Split for node: 110
 = = = Apply New Bounding Split for node: 113
 = = = Apply New Bounding Split for node: 115
 = = = Apply New Bounding Split for node: 122
 = = = Apply New Bounding Split for node: 130
 = = = Apply New Bounding Split for node: 131
 = = = Apply New Bounding Split for node: 132
 = = = Apply New Bounding Split for node: 143
 = = = Apply New Bounding Split for node: 144
 = = = Apply New Bounding Split for node: 146
 = = = Apply New Bounding Split for node

# Vary Skew Workload Parameters

In [315]:
# Using skew workload

# Vary Number of Cluster Centers

# search cluster center amount
block_size = 10000
# used_dims = [0,1]
used_dims = [0,1,2,3]
helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = used_dims) # EXAMPLE

dataset, domains = helper.load_dataset(used_dims)
boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]

pa_kd = PartitionAlgorithm()
pa_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size)

for num_center in [5, 10, 20, 50]: # maximum range
     
    print("!!!!! = = = = = Current Number of Centers:", num_center," = = = = = !!!!!")
    
    total_cost_qd = 0
    total_cost_kd = 0
    total_cost_nora = 0
    total_cost_nora_disable_bounding = 0
    total_cost_paw = 0
    total_cost_real_result = 0
    total_minimum_cost = 0
    
    for case in range(test_cases):
        
        helper.cluster_center_amount = num_center
        training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=4) # mixture Gaussian
        helper.visualize_queryset_and_dataset([0,1], training_set)
    
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2_2dims/num_center/train_"+str(num_center)+"_"+str(case)+".csv", training_set, delimiter=',')
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2_2dims/num_center/test_"+str(num_center)+"_"+str(case)+".csv", testing_set, delimiter=',')
        
        np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/num_center/train_"+str(num_center)+"_"+str(case)+".csv", training_set, delimiter=',')
        np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/num_center/test_"+str(num_center)+"_"+str(case)+".csv", testing_set, delimiter=',')
        
        #training_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/num_center/train_"+str(num_center)+"_"+str(case)+".csv", delimiter=',')
        #testing_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/num_center/test_"+str(num_center)+"_"+str(case)+".csv", delimiter=',')
        extended_training_set = helper.extend_queryset(training_set)
        
        pa_qd = PartitionAlgorithm()
        pa_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
        # pa_qd.partition_tree.visualize(queries = training_set)
        cost_qd = pa_qd.partition_tree.evaluate_query_cost(testing_set)
        total_cost_qd += cost_qd
        
        cost_kd = pa_kd.partition_tree.evaluate_query_cost(testing_set)
        total_cost_kd += cost_kd
        
        pa_paw = PartitionAlgorithm() # we don't need to optimize it?
        pa_paw.InitializeWithPAW(extended_training_set, len(boundary)//2, boundary, dataset, block_size, 1000, strategy = 1)
        cost_paw = pa_paw.partition_tree.evaluate_query_cost(testing_set)
        total_cost_paw += cost_paw
        
        result_sizes = helper.real_result_size(dataset, testing_set)
        cost_real = sum(result_sizes)/len(result_sizes)
        total_cost_real_result += cost_real
        
        min_cost = max(cost_real, block_size)
        total_minimum_cost += min_cost
        
        # print("= = Cost Ratio Qd/Nora:",cost_qd / cost_nora, " Qd-tree average cost:", cost_qd, " Nora average cost:", cost_nora, "= =")
    
#     print("= = = = Total Ratio(Qd-tree/NORA):",total_cost_qd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(kd-tree/NORA):",total_cost_kd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA_disable/NORA):",total_cost_nora_disable_bounding/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(Real/NORA):",total_cost_real_result/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(MinCost/NORA):",total_minimum_cost/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA/MinCost):",total_cost_nora/total_minimum_cost," = = = =")

    print("= = = = Total Ratio(Qd-tree/PAW):",total_cost_qd/total_cost_paw," = = = =")
    print("= = = = Total Ratio(kd-tree/PAW):",total_cost_kd/total_cost_paw," = = = =")
    print("= = = = Total Ratio(MinCost/PAW):",total_minimum_cost/total_cost_paw," = = = =")
    print("= = = = Total Ratio(PAW/MinCost):",total_cost_paw/total_minimum_cost," = = = =")
    
    print("= = = = Total Qd-tree Cost :",total_cost_qd/test_cases," = = = =")
    print("= = = = Total kd-tree Cost :",total_cost_kd/test_cases," = = = =")
    #print("= = = = Total NORA Cost:",total_cost_nora/test_cases," = = = =")
    #print("= = = = Total NORA disable bounding Cost:",total_cost_nora_disable_bounding/test_cases," = = = =")
    #print("= = = = Total Real Cost:",total_cost_real_result/test_cases," = = = =")
    print("= = = = Total PAW Cost:",total_cost_paw/test_cases," = = = =")
    print("= = = = Total Minimum Cost:",total_minimum_cost/test_cases," = = = =")
    print("")

!!!!! = = = = = Current Number of Centers: 5  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 130.91899991035461
= = = = Total Ratio(Qd-tree/PAW): 72.61871199557767  = = = =
= = = = Total Ratio(kd-tree/PAW): 3.3466694306246545  = = = =
= = = = Total Ratio(MinCost/PAW): 0.6909894969596462  = = = =
= = = = Total Ratio(PAW/MinCost): 1.4472  = = = =
= = = = Total Qd-tree Cost : 1050938.0  = = = =
= = = = Total kd-tree Cost : 48433.0  = = = =
= = = = Total PAW Cost: 14472.0  = = = =
= = = = Total Minimum Cost: 10000.0  = = = =

!!!!! = = = = = Current Number of Centers: 10  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 241.70131063461304
= = = = Total Ratio(Qd-tree/PAW): 63.753258674499726  = = = =
= = = = Total Ratio(kd-tree/PAW): 4.058564347347164  = = = =
= = = = Total Ratio(MinCost/PAW): 0.9179364787956673  = = = =
= = = = Total Ratio(PAW/MinCost): 1.0894  = = = =
= = = = Total Qd-tree Cost : 694528.0  = = = =
= = = = Total kd-tree Cost : 44214.0  = = = =
= = = = Total PAW Cost: 10894.0  = = = =
= = = = Total Minimum Cost: 10000.0  = = = =

!!!!! = = = = = Current Number of Centers: 20  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 379.20997953414917
= = = = Total Ratio(Qd-tree/PAW): 33.30923584387026  = = = =
= = = = Total Ratio(kd-tree/PAW): 4.384093824445666  = = = =
= = = = Total Ratio(MinCost/PAW): 0.916254352208173  = = = =
= = = = Total Ratio(PAW/MinCost): 1.0914  = = = =
= = = = Total Qd-tree Cost : 363537.0  = = = =
= = = = Total kd-tree Cost : 47848.0  = = = =
= = = = Total PAW Cost: 10914.0  = = = =
= = = = Total Minimum Cost: 10000.0  = = = =

!!!!! = = = = = Current Number of Centers: 50  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 680.4601354598999
= = = = Total Ratio(Qd-tree/PAW): 31.684286400768862  = = = =
= = = = Total Ratio(kd-tree/PAW): 5.03668108281275  = = = =
= = = = Total Ratio(MinCost/PAW): 0.8008970046452026  = = = =
= = = = Total Ratio(PAW/MinCost): 1.2486  = = = =
= = = = Total Qd-tree Cost : 395610.0  = = = =
= = = = Total kd-tree Cost : 62888.0  = = = =
= = = = Total PAW Cost: 12486.0  = = = =
= = = = Total Minimum Cost: 10000.0  = = = =



In [316]:
# Using skew workload

# Vary sigma

# search cluster center amount
block_size = 10000
# used_dims = [0,1]
used_dims = [0,1,2,3]
helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', used_dimensions = used_dims) # EXAMPLE

dataset, domains = helper.load_dataset(used_dims)
boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]

pa_kd = PartitionAlgorithm()
pa_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size)

for sigma in [0.1, 0.2, 0.5, 1.0]: # maximum range
     
    print("!!!!! = = = = = Current Sigma:", sigma," = = = = = !!!!!")
    
    total_cost_qd = 0
    total_cost_kd = 0
    total_cost_nora = 0
    total_cost_nora_disable_bounding = 0
    total_cost_paw = 0
    total_cost_real_result = 0
    total_minimum_cost = 0
    
    for case in range(test_cases):
        
        helper.sigma_percent = sigma
        training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=4) # mixture Gaussian
        helper.visualize_queryset_and_dataset([0,1], training_set)
    
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2_2dims/sigma/train_"+str(num_center)+"_"+str(case)+".csv", training_set, delimiter=',')
#         np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2_2dims/sigma/test_"+str(num_center)+"_"+str(case)+".csv", testing_set, delimiter=',')
        
        np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/sigma/train_"+str(int(sigma*100))+"_"+str(case)+".csv", training_set, delimiter=',')
        np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/sigma/test_"+str(int(sigma*100))+"_"+str(case)+".csv", testing_set, delimiter=',')
        
        #training_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/sigma/train_"+str(int(sigma*100))+"_"+str(case)+".csv", delimiter=',')
        #testing_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/sigma/test_"+str(int(sigma*100))+"_"+str(case)+".csv", delimiter=',')
        extended_training_set = helper.extend_queryset(training_set)
        
        pa_qd = PartitionAlgorithm()
        pa_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
        # pa_qd.partition_tree.visualize(queries = training_set)
        cost_qd = pa_qd.partition_tree.evaluate_query_cost(testing_set)
        total_cost_qd += cost_qd
        
        cost_kd = pa_kd.partition_tree.evaluate_query_cost(testing_set)
        total_cost_kd += cost_kd
        
        pa_paw = PartitionAlgorithm() # we don't need to optimize it?
        pa_paw.InitializeWithPAW(extended_training_set, len(boundary)//2, boundary, dataset, block_size, 1000, strategy = 1)
        cost_paw = pa_paw.partition_tree.evaluate_query_cost(testing_set)
        total_cost_paw += cost_paw
        
        result_sizes = helper.real_result_size(dataset, testing_set)
        cost_real = sum(result_sizes)/len(result_sizes)
        total_cost_real_result += cost_real
        
        min_cost = max(cost_real, block_size)
        total_minimum_cost += min_cost
        
        # print("= = Cost Ratio Qd/Nora:",cost_qd / cost_nora, " Qd-tree average cost:", cost_qd, " Nora average cost:", cost_nora, "= =")
    
#     print("= = = = Total Ratio(Qd-tree/NORA):",total_cost_qd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(kd-tree/NORA):",total_cost_kd/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA_disable/NORA):",total_cost_nora_disable_bounding/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(Real/NORA):",total_cost_real_result/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(MinCost/NORA):",total_minimum_cost/total_cost_nora," = = = =")
#     print("= = = = Total Ratio(NORA/MinCost):",total_cost_nora/total_minimum_cost," = = = =")

    print("= = = = Total Ratio(Qd-tree/PAW):",total_cost_qd/total_cost_paw," = = = =")
    print("= = = = Total Ratio(kd-tree/PAW):",total_cost_kd/total_cost_paw," = = = =")
    print("= = = = Total Ratio(MinCost/PAW):",total_minimum_cost/total_cost_paw," = = = =")
    print("= = = = Total Ratio(PAW/MinCost):",total_cost_paw/total_minimum_cost," = = = =")
    
    print("= = = = Total Qd-tree Cost :",total_cost_qd/test_cases," = = = =")
    print("= = = = Total kd-tree Cost :",total_cost_kd/test_cases," = = = =")
    #print("= = = = Total NORA Cost:",total_cost_nora/test_cases," = = = =")
    #print("= = = = Total NORA disable bounding Cost:",total_cost_nora_disable_bounding/test_cases," = = = =")
    #print("= = = = Total Real Cost:",total_cost_real_result/test_cases," = = = =")
    print("= = = = Total PAW Cost:",total_cost_paw/test_cases," = = = =")
    print("= = = = Total Minimum Cost:",total_minimum_cost/test_cases," = = = =")
    print("")

!!!!! = = = = = Current Sigma: 0.1  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 262.0166790485382
= = = = Total Ratio(Qd-tree/PAW): 43.28763191219924  = = = =
= = = = Total Ratio(kd-tree/PAW): 2.8544533558463487  = = = =
= = = = Total Ratio(MinCost/PAW): 0.8442380751371887  = = = =
= = = = Total Ratio(PAW/MinCost): 1.1845  = = = =
= = = = Total Qd-tree Cost : 512742.0  = = = =
= = = = Total kd-tree Cost : 33811.0  = = = =
= = = = Total PAW Cost: 11845.0  = = = =
= = = = Total Minimum Cost: 10000.0  = = = =

!!!!! = = = = = Current Sigma: 0.2  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 211.8584587574005
= = = = Total Ratio(Qd-tree/PAW): 38.06989102044493  = = = =
= = = = Total Ratio(kd-tree/PAW): 3.8545438169864  = = = =
= = = = Total Ratio(MinCost/PAW): 0.900657479960371  = = = =
= = = = Total Ratio(PAW/MinCost): 1.1103  = = = =
= = = = Total Qd-tree Cost : 422690.0  = = = =
= = = = Total kd-tree Cost : 42797.0  = = = =
= = = = Total PAW Cost: 11103.0  = = = =
= = = = Total Minimum Cost: 10000.0  = = = =

!!!!! = = = = = Current Sigma: 0.5  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 215.1129744052887
= = = = Total Ratio(Qd-tree/PAW): 34.78486633439058  = = = =
= = = = Total Ratio(kd-tree/PAW): 3.925056637970095  = = = =
= = = = Total Ratio(MinCost/PAW): 0.9062075215224287  = = = =
= = = = Total Ratio(PAW/MinCost): 1.1035  = = = =
= = = = Total Qd-tree Cost : 383851.0  = = = =
= = = = Total kd-tree Cost : 43313.0  = = = =
= = = = Total PAW Cost: 11035.0  = = = =
= = = = Total Minimum Cost: 10000.0  = = = =

!!!!! = = = = = Current Sigma: 1.0  = = = = = !!!!!


<IPython.core.display.Javascript object>

Build Time (s): 196.83605027198792
= = = = Total Ratio(Qd-tree/PAW): 13.523913043478261  = = = =
= = = = Total Ratio(kd-tree/PAW): 4.7033288043478265  = = = =
= = = = Total Ratio(MinCost/PAW): 0.6793478260869565  = = = =
= = = = Total Ratio(PAW/MinCost): 1.472  = = = =
= = = = Total Qd-tree Cost : 199072.0  = = = =
= = = = Total kd-tree Cost : 69233.0  = = = =
= = = = Total PAW Cost: 14720.0  = = = =
= = = = Total Minimum Cost: 10000.0  = = = =



# Vary Dataset Size

In [341]:
# consider the scale factor 10 TPC-H dataset

# used_dims = [1,2]
used_dims = [1,2,3,4]
block_size = 10000 * 10 # this is important! block_size (here) * (1/sampling rate) = 128MB( = 1,000,000 records in Spark cluster)

# = = = Test Dataset and Queryset Loader = = = 
helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', 
                                  used_dimensions = used_dims, scale_factor = 10) # change scale factor to 10!

dataset, domains = helper.load_dataset(used_dims)
boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]
len(dataset) # 6004799
print(domains)

[[1.e+00 2.e+06]
 [1.e+00 1.e+05]
 [1.e+00 7.e+00]
 [1.e+00 5.e+01]]


In [342]:
num_dims = 4

# training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=2)
# np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/scale10/train.csv", training_set, delimiter=',')
# np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/scale10/test.csv", testing_set, delimiter=',')

training_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/scale10/train.csv", delimiter=',')
testing_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/scale10/test.csv", delimiter=',')
extended_training_set = helper.extend_queryset(training_set)

In [343]:
pa_kd = PartitionAlgorithm()
pa_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size) # 447
pa_kd.partition_tree.visualize(queries = training_set)
# pa_kd.partition_tree.save_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob2_kdtree_scale10')\
pa_kd.partition_tree.evaluate_query_cost(testing_set, True)
    
pa_qd = PartitionAlgorithm()
pa_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
pa_qd.partition_tree.visualize(queries = training_set)
# pa_qd.partition_tree.save_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob2_qdtree_scale10')
pa_qd.partition_tree.evaluate_query_cost(testing_set, True)

pa_paw = PartitionAlgorithm()
pa_paw.InitializeWithPAW(extended_training_set, len(boundary)//2, boundary, dataset, block_size, 1000, strategy = 1) 
pa_paw.partition_tree.visualize(queries = training_set)
# pa_paw.partition_tree.save_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob2_nora_scale10')
pa_paw.partition_tree.evaluate_query_cost(testing_set, True)

<IPython.core.display.Javascript object>

Total logical IOs: 26018163
Average logical IOs: 216818
query 0 [50] cost: 180953
query 1 [50] cost: 180953
query 2 [50] cost: 180953
query 3 [50] cost: 180953
query 4 [79] cost: 100083
query 5 [79] cost: 100083
query 6 [79] cost: 100083
query 7 [79] cost: 100083
query 8 [56] cost: 100541
query 9 [56] cost: 100541
query 10 [56] cost: 100541
query 11 [56] cost: 100541
query 12 [56] cost: 100541
query 13 [12, 49] cost: 362232
query 14 [12, 49] cost: 362232
query 15 [12, 49] cost: 362232
query 16 [12, 49] cost: 362232
query 17 [11] cost: 181279
query 18 [64] cost: 100570
query 19 [64] cost: 100570
query 20 [64] cost: 100570
query 21 [69] cost: 174424
query 22 [69] cost: 174424
query 23 [69] cost: 174424
query 24 [69] cost: 174424
query 25 [69] cost: 174424
query 26 [83] cost: 100633
query 27 [83] cost: 100633
query 28 [70] cost: 174425
query 29 [70] cost: 174425
query 30 [70] cost: 174425
query 31 [70] cost: 174425
query 32 [49, 50] cost: 361905
query 33 [49, 50] cost: 361905
query 34 [56

<IPython.core.display.Javascript object>

Total logical IOs: 49996725
Average logical IOs: 416639
query 0 [55, 56, 40] cost: 736432
query 1 [55, 56, 40] cost: 736432
query 2 [55, 56, 40] cost: 736432
query 3 [55, 56, 40] cost: 736432
query 4 [47, 48] cost: 480867
query 5 [47, 48] cost: 480867
query 6 [47, 48] cost: 480867
query 7 [47, 48] cost: 480867
query 8 [33, 18, 38, 25] cost: 887986
query 9 [33, 18, 38, 25] cost: 887986
query 10 [33, 18, 38, 25] cost: 887986
query 11 [33, 18, 38, 25] cost: 887986
query 12 [33, 18, 38, 25] cost: 887986
query 13 [15, 53, 54] cost: 1338115
query 14 [15, 53, 54] cost: 1338115
query 15 [15, 53, 54] cost: 1338115
query 16 [15, 53, 54] cost: 1338115
query 17 [54, 32] cost: 447846
query 18 [45, 46] cost: 276995
query 19 [45, 46] cost: 276995
query 20 [45, 46] cost: 276995
query 21 [15, 35] cost: 1178135
query 22 [15, 35] cost: 1178135
query 23 [15, 35] cost: 1178135
query 24 [35] cost: 116731
query 25 [35] cost: 116731
query 26 [30] cost: 116366
query 27 [30] cost: 116366
query 28 [36] cost: 115

<IPython.core.display.Javascript object>

Total logical IOs: 17424813
Average logical IOs: 145206
query 0 [41] cost: 100002
query 1 [41] cost: 100002
query 2 [41] cost: 100002
query 3 [41] cost: 100002
query 4 [52] cost: 121594
query 5 [52] cost: 121594
query 6 [52] cost: 121594
query 7 [52] cost: 121594
query 8 [26] cost: 156346
query 9 [26] cost: 156346
query 10 [26] cost: 156346
query 11 [26] cost: 156346
query 12 [26] cost: 156346
query 13 [15] cost: 100008
query 14 [15] cost: 100008
query 15 [15] cost: 100008
query 16 [15] cost: 100008
query 17 [16] cost: 101984
query 18 [49] cost: 139097
query 19 [49] cost: 139097
query 20 [49] cost: 139097
query 21 [37] cost: 137752
query 22 [37] cost: 137752
query 23 [37] cost: 137752
query 24 [37] cost: 137752
query 25 [37] cost: 137752
query 26 [31] cost: 164593
query 27 [31] cost: 164593
query 28 [38] cost: 112425
query 29 [38] cost: 112425
query 30 [38] cost: 112425
query 31 [38] cost: 112425
query 32 [42] cost: 100003
query 33 [42] cost: 100003
query 34 [48] cost: 166456
query 35 

145206

In [252]:
# pa_qd.partition_tree.visualize(queries = training_set)
# pa_paw.partition_tree.visualize(queries = extended_training_set)

In [320]:
# consider the scale factor 50 TPC-H dataset

# used_dims = [1,2]
used_dims = [1,2,3,4]
block_size = 10000 * 2 # this is important! block_size (here) * (1/sampling rate) = 128MB( = 1,000,000 records in Spark cluster)

# = = = Test Dataset and Queryset Loader = = = 
helper = DatasetAndQuerysetHelper(base_path = 'C:/Users/Cloud/iCloudDrive/NORA_experiments', 
                                  used_dimensions = used_dims, scale_factor = 50) # change scale factor to 10!

dataset, domains = helper.load_dataset(used_dims)
boundary = [interval[0] for interval in domains]+[interval[1] for interval in domains]
len(dataset) # 5996516
print(domains)

[[1.e+00 1.e+07]
 [1.e+00 5.e+05]
 [1.e+00 7.e+00]
 [1.e+00 5.e+01]]


In [327]:
num_dims = 4

# training_set, testing_set = helper.generate_queryset_and_save(100, queryset_type=2)
# np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/scale50/train.csv", training_set, delimiter=',')
# np.savetxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/scale50/test.csv", testing_set, delimiter=',')

training_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/scale50/train.csv", delimiter=',')
testing_set = genfromtxt("C:/Users/Cloud/iCloudDrive/NORA_experiments/queryset/prob2/scale50/test.csv", delimiter=',')
extended_training_set = helper.extend_queryset(training_set)

In [329]:
helper.visualize_queryset_and_dataset([0,1],training_set)

<IPython.core.display.Javascript object>

In [330]:
pa_kd = PartitionAlgorithm()
pa_kd.InitializeWithKDT(len(boundary)//2, boundary, dataset, data_threshold = block_size) # 447
pa_kd.partition_tree.visualize(queries = training_set)
# pa_kd.partition_tree.save_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob2_kdtree_scale50')   
pa_kd.partition_tree.evaluate_query_cost(testing_set, True)
print("")

pa_qd = PartitionAlgorithm()
pa_qd.InitializeWithQDT(training_set, len(boundary)//2, boundary, dataset, data_threshold = block_size)
pa_qd.partition_tree.visualize(queries = training_set)
# pa_qd.partition_tree.save_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob2_qdtree_scale50')
pa_qd.partition_tree.evaluate_query_cost(testing_set, True)
print("")

pa_paw = PartitionAlgorithm()
pa_paw.InitializeWithPAW(extended_training_set, len(boundary)//2, boundary, dataset, block_size, 1000, strategy = 1) 
pa_paw.partition_tree.visualize(queries = training_set)
# pa_paw.partition_tree.save_tree('C:/Users/Cloud/iCloudDrive/NORA_experiments/partition_layout/prob2_nora_scale50')
pa_paw.partition_tree.evaluate_query_cost(testing_set, True)
print("")

<IPython.core.display.Javascript object>

Total logical IOs: 10325599
Average logical IOs: 89787
query 0 [102, 218] cost: 69614
query 1 [102, 218] cost: 69614
query 2 [102, 218] cost: 69614
query 3 [102, 218] cost: 69614
query 4 [102, 218] cost: 69614
query 5 [34, 44] cost: 70000
query 6 [227] cost: 32109
query 7 [227] cost: 32109
query 8 [157] cost: 31861
query 9 [157] cost: 31861
query 10 [157] cost: 31861
query 11 [157] cost: 31861
query 12 [11, 13] cost: 173792
query 13 [19, 20, 59, 63, 143, 144, 175, 179] cost: 484453
query 14 [59, 63, 175, 179] cost: 135931
query 15 [59, 63, 175, 179] cost: 135931
query 16 [19, 20, 59, 63, 143, 144, 175, 179] cost: 484453
query 17 [34] cost: 34856
query 18 [34] cost: 34856
query 19 [34] cost: 34856
query 20 [34] cost: 34856
query 21 [121, 123] cost: 65344
query 22 [121, 123] cost: 65344
query 23 [121, 123] cost: 65344
query 24 [121, 123] cost: 65344
query 25 [121, 123] cost: 65344
query 26 [61] cost: 32209
query 27 [146] cost: 87188
query 28 [146] cost: 87188
query 29 [146] cost: 87188
q

<IPython.core.display.Javascript object>

Total logical IOs: 26798625
Average logical IOs: 233031
query 0 [97, 98] cost: 129757
query 1 [97, 98] cost: 129757
query 2 [97, 98] cost: 129757
query 3 [97, 98] cost: 129757
query 4 [97, 98] cost: 129757
query 5 [93] cost: 32051
query 6 [129, 132] cost: 43779
query 7 [129, 132] cost: 43779
query 8 [91, 129, 130] cost: 71518
query 9 [35, 91, 129, 130] cost: 401327
query 10 [35, 91, 129, 130] cost: 401327
query 11 [35, 91, 129, 130] cost: 401327
query 12 [109, 110, 33] cost: 415491
query 13 [113, 114, 115] cost: 117628
query 14 [113, 114, 115] cost: 117628
query 15 [113, 114, 115, 74] cost: 299459
query 16 [113, 114, 115] cost: 117628
query 17 [93] cost: 32051
query 18 [93] cost: 32051
query 19 [93] cost: 32051
query 20 [93] cost: 32051
query 21 [59, 61, 106] cost: 645101
query 22 [59, 61, 106] cost: 645101
query 23 [59, 61, 106] cost: 645101
query 24 [59, 61, 106] cost: 645101
query 25 [59, 61, 106] cost: 645101
query 26 [87, 88] cost: 59703
query 27 [121, 122] cost: 76327
query 28 [1

<IPython.core.display.Javascript object>

Total logical IOs: 3311249
Average logical IOs: 28793
query 0 [95] cost: 20922
query 1 [95] cost: 20922
query 2 [95] cost: 20922
query 3 [95] cost: 20922
query 4 [95] cost: 20922
query 5 [134] cost: 34045
query 6 [140] cost: 30086
query 7 [140] cost: 30086
query 8 [90] cost: 32916
query 9 [90] cost: 32916
query 10 [90] cost: 32916
query 11 [90] cost: 32916
query 12 [108] cost: 22785
query 13 [35, 113, 123] cost: 64229
query 14 [35, 113, 123] cost: 64229
query 15 [35, 113, 123] cost: 64229
query 16 [35, 113, 123] cost: 64229
query 17 [134] cost: 34045
query 18 [134] cost: 34045
query 19 [134] cost: 34045
query 20 [134] cost: 34045
query 21 [103] cost: 21911
query 22 [103] cost: 21911
query 23 [103] cost: 21911
query 24 [103] cost: 21911
query 25 [103] cost: 21911
query 26 [86] cost: 23489
query 27 [68] cost: 20001
query 28 [68] cost: 20001
query 29 [68] cost: 20001
query 30 [68] cost: 20001
query 31 [19] cost: 20903
query 32 [19] cost: 20903
query 33 [44] cost: 20001
query 34 [70] cost:

In [323]:
# pa_paw.partition_tree.visualize(queries = training_set)