In [71]:
import numpy as np
import copy
from copy import deepcopy

In [106]:
class QueryTreeNode:
    
    def __init__(self, id, lower = 0, upper = 0, level = 0):
        self.id = id # should be identical at least in current QueryTree's current dimension
        self.lower = lower # lower border
        self.upper = upper # upper border
        
#         self.border = [] # all the dimensions
#         self.is_primary = False
#         self.primary_index = -1
#         self.status = 0 # 0: valid, 1: invalid
#         self.level = level # tree level, start from root - 0
        
        self.pid = [] # parent id, may be multiple
        self.cid = [] # child id, may be multiple
        self.previd = -1 # the previous node at the same level, by default -1, i.e., none
        self.nextid = -1 # the next node at the same level, by default -1, i.e., none

# queryset should have been pruned to

class QueryTree:
    
    def __init__(self, dimension=0):
        self.queryset = []
        self.dimension = dimension
        self.size = -1
        
        self.root = QueryTreeNode(-1)
        self.node_dict = {} # a dictionary mapping node id to node object
        self.primary_nodes = [] # a array store the primary nodes (i.e., the node just below the root)
        self.node_count = 0
    
    def loadQuerySetFromQueries(self, query):
        '''
        queries: [i,k,0/1] numpy object  i: ith query, k: kth dimension, 0/1: lower/upper
        '''
        self.queryset = query[:,self.dimension,:]    # [i, 2], each row has L U value
        if len(self.queryset) == 0:
            self.size = 0
            return
        self.queryset = np.unique(self.queryset,axis=0) # remove duplicated queries
        self.queryset = self.queryset[np.lexsort((-self.queryset[:,1],self.queryset[:,0]))] # sort by L first, then by U (reverse)
        
        # add an extra column for id at the end
        ids = np.array([[i] for i in range(len(self.queryset))]) # [i]: as the concat should has the same dimensions
        self.queryset = np.concatenate((self.queryset, ids), axis=1)
    
    def loadQuerySetFromFile(self, filename):
        
        '''
        Load the query form file separator by white space containing L U in each query dimension (interleaved),
          extract the target dimension, sort according to L first then -U. Append id at the last column.
        
        The queryset format in file:  L1 U1 L2 U2 L3 U3 ... (seperate by ' ')
        
        '''
        
        self.queryset = np.genfromtxt(filename, delimiter=' ')
        self.queryset = self.queryset.reshape(len(self.queryset),-1,2)
        self.queryset = self.queryset[:,self.dimension,:]    # [i, 2], each row has L U value
        self.queryset = np.unique(self.queryset,axis=0) # remove duplicated queries
        self.queryset = self.queryset[np.lexsort((-self.queryset[:,1],self.queryset[:,0]))] # sort by L first, then by U (reverse)
        
        # add an extra column for id at the end
        ids = np.array([[i] for i in range(len(self.queryset))]) # [i]: as the concat should has the same dimensions
        self.queryset = np.concatenate((self.queryset, ids), axis=1)
        
    def buildQueryTree(self):
        
        '''
        Should call loadQuerySetFromFile() first before calling this function.
        
        '''
        
        prev_node_id = 0
        
        for i in range(len(self.queryset)):
            
            current_query_lower = self.queryset[i,0]
            current_query_upper = self.queryset[i,1]
            
            # create a QueryNode for this query
            qNode = QueryTreeNode(i, current_query_lower, current_query_upper)
            
            # looking for queries that contains it, those queries are already sorted by L then -U. Notice this is a copy (no reference)
            container_queryset = self.queryset[ (self.queryset[:,0]<= current_query_lower) & (self.queryset[:,1]>=current_query_upper) ]
            
            if len(container_queryset) <= 1: # itself
                # add this node under root directly
                self.root.cid.append(i)
                qNode.pid.append(self.root.id) # append root id into its parent
                
                self.primary_nodes.append(qNode)
                self.node_dict.update({i:qNode})  # add this QueryNode to the dictionary, do it here as the following lines required
                
                if i == 0:
                    qNode.previd = -1
                    continue
                
                # handling prev node and next node
                qNode.previd = prev_node_id
                self.node_dict[prev_node_id].nextid = i
                prev_node_id = i
                
            else:
                # add this nodes under the previous queries, from reverse order!
                for j in reversed(range(len(container_queryset))):
                    container_id = int(container_queryset[j,-1]) # the last column is the id
                    if container_id == i: # itself
                        continue    
                    if container_id == -1: # ruled out
                        continue
                    
                    self.node_dict[container_id].cid.append(i)
                    qNode.pid.append(container_id)
                    
                    # remove the nodes contain this container_id from container_queryset !!!
                    container_lower = container_queryset[j,0]
                    container_upper = container_queryset[j,1]
                    # mark the id as -1, denoting these nodes are invalid
                    container_queryset[(container_queryset[:,0]<= container_lower)&(container_queryset[:,1]>=container_upper),-1] = -1  
                
                # add this QueryNode to the dictionary
                self.node_dict.update({i:qNode})
    
    def queryValue(self, mid_value):
        '''
        Query if an position (in domain, should be the median position) contains some queries.
        If no, it's good.
        If yes, find our the left and right border of that query(queries). 
        '''
        # using binary search to find the node that overlap the position
        lower_index = 0
        upper_index = len(self.primary_nodes) - 1
        mid_index = int((lower_index + upper_index) / 2) # the index of the array
        
        overlap_flag = False # false for no_overlap
        last_visit_node_id = -1
        last_visit_node_index = -1
        
        while True:
            current_lower = self.primary_nodes[mid_index].lower
            current_upper = self.primary_nodes[mid_index].upper
            last_visit_node_id = self.primary_nodes[mid_index].id
            last_visit_node_index = mid_index
            
            if mid_value >= current_lower and mid_value <= current_upper:
                overlap_flag = True
                break
                
            elif mid_value > current_upper:
                lower_index = mid_index + 1
                
            elif mid_value < current_lower:
                upper_index = mid_index - 1
            
            if upper_index < lower_index:
                break # the overlap flag should be False here
            
            mid_index = int((lower_index + upper_index) / 2)
    
        if overlap_flag:
            lower_node_id, lower_node_index, upper_node_id, upper_node_index = -1, -1, -1, -1
            lower_value = self.primary_nodes[last_visit_node_index].lower
            upper_value = self.primary_nodes[last_visit_node_index].upper
            # find the borders of the overlap node (all the way till non-overlap)
            for i in range(last_visit_node_index, len(self.primary_nodes)-1):
                if self.primary_nodes[i+1].lower < self.primary_nodes[i].upper:
                    upper_node_id = self.primary_nodes[i+1].id
                    upper_node_index = i + 1
                    upper_value = self.primary_nodes[i+1].upper
                else:
                    break
            
            for i in range(last_visit_node_index, 0, -1):
                if self.primary_nodes[i-1].upper > self.primary_nodes[i].lower:
                    lower_node_id = self.primary_nodes[i-1].id
                    lower_node_index = i - 1
                    lower_value = self.primary_nodes[i-1].lower
                else:
                    break  
                
            return True, (last_visit_node_index, lower_value, upper_value, lower_node_id, lower_node_index, upper_node_id, upper_node_index)
        else:
            return False, ()
    
    def diveIn(self, node_index, dive_position):
        '''
        This function dive in the query tree under the dive position. Then any primary node cover this dive position should 
        be cancelled.
        
        Currently, it's assumed to be called after queryValue(), such that the first overlap node is known
        
        node_index: the dive position's corresponding primary query node's index in self.primary_nodes 
            i.e., first output from cache, the last_visit_node_index
        dive_position: a value in this domain
        
        '''
        # find the overlaped primary nodes
        overlapped_upper_node_index = node_index
        overlapped_lower_node_index = node_index
        
        for i in range(node_index, len(self.primary_nodes)):
            if self.primary_nodes[i].lower <= dive_position:
                overlapped_upper_node_index = i
            else:
                break
        
        for i in reversed(range(node_index)):
            if self.primary_nodes[i].upper >= dive_position:
                overlapped_lower_node_index = i
            else:
                break
        
        # reorganize the query tree
        processed_count = 0
        current_index = overlapped_lower_node_index
        
        prev_node_id = self.primary_nodes[overlapped_lower_node_index].previd
        next_node_id = self.primary_nodes[overlapped_upper_node_index].nextid
        
        new_prim_count = 0
        new_prim_nodes_all = []
        
        while processed_count < overlapped_upper_node_index - overlapped_lower_node_index + 1 :
            
            processed_node_id = self.primary_nodes[current_index].id
            child_nodes_id = self.primary_nodes[current_index].cid
                
            self.root.cid.remove(processed_node_id)
            del self.primary_nodes[current_index]
            del self.node_dict[processed_node_id]
            
            # check the child nodes
            for i in range(len(child_nodes_id)):
                nid = child_nodes_id[i]
                ref = self.node_dict[nid]
                ref.pid.remove(processed_node_id)
                
                # should be added to primary node
                if len(ref.pid) == 0:
                    # add it to primary node !
                    ref.previd = prev_node_id
                    if prev_node_id != -1:
                        self.node_dict[prev_node_id].nextid = nid
                    prev_node_id = nid
                    
                    ref.pid.append(self.root.id)
                    self.root.cid.insert(overlapped_upper_node_index + new_prim_count + 1, nid)
                    new_prim_count += 1
                    
                    # add this node to the primary node cache
                    new_prim_nodes_all.append(ref)
            
            processed_count += 1
        
        # for the last (to be added) primary node
        if len(new_prim_nodes_all) == 0 and prev_node_id != -1:
            self.node_dict[prev_node_id].nextid = next_node_id
        if len(new_prim_nodes_all) >= 1:
            new_prim_nodes_all[len(new_prim_nodes_all)-1].nextid = next_node_id
        if next_node_id != -1:
            self.node_dict[next_node_id].previd = prev_node_id
        
        # add these nodes to the primary node
        for i in range(len(new_prim_nodes_all)):
            self.primary_nodes.insert(overlapped_lower_node_index + i, new_prim_nodes_all[i])
        
        return overlapped_upper_node_index - overlapped_lower_node_index + 1 # removed queries        

In [None]:
def CreateQueryTree(queryset_filepath, dims):
    query_trees = []
    for i in range(dims):
        qtree = QueryTree(i)
        qtree.loadQuerySetFromFile(queryset_filepath)
        qtree.buildQueryTree()
        query_trees.append(qtree)
    return query_trees

In [27]:
# # Unit Test
# qTree = QueryTree()
# qTree.loadQuerySetInNumpy('C:/Users/Cloud/iCloudDrive/HUAWEI_LKD/Dataset/Legacy/query/training.csv')

[[1.00000000e+00 4.25563989e+03]
 [1.02341336e+02 4.85909190e+03]
 [1.52009967e+02 4.36377676e+03]
 [1.70180435e+02 4.63882393e+03]]


In [109]:
# # === unit test ===
# print("= = = loadQuerySetInNumpy() = = =")
# qTree = QueryTree()
# qTree.loadQuerySetInNumpy('./TestQuery.csv')
# print(qTree.queryset)

# print("= = = buildQueryTree() = = =")
# qTree.buildQueryTree()
# for key in qTree.node_dict:
#     print(qTree.node_dict[key].__dict__)

# print("= = = primary_nodes = = =")
# for node in qTree.primary_nodes:
#     print(node.__dict__)

# print("= = = queryValue() = = =")
# is_overlap, cache = qTree.queryValue(3.0)
# print(is_overlap, cache)

# print("= = = diveIn() = = =")
# qTree.diveIn(cache[0], 3.0)
# for node in qTree.primary_nodes:
#     print(node.__dict__)
# print("= = = node_dict = = =")
# for key in qTree.node_dict:
#     print(qTree.node_dict[key].__dict__)
    
# # after dive in, test query again
# # print("= = = queryValue() = = =")
# # is_overlap, cache = qTree.queryValue(3.0)
# # print(is_overlap, cache)

# # print("= = = diveIn() = = =")
# # qTree.diveIn(cache[0], 3.0)
# # for node in qTree.primary_nodes:
# #     print(node.__dict__)
# # print("= = = node_dict = = =")
# # for key in qTree.node_dict:
# #     print(qTree.node_dict[key].__dict__)
    
# # # after dive in, test query again
# # print("= = = queryValue() = = =")
# # is_overlap, cache = qTree.queryValue(3.0)
# # print(is_overlap, cache)

= = = loadQuerySetInNumpy() = = =
[[0.   3.   0.  ]
 [0.5  2.5  1.  ]
 [1.   4.   2.  ]
 [1.25 2.25 3.  ]
 [2.   3.75 4.  ]
 [3.25 5.25 5.  ]]
= = = buildQueryTree() = = =
{'id': 0, 'lower': 0.0, 'upper': 3.0, 'level': 0, 'pid': [-1], 'cid': [1], 'previd': -1, 'nextid': 2}
{'id': 1, 'lower': 0.5, 'upper': 2.5, 'level': 0, 'pid': [0], 'cid': [3], 'previd': -1, 'nextid': -1}
{'id': 2, 'lower': 1.0, 'upper': 4.0, 'level': 0, 'pid': [-1], 'cid': [3, 4], 'previd': 0, 'nextid': 5}
{'id': 3, 'lower': 1.25, 'upper': 2.25, 'level': 0, 'pid': [2, 1], 'cid': [], 'previd': -1, 'nextid': -1}
{'id': 4, 'lower': 2.0, 'upper': 3.75, 'level': 0, 'pid': [2], 'cid': [], 'previd': -1, 'nextid': -1}
{'id': 5, 'lower': 3.25, 'upper': 5.25, 'level': 0, 'pid': [-1], 'cid': [], 'previd': 2, 'nextid': -1}
= = = primary_nodes = = =
{'id': 0, 'lower': 0.0, 'upper': 3.0, 'level': 0, 'pid': [-1], 'cid': [1], 'previd': -1, 'nextid': 2}
{'id': 2, 'lower': 1.0, 'upper': 4.0, 'level': 0, 'pid': [-1], 'cid': [3, 4], 'pr

In [76]:
node = QueryTreeNode
test_array = []
test_array.append(node)
test_dict = {1:node}
# node.previd = 10
test_dict[1].previd=20
print(test_array[0].previd)
# del test_array[0]
print(test_dict[1])

copy_node = deepcopy(node)
# copy_node = copy.copy(node)
copy_node.previd = 2000
print(test_array[0].previd)
print(node.previd)


20
<class '__main__.QueryTreeNode'>
2000
2000
True


In [29]:
selected = qTree.queryset[(qTree.queryset[:,0]<200) & (qTree.queryset[:,1]<5000)]
print(selected)

[[1.00000000e+00 4.25563989e+03]
 [1.02341336e+02 4.85909190e+03]
 [1.52009967e+02 4.36377676e+03]
 [1.70180435e+02 4.63882393e+03]]


In [31]:
ids = np.array([[i] for i in range(len(selected))])
selected = np.concatenate((selected, ids), axis=1)
print(selected)

[[1.00000000e+00 4.25563989e+03 0.00000000e+00]
 [1.02341336e+02 4.85909190e+03 1.00000000e+00]
 [1.52009967e+02 4.36377676e+03 2.00000000e+00]
 [1.70180435e+02 4.63882393e+03 3.00000000e+00]]


In [2]:
# a = 'str'
# b = a
# b += 'lalala'
# print(a)

str


In [4]:
# a = [1,2,3]
# b = a.copy()
# b.append(4)
# print(a)

[1, 2, 3]


In [12]:
# a = np.array([[2,30,200], [1,30,200], [2,10,200]])
# print(a)

[[  2  30 200]
 [  1  30 200]
 [  2  10 200]]


In [14]:
# print(a[np.lexsort((-a[:,0],a[:,1]))])

[[  2  10 200]
 [  2  30 200]
 [  1  30 200]]


In [13]:
# for j in range(10,0,-1):
#     print(j)

10
9
8
7
6
5
4
3
2
1


In [37]:
# for j in reversed(range(10)):
#     print(j)

9
8
7
6
5
4
3
2
1
0


In [38]:
# dicts = {1:21, 2:22, 3:33}
# dicts[1]+=100
# print(dicts[1])

121


In [41]:
# test_array = np.array([[1,2,3],[4,5,6],[7,8,9]])
# print(test_array)

[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [47]:
# test_array[test_array[:,2] <= 6, -1] = 100
# print(test_array)

[[  1   2 100]
 [  4   5 100]
 [  7   8   9]]


In [44]:
# selected[0,-1] = 100
# print(selected)

[[  1   2 100]
 [  4   5   6]]


In [45]:
# print(test_array)

[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [58]:
# a = [1,2,3,4,5,6,7,8,9,10]

# del a[3]
# print(a[3])
# print(a)

5
[1, 2, 3, 5, 6, 7, 8, 9, 10]
