In [1]:
import numpy as np
import rtree
from rtree import index
import import_ipynb

In [23]:
# - dataset: numpy array

def DatasetGenerator(dataset):
    for i in range(len(dataset)):
        yield(i, (dataset[i,0], dataset[i,1], dataset[i,0], dataset[i,1]), dataset[i])
        #yield(i, (dataset[i,0], dataset[i,1]), dataset[i])
    return

# Generate Rtree MBRs using Bulk Loading (STR version)
# Hilbert R-tree: An improved R-tree using fractals. VLDB 1994
# STR: A Simple and Efficient Algorithm for R-Tree Packing. ICDE 1997
def GenerateRtreePartitionUsingBulkLoading(dataset, maximum_capacity = 10000):
    
    p = index.Property()
    p.leaf_capacity = maximum_capacity # cannot be less than 100, indicate the maximum capacity
    p.fill_factor = 0.9
    #p.fill_factor = 0.5
    p.overwrite = True

    idx = index.Index(DatasetGenerator(dataset), properties = p)
    
    leaves = idx.leaves()
    dims = int(len(leaves[0][2])/2)
    
    MBRs = []
    for i in range(len(leaves)):
        mbr = leaves[i][2]
        interleaved_mbr = []
        for j in range(dims):
            interleaved_mbr.append(mbr[j])
            interleaved_mbr.append(mbr[j+dims])
        interleaved_mbr.append(len(leaves[i][1]))
        MBRs.append(interleaved_mbr)
                   
    MBRs = np.array(MBRs) 
    return MBRs

In [9]:
# # debug
# p = index.Property()
# p.leaf_capacity = 10000
# p.fill_factor = 0.5
# p.overwrite = True

# # idx = index.Index(properties = p)
# idx = index.Index(DatasetGenerator(dataset),properties = p)

# leaves = idx.leaves()
# print(len(leaves)) # 171400
# print(len(leaves[0][1])) # 70

In [8]:
# reform the partition representation of RTree's MBR into kdnodes
# MBR[i, ...]: each line: L1. U1, L2, U2,...,Ln Un, Counts
# kdnodes[i][0/1][k][0/1]: i partitions, 0/1 boundary/count  k dimension  0/1 min/max
#
def PartitionFormalization(RTreePartition):
    
    dims = int(len(RTreePartition[0])/2)
    kdnodes = []
    for i in range(len(RTreePartition)):
        domains = []
        for k in range(dims):
            domains.append([ RTreePartition[i,2*k], RTreePartition[i,2*k+1] ])     
        kdnodes.append([domains, RTreePartition[i,-1]])
    return kdnodes

# rtree_partitions = genfromtxt('C:/Users/Cloud/iCloudDrive/HUAWEI_LKD/Dataset/Legacy/data/temp/RTreePartitions_M5K_Dim12.csv', delimiter=',')
# rtree_kdnodes = PartitionFormalization(rtree_partitions)

In [11]:
# # Test Rtree Bulk Loading

# tiny_kdnodes = [ [[[0,10],[0,4]],13], [[[0,10],[4,10]],14] ]

# def DatasetGenerator(kdnodes):
#     for i in range(len(kdnodes)):
#         lower = [domain[0] for domain in kdnodes[i][0]]
#         upper = [domain[1] for domain in kdnodes[i][0]]
#         border = tuple(lower + upper) # non interleave
#         yield(i, border, kdnodes[i][1])
#     return

# p = index.Property()
# p.leaf_capacity = 100 # cannot be less than 100, indicate the maximum capacity
# p.fill_factor = 0.5
# p.overwrite = True

# idx = index.Index(DatasetGenerator(tiny_kdnodes), properties = p)

In [9]:
# print(idx.leaves())
# print(list(idx.intersection((1.0, 1.0, 2.0, 2.0))))
# print(list(idx.intersection((1.0, 5.0, 2.0, 7.0))))

[(0, [0, 1], [0.0, 0.0, 10.0, 10.0])]
[0]
[1]


In [7]:
# import copy
# copy_index = copy.deepcopy(idx)
# print(list(idx.intersection((1.0, 5.0, 2.0, 7.0))))
# copy_index.delete(1, (0,4,10,10))
# print(list(idx.intersection((1.0, 5.0, 2.0, 7.0))))
# print(list(copy_index.intersection((1.0, 5.0, 2.0, 7.0))))

In [16]:
# = = = Global Configuration = = =
import numpy as np

scale_factor = 100
prob_threshold = 1 / scale_factor
total_dims = 16 # the dimensions of lineitem table
domain_dims = 8 # the dimensions we used and maintain min max for
chunk_size = 100000 # 0.1M 

# base_table_path = 'C:/Users/Cloud/iCloudDrive/HUAWEI_LKD/9a84f6cd-727f-4f10-ae95-10a0214e10a4-tpc-h-tool/2.18.0_rc2/dbgen/lineitem_'
# table_path = base_table_path + str(scale_factor) + '.tbl'

base_save_path = 'C:/Users/Cloud/iCloudDrive/HUAWEI_LKD/Dataset/Robust/dataset/lineitem_'
save_path_data = base_save_path + str(scale_factor) + '_' + str(prob_threshold) + '.csv'
save_path_domain = base_save_path + str(scale_factor) + '_' + str(prob_threshold) + '_domains.csv'

# by default, the sampled size always equal to 6M (i.e., using scale factor 1), then for a higher scale factor, we need to divide it
block_size = 1000000 // scale_factor # in original file, 1M rows take approximately 128MB

# = = = Data Loading = = =
dataset = np.genfromtxt(save_path_data, delimiter=',') # the sampled subset
domains = np.genfromtxt(save_path_domain, delimiter=',') # the domain of that scale

# Configuration
used_dimensions = [1,2] # the second and third dimensions

dataset = dataset[:,used_dimensions]
domains = domains[used_dimensions]

# = = = Query Loading = = =
from QueryGeneration import *

# = = = Query Generation = = =

# Configuration
total_queries = 100
random_percentage = 0.20
maximum_random_range = [int((domains[i,1] - domains[i,0])*0.1) for i in range(len(domains))]
cluster_amount = 8
maximum_range_dis = [int((domains[i,1] - domains[i,0])*0.1) for i in range(len(domains))]
sigmas_percentage = [0.2,0.2]
random_shift = False
return_seperate = True

# COMMENT THIS IF NOT GENERATING QUERIES
# mixed_queris = generate_query_with_random(total_queries, random_percentage, domains, maximum_random_range, 
#                                           cluster_amount, maximum_range_dis, sigmas_percentage, random_shift, return_seperate)
# plot_queries_2d_distribution_and_random(mixed_queris[1], mixed_queris[0], domains)
# random_query = mixed_queris[0]
# distribution_query = mixed_queris[1]

# = = = Query Saving / Loading = = =

# Configuration
query_base_path = 'C:/Users/Cloud/iCloudDrive/HUAWEI_LKD/Dataset/NORA/query/'

distribution_path = query_base_path + 'alpha_' + str(int(random_percentage*100)) +'_distribution.csv'
random_path = query_base_path + 'alpha_' + str(int(random_percentage*100)) +'_random.csv'

random_segmentation = int(total_queries * random_percentage / 2)
distribution_segmentation = int(total_queries * (1 - random_percentage) / 2)

### SELECT THIS ONE ###  save the generated queries
# save_query(mixed_queris[0], random_path)
# save_query(mixed_queris[1], distribution_path)
# training_set = np.concatenate((random_query[0:random_segmentation], distribution_query[0:distribution_segmentation]), axis=0)
# testing_set = np.concatenate((random_query[random_segmentation:], distribution_query[distribution_segmentation:]), axis = 0)

### OR THIS ONE ###  load the generated query (if the query is generated from another domain, it should be regenerated!)
distribution_query = load_query(distribution_path)
random_query = load_query(random_path)
training_set = np.concatenate((distribution_query[0:distribution_segmentation], random_query[0:random_segmentation]), axis=0)
testing_set = np.concatenate((distribution_query[distribution_segmentation:], random_query[random_segmentation:]), axis = 0)

In [17]:
dataset.shape

(6001309, 2)

In [None]:
p2 = rtree.index.Property()
p2.leaf_capacity = data_threshold # cannot be less than 100, indicate the maximum capacity
p2.overwrite = True
# p.fill_factor = 0.9
idx2 = index.Index(properties = p2)
for i in range(len(dataset)):
    idx2.insert(i, (dataset[i,0], dataset[i,1])) 

In [None]:
# data_threshold = block_size
# BulkLoadMBRs = GenerateRtreePartitionUsingBulkLoading(dataset, maximum_capacity = data_threshold)
# print(BulkLoadMBRs)

In [9]:
rtree_kdnodes = PartitionFormalization(BulkLoadMBRs)

In [12]:
print(rtree_kdnodes[0])
print(rtree_kdnodes[1])

[[[3499995.0, 8036770.0], [2.0, 36955.0]], 9000.0]
[[[45.0, 19787150.0], [3.0, 37262.0]], 9000.0]


In [10]:
# = = = Visualization = = =
from Utils import *
visualize_kdnodes_and_query_2(rtree_kdnodes[0:2], [], [], domains)

importing Jupyter notebook from Utils.ipynb


<IPython.core.display.Javascript object>

In [None]:
len(rtree_kdnodes)

In [12]:
from Query import *
Query(testing_set, rtree_kdnodes, random_percentage)

importing Jupyter notebook from Query.ipynb
Query 0  Cost:  234000.0  Intersected Partitions: [220, 221, 224, 226, 229, 233, 239, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505]
Query 1  Cost:  243000.0  Intersected Partitions: [14, 17, 19, 20, 21, 27, 28, 38, 374, 376, 379, 381, 383, 385, 386, 392, 393, 394, 544, 546, 547, 549, 551, 553, 555, 557, 558]
Query 2  Cost:  234000.0  Intersected Partitions: [220, 221, 224, 226, 229, 233, 239, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505]
Query 3  Cost:  234000.0  Intersected Partitions: [190, 191, 192, 193, 194, 195, 198, 199, 200, 201, 202, 205, 456, 457, 458, 459, 464, 465, 466, 468, 469, 470, 471, 474, 477, 479]
Query 4  Cost:  90000.0  Intersected Partitions: [130, 131, 134, 136, 301, 302, 303, 306, 307, 309]
Query 5  Cost:  234000.0  Intersected Partitions: [220, 221, 224, 226, 229, 233, 239, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496,