In [16]:
import pandas as pd
import numpy as np
import random
import time

In [22]:
def process_chunk_sampling(row, prob_threshold, domains, sampled_subset):
    pob = random.uniform(0, 1)
    row_numpy = row.to_numpy()  
    for i in range(len(domains)):
        if row_numpy[i] > domains[i][1]:
            domains[i][1] = row_numpy[i]
        if row_numpy[i] < domains[i][0]:
            domains[i][0] = row_numpy[i]
    if pob <= prob_threshold:    
        sampled_subset.append(row_numpy[0:len(domains)].tolist())

def batchSampling(table_path, prob_threshold, total_dims, domain_dims, chunk_size):

    domains = [[float('Infinity'), float('-Infinity')] for i in range(domain_dims)] # indicate min, max
    sampled_subset = []
    
    col_names = ['_c'+str(i) for i in range(total_dims)]
    cols = [i for i in range(total_dims)]
    
    start_time = time.time()
    
    batch_count = 0
    for chunk in pd.read_table(table_path, delimiter='|', usecols=cols, names=col_names, chunksize=chunk_size):
        # sampling, domain maintain 
        print('current chunk: ', batch_count)
        chunk.apply(lambda row: process_chunk_sampling(row, prob_threshold, domains, sampled_subset), axis=1)
        batch_count += 1
    
    end_time = time.time()
    print('total processing time: ', end_time - start_time)
    
    return sampled_subset, domains

In [27]:
# = = = Configuration = = =
scale_factor = 10
prob_threshold = 1 / scale_factor
total_dims = 16
domain_dims = 8
chunk_size = 100000

base_table_path = 'C:/Users/Cloud/iCloudDrive/HUAWEI_LKD/9a84f6cd-727f-4f10-ae95-10a0214e10a4-tpc-h-tool/2.18.0_rc2/dbgen/lineitem_'
table_path = base_table_path + str(scale_factor) + '.tbl'
base_save_path = 'C:/Users/Cloud/iCloudDrive/HUAWEI_LKD/Dataset/Robust/dataset/lineitem_'
save_path_data = base_save_path + str(scale_factor) + '_' + str(prob_threshold) + '.csv'
save_path_domain = base_save_path + str(scale_factor) + '_' + str(prob_threshold) + '_domains.csv'

# = = = Execution = = =
sampled_subset, domains = batchSampling(table_path, prob_threshold, total_dims, domain_dims, chunk_size)

# save the sampled subset and domains
sampled_subset = np.array(sampled_subset)
np.savetxt(save_path_data, sampled_subset, delimiter=',')
domains = np.array(domains)
np.savetxt(save_path_domain, domains, delimiter=',')

current chunk:  0
current chunk:  1
current chunk:  2
current chunk:  3
current chunk:  4
current chunk:  5
current chunk:  6
current chunk:  7
current chunk:  8
current chunk:  9
current chunk:  10
current chunk:  11
current chunk:  12
current chunk:  13
current chunk:  14
current chunk:  15
current chunk:  16
current chunk:  17
current chunk:  18
current chunk:  19
current chunk:  20
current chunk:  21
current chunk:  22
current chunk:  23
current chunk:  24
current chunk:  25
current chunk:  26
current chunk:  27
current chunk:  28
current chunk:  29
current chunk:  30
current chunk:  31
current chunk:  32
current chunk:  33
current chunk:  34
current chunk:  35
current chunk:  36
current chunk:  37
current chunk:  38
current chunk:  39
current chunk:  40
current chunk:  41
current chunk:  42
current chunk:  43
current chunk:  44
current chunk:  45
current chunk:  46
current chunk:  47
current chunk:  48
current chunk:  49
current chunk:  50
current chunk:  51
current chunk:  52
cur

current chunk:  416
current chunk:  417
current chunk:  418
current chunk:  419
current chunk:  420
current chunk:  421
current chunk:  422
current chunk:  423
current chunk:  424
current chunk:  425
current chunk:  426
current chunk:  427
current chunk:  428
current chunk:  429
current chunk:  430
current chunk:  431
current chunk:  432
current chunk:  433
current chunk:  434
current chunk:  435
current chunk:  436
current chunk:  437
current chunk:  438
current chunk:  439
current chunk:  440
current chunk:  441
current chunk:  442
current chunk:  443
current chunk:  444
current chunk:  445
current chunk:  446
current chunk:  447
current chunk:  448
current chunk:  449
current chunk:  450
current chunk:  451
current chunk:  452
current chunk:  453
current chunk:  454
current chunk:  455
current chunk:  456
current chunk:  457
current chunk:  458
current chunk:  459
current chunk:  460
current chunk:  461
current chunk:  462
current chunk:  463
current chunk:  464
current chunk:  465


In [30]:
# domains - scale factor 10
# array([[1.000000e+00, 6.000000e+07],
#        [1.000000e+00, 2.000000e+06],
#        [1.000000e+00, 1.000000e+05],
#        [1.000000e+00, 7.000000e+00],
#        [1.000000e+00, 5.000000e+01],
#        [9.009100e+02, 1.049495e+05],
#        [0.000000e+00, 1.000000e-01],
#        [0.000000e+00, 8.000000e-02]])

In [29]:
# domains - scale factor 100
# array([[1.000000e+00, 6.000000e+08],
#        [1.000000e+00, 2.000000e+07],
#        [1.000000e+00, 1.000000e+06],
#        [1.000000e+00, 7.000000e+00],
#        [1.000000e+00, 5.000000e+01],
#        [9.000500e+02, 1.049485e+05],
#        [0.000000e+00, 1.000000e-01],
#        [0.000000e+00, 8.000000e-02]])

In [None]:
# for later computation
block_size = 1000000 // scale_factor