In [1]:
import findspark
findspark.init() # this must be executed before the below import

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark import SparkFiles

In [3]:
import pandas as pd
import time
import rtree
from rtree import index
import numpy as np
from numpy import genfromtxt
from multiprocessing import Pool
import threading

In [4]:
from DRProcess import *
from DDProcess import *

In [5]:
conf = SparkConf().setAll([("spark.executor.memory", "8g"),("spark.driver.memory","8g"),
                           ("spark.memory.offHeap.enabled",True),("spark.memory.offHeap.size","8g")])

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [6]:
sc.getConf().getAll()

[('spark.memory.offHeap.size', '8g'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.port', '43195'),
 ('spark.app.id', 'local-1603182118708'),
 ('spark.app.name', 'pyspark-shell'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.host', '10.88.88.103'),
 ('spark.driver.memory', '8g'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.memory', '8g'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.memory.offHeap.enabled', 'True'),
 ('spark.ui.showConsoleProgress', 'true')]

In [7]:
class DumpThread(threading.Thread):
    def __init__(self, thread_id, name, parameters):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.name = name
        self.parameters = parameters
        
    def run(self):
        print('start dumping thread: ', self.thread_id, self.name)
        start_index, end_index, pids, pid_data_dict, hdfs_path, column_names = self.parameters
        for pid in pids[start_index: end_index]:
            path = hdfs_path + 'partition_' + str(pid)+'.parquet'
            pdf = pd.DataFrame(pid_data_dict[pid], columns=column_names)
            df = sqlContext.createDataFrame(pdf)
            df.write.mode('append').parquet(path)
            pid_data_dict[pid] = []
        print('exit dumping thread: ', self.thread_id, self.name)
        
def dump_dict_data_2_hdfs(pid_data_dicts, column_names, hdfs_path, num_threads = 8):
    
    # first merge all the dicts
    base_dict = pid_data_dicts[0]
    for k in range(1, len(pid_data_dicts)):
        for key, val in pid_data_dicts[k].items():
            if key in base_dict:
                base_dict[key] += val
            else:
                base_dict.update({key:val})
        pid_data_dicts[k].clear()
    
    if num_threads == 1:
        print('start dumping single thread (main)')
        pids = list(base_dict.keys())
        for pid in pids:
            path = hdfs_path + 'partition_' + str(pid)+'.parquet'
            pdf = pd.DataFrame(base_dict[pid], columns=column_names)
            df = sqlContext.createDataFrame(pdf)
            df.write.mode('append').parquet(path)
            base_dict[pid] = []
        print('finish dumping single thread (main)')
    
    else:
        # apply multi-threading to save
        pids = list(base_dict.keys())
        step = int(len(pids) / num_threads) + 1
        threads = []
        for i in range(num_threads):
            start_index = i * step
            end_index = (i+1) * step
            parameters = [start_index, end_index, pids, base_dict, hdfs_path, column_names]
            thread = DumpThread(i, 'dump_thread_'+str(i), parameters)
            thread.start()
            threads.append(thread)
            if start_index >= len(pids):
                break   
        for t in threads:
            t.join()

# used for multi-process wirting
def merge_dicts(pid_data_dicts, num_process):
    base_dict = pid_data_dicts[0]
    for k in range(1, len(pid_data_dicts)):
        for key, val in pid_data_dicts[k].items():
            if key in base_dict:
                base_dict[key] += val
            else:
                base_dict.update({key:val})
        pid_data_dicts[k].clear()
    
    # re allocate to non-overlap dicts
    pids = list(base_dict.keys())
    step = int(len(pids) / num_process) + 1
    non_overlap_dicts = [{} for i in range(num_process)]
    
    for key, val in base_dict.items():
        dict_index = key // step
        non_overlap_dicts[dict_index][key] = val
        
    return non_overlap_dicts

In [8]:
def batch_data_parallel(table_path, partition_path, chunk_size, used_dims, hdfs_path, 
                        num_dims, dump_threshold = 1000000, num_process = 8):
    
    begin_time = time.time()
    
    col_names = ['_c'+str(i) for i in range(num_dims)]
    cols = [i for i in range(num_dims)]
    
    pid_data_dicts = []
    for i in range(num_process):
        pid_data_dicts.append({})
    
    chunks = []
    
    count = 0
    epochs = 0
    processed_data = 0
    pool = Pool(processes = num_process) # the pool should be reused, or incur memory leak!
    pids_each_process = [set() for k in range(num_process)] # used for final merge
    
    for chunk in pd.read_table(table_path, delimiter='|', usecols=cols, names=col_names, chunksize=chunk_size):
    #for chunk in pd.read_csv(table_path, usecols=cols, names=col_names, chunksize=chunk_size):
        print('current chunk: ', count)
        chunks.append(chunk)
        if count % num_process == num_process - 1:
            paras = [[chunks[k], used_dims, partition_path, pid_data_dicts[k]] for k in range(num_process)]
            pid_data_dicts = pool.map(process_chunk, [para for para in paras])
            print('===================================================')
            chunks = []
            processed_data += chunk_size * num_process
            
            # dump data to file
            if processed_data >= dump_threshold:
                # parquet write is not thread safe, avoid concurent write
                pid_data_dicts = merge_dicts(pid_data_dicts, num_process) # make it non-overlap
                for k in range(num_process):
                    pids_each_process[k].update(list(pid_data_dicts[k].keys()))
                paras = [[epochs, pid_data_dicts[k], col_names, hdfs_path] for k in range(num_process)]
                pool.map(dump_data, [para for para in paras])
                #dump_dict_data_2_hdfs(pid_data_dicts, col_names, hdfs_path) # multi-thread
                processed_data = 0
                epochs += 1
                for i in range(num_process):
                    pid_data_dicts[i].clear()
        count += 1
        
    dict_size = [len(pid_data_dicts[i]) for i in range(num_process)]
    print('after exit, chunks size: ', len(chunks))
    print('after exit, each dict size: ', dict_size)
    # process the last batch
    if len(chunks) != 0:
        paras = [[chunks[k], used_dims, partition_path, pid_data_dicts[k]] for k in range(len(chunks))]
        pid_data_dicts[0:len(chunks)] = pool.map(process_chunk, [para for para in paras])
    
    
    dict_size = [len(pid_data_dicts[i]) for i in range(num_process)]
    print('after last chunk, each dict size: ', dict_size)
    
    if len(pid_data_dicts[0]) != 0:
        pid_data_dicts = merge_dicts(pid_data_dicts, num_process) # make it non-overlap
        paras = [[epochs, pid_data_dicts[k], col_names, hdfs_path] for k in range(num_process)]
        pool.map(dump_data, [para for para in paras])
        #dump_dict_data_2_hdfs(pid_data_dicts, col_names, hdfs_path)
        for k in range(num_process):
            pids_each_process[k].update(list(pid_data_dicts[k].keys()))
    
    pid_data_dicts.clear() # release memory
    
    # final merge
    epochs += 1
    paras = [[epochs, pids_each_process[k], hdfs_path] for k in range(num_process)]
    pool.map(merge_parquets, [para for para in paras])
        
    pool.close()
    pool.join()
    
    finish_time = time.time()
    print('total data routing and persisting time: ', finish_time - begin_time)

In [9]:
# = = = Configuration = = =
scale_factor = 100

table_base_path = '/home/ubuntu/TPCH/dbgen/'
table_path = table_base_path + 'lineitem_' + str(scale_factor) + '.tbl'

num_process = 12
chunk_size = 200000
dump_threshold = 12000000 # 6M rows = about 1GB raw data

num_dims = 16
used_dims = [1,2]

# base path of HDFS
hdfs_base_path = 'hdfs://10.88.88.103:9000/user/cloudray/'

nora_hdfs = hdfs_base_path + 'NORA/scale' + str(100) + '/'
qdtree_hdfs = hdfs_base_path + 'QdTree/scale' + str(100) + '/'
kdtree_hdfs = hdfs_base_path + 'KDTree/scale' + str(100) + '/'

# base path of Partition
partition_base_path = '/home/ubuntu/PartitionLayout/'

nora_partition = partition_base_path + 'nora_partitions_' + str(scale_factor)
qdtree_partition = partition_base_path + 'qdtree_partitions_' + str(scale_factor)
kdtree_partition = partition_base_path + 'kdt_partitions_' + str(scale_factor)

# Legacy
# table_path = '/home/cloudray/Downloads/TPCH_12M_8Field.csv'
# table_path = '/home/cloudray/TPCH/2.18.0_rc2/dbgen/lineitem.tbl'

# partition_path = '/home/cloudray/NORA_Partitions/nora_partitions'
# partition_path = '/home/cloudray/NORA_Partitions/qd_tree_partitions'

# hdfs_path = 'hdfs://localhost:9000/user/cloudray/NORA/'
# hdfs_path = 'hdfs://localhost:9000/user/cloudray/QdTree/'

# partition_path = '/home/cloudray/NORA_Partitions/nora_test'
# partition_path = '/home/cloudray/NORA_Partitions/qd_tree_test'

# hdfs_path = 'hdfs://localhost:9000/user/cloudray/NORA_Test/'
# hdfs_path = 'hdfs://localhost:9000/user/cloudray/QdTree_Test/'

In [10]:
# = = = Execution = = =
# batch_data_parallel(table_path, nora_partition, chunk_size, used_dims, nora_hdfs, num_dims, dump_threshold, num_process)
# print('finish nora data routing..')
# batch_data_parallel(table_path, qdtree_partition, chunk_size, used_dims, qdtree_hdfs, num_dims, dump_threshold, num_process)
# print('finish qdtree data routing..')
batch_data_parallel(table_path, kdtree_partition, chunk_size, used_dims, kdtree_hdfs, num_dims, dump_threshold, num_process)
print('finish kdtree data routing..')

current chunk:  0
current chunk:  1
current chunk:  2
current chunk:  3
current chunk:  4
current chunk:  5
current chunk:  6
current chunk:  7
current chunk:  8
current chunk:  9
current chunk:  10
current chunk:  11
current chunk:  12
current chunk:  13
current chunk:  14
current chunk:  15
current chunk:  16
current chunk:  17
current chunk:  18
current chunk:  19
current chunk:  20
current chunk:  21
current chunk:  22
current chunk:  23
current chunk:  24
current chunk:  25
current chunk:  26
current chunk:  27
current chunk:  28
current chunk:  29
current chunk:  30
current chunk:  31
current chunk:  32
current chunk:  33
current chunk:  34
current chunk:  35
current chunk:  36
current chunk:  37
current chunk:  38
current chunk:  39
current chunk:  40
current chunk:  41
current chunk:  42
current chunk:  43
current chunk:  44
current chunk:  45
current chunk:  46
current chunk:  47
current chunk:  48
current chunk:  49
current chunk:  50
current chunk:  51
current chunk:  52
cur

exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  300
current chunk:  301
current chunk:  302
current chunk:  303
current chunk:  304
current chunk:  305
current chunk:  306
current chunk:  307
current chunk:  308
current chunk:  309
current chunk:  310
current chunk:  311
current chunk:  312
current chunk:  313
current chunk:  314
current chunk:  315
current chunk:  316
current chunk:  317
current chunk:  318
current chunk:  319
current chunk:  320
current chunk:  321
current chunk:  322
current chunk:  323
current chunk:  324
current chunk:  325
current chunk:  326
current chunk:  327
current chunk:  328
current chunk:  329
current chunk:  330
current chunk:  331
current chunk:  332
current chunk:  333
current chunk:  334
current chunk:  335
current chunk:  336
current 

current chunk:  587
current chunk:  588
current chunk:  589
current chunk:  590
current chunk:  591
current chunk:  592
current chunk:  593
current chunk:  594
current chunk:  595
current chunk:  596
current chunk:  597
current chunk:  598
current chunk:  599
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  600
current chunk:  601
current chunk:  602
current chunk:  603
current chunk:  604
current chunk:  605
current chunk:  606
current chunk:  607
current chunk:  608
current chunk:  609
current chunk:  610
current chunk:  611
current chunk:  612
current chunk:  613
current chunk:  614
current chunk:  615
current chunk:  616
current chunk:  617
current chunk:  618
current chunk:  619
current chunk:  620
current chunk:  621
current chunk:  622
current chunk:  623
current 

current chunk:  872
current chunk:  873
current chunk:  874
current chunk:  875
current chunk:  876
current chunk:  877
current chunk:  878
current chunk:  879
current chunk:  880
current chunk:  881
current chunk:  882
current chunk:  883
current chunk:  884
current chunk:  885
current chunk:  886
current chunk:  887
current chunk:  888
current chunk:  889
current chunk:  890
current chunk:  891
current chunk:  892
current chunk:  893
current chunk:  894
current chunk:  895
current chunk:  896
current chunk:  897
current chunk:  898
current chunk:  899
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  900
current chunk:  901
current chunk:  902
current chunk:  903
current chunk:  904
current chunk:  905
current chunk:  906
current chunk:  907
current chunk:  908
current 

current chunk:  1152
current chunk:  1153
current chunk:  1154
current chunk:  1155
current chunk:  1156
current chunk:  1157
current chunk:  1158
current chunk:  1159
current chunk:  1160
current chunk:  1161
current chunk:  1162
current chunk:  1163
current chunk:  1164
current chunk:  1165
current chunk:  1166
current chunk:  1167
current chunk:  1168
current chunk:  1169
current chunk:  1170
current chunk:  1171
current chunk:  1172
current chunk:  1173
current chunk:  1174
current chunk:  1175
current chunk:  1176
current chunk:  1177
current chunk:  1178
current chunk:  1179
current chunk:  1180
current chunk:  1181
current chunk:  1182
current chunk:  1183
current chunk:  1184
current chunk:  1185
current chunk:  1186
current chunk:  1187
current chunk:  1188
current chunk:  1189
current chunk:  1190
current chunk:  1191
current chunk:  1192
current chunk:  1193
current chunk:  1194
current chunk:  1195
current chunk:  1196
current chunk:  1197
current chunk:  1198
current chunk

current chunk:  1435
current chunk:  1436
current chunk:  1437
current chunk:  1438
current chunk:  1439
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  1440
current chunk:  1441
current chunk:  1442
current chunk:  1443
current chunk:  1444
current chunk:  1445
current chunk:  1446
current chunk:  1447
current chunk:  1448
current chunk:  1449
current chunk:  1450
current chunk:  1451
current chunk:  1452
current chunk:  1453
current chunk:  1454
current chunk:  1455
current chunk:  1456
current chunk:  1457
current chunk:  1458
current chunk:  1459
current chunk:  1460
current chunk:  1461
current chunk:  1462
current chunk:  1463
current chunk:  1464
current chunk:  1465
current chunk:  1466
current chunk:  1467
current chunk:  1468
current chunk:  1469
current chunk

current chunk:  1709
current chunk:  1710
current chunk:  1711
current chunk:  1712
current chunk:  1713
current chunk:  1714
current chunk:  1715
current chunk:  1716
current chunk:  1717
current chunk:  1718
current chunk:  1719
current chunk:  1720
current chunk:  1721
current chunk:  1722
current chunk:  1723
current chunk:  1724
current chunk:  1725
current chunk:  1726
current chunk:  1727
current chunk:  1728
current chunk:  1729
current chunk:  1730
current chunk:  1731
current chunk:  1732
current chunk:  1733
current chunk:  1734
current chunk:  1735
current chunk:  1736
current chunk:  1737
current chunk:  1738
current chunk:  1739
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  1740
current chunk:  1741
current chunk:  1742
current chunk:  1743
current chunk

current chunk:  1983
current chunk:  1984
current chunk:  1985
current chunk:  1986
current chunk:  1987
current chunk:  1988
current chunk:  1989
current chunk:  1990
current chunk:  1991
current chunk:  1992
current chunk:  1993
current chunk:  1994
current chunk:  1995
current chunk:  1996
current chunk:  1997
current chunk:  1998
current chunk:  1999
current chunk:  2000
current chunk:  2001
current chunk:  2002
current chunk:  2003
current chunk:  2004
current chunk:  2005
current chunk:  2006
current chunk:  2007
current chunk:  2008
current chunk:  2009
current chunk:  2010
current chunk:  2011
current chunk:  2012
current chunk:  2013
current chunk:  2014
current chunk:  2015
current chunk:  2016
current chunk:  2017
current chunk:  2018
current chunk:  2019
current chunk:  2020
current chunk:  2021
current chunk:  2022
current chunk:  2023
current chunk:  2024
current chunk:  2025
current chunk:  2026
current chunk:  2027
current chunk:  2028
current chunk:  2029
current chunk

current chunk:  2268
current chunk:  2269
current chunk:  2270
current chunk:  2271
current chunk:  2272
current chunk:  2273
current chunk:  2274
current chunk:  2275
current chunk:  2276
current chunk:  2277
current chunk:  2278
current chunk:  2279
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  2280
current chunk:  2281
current chunk:  2282
current chunk:  2283
current chunk:  2284
current chunk:  2285
current chunk:  2286
current chunk:  2287
current chunk:  2288
current chunk:  2289
current chunk:  2290
current chunk:  2291
current chunk:  2292
current chunk:  2293
current chunk:  2294
current chunk:  2295
current chunk:  2296
current chunk:  2297
current chunk:  2298
current chunk:  2299
current chunk:  2300
current chunk:  2301
current chunk:  2302
current chunk

current chunk:  2544
current chunk:  2545
current chunk:  2546
current chunk:  2547
current chunk:  2548
current chunk:  2549
current chunk:  2550
current chunk:  2551
current chunk:  2552
current chunk:  2553
current chunk:  2554
current chunk:  2555
current chunk:  2556
current chunk:  2557
current chunk:  2558
current chunk:  2559
current chunk:  2560
current chunk:  2561
current chunk:  2562
current chunk:  2563
current chunk:  2564
current chunk:  2565
current chunk:  2566
current chunk:  2567
current chunk:  2568
current chunk:  2569
current chunk:  2570
current chunk:  2571
current chunk:  2572
current chunk:  2573
current chunk:  2574
current chunk:  2575
current chunk:  2576
current chunk:  2577
current chunk:  2578
current chunk:  2579
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping 

exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  2820
current chunk:  2821
current chunk:  2822
current chunk:  2823
current chunk:  2824
current chunk:  2825
current chunk:  2826
current chunk:  2827
current chunk:  2828
current chunk:  2829
current chunk:  2830
current chunk:  2831
current chunk:  2832
current chunk:  2833
current chunk:  2834
current chunk:  2835
current chunk:  2836
current chunk:  2837
current chunk:  2838
current chunk:  2839
current chunk:  2840
current chunk:  2841
current chunk:  2842
current chunk:  2843
current chunk:  2844
current chunk:  2845
current chunk:  2846
current chunk:  2847
current chunk:  2848
current chunk:  2849
current chunk:  2850
current chunk:  2851
current chunk:  2852
current chunk:  2853
current chunk:  2854
current chunk:  2855
current chunk:  2856
current chunk:  2857
current chunk:  2858
current chunk:  2859
current chunk:  2860
current chunk

In [11]:
# it seems the process of merging QdTree partitions are stuck, we re generate the merged data
# but the below cannot work, it will also stuck at some point, I can't find out why

# pool = Pool(processes = 3)

# # totally 68 partitions for QdTree
# # pids = [[k * 8 + i for i in range(8)] for k in range(num_process)]
# # pids[-1] += [64, 65, 66, 67]

# pids = [i for i in range(68)] # 0 - 67

# batch = 0
# while batch < 3:

#     pids_each_process = [set(pids[batch*24+k*8: batch*24+(k+1)*8]) for k in range(3)]
#     # totally 94 epochs
#     paras = [[94, pids_each_process[k], qdtree_hdfs] for k in range(3)]
#     pool.map(merge_parquets, [para for para in paras])
#     batch += 1
    
# pool.close()
# pool.join()

In [12]:
# import pandas as pd
# import pyarrow as pa
# import pyarrow.parquet as pq
# import numpy as np

# pids = [i for i in range(68)] # 0 - 67
# batches = 94
# hdfs_path = qdtree_hdfs

# start_time = time.time()

# # using single process to handle data merge
# fs = pa.hdfs.connect()
# for pid in pids:
#     parquets = []
#     print('= = = process pid: ', pid, '= = =')
#     for batch in range(batches):
#         path = hdfs_path + str(batch) + '/partition_' + str(pid)+'.parquet'
#         print(batch)
#         try:
#             par = pq.read_table(path)
#             parquets.append(par)
#         except:
#             continue
#     merged_parquet = pa.concat_tables(parquets)
#     merge_path = hdfs_path + 'merged/partition_' + str(pid)+'.parquet'
#     fw = fs.open(merge_path, 'wb')
#     pq.write_table(merged_parquet, fw)
#     fw.close()
# print('exit merge process')

# end_time = time.time()
# print('time usage: ', end_time - start_time) # 2347s