In [1]:
import findspark
findspark.init() # this must be executed before the below import

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark import SparkFiles

In [3]:
import pandas as pd
import time
import rtree
from rtree import index
import numpy as np
from numpy import genfromtxt
from multiprocessing import Pool
import threading

In [4]:
from DRProcess import *
from DDProcess import *

In [5]:
conf = SparkConf().setAll([("spark.executor.memory", "8g"),("spark.driver.memory","8g"),
                           ("spark.memory.offHeap.enabled",True),("spark.memory.offHeap.size","8g")])

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [7]:
sc.getConf().getAll()

[('spark.memory.offHeap.size', '8g'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.port', '33265'),
 ('spark.app.name', 'pyspark-shell'),
 ('spark.app.id', 'local-1597299371205'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.host', '10.88.88.103'),
 ('spark.driver.memory', '8g'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.memory', '8g'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.memory.offHeap.enabled', 'True'),
 ('spark.ui.showConsoleProgress', 'true')]

In [7]:
class DumpThread(threading.Thread):
    def __init__(self, thread_id, name, parameters):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.name = name
        self.parameters = parameters
        
    def run(self):
        print('start dumping thread: ', self.thread_id, self.name)
        start_index, end_index, pids, pid_data_dict, hdfs_path, column_names = self.parameters
        for pid in pids[start_index: end_index]:
            path = hdfs_path + 'partition_' + str(pid)+'.parquet'
            pdf = pd.DataFrame(pid_data_dict[pid], columns=column_names)
            df = sqlContext.createDataFrame(pdf)
            df.write.mode('append').parquet(path)
            pid_data_dict[pid] = []
        print('exit dumping thread: ', self.thread_id, self.name)
        
def dump_dict_data_2_hdfs(pid_data_dicts, column_names, hdfs_path, num_threads = 8):
    
    # first merge all the dicts
    base_dict = pid_data_dicts[0]
    for k in range(1, len(pid_data_dicts)):
        for key, val in pid_data_dicts[k].items():
            if key in base_dict:
                base_dict[key] += val
            else:
                base_dict.update({key:val})
        pid_data_dicts[k].clear()
    
    if num_threads == 1:
        print('start dumping single thread (main)')
        pids = list(base_dict.keys())
        for pid in pids:
            path = hdfs_path + 'partition_' + str(pid)+'.parquet'
            pdf = pd.DataFrame(base_dict[pid], columns=column_names)
            df = sqlContext.createDataFrame(pdf)
            df.write.mode('append').parquet(path)
            base_dict[pid] = []
        print('finish dumping single thread (main)')
    
    else:
        # apply multi-threading to save
        pids = list(base_dict.keys())
        step = int(len(pids) / num_threads) + 1
        threads = []
        for i in range(num_threads):
            start_index = i * step
            end_index = (i+1) * step
            parameters = [start_index, end_index, pids, base_dict, hdfs_path, column_names]
            thread = DumpThread(i, 'dump_thread_'+str(i), parameters)
            thread.start()
            threads.append(thread)
            if start_index >= len(pids):
                break   
        for t in threads:
            t.join()

# used for multi-process wirting
def merge_dicts(pid_data_dicts, num_process):
    base_dict = pid_data_dicts[0]
    for k in range(1, len(pid_data_dicts)):
        for key, val in pid_data_dicts[k].items():
            if key in base_dict:
                base_dict[key] += val
            else:
                base_dict.update({key:val})
        pid_data_dicts[k].clear()
    
    # re allocate to non-overlap dicts
    pids = list(base_dict.keys())
    step = int(len(pids) / num_process) + 1
    non_overlap_dicts = [{} for i in range(num_process)]
    
    for key, val in base_dict.items():
        dict_index = key // step
        non_overlap_dicts[dict_index][key] = val
        
    return non_overlap_dicts

In [8]:
def batch_data_parallel(table_path, partition_path, chunk_size, used_dims, hdfs_path, 
                        num_dims, dump_threshold = 1000000, num_process = 8):
    
    begin_time = time.time()
    
    col_names = ['_c'+str(i) for i in range(num_dims)]
    cols = [i for i in range(num_dims)]
    
    pid_data_dicts = []
    for i in range(num_process):
        pid_data_dicts.append({})
    
    chunks = []
    
    count = 0
    epochs = 0
    processed_data = 0
    pool = Pool(processes = num_process) # the pool should be reused, or incur memory leak!
    pids_each_process = [set() for k in range(num_process)] # used for final merge
    
    for chunk in pd.read_table(table_path, delimiter='|', usecols=cols, names=col_names, chunksize=chunk_size):
    #for chunk in pd.read_csv(table_path, usecols=cols, names=col_names, chunksize=chunk_size):
        print('current chunk: ', count)
        chunks.append(chunk)
        if count % num_process == num_process - 1:
            paras = [[chunks[k], used_dims, partition_path, pid_data_dicts[k]] for k in range(num_process)]
            pid_data_dicts = pool.map(process_chunk, [para for para in paras])
            print('===================================================')
            chunks = []
            processed_data += chunk_size * num_process
            
            # dump data to file
            if processed_data >= dump_threshold:
                # parquet write is not thread safe, avoid concurent write
                pid_data_dicts = merge_dicts(pid_data_dicts, num_process) # make it non-overlap
                for k in range(num_process):
                    pids_each_process[k].update(list(pid_data_dicts[k].keys()))
                paras = [[epochs, pid_data_dicts[k], col_names, hdfs_path] for k in range(num_process)]
                pool.map(dump_data, [para for para in paras])
                #dump_dict_data_2_hdfs(pid_data_dicts, col_names, hdfs_path) # multi-thread
                processed_data = 0
                epochs += 1
                for i in range(num_process):
                    pid_data_dicts[i].clear()
        count += 1
        
    dict_size = [len(pid_data_dicts[i]) for i in range(num_process)]
    print('after exit, chunks size: ', len(chunks))
    print('after exit, each dict size: ', dict_size)
    # process the last batch
    if len(chunks) != 0:
        paras = [[chunks[k], used_dims, partition_path, pid_data_dicts[k]] for k in range(len(chunks))]
        pid_data_dicts[0:len(chunks)] = pool.map(process_chunk, [para for para in paras])
    
    
    dict_size = [len(pid_data_dicts[i]) for i in range(num_process)]
    print('after last chunk, each dict size: ', dict_size)
    
    if len(pid_data_dicts[0]) != 0:
        pid_data_dicts = merge_dicts(pid_data_dicts, num_process) # make it non-overlap
        paras = [[epochs, pid_data_dicts[k], col_names, hdfs_path] for k in range(num_process)]
        pool.map(dump_data, [para for para in paras])
        #dump_dict_data_2_hdfs(pid_data_dicts, col_names, hdfs_path)
        for k in range(num_process):
            pids_each_process[k].update(list(pid_data_dicts[k].keys()))
    
    pid_data_dicts.clear() # release memory
    
    # final merge
    epochs += 1
    paras = [[epochs, pids_each_process[k], hdfs_path] for k in range(num_process)]
    pool.map(merge_parquets, [para for para in paras])
        
    pool.close()
    pool.join()
    
    finish_time = time.time()
    print('total data routing and persisting time: ', finish_time - begin_time)

In [7]:
# = = = Configuration = = =
scale_factor = 100

table_base_path = '/home/ubuntu/TPCH/dbgen/'
table_path = table_base_path + 'lineitem_' + str(scale_factor) + '.tbl'

num_process = 8
chunk_size = 100000 
dump_threshold = 6000000 # 6M rows, about 1GB

num_dims = 16
used_dims = [1,2]

hdfs_base_path = 'hdfs://10.88.88.103:9000/user/cloudray/'
nora_hdfs = hdfs_base_path + 'NORA/scale' + str(100) + '/'
qdtree_hdfs = hdfs_base_path + 'QdTree/scale' + str(100) + '/'

partition_base_path = '/home/ubuntu/PartitionLayout/'
nora_partition = partition_base_path + 'nora_partitions_' + str(scale_factor)
qdtree_partition = partition_base_path + 'qdtree_partitions_' + str(scale_factor)

# Legacy
# table_path = '/home/cloudray/Downloads/TPCH_12M_8Field.csv'
# table_path = '/home/cloudray/TPCH/2.18.0_rc2/dbgen/lineitem.tbl'

# partition_path = '/home/cloudray/NORA_Partitions/nora_partitions'
# partition_path = '/home/cloudray/NORA_Partitions/qd_tree_partitions'

# hdfs_path = 'hdfs://localhost:9000/user/cloudray/NORA/'
# hdfs_path = 'hdfs://localhost:9000/user/cloudray/QdTree/'

# partition_path = '/home/cloudray/NORA_Partitions/nora_test'
# partition_path = '/home/cloudray/NORA_Partitions/qd_tree_test'

# hdfs_path = 'hdfs://localhost:9000/user/cloudray/NORA_Test/'
# hdfs_path = 'hdfs://localhost:9000/user/cloudray/QdTree_Test/'

In [10]:
# = = = Execution = = =
batch_data_parallel(table_path, nora_partition, chunk_size, used_dims, nora_hdfs, num_dims, dump_threshold, num_process)
print('finish nora data routing..')
batch_data_parallel(table_path, qdtree_partition, chunk_size, used_dims, qdtree_hdfs, num_dims, dump_threshold, num_process)
print('finish qdtree data routing..')

current chunk:  0
current chunk:  1
current chunk:  2
current chunk:  3
current chunk:  4
current chunk:  5
current chunk:  6
current chunk:  7
current chunk:  8
current chunk:  9
current chunk:  10
current chunk:  11
current chunk:  12
current chunk:  13
current chunk:  14
current chunk:  15
current chunk:  16
current chunk:  17
current chunk:  18
current chunk:  19
current chunk:  20
current chunk:  21
current chunk:  22
current chunk:  23
current chunk:  24
current chunk:  25
current chunk:  26
current chunk:  27
current chunk:  28
current chunk:  29
current chunk:  30
current chunk:  31
current chunk:  32
current chunk:  33
current chunk:  34
current chunk:  35
current chunk:  36
current chunk:  37
current chunk:  38
current chunk:  39
current chunk:  40
current chunk:  41
current chunk:  42
current chunk:  43
current chunk:  44
current chunk:  45
current chunk:  46
current chunk:  47
current chunk:  48
current chunk:  49
current chunk:  50
current chunk:  51
current chunk:  52
cur

current chunk:  288
current chunk:  289
current chunk:  290
current chunk:  291
current chunk:  292
current chunk:  293
current chunk:  294
current chunk:  295
current chunk:  296
current chunk:  297
current chunk:  298
current chunk:  299
current chunk:  300
current chunk:  301
current chunk:  302
current chunk:  303
current chunk:  304
current chunk:  305
current chunk:  306
current chunk:  307
current chunk:  308
current chunk:  309
current chunk:  310
current chunk:  311
current chunk:  312
current chunk:  313
current chunk:  314
current chunk:  315
current chunk:  316
current chunk:  317
current chunk:  318
current chunk:  319
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  320
current chunk:  321
current chunk:  322
current chunk:  323
current chunk:  324
current chunk:  325
current chunk:  326
current chunk:  327
current chunk:  328
current chun

current chunk:  574
current chunk:  575
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  576
current chunk:  577
current chunk:  578
current chunk:  579
current chunk:  580
current chunk:  581
current chunk:  582
current chunk:  583
current chunk:  584
current chunk:  585
current chunk:  586
current chunk:  587
current chunk:  588
current chunk:  589
current chunk:  590
current chunk:  591
current chunk:  592
current chunk:  593
current chunk:  594
current chunk:  595
current chunk:  596
current chunk:  597
current chunk:  598
current chunk:  599
current chunk:  600
current chunk:  601
current chunk:  602
current chunk:  603
current chunk:  604
current chunk:  605
current chunk:  606
current chunk:  607
current chunk:  608
current chunk:  609
current chunk:  610
current chunk:  611
current chunk:  612
current chunk:  613
current chunk:  614
current chun

current chunk:  851
current chunk:  852
current chunk:  853
current chunk:  854
current chunk:  855
current chunk:  856
current chunk:  857
current chunk:  858
current chunk:  859
current chunk:  860
current chunk:  861
current chunk:  862
current chunk:  863
current chunk:  864
current chunk:  865
current chunk:  866
current chunk:  867
current chunk:  868
current chunk:  869
current chunk:  870
current chunk:  871
current chunk:  872
current chunk:  873
current chunk:  874
current chunk:  875
current chunk:  876
current chunk:  877
current chunk:  878
current chunk:  879
current chunk:  880
current chunk:  881
current chunk:  882
current chunk:  883
current chunk:  884
current chunk:  885
current chunk:  886
current chunk:  887
current chunk:  888
current chunk:  889
current chunk:  890
current chunk:  891
current chunk:  892
current chunk:  893
current chunk:  894
current chunk:  895
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping pro

current chunk:  1130
current chunk:  1131
current chunk:  1132
current chunk:  1133
current chunk:  1134
current chunk:  1135
current chunk:  1136
current chunk:  1137
current chunk:  1138
current chunk:  1139
current chunk:  1140
current chunk:  1141
current chunk:  1142
current chunk:  1143
current chunk:  1144
current chunk:  1145
current chunk:  1146
current chunk:  1147
current chunk:  1148
current chunk:  1149
current chunk:  1150
current chunk:  1151
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  1152
current chunk:  1153
current chunk:  1154
current chunk:  1155
current chunk:  1156
current chunk:  1157
current chunk:  1158
current chunk:  1159
current chunk:  1160
current chunk:  1161
current chunk:  1162
current chunk:  1163
current chunk:  1164
current chunk:  1165
current chunk:  1166
current chunk:  1167
current chunk:  1168
current chunk

current chunk:  1404
current chunk:  1405
current chunk:  1406
current chunk:  1407
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  1408
current chunk:  1409
current chunk:  1410
current chunk:  1411
current chunk:  1412
current chunk:  1413
current chunk:  1414
current chunk:  1415
current chunk:  1416
current chunk:  1417
current chunk:  1418
current chunk:  1419
current chunk:  1420
current chunk:  1421
current chunk:  1422
current chunk:  1423
current chunk:  1424
current chunk:  1425
current chunk:  1426
current chunk:  1427
current chunk:  1428
current chunk:  1429
current chunk:  1430
current chunk:  1431
current chunk:  1432
current chunk:  1433
current chunk:  1434
current chunk:  1435
current chunk:  1436
current chunk:  1437
current chunk:  1438
current chunk:  1439
current chunk:  1440
current chunk:  1441
current chunk:  1442
current chunk

current chunk:  1672
current chunk:  1673
current chunk:  1674
current chunk:  1675
current chunk:  1676
current chunk:  1677
current chunk:  1678
current chunk:  1679
current chunk:  1680
current chunk:  1681
current chunk:  1682
current chunk:  1683
current chunk:  1684
current chunk:  1685
current chunk:  1686
current chunk:  1687
current chunk:  1688
current chunk:  1689
current chunk:  1690
current chunk:  1691
current chunk:  1692
current chunk:  1693
current chunk:  1694
current chunk:  1695
current chunk:  1696
current chunk:  1697
current chunk:  1698
current chunk:  1699
current chunk:  1700
current chunk:  1701
current chunk:  1702
current chunk:  1703
current chunk:  1704
current chunk:  1705
current chunk:  1706
current chunk:  1707
current chunk:  1708
current chunk:  1709
current chunk:  1710
current chunk:  1711
current chunk:  1712
current chunk:  1713
current chunk:  1714
current chunk:  1715
current chunk:  1716
current chunk:  1717
current chunk:  1718
current chunk

current chunk:  1946
current chunk:  1947
current chunk:  1948
current chunk:  1949
current chunk:  1950
current chunk:  1951
current chunk:  1952
current chunk:  1953
current chunk:  1954
current chunk:  1955
current chunk:  1956
current chunk:  1957
current chunk:  1958
current chunk:  1959
current chunk:  1960
current chunk:  1961
current chunk:  1962
current chunk:  1963
current chunk:  1964
current chunk:  1965
current chunk:  1966
current chunk:  1967
current chunk:  1968
current chunk:  1969
current chunk:  1970
current chunk:  1971
current chunk:  1972
current chunk:  1973
current chunk:  1974
current chunk:  1975
current chunk:  1976
current chunk:  1977
current chunk:  1978
current chunk:  1979
current chunk:  1980
current chunk:  1981
current chunk:  1982
current chunk:  1983
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  1984
current chunk

current chunk:  2220
current chunk:  2221
current chunk:  2222
current chunk:  2223
current chunk:  2224
current chunk:  2225
current chunk:  2226
current chunk:  2227
current chunk:  2228
current chunk:  2229
current chunk:  2230
current chunk:  2231
current chunk:  2232
current chunk:  2233
current chunk:  2234
current chunk:  2235
current chunk:  2236
current chunk:  2237
current chunk:  2238
current chunk:  2239
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  2240
current chunk:  2241
current chunk:  2242
current chunk:  2243
current chunk:  2244
current chunk:  2245
current chunk:  2246
current chunk:  2247
current chunk:  2248
current chunk:  2249
current chunk:  2250
current chunk:  2251
current chunk:  2252
current chunk:  2253
current chunk:  2254
current chunk:  2255
current chunk:  2256
current chunk:  2257
current chunk:  2258
current chunk

current chunk:  2494
current chunk:  2495
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  2496
current chunk:  2497
current chunk:  2498
current chunk:  2499
current chunk:  2500
current chunk:  2501
current chunk:  2502
current chunk:  2503
current chunk:  2504
current chunk:  2505
current chunk:  2506
current chunk:  2507
current chunk:  2508
current chunk:  2509
current chunk:  2510
current chunk:  2511
current chunk:  2512
current chunk:  2513
current chunk:  2514
current chunk:  2515
current chunk:  2516
current chunk:  2517
current chunk:  2518
current chunk:  2519
current chunk:  2520
current chunk:  2521
current chunk:  2522
current chunk:  2523
current chunk:  2524
current chunk:  2525
current chunk:  2526
current chunk:  2527
current chunk:  2528
current chunk:  2529
current chunk:  2530
current chunk:  2531
current chunk:  2532
current chunk

current chunk:  2760
current chunk:  2761
current chunk:  2762
current chunk:  2763
current chunk:  2764
current chunk:  2765
current chunk:  2766
current chunk:  2767
current chunk:  2768
current chunk:  2769
current chunk:  2770
current chunk:  2771
current chunk:  2772
current chunk:  2773
current chunk:  2774
current chunk:  2775
current chunk:  2776
current chunk:  2777
current chunk:  2778
current chunk:  2779
current chunk:  2780
current chunk:  2781
current chunk:  2782
current chunk:  2783
current chunk:  2784
current chunk:  2785
current chunk:  2786
current chunk:  2787
current chunk:  2788
current chunk:  2789
current chunk:  2790
current chunk:  2791
current chunk:  2792
current chunk:  2793
current chunk:  2794
current chunk:  2795
current chunk:  2796
current chunk:  2797
current chunk:  2798
current chunk:  2799
current chunk:  2800
current chunk:  2801
current chunk:  2802
current chunk:  2803
current chunk:  2804
current chunk:  2805
current chunk:  2806
current chunk

current chunk:  3034
current chunk:  3035
current chunk:  3036
current chunk:  3037
current chunk:  3038
current chunk:  3039
current chunk:  3040
current chunk:  3041
current chunk:  3042
current chunk:  3043
current chunk:  3044
current chunk:  3045
current chunk:  3046
current chunk:  3047
current chunk:  3048
current chunk:  3049
current chunk:  3050
current chunk:  3051
current chunk:  3052
current chunk:  3053
current chunk:  3054
current chunk:  3055
current chunk:  3056
current chunk:  3057
current chunk:  3058
current chunk:  3059
current chunk:  3060
current chunk:  3061
current chunk:  3062
current chunk:  3063
current chunk:  3064
current chunk:  3065
current chunk:  3066
current chunk:  3067
current chunk:  3068
current chunk:  3069
current chunk:  3070
current chunk:  3071
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  3072
current chunk

current chunk:  3308
current chunk:  3309
current chunk:  3310
current chunk:  3311
current chunk:  3312
current chunk:  3313
current chunk:  3314
current chunk:  3315
current chunk:  3316
current chunk:  3317
current chunk:  3318
current chunk:  3319
current chunk:  3320
current chunk:  3321
current chunk:  3322
current chunk:  3323
current chunk:  3324
current chunk:  3325
current chunk:  3326
current chunk:  3327
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  3328
current chunk:  3329
current chunk:  3330
current chunk:  3331
current chunk:  3332
current chunk:  3333
current chunk:  3334
current chunk:  3335
current chunk:  3336
current chunk:  3337
current chunk:  3338
current chunk:  3339
current chunk:  3340
current chunk:  3341
current chunk:  3342
current chunk:  3343
current chunk:  3344
current chunk:  3345
current chunk:  3346
current chunk

current chunk:  3582
current chunk:  3583
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  3584
current chunk:  3585
current chunk:  3586
current chunk:  3587
current chunk:  3588
current chunk:  3589
current chunk:  3590
current chunk:  3591
current chunk:  3592
current chunk:  3593
current chunk:  3594
current chunk:  3595
current chunk:  3596
current chunk:  3597
current chunk:  3598
current chunk:  3599
current chunk:  3600
current chunk:  3601
current chunk:  3602
current chunk:  3603
current chunk:  3604
current chunk:  3605
current chunk:  3606
current chunk:  3607
current chunk:  3608
current chunk:  3609
current chunk:  3610
current chunk:  3611
current chunk:  3612
current chunk:  3613
current chunk:  3614
current chunk:  3615
current chunk:  3616
current chunk:  3617
current chunk:  3618
current chunk:  3619
current chunk:  3620
current chunk

current chunk:  3848
current chunk:  3849
current chunk:  3850
current chunk:  3851
current chunk:  3852
current chunk:  3853
current chunk:  3854
current chunk:  3855
current chunk:  3856
current chunk:  3857
current chunk:  3858
current chunk:  3859
current chunk:  3860
current chunk:  3861
current chunk:  3862
current chunk:  3863
current chunk:  3864
current chunk:  3865
current chunk:  3866
current chunk:  3867
current chunk:  3868
current chunk:  3869
current chunk:  3870
current chunk:  3871
current chunk:  3872
current chunk:  3873
current chunk:  3874
current chunk:  3875
current chunk:  3876
current chunk:  3877
current chunk:  3878
current chunk:  3879
current chunk:  3880
current chunk:  3881
current chunk:  3882
current chunk:  3883
current chunk:  3884
current chunk:  3885
current chunk:  3886
current chunk:  3887
current chunk:  3888
current chunk:  3889
current chunk:  3890
current chunk:  3891
current chunk:  3892
current chunk:  3893
current chunk:  3894
current chunk

current chunk:  4122
current chunk:  4123
current chunk:  4124
current chunk:  4125
current chunk:  4126
current chunk:  4127
current chunk:  4128
current chunk:  4129
current chunk:  4130
current chunk:  4131
current chunk:  4132
current chunk:  4133
current chunk:  4134
current chunk:  4135
current chunk:  4136
current chunk:  4137
current chunk:  4138
current chunk:  4139
current chunk:  4140
current chunk:  4141
current chunk:  4142
current chunk:  4143
current chunk:  4144
current chunk:  4145
current chunk:  4146
current chunk:  4147
current chunk:  4148
current chunk:  4149
current chunk:  4150
current chunk:  4151
current chunk:  4152
current chunk:  4153
current chunk:  4154
current chunk:  4155
current chunk:  4156
current chunk:  4157
current chunk:  4158
current chunk:  4159
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  4160
current chunk

current chunk:  4396
current chunk:  4397
current chunk:  4398
current chunk:  4399
current chunk:  4400
current chunk:  4401
current chunk:  4402
current chunk:  4403
current chunk:  4404
current chunk:  4405
current chunk:  4406
current chunk:  4407
current chunk:  4408
current chunk:  4409
current chunk:  4410
current chunk:  4411
current chunk:  4412
current chunk:  4413
current chunk:  4414
current chunk:  4415
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  4416
current chunk:  4417
current chunk:  4418
current chunk:  4419
current chunk:  4420
current chunk:  4421
current chunk:  4422
current chunk:  4423
current chunk:  4424
current chunk:  4425
current chunk:  4426
current chunk:  4427
current chunk:  4428
current chunk:  4429
current chunk:  4430
current chunk:  4431
current chunk:  4432
current chunk:  4433
current chunk:  4434
current chunk

current chunk:  4670
current chunk:  4671
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  4672
current chunk:  4673
current chunk:  4674
current chunk:  4675
current chunk:  4676
current chunk:  4677
current chunk:  4678
current chunk:  4679
current chunk:  4680
current chunk:  4681
current chunk:  4682
current chunk:  4683
current chunk:  4684
current chunk:  4685
current chunk:  4686
current chunk:  4687
current chunk:  4688
current chunk:  4689
current chunk:  4690
current chunk:  4691
current chunk:  4692
current chunk:  4693
current chunk:  4694
current chunk:  4695
current chunk:  4696
current chunk:  4697
current chunk:  4698
current chunk:  4699
current chunk:  4700
current chunk:  4701
current chunk:  4702
current chunk:  4703
current chunk:  4704
current chunk:  4705
current chunk:  4706
current chunk:  4707
current chunk:  4708
current chunk

current chunk:  4936
current chunk:  4937
current chunk:  4938
current chunk:  4939
current chunk:  4940
current chunk:  4941
current chunk:  4942
current chunk:  4943
current chunk:  4944
current chunk:  4945
current chunk:  4946
current chunk:  4947
current chunk:  4948
current chunk:  4949
current chunk:  4950
current chunk:  4951
current chunk:  4952
current chunk:  4953
current chunk:  4954
current chunk:  4955
current chunk:  4956
current chunk:  4957
current chunk:  4958
current chunk:  4959
current chunk:  4960
current chunk:  4961
current chunk:  4962
current chunk:  4963
current chunk:  4964
current chunk:  4965
current chunk:  4966
current chunk:  4967
current chunk:  4968
current chunk:  4969
current chunk:  4970
current chunk:  4971
current chunk:  4972
current chunk:  4973
current chunk:  4974
current chunk:  4975
current chunk:  4976
current chunk:  4977
current chunk:  4978
current chunk:  4979
current chunk:  4980
current chunk:  4981
current chunk:  4982
current chunk

current chunk:  5210
current chunk:  5211
current chunk:  5212
current chunk:  5213
current chunk:  5214
current chunk:  5215
current chunk:  5216
current chunk:  5217
current chunk:  5218
current chunk:  5219
current chunk:  5220
current chunk:  5221
current chunk:  5222
current chunk:  5223
current chunk:  5224
current chunk:  5225
current chunk:  5226
current chunk:  5227
current chunk:  5228
current chunk:  5229
current chunk:  5230
current chunk:  5231
current chunk:  5232
current chunk:  5233
current chunk:  5234
current chunk:  5235
current chunk:  5236
current chunk:  5237
current chunk:  5238
current chunk:  5239
current chunk:  5240
current chunk:  5241
current chunk:  5242
current chunk:  5243
current chunk:  5244
current chunk:  5245
current chunk:  5246
current chunk:  5247
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  5248
current chunk

current chunk:  5484
current chunk:  5485
current chunk:  5486
current chunk:  5487
current chunk:  5488
current chunk:  5489
current chunk:  5490
current chunk:  5491
current chunk:  5492
current chunk:  5493
current chunk:  5494
current chunk:  5495
current chunk:  5496
current chunk:  5497
current chunk:  5498
current chunk:  5499
current chunk:  5500
current chunk:  5501
current chunk:  5502
current chunk:  5503
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  5504
current chunk:  5505
current chunk:  5506
current chunk:  5507
current chunk:  5508
current chunk:  5509
current chunk:  5510
current chunk:  5511
current chunk:  5512
current chunk:  5513
current chunk:  5514
current chunk:  5515
current chunk:  5516
current chunk:  5517
current chunk:  5518
current chunk:  5519
current chunk:  5520
current chunk:  5521
current chunk:  5522
current chunk

current chunk:  5758
current chunk:  5759
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  5760
current chunk:  5761
current chunk:  5762
current chunk:  5763
current chunk:  5764
current chunk:  5765
current chunk:  5766
current chunk:  5767
current chunk:  5768
current chunk:  5769
current chunk:  5770
current chunk:  5771
current chunk:  5772
current chunk:  5773
current chunk:  5774
current chunk:  5775
current chunk:  5776
current chunk:  5777
current chunk:  5778
current chunk:  5779
current chunk:  5780
current chunk:  5781
current chunk:  5782
current chunk:  5783
current chunk:  5784
current chunk:  5785
current chunk:  5786
current chunk:  5787
current chunk:  5788
current chunk:  5789
current chunk:  5790
current chunk:  5791
current chunk:  5792
current chunk:  5793
current chunk:  5794
current chunk:  5795
current chunk:  5796
current chunk

current chunk:  10
current chunk:  11
current chunk:  12
current chunk:  13
current chunk:  14
current chunk:  15
current chunk:  16
current chunk:  17
current chunk:  18
current chunk:  19
current chunk:  20
current chunk:  21
current chunk:  22
current chunk:  23
current chunk:  24
current chunk:  25
current chunk:  26
current chunk:  27
current chunk:  28
current chunk:  29
current chunk:  30
current chunk:  31
current chunk:  32
current chunk:  33
current chunk:  34
current chunk:  35
current chunk:  36
current chunk:  37
current chunk:  38
current chunk:  39
current chunk:  40
current chunk:  41
current chunk:  42
current chunk:  43
current chunk:  44
current chunk:  45
current chunk:  46
current chunk:  47
current chunk:  48
current chunk:  49
current chunk:  50
current chunk:  51
current chunk:  52
current chunk:  53
current chunk:  54
current chunk:  55
current chunk:  56
current chunk:  57
current chunk:  58
current chunk:  59
current chunk:  60
current chunk:  61
current chun

current chunk:  297
current chunk:  298
current chunk:  299
current chunk:  300
current chunk:  301
current chunk:  302
current chunk:  303
current chunk:  304
current chunk:  305
current chunk:  306
current chunk:  307
current chunk:  308
current chunk:  309
current chunk:  310
current chunk:  311
current chunk:  312
current chunk:  313
current chunk:  314
current chunk:  315
current chunk:  316
current chunk:  317
current chunk:  318
current chunk:  319
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  320
current chunk:  321
current chunk:  322
current chunk:  323
current chunk:  324
current chunk:  325
current chunk:  326
current chunk:  327
current chunk:  328
current chunk:  329
current chunk:  330
current chunk:  331
current chunk:  332
current chunk:  333
current chunk:  334
current chunk:  335
current chunk:  336
current chunk:  337
current chun

current chunk:  858
current chunk:  859
current chunk:  860
current chunk:  861
current chunk:  862
current chunk:  863
current chunk:  864
current chunk:  865
current chunk:  866
current chunk:  867
current chunk:  868
current chunk:  869
current chunk:  870
current chunk:  871
current chunk:  872
current chunk:  873
current chunk:  874
current chunk:  875
current chunk:  876
current chunk:  877
current chunk:  878
current chunk:  879
current chunk:  880
current chunk:  881
current chunk:  882
current chunk:  883
current chunk:  884
current chunk:  885
current chunk:  886
current chunk:  887
current chunk:  888
current chunk:  889
current chunk:  890
current chunk:  891
current chunk:  892
current chunk:  893
current chunk:  894
current chunk:  895
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  896
current chunk:  897
current chunk:  898
current chun

current chunk:  1137
current chunk:  1138
current chunk:  1139
current chunk:  1140
current chunk:  1141
current chunk:  1142
current chunk:  1143
current chunk:  1144
current chunk:  1145
current chunk:  1146
current chunk:  1147
current chunk:  1148
current chunk:  1149
current chunk:  1150
current chunk:  1151
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  1152
current chunk:  1153
current chunk:  1154
current chunk:  1155
current chunk:  1156
current chunk:  1157
current chunk:  1158
current chunk:  1159
current chunk:  1160
current chunk:  1161
current chunk:  1162
current chunk:  1163
current chunk:  1164
current chunk:  1165
current chunk:  1166
current chunk:  1167
current chunk:  1168
current chunk:  1169
current chunk:  1170
current chunk:  1171
current chunk:  1172
current chunk:  1173
current chunk:  1174
current chunk:  1175
current chunk

current chunk:  1679
current chunk:  1680
current chunk:  1681
current chunk:  1682
current chunk:  1683
current chunk:  1684
current chunk:  1685
current chunk:  1686
current chunk:  1687
current chunk:  1688
current chunk:  1689
current chunk:  1690
current chunk:  1691
current chunk:  1692
current chunk:  1693
current chunk:  1694
current chunk:  1695
current chunk:  1696
current chunk:  1697
current chunk:  1698
current chunk:  1699
current chunk:  1700
current chunk:  1701
current chunk:  1702
current chunk:  1703
current chunk:  1704
current chunk:  1705
current chunk:  1706
current chunk:  1707
current chunk:  1708
current chunk:  1709
current chunk:  1710
current chunk:  1711
current chunk:  1712
current chunk:  1713
current chunk:  1714
current chunk:  1715
current chunk:  1716
current chunk:  1717
current chunk:  1718
current chunk:  1719
current chunk:  1720
current chunk:  1721
current chunk:  1722
current chunk:  1723
current chunk:  1724
current chunk:  1725
current chunk

current chunk:  1952
current chunk:  1953
current chunk:  1954
current chunk:  1955
current chunk:  1956
current chunk:  1957
current chunk:  1958
current chunk:  1959
current chunk:  1960
current chunk:  1961
current chunk:  1962
current chunk:  1963
current chunk:  1964
current chunk:  1965
current chunk:  1966
current chunk:  1967
current chunk:  1968
current chunk:  1969
current chunk:  1970
current chunk:  1971
current chunk:  1972
current chunk:  1973
current chunk:  1974
current chunk:  1975
current chunk:  1976
current chunk:  1977
current chunk:  1978
current chunk:  1979
current chunk:  1980
current chunk:  1981
current chunk:  1982
current chunk:  1983
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  1984
current chunk:  1985
current chunk:  1986
current chunk:  1987
current chunk:  1988
current chunk:  1989
current chunk:  1990
current chunk

current chunk:  2226
current chunk:  2227
current chunk:  2228
current chunk:  2229
current chunk:  2230
current chunk:  2231
current chunk:  2232
current chunk:  2233
current chunk:  2234
current chunk:  2235
current chunk:  2236
current chunk:  2237
current chunk:  2238
current chunk:  2239
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  2240
current chunk:  2241
current chunk:  2242
current chunk:  2243
current chunk:  2244
current chunk:  2245
current chunk:  2246
current chunk:  2247
current chunk:  2248
current chunk:  2249
current chunk:  2250
current chunk:  2251
current chunk:  2252
current chunk:  2253
current chunk:  2254
current chunk:  2255
current chunk:  2256
current chunk:  2257
current chunk:  2258
current chunk:  2259
current chunk:  2260
current chunk:  2261
current chunk:  2262
current chunk:  2263
current chunk:  2264
current chunk

current chunk:  2768
current chunk:  2769
current chunk:  2770
current chunk:  2771
current chunk:  2772
current chunk:  2773
current chunk:  2774
current chunk:  2775
current chunk:  2776
current chunk:  2777
current chunk:  2778
current chunk:  2779
current chunk:  2780
current chunk:  2781
current chunk:  2782
current chunk:  2783
current chunk:  2784
current chunk:  2785
current chunk:  2786
current chunk:  2787
current chunk:  2788
current chunk:  2789
current chunk:  2790
current chunk:  2791
current chunk:  2792
current chunk:  2793
current chunk:  2794
current chunk:  2795
current chunk:  2796
current chunk:  2797
current chunk:  2798
current chunk:  2799
current chunk:  2800
current chunk:  2801
current chunk:  2802
current chunk:  2803
current chunk:  2804
current chunk:  2805
current chunk:  2806
current chunk:  2807
current chunk:  2808
current chunk:  2809
current chunk:  2810
current chunk:  2811
current chunk:  2812
current chunk:  2813
current chunk:  2814
current chunk

current chunk:  3040
current chunk:  3041
current chunk:  3042
current chunk:  3043
current chunk:  3044
current chunk:  3045
current chunk:  3046
current chunk:  3047
current chunk:  3048
current chunk:  3049
current chunk:  3050
current chunk:  3051
current chunk:  3052
current chunk:  3053
current chunk:  3054
current chunk:  3055
current chunk:  3056
current chunk:  3057
current chunk:  3058
current chunk:  3059
current chunk:  3060
current chunk:  3061
current chunk:  3062
current chunk:  3063
current chunk:  3064
current chunk:  3065
current chunk:  3066
current chunk:  3067
current chunk:  3068
current chunk:  3069
current chunk:  3070
current chunk:  3071
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  3072
current chunk:  3073
current chunk:  3074
current chunk:  3075
current chunk:  3076
current chunk:  3077
current chunk:  3078
current chunk

current chunk:  3314
current chunk:  3315
current chunk:  3316
current chunk:  3317
current chunk:  3318
current chunk:  3319
current chunk:  3320
current chunk:  3321
current chunk:  3322
current chunk:  3323
current chunk:  3324
current chunk:  3325
current chunk:  3326
current chunk:  3327
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  3328
current chunk:  3329
current chunk:  3330
current chunk:  3331
current chunk:  3332
current chunk:  3333
current chunk:  3334
current chunk:  3335
current chunk:  3336
current chunk:  3337
current chunk:  3338
current chunk:  3339
current chunk:  3340
current chunk:  3341
current chunk:  3342
current chunk:  3343
current chunk:  3344
current chunk:  3345
current chunk:  3346
current chunk:  3347
current chunk:  3348
current chunk:  3349
current chunk:  3350
current chunk:  3351
current chunk:  3352
current chunk

current chunk:  3856
current chunk:  3857
current chunk:  3858
current chunk:  3859
current chunk:  3860
current chunk:  3861
current chunk:  3862
current chunk:  3863
current chunk:  3864
current chunk:  3865
current chunk:  3866
current chunk:  3867
current chunk:  3868
current chunk:  3869
current chunk:  3870
current chunk:  3871
current chunk:  3872
current chunk:  3873
current chunk:  3874
current chunk:  3875
current chunk:  3876
current chunk:  3877
current chunk:  3878
current chunk:  3879
current chunk:  3880
current chunk:  3881
current chunk:  3882
current chunk:  3883
current chunk:  3884
current chunk:  3885
current chunk:  3886
current chunk:  3887
current chunk:  3888
current chunk:  3889
current chunk:  3890
current chunk:  3891
current chunk:  3892
current chunk:  3893
current chunk:  3894
current chunk:  3895
current chunk:  3896
current chunk:  3897
current chunk:  3898
current chunk:  3899
current chunk:  3900
current chunk:  3901
current chunk:  3902
current chunk

current chunk:  4128
current chunk:  4129
current chunk:  4130
current chunk:  4131
current chunk:  4132
current chunk:  4133
current chunk:  4134
current chunk:  4135
current chunk:  4136
current chunk:  4137
current chunk:  4138
current chunk:  4139
current chunk:  4140
current chunk:  4141
current chunk:  4142
current chunk:  4143
current chunk:  4144
current chunk:  4145
current chunk:  4146
current chunk:  4147
current chunk:  4148
current chunk:  4149
current chunk:  4150
current chunk:  4151
current chunk:  4152
current chunk:  4153
current chunk:  4154
current chunk:  4155
current chunk:  4156
current chunk:  4157
current chunk:  4158
current chunk:  4159
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  4160
current chunk:  4161
current chunk:  4162
current chunk:  4163
current chunk:  4164
current chunk:  4165
current chunk:  4166
current chunk

current chunk:  4402
current chunk:  4403
current chunk:  4404
current chunk:  4405
current chunk:  4406
current chunk:  4407
current chunk:  4408
current chunk:  4409
current chunk:  4410
current chunk:  4411
current chunk:  4412
current chunk:  4413
current chunk:  4414
current chunk:  4415
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  4416
current chunk:  4417
current chunk:  4418
current chunk:  4419
current chunk:  4420
current chunk:  4421
current chunk:  4422
current chunk:  4423
current chunk:  4424
current chunk:  4425
current chunk:  4426
current chunk:  4427
current chunk:  4428
current chunk:  4429
current chunk:  4430
current chunk:  4431
current chunk:  4432
current chunk:  4433
current chunk:  4434
current chunk:  4435
current chunk:  4436
current chunk:  4437
current chunk:  4438
current chunk:  4439
current chunk:  4440
current chunk

current chunk:  4944
current chunk:  4945
current chunk:  4946
current chunk:  4947
current chunk:  4948
current chunk:  4949
current chunk:  4950
current chunk:  4951
current chunk:  4952
current chunk:  4953
current chunk:  4954
current chunk:  4955
current chunk:  4956
current chunk:  4957
current chunk:  4958
current chunk:  4959
current chunk:  4960
current chunk:  4961
current chunk:  4962
current chunk:  4963
current chunk:  4964
current chunk:  4965
current chunk:  4966
current chunk:  4967
current chunk:  4968
current chunk:  4969
current chunk:  4970
current chunk:  4971
current chunk:  4972
current chunk:  4973
current chunk:  4974
current chunk:  4975
current chunk:  4976
current chunk:  4977
current chunk:  4978
current chunk:  4979
current chunk:  4980
current chunk:  4981
current chunk:  4982
current chunk:  4983
current chunk:  4984
current chunk:  4985
current chunk:  4986
current chunk:  4987
current chunk:  4988
current chunk:  4989
current chunk:  4990
current chunk

current chunk:  5216
current chunk:  5217
current chunk:  5218
current chunk:  5219
current chunk:  5220
current chunk:  5221
current chunk:  5222
current chunk:  5223
current chunk:  5224
current chunk:  5225
current chunk:  5226
current chunk:  5227
current chunk:  5228
current chunk:  5229
current chunk:  5230
current chunk:  5231
current chunk:  5232
current chunk:  5233
current chunk:  5234
current chunk:  5235
current chunk:  5236
current chunk:  5237
current chunk:  5238
current chunk:  5239
current chunk:  5240
current chunk:  5241
current chunk:  5242
current chunk:  5243
current chunk:  5244
current chunk:  5245
current chunk:  5246
current chunk:  5247
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  5248
current chunk:  5249
current chunk:  5250
current chunk:  5251
current chunk:  5252
current chunk:  5253
current chunk:  5254
current chunk

current chunk:  5490
current chunk:  5491
current chunk:  5492
current chunk:  5493
current chunk:  5494
current chunk:  5495
current chunk:  5496
current chunk:  5497
current chunk:  5498
current chunk:  5499
current chunk:  5500
current chunk:  5501
current chunk:  5502
current chunk:  5503
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
exit dumping process
current chunk:  5504
current chunk:  5505
current chunk:  5506
current chunk:  5507
current chunk:  5508
current chunk:  5509
current chunk:  5510
current chunk:  5511
current chunk:  5512
current chunk:  5513
current chunk:  5514
current chunk:  5515
current chunk:  5516
current chunk:  5517
current chunk:  5518
current chunk:  5519
current chunk:  5520
current chunk:  5521
current chunk:  5522
current chunk:  5523
current chunk:  5524
current chunk:  5525
current chunk:  5526
current chunk:  5527
current chunk:  5528
current chunk

Process ForkPoolWorker-21:
Process ForkPoolWorker-18:
Process ForkPoolWorker-17:
Process ForkPoolWorker-20:
Process ForkPoolWorker-19:
Process ForkPoolWorker-22:
Traceback (most recent call last):


In [1]:
# it seems the process of merging QdTree partitions are stuck, we re generate the merged data
# but the below cannot work, it will also stuck at some point, I can't find out why

# pool = Pool(processes = 3)

# # totally 68 partitions for QdTree
# # pids = [[k * 8 + i for i in range(8)] for k in range(num_process)]
# # pids[-1] += [64, 65, 66, 67]

# pids = [i for i in range(68)] # 0 - 67

# batch = 0
# while batch < 3:

#     pids_each_process = [set(pids[batch*24+k*8: batch*24+(k+1)*8]) for k in range(3)]
#     # totally 94 epochs
#     paras = [[94, pids_each_process[k], qdtree_hdfs] for k in range(3)]
#     pool.map(merge_parquets, [para for para in paras])
#     batch += 1
    
# pool.close()
# pool.join()

In [8]:
# import pandas as pd
# import pyarrow as pa
# import pyarrow.parquet as pq
# import numpy as np

# pids = [i for i in range(68)] # 0 - 67
# batches = 94
# hdfs_path = qdtree_hdfs

# start_time = time.time()

# # using single process to handle data merge
# fs = pa.hdfs.connect()
# for pid in pids:
#     parquets = []
#     print('= = = process pid: ', pid, '= = =')
#     for batch in range(batches):
#         path = hdfs_path + str(batch) + '/partition_' + str(pid)+'.parquet'
#         print(batch)
#         try:
#             par = pq.read_table(path)
#             parquets.append(par)
#         except:
#             continue
#     merged_parquet = pa.concat_tables(parquets)
#     merge_path = hdfs_path + 'merged/partition_' + str(pid)+'.parquet'
#     fw = fs.open(merge_path, 'wb')
#     pq.write_table(merged_parquet, fw)
#     fw.close()
# print('exit merge process')

# end_time = time.time()
# print('time usage: ', end_time - start_time) # 2347s

= = = process pid:  0 = = =
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
= = = process pid:  1 = = =
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
= = = process pid:  2 = = =
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
= = = process pid:  3 = = =
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
2

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
= = = process pid:  28 = = =
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
= = = process pid:  29 = = =
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
= = = process pid:  30 = = =
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
5

42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
= = = process pid:  55 = = =
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
= = = process pid:  56 = = =
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
= = = process pid:  57 = = =
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
7