In [1]:
import findspark
findspark.init() # this must be executed before the below import

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark import SparkFiles

In [3]:
import ray
import time
import rtree
from rtree import index
import pandas as pd
import numpy as np
from numpy import genfromtxt
import threading
import pyarrow as pa
import pyarrow.parquet as pq

In [4]:
conf = SparkConf().setAll([("spark.executor.memory", "8g"),("spark.driver.memory","8g"),
                           ("spark.memory.offHeap.enabled",True),("spark.memory.offHeap.size","8g")])

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [5]:
def merge_dicts_simple(pid_data_dicts, num_process):
    base_dict = pid_data_dicts[0]
    for k in range(1, len(pid_data_dicts)):
        for key, val in pid_data_dicts[k].items():
            if key in base_dict:
                base_dict[key] += val
            else:
                base_dict.update({key:val})
        pid_data_dicts[k].clear()

class DumpThread(threading.Thread):
    def __init__(self, thread_id, parameters):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.parameters = parameters
        
    def run(self):
        print('start dumping thread', self.thread_id)
        start_index, end_index, pids, merged_dict, pq_writers, fs, hdfs_path, column_names = self.parameters
        for pid in pids[start_index: end_index]:
            path = hdfs_path + 'partition_' + str(pid)+'.parquet'
            pdf = pd.DataFrame(merged_dict[pid], columns=column_names)
            adf = pa.Table.from_pandas(pdf)
            if pid in pq_writers:
                pq_writers[pid].write_table(table=adf)
                #print('existing pid')
            else:
                writer = pq.ParquetWriter(path, adf.schema, fs)
                pq_writers[pid] = writer
                writer.write_table(table=adf)
                #print('new pid', pid, 'from thread',self.thread_id)
            merged_dict[pid] = []
        print('exit dumping thread', self.thread_id)
        
def dump_dict_2_hdfs(merged_dict, pq_writers, fs, column_names, hdfs_path, num_threads):
                                         
    if num_threads == 1:
        print('start dumping with single thread in main process..')
        pids = list(merged_dict.keys())
        for pid in pids:
            #print("writing to pid:",pid)
            path = hdfs_path + 'partition_' + str(pid)+'.parquet'
            pdf = pd.DataFrame(merged_dict[pid], columns=column_names)
            adf = pa.Table.from_pandas(pdf)
            if pid in pq_writers:
                pq_writers[pid].write_table(table=adf)
                #print('existing pid')
            else:
                writer = pq.ParquetWriter(path, adf.schema, fs)
                pq_writers[pid] = writer
                writer.write_table(table=adf)
                #print('new pid')
            merged_dict[pid] = []  
    else:
        print('start dumping with', num_threads, 'threads in main process..')
        pids = list(merged_dict.keys())
        step = int(len(pids) / num_threads) + 1
        threads = []
        for i in range(num_threads):
            start_index = i * step
            end_index = (i+1) * step
            parameters = [start_index, end_index, pids, merged_dict, pq_writers, fs, hdfs_path, column_names]
            thread = DumpThread(i, parameters)
            thread.start()
            threads.append(thread)
            if start_index >= len(pids):
                break   
        for t in threads:
            t.join()
    print('finish dumping.')

In [6]:
def kdnode_2_border(kdnode):
    lower = [domain[0] for domain in kdnode[0]]
    upper = [domain[1] for domain in kdnode[0]]
    border = tuple(lower + upper) # non interleave
    return border

def load_partitions_from_file(path):
    stretched_kdnodes = genfromtxt(path, delimiter=',')
    num_dims = int(stretched_kdnodes[0,0])
    kdnodes = []
    for i in range(len(stretched_kdnodes)):
        domains = [ [stretched_kdnodes[i,k+1],stretched_kdnodes[i,1+num_dims+k]] for k in range(num_dims) ]
        row = [domains]
        row.append(stretched_kdnodes[i,2*num_dims+1])
        # to be compatible with qd-tree's partition, that do not have the last 4 attributes
        if len(stretched_kdnodes[i]) > 2*num_dims+2:
            row.append(stretched_kdnodes[i,-4])
            row.append(stretched_kdnodes[i,-3])
            row.append(stretched_kdnodes[i,-2])
            row.append(stretched_kdnodes[i,-1])
        kdnodes.append(row)
    return kdnodes

def process_chunk_row(row, used_dims, pidx, pid_data_dict, count, k):
    if count[0] % 100000 == 0:
        print('proces',k,'has routed',count[0],'rows')
    count[0] += 1
    row_numpy = row.to_numpy()
    row_used_dims_list = row_numpy[used_dims].tolist()
    row_border = tuple(row_used_dims_list+row_used_dims_list)
    try:
        pid = list(pidx.intersection(row_border))[0]
    except:
        print(row_border)
    if pid in pid_data_dict:
        pid_data_dict[pid]+=[row_numpy.tolist()]
        #print('update dict..')
    else:
        pid_data_dict[pid]=[row_numpy.tolist()]
        #print('initialize dict..')

@ray.remote
def process_chunk(chunk, used_dims, partition_path, k):
    print("enter data routing process", k, '..')    
    pid_data_dict = {}
    partitions = load_partitions_from_file(partition_path)
    p = index.Property()
    p.leaf_capacity = 32
    p.index_capacity = 32
    p.NearMinimumOverlaoFactor = 16
    p.fill_factor = 0.8
    p.overwrite = True
    pidx = index.Index(properties = p)
    for i in range(len(partitions)):
        pidx.insert(i, kdnode_2_border(partitions[i]))
    count = [0]
    chunk.apply(lambda row: process_chunk_row(row, used_dims, pidx, pid_data_dict, count, k), axis=1)
    dict_id = ray.put(pid_data_dict)
    print("exit data routing process", k, ".")
    return dict_id

In [7]:
def batch_data_parallel(table_path, partition_path, chunk_size, used_dims, hdfs_path, num_dims, num_process, hdfs_private_ip):
    
    begin_time = time.time()
    
    ray.init(num_cpus=num_process)
    
    # pyarrow parquent append
    pq_writers = {}
    fs = pa.fs.HadoopFileSystem(hdfs_private_ip, port=9000, user='hdfs', replication=1)
    
    # column names for pandas dataframe
    col_names = ['_c'+str(i) for i in range(num_dims)]
    cols = [i for i in range(num_dims)]
    
    # chunks
    chunks = []
    chunk_count = 0
    
    for chunk in pd.read_table(table_path, delimiter='|', usecols=cols, names=col_names, chunksize=chunk_size):
        print('reading chunk: ', chunk_count)
        chunks.append(chunk)
        
        # after all process allocated a chunk, process and dump the data
        if chunk_count % num_process == num_process - 1:
        
            chunk_ids = [ray.put(chunks[k]) for k in range(num_process)]
            # result id is the function id, not the return from function
            result_ids = [process_chunk.remote(chunk_ids[k], used_dims, partition_path, k) for k in range(num_process)]
            # this is the return from function
            results = ray.get(result_ids)
            
            chunks.clear() # clear up the memeory
            del chunk_ids # clear up the references
            
            base_dict = ray.get(results[0])
            results[0] = 0 # del the object reference
            for k in range(1, len(results)):
                for key, val in ray.get(results[k]).items():
                    if key in base_dict:
                        base_dict[key] += val
                    else:
                        base_dict.update({key:val})
                results[k] = 0
            del result_ids # clear up the references
            del results    # clear up the references
            
            print("= = = dumping = = =")
            dump_dict_2_hdfs(base_dict, pq_writers, fs, col_names, hdfs_path, num_process)
            del base_dict # clear up the memeory
            current_time = time.time()
            time_elapsed = current_time - begin_time
            print("= = = TOTAL PROCESSED SO FAR:", chunk_count * chunk_size,"ROWS. TIME SPENT:", time_elapsed, "SECONDS = = =")
                
        chunk_count += 1
        
    print('after exit, chunks size: ', len(chunks))
    # process the last batch
    if len(chunks) != 0:
        chunk_ids = [ray.put(chunks[k]) for k in range(len(chunks))]
        result_ids = [process_chunk.remote(chunk_ids[k], used_dims, partition_path, k) for k in range(len(chunks))]
        results = ray.get(result_ids)
        
        base_dict = ray.get(results[0])
        del results[0]
        for k in range(1, len(results)):
            for key, val in ray.get(results[k]).items():
                if key in base_dict:
                    base_dict[key] += val
                else:
                    base_dict.update({key:val})
            del results[k]
        
        print("= = = last epoch dumping = = =")
        dump_dict_2_hdfs(base_dict, pq_writers, fs, col_names, hdfs_path, 1)
    
    for writer in pq_writers:
        writer.close()
    
    ray.shutdown()
    
    finish_time = time.time()
    print('total data routing and persisting time: ', finish_time - begin_time)

In [8]:
# = = = Configuration (UBDA Cloud Centos) = = =
scale_factor = 100

table_base_path = '/media/datadrive1/TPCH/dbgen/'
table_path = table_base_path + 'lineitem_' + str(scale_factor) + '.tbl'

num_process = 8
chunk_size = 2000000
# 6M rows = about 1GB raw data

num_dims = 16
used_dims = [1,2]

# base path of HDFS
hdfs_private_ip = '192.168.6.62'
hdfs_base_path = 'hdfs://192.168.6.62:9000/user/cloudray/'

nora_hdfs = hdfs_base_path + 'NORA/scale' + str(scale_factor) + '/'
qdtree_hdfs = hdfs_base_path + 'QdTree/scale' + str(scale_factor) + '/'
kdtree_hdfs = hdfs_base_path + 'KDTree/scale' + str(scale_factor) + '/'

# base path of Partition
partition_base_path = '/home/centos/PartitionLayout/'

nora_partition = partition_base_path + 'nora_partitions_' + str(scale_factor)
qdtree_partition = partition_base_path + 'qdtree_partitions_' + str(scale_factor)
kdtree_partition = partition_base_path + 'kdtree_partitions_' + str(scale_factor)

In [None]:
# = = = Execution = = =
if __name__ == '__main__':
    # batch_data_parallel(table_path, nora_partition, chunk_size, used_dims, nora_hdfs, num_dims, num_process, hdfs_private_ip)
    # print('finish nora data routing..')
    # batch_data_parallel(table_path, qdtree_partition, chunk_size, used_dims, qdtree_hdfs, num_dims, num_process, hdfs_private_ip)
    # print('finish qdtree data routing..')
    batch_data_parallel(table_path, kdtree_partition, chunk_size, used_dims, kdtree_hdfs, num_dims, num_process, hdfs_private_ip)
    print('finish kdtree data routing..')

2020-10-27 16:26:21,990	INFO services.py:1164 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


reading chunk:  0
reading chunk:  1
reading chunk:  2
reading chunk:  3
reading chunk:  4
reading chunk:  5
reading chunk:  6
reading chunk:  7
[2m[36m(pid=1260)[0m enter data routing process 6 ..
[2m[36m(pid=1267)[0m enter data routing process 5 ..
[2m[36m(pid=1266)[0m enter data routing process 3 ..
[2m[36m(pid=1265)[0m enter data routing process 0 ..
[2m[36m(pid=1259)[0m enter data routing process 7 ..
[2m[36m(pid=1258)[0m enter data routing process 1 ..
[2m[36m(pid=1257)[0m enter data routing process 4 ..
[2m[36m(pid=1256)[0m enter data routing process 2 ..
[2m[36m(pid=1265)[0m proces 0 has routed 0 rows
[2m[36m(pid=1266)[0m proces 3 has routed 0 rows
[2m[36m(pid=1259)[0m proces 7 has routed 0 rows
[2m[36m(pid=1257)[0m proces 4 has routed 0 rows
[2m[36m(pid=1260)[0m proces 6 has routed 0 rows
[2m[36m(pid=1267)[0m proces 5 has routed 0 rows
[2m[36m(pid=1258)[0m proces 1 has routed 0 rows
[2m[36m(pid=1256)[0m proces 2 has routed 0 rows


[2m[36m(pid=1259)[0m proces 7 has routed 1700000 rows
[2m[36m(pid=1267)[0m proces 5 has routed 1700000 rows
[2m[36m(pid=1257)[0m proces 4 has routed 1700000 rows
[2m[36m(pid=1265)[0m proces 0 has routed 1700000 rows
[2m[36m(pid=1266)[0m proces 3 has routed 1700000 rows
[2m[36m(pid=1258)[0m proces 1 has routed 1700000 rows
[2m[36m(pid=1256)[0m proces 2 has routed 1700000 rows
[2m[36m(pid=1260)[0m proces 6 has routed 1700000 rows
[2m[36m(pid=1259)[0m proces 7 has routed 1800000 rows
[2m[36m(pid=1257)[0m proces 4 has routed 1800000 rows
[2m[36m(pid=1267)[0m proces 5 has routed 1800000 rows
[2m[36m(pid=1266)[0m proces 3 has routed 1800000 rows
[2m[36m(pid=1265)[0m proces 0 has routed 1800000 rows
[2m[36m(pid=1258)[0m proces 1 has routed 1800000 rows
[2m[36m(pid=1256)[0m proces 2 has routed 1800000 rows
[2m[36m(pid=1260)[0m proces 6 has routed 1800000 rows
[2m[36m(pid=1259)[0m proces 7 has routed 1900000 rows
[2m[36m(pid=1267)[0m proces 

[2m[36m(pid=1260)[0m proces 1 has routed 1100000 rows
[2m[36m(pid=1256)[0m proces 0 has routed 1200000 rows
[2m[36m(pid=1267)[0m proces 2 has routed 1200000 rows
[2m[36m(pid=1259)[0m proces 4 has routed 1200000 rows
[2m[36m(pid=1265)[0m proces 5 has routed 1200000 rows
[2m[36m(pid=1258)[0m proces 3 has routed 1200000 rows
[2m[36m(pid=1257)[0m proces 7 has routed 1200000 rows
[2m[36m(pid=1266)[0m proces 6 has routed 1200000 rows
[2m[36m(pid=1260)[0m proces 1 has routed 1200000 rows
[2m[36m(pid=1256)[0m proces 0 has routed 1300000 rows
[2m[36m(pid=1267)[0m proces 2 has routed 1300000 rows
[2m[36m(pid=1259)[0m proces 4 has routed 1300000 rows
[2m[36m(pid=1265)[0m proces 5 has routed 1300000 rows
[2m[36m(pid=1258)[0m proces 3 has routed 1300000 rows
[2m[36m(pid=1257)[0m proces 7 has routed 1300000 rows
[2m[36m(pid=1266)[0m proces 6 has routed 1300000 rows
[2m[36m(pid=1260)[0m proces 1 has routed 1300000 rows
[2m[36m(pid=1256)[0m proces 

[2m[36m(pid=1265)[0m proces 0 has routed 600000 rows
[2m[36m(pid=1260)[0m proces 5 has routed 600000 rows
[2m[36m(pid=1256)[0m proces 3 has routed 700000 rows
[2m[36m(pid=1259)[0m proces 7 has routed 700000 rows
[2m[36m(pid=1258)[0m proces 4 has routed 700000 rows
[2m[36m(pid=1257)[0m proces 1 has routed 700000 rows
[2m[36m(pid=1266)[0m proces 2 has routed 700000 rows
[2m[36m(pid=1267)[0m proces 6 has routed 700000 rows
[2m[36m(pid=1265)[0m proces 0 has routed 700000 rows
[2m[36m(pid=1260)[0m proces 5 has routed 700000 rows
[2m[36m(pid=1256)[0m proces 3 has routed 800000 rows
[2m[36m(pid=1258)[0m proces 4 has routed 800000 rows
[2m[36m(pid=1259)[0m proces 7 has routed 800000 rows
[2m[36m(pid=1257)[0m proces 1 has routed 800000 rows
[2m[36m(pid=1267)[0m proces 6 has routed 800000 rows
[2m[36m(pid=1266)[0m proces 2 has routed 800000 rows
[2m[36m(pid=1265)[0m proces 0 has routed 800000 rows
[2m[36m(pid=1260)[0m proces 5 has routed 8000

[2m[36m(pid=1266)[0m proces 6 has routed 100000 rows
[2m[36m(pid=1258)[0m proces 0 has routed 100000 rows
[2m[36m(pid=1260)[0m proces 2 has routed 100000 rows
[2m[36m(pid=1259)[0m proces 5 has routed 100000 rows
[2m[36m(pid=1267)[0m proces 3 has routed 200000 rows
[2m[36m(pid=1256)[0m proces 4 has routed 200000 rows
[2m[36m(pid=1257)[0m proces 7 has routed 200000 rows
[2m[36m(pid=1265)[0m proces 1 has routed 200000 rows
[2m[36m(pid=1258)[0m proces 0 has routed 200000 rows
[2m[36m(pid=1260)[0m proces 2 has routed 200000 rows
[2m[36m(pid=1266)[0m proces 6 has routed 200000 rows
[2m[36m(pid=1259)[0m proces 5 has routed 200000 rows
[2m[36m(pid=1267)[0m proces 3 has routed 300000 rows
[2m[36m(pid=1257)[0m proces 7 has routed 300000 rows
[2m[36m(pid=1256)[0m proces 4 has routed 300000 rows
[2m[36m(pid=1265)[0m proces 1 has routed 300000 rows
[2m[36m(pid=1258)[0m proces 0 has routed 300000 rows
[2m[36m(pid=1266)[0m proces 6 has routed 3000

[2m[36m(pid=1266)[0m proces 6 has routed 1900000 rows
[2m[36m(pid=1267)[0m exit data routing process 3 .
[2m[36m(pid=1258)[0m exit data routing process 0 .
[2m[36m(pid=1265)[0m exit data routing process 1 .
[2m[36m(pid=1266)[0m exit data routing process 6 .
[2m[36m(pid=1257)[0m exit data routing process 7 .
[2m[36m(pid=1256)[0m exit data routing process 4 .
[2m[36m(pid=1259)[0m exit data routing process 5 .
[2m[36m(pid=1260)[0m exit data routing process 2 .
= = = dumping = = =
start dumping with 8 threads in main process..
start dumping thread 0
start dumping thread 1
start dumping thread start dumping thread 3
2
start dumping thread 4
start dumping threadstart dumping thread 6
start dumping thread 5
 7
exit dumping thread 7
exit dumping thread 0
exit dumping thread 1
exit dumping thread 6
exit dumping thread 4
exit dumping thread 5
exit dumping thread 3
exit dumping thread 2
finish dumping.
= = = TOTAL PROCESSED SO FAR: 62000000 ROWS. TIME SPENT: 1248.616785

[2m[36m(pid=1260)[0m proces 6 has routed 1400000 rows
[2m[36m(pid=1259)[0m proces 0 has routed 1500000 rows
[2m[36m(pid=1267)[0m proces 7 has routed 1500000 rows
[2m[36m(pid=1257)[0m proces 2 has routed 1500000 rows
[2m[36m(pid=1265)[0m proces 4 has routed 1500000 rows
[2m[36m(pid=1256)[0m proces 1 has routed 1500000 rows
[2m[36m(pid=1258)[0m proces 5 has routed 1500000 rows
[2m[36m(pid=1266)[0m proces 3 has routed 1500000 rows
[2m[36m(pid=1260)[0m proces 6 has routed 1500000 rows
[2m[36m(pid=1259)[0m proces 0 has routed 1600000 rows
[2m[36m(pid=1267)[0m proces 7 has routed 1600000 rows
[2m[36m(pid=1265)[0m proces 4 has routed 1600000 rows
[2m[36m(pid=1257)[0m proces 2 has routed 1600000 rows
[2m[36m(pid=1256)[0m proces 1 has routed 1600000 rows
[2m[36m(pid=1258)[0m proces 5 has routed 1600000 rows
[2m[36m(pid=1266)[0m proces 3 has routed 1600000 rows
[2m[36m(pid=1260)[0m proces 6 has routed 1600000 rows
[2m[36m(pid=1258)[0m proces 

[2m[36m(pid=1256)[0m proces 5 has routed 900000 rows
[2m[36m(pid=1260)[0m proces 6 has routed 900000 rows
[2m[36m(pid=1265)[0m proces 3 has routed 1000000 rows
[2m[36m(pid=1267)[0m proces 7 has routed 1000000 rows
[2m[36m(pid=1258)[0m proces 4 has routed 1000000 rows
[2m[36m(pid=1259)[0m proces 2 has routed 1000000 rows
[2m[36m(pid=1266)[0m proces 1 has routed 1000000 rows
[2m[36m(pid=1257)[0m proces 0 has routed 1000000 rows
[2m[36m(pid=1256)[0m proces 5 has routed 1000000 rows
[2m[36m(pid=1260)[0m proces 6 has routed 1000000 rows
[2m[36m(pid=1265)[0m proces 3 has routed 1100000 rows
[2m[36m(pid=1267)[0m proces 7 has routed 1100000 rows
[2m[36m(pid=1258)[0m proces 4 has routed 1100000 rows
[2m[36m(pid=1259)[0m proces 2 has routed 1100000 rows
[2m[36m(pid=1266)[0m proces 1 has routed 1100000 rows
[2m[36m(pid=1257)[0m proces 0 has routed 1100000 rows
[2m[36m(pid=1256)[0m proces 5 has routed 1100000 rows
[2m[36m(pid=1260)[0m proces 6 

[2m[36m(pid=1265)[0m proces 7 has routed 400000 rows
[2m[36m(pid=1258)[0m proces 5 has routed 400000 rows
[2m[36m(pid=1266)[0m proces 1 has routed 400000 rows
[2m[36m(pid=1260)[0m proces 3 has routed 400000 rows
[2m[36m(pid=1267)[0m proces 4 has routed 500000 rows
[2m[36m(pid=1257)[0m proces 2 has routed 500000 rows
[2m[36m(pid=1259)[0m proces 6 has routed 500000 rows
[2m[36m(pid=1256)[0m proces 0 has routed 500000 rows
[2m[36m(pid=1265)[0m proces 7 has routed 500000 rows
[2m[36m(pid=1258)[0m proces 5 has routed 500000 rows
[2m[36m(pid=1266)[0m proces 1 has routed 500000 rows
[2m[36m(pid=1260)[0m proces 3 has routed 500000 rows
[2m[36m(pid=1267)[0m proces 4 has routed 600000 rows
[2m[36m(pid=1257)[0m proces 2 has routed 600000 rows
[2m[36m(pid=1259)[0m proces 6 has routed 600000 rows
[2m[36m(pid=1256)[0m proces 0 has routed 600000 rows
[2m[36m(pid=1265)[0m proces 7 has routed 600000 rows
[2m[36m(pid=1258)[0m proces 5 has routed 6000

[2m[36m(pid=1260)[0m proces 6 has routed 0 rows
[2m[36m(pid=1258)[0m proces 5 has routed 0 rows
[2m[36m(pid=1265)[0m proces 0 has routed 0 rows
[2m[36m(pid=1257)[0m proces 4 has routed 0 rows
[2m[36m(pid=1267)[0m proces 7 has routed 0 rows
[2m[36m(pid=1259)[0m proces 2 has routed 0 rows
[2m[36m(pid=1256)[0m proces 1 has routed 0 rows
[2m[36m(pid=1266)[0m proces 3 has routed 0 rows
[2m[36m(pid=1258)[0m proces 5 has routed 100000 rows
[2m[36m(pid=1260)[0m proces 6 has routed 100000 rows
[2m[36m(pid=1267)[0m proces 7 has routed 100000 rows
[2m[36m(pid=1259)[0m proces 2 has routed 100000 rows
[2m[36m(pid=1257)[0m proces 4 has routed 100000 rows
[2m[36m(pid=1265)[0m proces 0 has routed 100000 rows
[2m[36m(pid=1256)[0m proces 1 has routed 100000 rows
[2m[36m(pid=1266)[0m proces 3 has routed 100000 rows
[2m[36m(pid=1259)[0m proces 2 has routed 200000 rows
[2m[36m(pid=1257)[0m proces 4 has routed 200000 rows
[2m[36m(pid=1267)[0m proces 7

[2m[36m(pid=1256)[0m proces 1 has routed 1800000 rows
[2m[36m(pid=1258)[0m proces 5 has routed 1800000 rows
[2m[36m(pid=1259)[0m proces 2 has routed 1800000 rows
[2m[36m(pid=1266)[0m proces 3 has routed 1800000 rows
[2m[36m(pid=1260)[0m proces 6 has routed 1800000 rows
[2m[36m(pid=1257)[0m proces 4 has routed 1900000 rows
[2m[36m(pid=1265)[0m proces 0 has routed 1900000 rows
[2m[36m(pid=1267)[0m proces 7 has routed 1900000 rows
[2m[36m(pid=1256)[0m proces 1 has routed 1900000 rows
[2m[36m(pid=1259)[0m proces 2 has routed 1900000 rows
[2m[36m(pid=1258)[0m proces 5 has routed 1900000 rows
[2m[36m(pid=1266)[0m proces 3 has routed 1900000 rows
[2m[36m(pid=1260)[0m proces 6 has routed 1900000 rows
[2m[36m(pid=1257)[0m exit data routing process 4 .
[2m[36m(pid=1267)[0m exit data routing process 7 .
[2m[36m(pid=1265)[0m exit data routing process 0 .
[2m[36m(pid=1259)[0m exit data routing process 2 .
[2m[36m(pid=1258)[0m exit data routing p

[2m[36m(pid=1257)[0m proces 7 has routed 1300000 rows
[2m[36m(pid=1267)[0m proces 5 has routed 1300000 rows
[2m[36m(pid=1258)[0m proces 1 has routed 1300000 rows
[2m[36m(pid=1259)[0m proces 3 has routed 1300000 rows
[2m[36m(pid=1260)[0m proces 4 has routed 1300000 rows
[2m[36m(pid=1258)[0m proces 1 has routed 1400000 rows
[2m[36m(pid=1259)[0m proces 3 has routed 1400000 rows
[2m[36m(pid=1265)[0m proces 2 has routed 1400000 rows
[2m[36m(pid=1266)[0m proces 6 has routed 1400000 rows
[2m[36m(pid=1256)[0m proces 0 has routed 1400000 rows
[2m[36m(pid=1257)[0m proces 7 has routed 1400000 rows
[2m[36m(pid=1267)[0m proces 5 has routed 1400000 rows
[2m[36m(pid=1260)[0m proces 4 has routed 1400000 rows
[2m[36m(pid=1265)[0m proces 2 has routed 1500000 rows
[2m[36m(pid=1259)[0m proces 3 has routed 1500000 rows
[2m[36m(pid=1258)[0m proces 1 has routed 1500000 rows
[2m[36m(pid=1256)[0m proces 0 has routed 1500000 rows
[2m[36m(pid=1257)[0m proces 

[2m[36m(pid=1265)[0m proces 7 has routed 800000 rows
[2m[36m(pid=1256)[0m proces 3 has routed 800000 rows
[2m[36m(pid=1260)[0m proces 1 has routed 800000 rows
[2m[36m(pid=1257)[0m proces 5 has routed 800000 rows
[2m[36m(pid=1259)[0m proces 4 has routed 800000 rows
[2m[36m(pid=1266)[0m proces 6 has routed 800000 rows
[2m[36m(pid=1267)[0m proces 2 has routed 900000 rows
[2m[36m(pid=1258)[0m proces 0 has routed 900000 rows
[2m[36m(pid=1265)[0m proces 7 has routed 900000 rows
[2m[36m(pid=1257)[0m proces 5 has routed 900000 rows
[2m[36m(pid=1259)[0m proces 4 has routed 900000 rows
[2m[36m(pid=1256)[0m proces 3 has routed 900000 rows
[2m[36m(pid=1260)[0m proces 1 has routed 900000 rows
[2m[36m(pid=1266)[0m proces 6 has routed 900000 rows
[2m[36m(pid=1258)[0m proces 0 has routed 1000000 rows
[2m[36m(pid=1267)[0m proces 2 has routed 1000000 rows
[2m[36m(pid=1265)[0m proces 7 has routed 1000000 rows
[2m[36m(pid=1256)[0m proces 3 has routed 1

[2m[36m(pid=1259)[0m proces 2 has routed 300000 rows
[2m[36m(pid=1256)[0m proces 4 has routed 300000 rows
[2m[36m(pid=1265)[0m proces 3 has routed 300000 rows
[2m[36m(pid=1267)[0m proces 7 has routed 300000 rows
[2m[36m(pid=1266)[0m proces 1 has routed 300000 rows
[2m[36m(pid=1258)[0m proces 5 has routed 300000 rows
[2m[36m(pid=1257)[0m proces 0 has routed 300000 rows
[2m[36m(pid=1260)[0m proces 6 has routed 300000 rows
[2m[36m(pid=1259)[0m proces 2 has routed 400000 rows
[2m[36m(pid=1256)[0m proces 4 has routed 400000 rows
[2m[36m(pid=1266)[0m proces 1 has routed 400000 rows
[2m[36m(pid=1267)[0m proces 7 has routed 400000 rows
[2m[36m(pid=1265)[0m proces 3 has routed 400000 rows
[2m[36m(pid=1258)[0m proces 5 has routed 400000 rows
[2m[36m(pid=1257)[0m proces 0 has routed 400000 rows
[2m[36m(pid=1260)[0m proces 6 has routed 400000 rows
[2m[36m(pid=1259)[0m proces 2 has routed 500000 rows
[2m[36m(pid=1258)[0m proces 5 has routed 5000

start dumping thread 6
start dumping thread 7
exit dumping thread 7
exit dumping thread 5
exit dumping thread 0
exit dumping thread 6
exit dumping thread 1
exit dumping thread 3
exit dumping thread 4
exit dumping thread 2
finish dumping.
= = = TOTAL PROCESSED SO FAR: 174000000 ROWS. TIME SPENT: 3448.2533485889435 SECONDS = = =
reading chunk:  88
reading chunk:  89
reading chunk:  90
reading chunk:  91
reading chunk:  92
reading chunk:  93
reading chunk:  94
reading chunk:  95
[2m[36m(pid=1260)[0m enter data routing process 1 ..
[2m[36m(pid=1267)[0m enter data routing process 2 ..
[2m[36m(pid=1265)[0m enter data routing process 0 ..
[2m[36m(pid=1259)[0m enter data routing process 7 ..
[2m[36m(pid=1258)[0m enter data routing process 5 ..
[2m[36m(pid=1266)[0m enter data routing process 4 ..
[2m[36m(pid=1256)[0m enter data routing process 6 ..
[2m[36m(pid=1257)[0m enter data routing process 3 ..
[2m[36m(pid=1260)[0m proces 1 has routed 0 rows
[2m[36m(pid=1265)

[2m[36m(pid=1256)[0m proces 6 has routed 1600000 rows
[2m[36m(pid=1266)[0m proces 4 has routed 1600000 rows
[2m[36m(pid=1265)[0m proces 0 has routed 1600000 rows
[2m[36m(pid=1258)[0m proces 5 has routed 1600000 rows
[2m[36m(pid=1259)[0m proces 7 has routed 1600000 rows
[2m[36m(pid=1260)[0m proces 1 has routed 1600000 rows
[2m[36m(pid=1265)[0m proces 0 has routed 1700000 rows
[2m[36m(pid=1267)[0m proces 2 has routed 1700000 rows
[2m[36m(pid=1257)[0m proces 3 has routed 1700000 rows
[2m[36m(pid=1258)[0m proces 5 has routed 1700000 rows
[2m[36m(pid=1256)[0m proces 6 has routed 1700000 rows
[2m[36m(pid=1266)[0m proces 4 has routed 1700000 rows
[2m[36m(pid=1259)[0m proces 7 has routed 1700000 rows
[2m[36m(pid=1260)[0m proces 1 has routed 1700000 rows
[2m[36m(pid=1265)[0m proces 0 has routed 1800000 rows
[2m[36m(pid=1267)[0m proces 2 has routed 1800000 rows
[2m[36m(pid=1257)[0m proces 3 has routed 1800000 rows
[2m[36m(pid=1258)[0m proces 

[2m[36m(pid=1259)[0m proces 1 has routed 1100000 rows
[2m[36m(pid=1265)[0m proces 0 has routed 1100000 rows
[2m[36m(pid=1267)[0m proces 7 has routed 1100000 rows
[2m[36m(pid=1257)[0m proces 4 has routed 1100000 rows
[2m[36m(pid=1258)[0m proces 2 has routed 1100000 rows
[2m[36m(pid=1256)[0m proces 5 has routed 1100000 rows
[2m[36m(pid=1260)[0m proces 6 has routed 1100000 rows
[2m[36m(pid=1266)[0m proces 3 has routed 1100000 rows
[2m[36m(pid=1265)[0m proces 0 has routed 1200000 rows
[2m[36m(pid=1259)[0m proces 1 has routed 1200000 rows
[2m[36m(pid=1256)[0m proces 5 has routed 1200000 rows
[2m[36m(pid=1257)[0m proces 4 has routed 1200000 rows
[2m[36m(pid=1267)[0m proces 7 has routed 1200000 rows
[2m[36m(pid=1258)[0m proces 2 has routed 1200000 rows
[2m[36m(pid=1266)[0m proces 3 has routed 1200000 rows
[2m[36m(pid=1260)[0m proces 6 has routed 1200000 rows
[2m[36m(pid=1265)[0m proces 0 has routed 1300000 rows
[2m[36m(pid=1259)[0m proces 

Exception in thread Thread-106:
Traceback (most recent call last):
  File "/home/centos/anaconda3/lib/python3.8/threading.py", line 932, in _bootstrap_inner
Exception in thread Thread-102:
Traceback (most recent call last):
  File "/home/centos/anaconda3/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    Exception in thread Thread-103:
Traceback (most recent call last):
  File "/home/centos/anaconda3/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
  File "<ipython-input-5-ea839bf84398>", line 25, in run
    self.run()
  File "<ipython-input-5-ea839bf84398>", line 25, in run
    self.run()
  File "<ipython-input-5-ea839bf84398>", line 25, in run
  File "/home/centos/anaconda3/lib/python3.8/site-packages/pyarrow/parquet.py", line 591, in write_table
  File "/home/centos/anaconda3/lib/python3.8/site-packages/pyarrow/parquet.py", line 591, in write_table
    self.writer.write_table(table, row_group_size=row_group_size)
  File "pyarrow/_parquet.pyx", line 1

exit dumping thread 0
finish dumping.
= = = TOTAL PROCESSED SO FAR: 206000000 ROWS. TIME SPENT: 4064.738250732422 SECONDS = = =
reading chunk:  104
reading chunk:  105


In [None]:
# ray.shutdown()