In [1]:
import findspark
findspark.init() # this must be executed before the below import

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark import SparkFiles

In [3]:
import pandas as pd
import time
import rtree
from rtree import index
import numpy as np
from numpy import genfromtxt
from multiprocessing import Pool
import threading

In [4]:
from DRProcess import *
from DDProcess import *

In [5]:
conf = SparkConf().setAll([("spark.executor.memory", "8g"),("spark.driver.memory","8g"),
                           ("spark.memory.offHeap.enabled",True),("spark.memory.offHeap.size","8g")])

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [6]:
sc.getConf().getAll()

[('spark.driver.port', '37662'),
 ('spark.memory.offHeap.size', '8g'),
 ('spark.executor.id', 'driver'),
 ('spark.app.id', 'local-1603181345955'),
 ('spark.app.name', 'pyspark-shell'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.host', '10.88.88.103'),
 ('spark.driver.memory', '8g'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.memory', '8g'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.memory.offHeap.enabled', 'True'),
 ('spark.ui.showConsoleProgress', 'true')]

In [7]:
class DumpThread(threading.Thread):
    def __init__(self, thread_id, name, parameters):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.name = name
        self.parameters = parameters
        
    def run(self):
        print('start dumping thread: ', self.thread_id, self.name)
        start_index, end_index, pids, pid_data_dict, hdfs_path, column_names = self.parameters
        for pid in pids[start_index: end_index]:
            path = hdfs_path + 'partition_' + str(pid)+'.parquet'
            pdf = pd.DataFrame(pid_data_dict[pid], columns=column_names)
            df = sqlContext.createDataFrame(pdf)
            df.write.mode('append').parquet(path)
            pid_data_dict[pid] = []
        print('exit dumping thread: ', self.thread_id, self.name)
        
def dump_dict_data_2_hdfs(pid_data_dicts, column_names, hdfs_path, num_threads = 8):
    
    # first merge all the dicts
    base_dict = pid_data_dicts[0]
    for k in range(1, len(pid_data_dicts)):
        for key, val in pid_data_dicts[k].items():
            if key in base_dict:
                base_dict[key] += val
            else:
                base_dict.update({key:val})
        pid_data_dicts[k].clear()
    
    if num_threads == 1:
        print('start dumping single thread (main)')
        pids = list(base_dict.keys())
        for pid in pids:
            path = hdfs_path + 'partition_' + str(pid)+'.parquet'
            pdf = pd.DataFrame(base_dict[pid], columns=column_names)
            df = sqlContext.createDataFrame(pdf)
            df.write.mode('append').parquet(path)
            base_dict[pid] = []
        print('finish dumping single thread (main)')
    
    else:
        # apply multi-threading to save
        pids = list(base_dict.keys())
        step = int(len(pids) / num_threads) + 1
        threads = []
        for i in range(num_threads):
            start_index = i * step
            end_index = (i+1) * step
            parameters = [start_index, end_index, pids, base_dict, hdfs_path, column_names]
            thread = DumpThread(i, 'dump_thread_'+str(i), parameters)
            thread.start()
            threads.append(thread)
            if start_index >= len(pids):
                break   
        for t in threads:
            t.join()

# used for multi-process wirting
def merge_dicts(pid_data_dicts, num_process):
    base_dict = pid_data_dicts[0]
    for k in range(1, len(pid_data_dicts)):
        for key, val in pid_data_dicts[k].items():
            if key in base_dict:
                base_dict[key] += val
            else:
                base_dict.update({key:val})
        pid_data_dicts[k].clear()
    
    # re allocate to non-overlap dicts
    pids = list(base_dict.keys())
    step = int(len(pids) / num_process) + 1
    non_overlap_dicts = [{} for i in range(num_process)]
    
    for key, val in base_dict.items():
        dict_index = key // step
        non_overlap_dicts[dict_index][key] = val
        
    return non_overlap_dicts

In [8]:
def batch_data_parallel(table_path, partition_path, chunk_size, used_dims, hdfs_path, 
                        num_dims, dump_threshold = 1000000, num_process = 8):
    
    begin_time = time.time()
    
    col_names = ['_c'+str(i) for i in range(num_dims)]
    cols = [i for i in range(num_dims)]
    
    pid_data_dicts = []
    for i in range(num_process):
        pid_data_dicts.append({})
    
    chunks = []
    
    count = 0
    epochs = 0
    processed_data = 0
    pool = Pool(processes = num_process) # the pool should be reused, or incur memory leak!
    pids_each_process = [set() for k in range(num_process)] # used for final merge
    
    for chunk in pd.read_table(table_path, delimiter='|', usecols=cols, names=col_names, chunksize=chunk_size):
    #for chunk in pd.read_csv(table_path, usecols=cols, names=col_names, chunksize=chunk_size):
        print('current chunk: ', count)
        chunks.append(chunk)
        if count % num_process == num_process - 1:
            paras = [[chunks[k], used_dims, partition_path, pid_data_dicts[k]] for k in range(num_process)]
            pid_data_dicts = pool.map(process_chunk, [para for para in paras])
            print('===================================================')
            chunks = []
            processed_data += chunk_size * num_process
            
            # dump data to file
            if processed_data >= dump_threshold:
                # parquet write is not thread safe, avoid concurent write
                pid_data_dicts = merge_dicts(pid_data_dicts, num_process) # make it non-overlap
                for k in range(num_process):
                    pids_each_process[k].update(list(pid_data_dicts[k].keys()))
                paras = [[epochs, pid_data_dicts[k], col_names, hdfs_path] for k in range(num_process)]
                pool.map(dump_data, [para for para in paras])
                #dump_dict_data_2_hdfs(pid_data_dicts, col_names, hdfs_path) # multi-thread
                processed_data = 0
                epochs += 1
                for i in range(num_process):
                    pid_data_dicts[i].clear()
        count += 1
        
    dict_size = [len(pid_data_dicts[i]) for i in range(num_process)]
    print('after exit, chunks size: ', len(chunks))
    print('after exit, each dict size: ', dict_size)
    # process the last batch
    if len(chunks) != 0:
        paras = [[chunks[k], used_dims, partition_path, pid_data_dicts[k]] for k in range(len(chunks))]
        pid_data_dicts[0:len(chunks)] = pool.map(process_chunk, [para for para in paras])
    
    
    dict_size = [len(pid_data_dicts[i]) for i in range(num_process)]
    print('after last chunk, each dict size: ', dict_size)
    
    if len(pid_data_dicts[0]) != 0:
        pid_data_dicts = merge_dicts(pid_data_dicts, num_process) # make it non-overlap
        paras = [[epochs, pid_data_dicts[k], col_names, hdfs_path] for k in range(num_process)]
        pool.map(dump_data, [para for para in paras])
        #dump_dict_data_2_hdfs(pid_data_dicts, col_names, hdfs_path)
        for k in range(num_process):
            pids_each_process[k].update(list(pid_data_dicts[k].keys()))
    
    pid_data_dicts.clear() # release memory
    
    # final merge
    epochs += 1
    paras = [[epochs, pids_each_process[k], hdfs_path] for k in range(num_process)]
    pool.map(merge_parquets, [para for para in paras])
        
    pool.close()
    pool.join()
    
    finish_time = time.time()
    print('total data routing and persisting time: ', finish_time - begin_time)

In [9]:
# = = = Configuration = = =
scale_factor = 100

table_base_path = '/home/ubuntu/TPCH/dbgen/'
table_path = table_base_path + 'lineitem_' + str(scale_factor) + '.tbl'

num_process = 12
chunk_size = 200000
dump_threshold = 12000000 # 6M rows = about 1GB raw data

num_dims = 16
used_dims = [1,2]

# base path of HDFS
hdfs_base_path = 'hdfs://10.88.88.103:9000/user/cloudray/'

nora_hdfs = hdfs_base_path + 'NORA/scale' + str(100) + '/'
qdtree_hdfs = hdfs_base_path + 'QdTree/scale' + str(100) + '/'
kdtree_hdfs = hdfs_base_path + 'KDTree/scale' + str(100) + '/'

# base path of Partition
partition_base_path = '/home/ubuntu/PartitionLayout/'

nora_partition = partition_base_path + 'nora_partitions_' + str(scale_factor)
qdtree_partition = partition_base_path + 'qdtree_partitions_' + str(scale_factor)
kdtree_partition = partition_base_path + 'kdt_partitions_' + str(scale_factor)

# Legacy
# table_path = '/home/cloudray/Downloads/TPCH_12M_8Field.csv'
# table_path = '/home/cloudray/TPCH/2.18.0_rc2/dbgen/lineitem.tbl'

# partition_path = '/home/cloudray/NORA_Partitions/nora_partitions'
# partition_path = '/home/cloudray/NORA_Partitions/qd_tree_partitions'

# hdfs_path = 'hdfs://localhost:9000/user/cloudray/NORA/'
# hdfs_path = 'hdfs://localhost:9000/user/cloudray/QdTree/'

# partition_path = '/home/cloudray/NORA_Partitions/nora_test'
# partition_path = '/home/cloudray/NORA_Partitions/qd_tree_test'

# hdfs_path = 'hdfs://localhost:9000/user/cloudray/NORA_Test/'
# hdfs_path = 'hdfs://localhost:9000/user/cloudray/QdTree_Test/'

In [10]:
# = = = Execution = = =
# batch_data_parallel(table_path, nora_partition, chunk_size, used_dims, nora_hdfs, num_dims, dump_threshold, num_process)
# print('finish nora data routing..')
# batch_data_parallel(table_path, qdtree_partition, chunk_size, used_dims, qdtree_hdfs, num_dims, dump_threshold, num_process)
# print('finish qdtree data routing..')
batch_data_parallel(table_path, kdtree_partition, chunk_size, used_dims, kdtree_hdfs, num_dims, dump_threshold, num_process)
print('finish kdtree data routing..')

current chunk:  0
current chunk:  1
current chunk:  2
current chunk:  3
current chunk:  4
current chunk:  5
current chunk:  6
current chunk:  7
current chunk:  8
current chunk:  9
current chunk:  10
current chunk:  11
current chunk:  12
current chunk:  13
current chunk:  14
current chunk:  15
current chunk:  16
current chunk:  17
current chunk:  18
current chunk:  19
current chunk:  20
current chunk:  21
current chunk:  22
current chunk:  23
current chunk:  24
current chunk:  25
current chunk:  26
current chunk:  27
current chunk:  28
current chunk:  29
current chunk:  30
current chunk:  31
current chunk:  32
current chunk:  33
current chunk:  34
current chunk:  35
current chunk:  36
current chunk:  37
current chunk:  38
current chunk:  39
current chunk:  40
current chunk:  41
current chunk:  42
current chunk:  43
current chunk:  44
current chunk:  45
current chunk:  46
current chunk:  47
current chunk:  48
current chunk:  49
current chunk:  50
current chunk:  51
current chunk:  52
cur

KeyboardInterrupt: 

In [None]:
# it seems the process of merging QdTree partitions are stuck, we re generate the merged data
# but the below cannot work, it will also stuck at some point, I can't find out why

# pool = Pool(processes = 3)

# # totally 68 partitions for QdTree
# # pids = [[k * 8 + i for i in range(8)] for k in range(num_process)]
# # pids[-1] += [64, 65, 66, 67]

# pids = [i for i in range(68)] # 0 - 67

# batch = 0
# while batch < 3:

#     pids_each_process = [set(pids[batch*24+k*8: batch*24+(k+1)*8]) for k in range(3)]
#     # totally 94 epochs
#     paras = [[94, pids_each_process[k], qdtree_hdfs] for k in range(3)]
#     pool.map(merge_parquets, [para for para in paras])
#     batch += 1
    
# pool.close()
# pool.join()

In [None]:
# import pandas as pd
# import pyarrow as pa
# import pyarrow.parquet as pq
# import numpy as np

# pids = [i for i in range(68)] # 0 - 67
# batches = 94
# hdfs_path = qdtree_hdfs

# start_time = time.time()

# # using single process to handle data merge
# fs = pa.hdfs.connect()
# for pid in pids:
#     parquets = []
#     print('= = = process pid: ', pid, '= = =')
#     for batch in range(batches):
#         path = hdfs_path + str(batch) + '/partition_' + str(pid)+'.parquet'
#         print(batch)
#         try:
#             par = pq.read_table(path)
#             parquets.append(par)
#         except:
#             continue
#     merged_parquet = pa.concat_tables(parquets)
#     merge_path = hdfs_path + 'merged/partition_' + str(pid)+'.parquet'
#     fw = fs.open(merge_path, 'wb')
#     pq.write_table(merged_parquet, fw)
#     fw.close()
# print('exit merge process')

# end_time = time.time()
# print('time usage: ', end_time - start_time) # 2347s