In [1]:
import findspark
findspark.init() # this must be executed before the below import

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark import SparkFiles

In [3]:
import pandas as pd
import time
import rtree
from rtree import index
import numpy as np
from numpy import genfromtxt
import copy

In [4]:
sc = SparkContext()
sqlContext = SQLContext(sc)

In [5]:
# consider using a partition_rtree_index to find the corresponding partition
# this is single thread
def record_2_border(row, used_dims):
    '''
    row should be a pandas row, i.e., a point
    border is the border required in rtree index
    '''
    row_used_dims = row.iloc[:,used_dims]
    row_list = row_used_dims.values.tolist()[0]
    return tuple(row_list + row_list)
   
def route_data_2_partition(dataset, used_dims, partition_index, hdfs_path, lock_dict, print_execution_time=False):
    '''
    parameters:
    @dataset: should be in the form of pandas dataframe here
    @partition_index: the index of partitions
    @column_names: a full list of the column names of the table, like ['_c0','_c1','_c2']
    '''   
    pid_pdf_dict = {}
    
    start_time = time.time()
    
    for i in range(len(dataset)):
        
        record = dataset[i:i+1] # row shape = (1, n_dims)
        point_border = record_2_border(record, used_dims)
        overlap_pids = list(partition_index.intersection(point_border)) # should only contains 1
        #print('in route func, partition index size: ', partition_index.get_size())
        if len(overlap_pids) == 0:
            #print('error: no partition that can hold this record')
            #print('record: ',record)
            print('error point_border: ',point_border)
            print('error partition index size: ', partition_index.get_size())
        pid = overlap_pids[0]
        
        # assign this record to the corresponding partition
        if pid in pid_pdf_dict:
            #pid_pdf_dict[pid] = pid_pdf_dict[pid].append(record) # must return, cannot replace
            pid_pdf_dict[pid] = pd.concat([pid_pdf_dict[pid], record]) # a little bit faster
        else:
            pid_pdf_dict.update({pid:record})
    
    routing_time = time.time()
    
    # persist them in HDFS
    for pid, pdf in pid_pdf_dict.items():
        partition_name = 'partition_' + str(pid)+'.parquet'
        path = hdfs_path + partition_name
        #pdf.columns = column_names
        df = sqlContext.createDataFrame(pdf)
        lock_dict[pid].acquire() # lock
        df.write.mode('append').parquet(path)
        lock_dict[pid].release() # unlock
        
    persist_time = time.time()
    
    if print_execution_time:
        print(pid_pdf_dict)
        print('data routing time: ', routing_time-start_time)
        print('data persist time: ', persist_time-routing_time)

In [6]:
# def batch_data(raw_data_path, chunk_size, used_dims, partition_index, column_names, hdfs_path):
#     begin_time = time.time()
    
#     chunk_count = 0
#     for chunk in pd.read_csv(raw_data_path, chunksize=chunk_size):
#         print('current chunk: ', chunk_count)
#         chunk.columns = column_names
#         route_data_2_partition(chunk, used_dims, partition_index, hdfs_path)
#         chunk_count += 1
    
#     finish_time = time.time()
#     print('total data routing and persisting time: ', finish_time - begin_time)

In [7]:
# try multi-thread
import threading

class myThread(threading.Thread):
    def __init__(self, thread_id, name, counter, parameters, lock_dict):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.name = name
        self.counter = counter
        self.parameters = parameters
        self.lock_dict = lock_dict
        #print('in thread init, pidx size: ', parameters[2].get_size())
        #print('in thread init, self.pidx size: ', self.parameters[2].get_size())
        
    def run(self):
        print('start thread: ',self.thread_id, self.name)
        chunk, used_dims, partition_index, hdfs_path = self.parameters
        #print('in thread run, pidx size: ', partition_index.get_size())
        route_data_2_partition(chunk, used_dims, partition_index, hdfs_path, self.lock_dict)
        print('exit thread: ',self.thread_id, self.name)


def batch_data_parallel(raw_data_path, chunk_size, used_dims, partition_index, column_names, 
                        hdfs_path, lock_dict, max_threads = 8):
    
    begin_time = time.time()
    
    count = 0
    threads = []
    for chunk in pd.read_csv(raw_data_path, chunksize=chunk_size):
        print('current chunk: ', count)
        tid = count % max_threads
        chunk.columns = column_names
        #print('in batch func, pidx size: ', partition_index.get_size())
        parameters = [chunk, used_dims, partition_index[tid], hdfs_path]
        #print('in batch func para, pidx size: ', parameters[2].get_size())
        thread = myThread(tid, 'thread_'+str(tid)+'_'+str(count), count, parameters, lock_dict)
        thread.start()
        threads.append(thread)
        count += 1
        
        if tid == max_threads-1:
            for t in threads:
                t.join()
            threads = []
            print('===================================================')
            
    finish_time = time.time()
    print('total data routing and persisting time: ', finish_time - begin_time)

In [8]:
def kdnode_2_border(kdnode):
    lower = [domain[0] for domain in kdnode[0]]
    upper = [domain[1] for domain in kdnode[0]]
    border = tuple(lower + upper) # non interleave
    return border

def load_partitions_from_file(path):
    '''
    the loaded stretched_kdnodes: [num_dims, l1,l2,...,ln, u1,u2,...,un, size, id, pid, left_child,id, right_child_id]
    '''
    stretched_kdnodes = genfromtxt(path, delimiter=',')
    num_dims = int(stretched_kdnodes[0,0])
    kdnodes = []
    
    for i in range(len(stretched_kdnodes)):
        domains = [ [stretched_kdnodes[i,k+1],stretched_kdnodes[i,1+num_dims+k]] for k in range(num_dims) ]
        row = [domains]
        row.append(stretched_kdnodes[i,2*num_dims+1])
        
        # to be compatible with qd-tree's partition, that do not have the last 4 attributes
        if len(stretched_kdnodes[i]) > 2*num_dims+2:
            row.append(stretched_kdnodes[i,-4])
            row.append(stretched_kdnodes[i,-3])
            row.append(stretched_kdnodes[i,-2])
            row.append(stretched_kdnodes[i,-1])
    
        kdnodes.append(row)
    
    return kdnodes

In [9]:
def data_routing(data_path, partition_path, num_threads,
                chunk_size, used_dims, column_names, hdfs_path):
    
    partitions = load_partitions_from_file(partition_path)
    lock_dict = {}
    
    p = index.Property()
    p.leaf_capacity = 100 # cannot be less than 100, indicate the maximum capacity
    p.fill_factor = 0.5
    p.overwrite = True
    
    pidxs = [] # the rtree index has problem in mutli-threading, create an index for each thread
    for k in range(num_threads):
        partition_index = index.Index(properties = p)
        for i in range(len(partitions)):
            # qd-tree do not have this
            #partition_index.insert(int(partitions[i][-4]), kdnode_2_border(partitions[i])) 
            partition_index.insert(i, kdnode_2_border(partitions[i]))
            lock_dict.update({i:threading.Lock()})
        pidxs.append(partition_index)
    
    batch_data_parallel(data_path, chunk_size, used_dims, pidxs, column_names, hdfs_path, lock_dict)

In [10]:
# = = = Execution = = =
data_path = '/home/cloudray/Downloads/TPCH_12M_8Field.csv'
partition_path = '/home/cloudray/NORA_Partitions/nora_partitions'
num_threads = 8
chunk_size = 100000
used_dims = [1,2]
column_names = ['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7']
hdfs_path = 'hdfs://localhost:9000/user/cloudray/NORA/'

data_routing(data_path, partition_path, num_threads, chunk_size, used_dims, column_names, hdfs_path)

current chunk:  0
start thread:  0 thread_0_0
current chunk:  1
start thread:  1 thread_1_1
current chunk:  2
start thread:  2 thread_2_2
current chunk:  3
start thread:  3 thread_3_3
current chunk:  4
start thread:  4 thread_4_4
current chunk:  5
start thread:  5 thread_5_5
current chunk:  6
start thread:  6 thread_6_6
current chunk:  7
start thread:  7 thread_7_7
exit thread:  7 thread_7_7
exit thread:  2 thread_2_2
exit thread:  4 thread_4_4
exit thread:  6 thread_6_6
exit thread:  0 thread_0_0
exit thread:  5 thread_5_5
exit thread:  1 thread_1_1
exit thread:  3 thread_3_3
current chunk:  8
start thread:  0 thread_0_8
current chunk:  9
start thread:  1 thread_1_9
current chunk:  10
start thread:  2 thread_2_10
current chunk:  11
start thread:  3 thread_3_11
current chunk:  12
start thread:  4 thread_4_12
current chunk:  13
start thread:  5 thread_5_13
current chunk:  14
start thread:  6 thread_6_14
current chunk:  15
start thread:  7 thread_7_15
exit thread:  1 thread_1_9
exit thre

current chunk:  103
start thread:  7 thread_7_103
exit thread:  2 thread_2_98
exit thread:  4 thread_4_100
exit thread:  6 thread_6_102
exit thread:  1 thread_1_97
exit thread:  7 thread_7_103
exit thread:  0 thread_0_96
exit thread:  3 thread_3_99
exit thread:  5 thread_5_101
current chunk:  104
start thread:  0 thread_0_104
current chunk:  105
start thread:  1 thread_1_105
current chunk:  106
start thread:  2 thread_2_106
current chunk:  107
start thread:  3 thread_3_107
current chunk:  108
start thread:  4 thread_4_108
current chunk:  109
start thread:  5 thread_5_109
current chunk:  110
start thread:  6 thread_6_110
current chunk:  111
start thread:  7 thread_7_111
exit thread:  0 thread_0_104
exit thread:  3 thread_3_107
exit thread:  2 thread_2_106
exit thread:  6 thread_6_110
exit thread:  5 thread_5_109
exit thread:  1 thread_1_105
exit thread:  7 thread_7_111
exit thread:  4 thread_4_108
current chunk:  112
start thread:  0 thread_0_112
current chunk:  113
start thread:  1 thr

In [11]:
# = = = Unit Test = = =
# # path = '/home/cloudray/NORA_Partitions/nora_partitions'
# path = '/home/cloudray/NORA_Partitions/qd_tree_partitions'

# partitions = load_partitions_from_file(path)

# for i in range(len(partitions)):
#     if partitions[i][0][0][0] > partitions[i][0][0][1] or partitions[i][0][1][0] > partitions[i][0][1][1]:
#         print('found invalid position: ',i, partitions[i])

# p = index.Property()
# p.leaf_capacity = 100 # cannot be less than 100, indicate the maximum capacity
# p.fill_factor = 0.5
# p.overwrite = True

# # partition_index = index.Index(properties = p)
# lock_dict = {}

# # the rtree index has some problem in multi-threading.
# # that's why we need to create multiple index here, each for 1 thread
# pidxs = []
# for k in range(8):
#     partition_index = index.Index(properties = p)
#     for i in range(len(partitions)):
#         #partition_index.insert(int(partitions[i][-4]), kdnode_2_border(partitions[i])) # qd-tree do not have this
#         partition_index.insert(i, kdnode_2_border(partitions[i]))
#         lock_dict.update({i:threading.Lock()})
#     pidxs.append(partition_index)
    
# raw_data_path = '/home/cloudray/Downloads/TPCH_12M_8Field.csv'
# chunk_size = 10000
# column_names = ['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7'] # handle this
# hdfs_path = 'hdfs://localhost:9000/user/cloudray/NORA/'
# partition_and_query_dims = [1,2]

# # batch_data(raw_data_path, chunk_size, partition_and_query_dims, partition_index, column_names, hdfs_path)
# print(partition_index.get_size())

# raw_data_path = '/home/cloudray/Downloads/TPCH_12M_8Field.csv'
# chunk_size = 10000
# column_names = ['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7'] # handle this
# hdfs_path = 'hdfs://localhost:9000/user/cloudray/NORA/'
# partition_and_query_dims = [1,2]

In [12]:
# = = = Unit Test = = =
# # batch_data_parallel(raw_data_path, chunk_size, partition_and_query_dims, 
# #                     partition_index, column_names, hdfs_path, lock_dict)

# batch_data_parallel(raw_data_path, chunk_size, partition_and_query_dims, 
#                     pidxs, column_names, hdfs_path, lock_dict)

In [13]:
# # = = = check correctness = = =
# df = sqlContext.read.parquet('hdfs://localhost:9000/user/cloudray/NORA/partition_19.parquet')
# print(df)
# df.head(10)

In [14]:
# # = = = check correctness = = =
# # check correctness for Rtree index, where the pos is reported no partition overlap
# overlap_pids = list(partition_index.intersection((118449.0, 8460.0, 118449.0, 8460.0)))
# print(overlap_pids)
# print(partition_index.leaves())