In [None]:
import findspark
findspark.init() # this must be executed before the below import

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark import SparkFiles

In [None]:
import pandas as pd
import time

In [6]:
# consider using a partition_rtree_index to find the corresponding partition
# this is single thread
def record_2_border(row, used_dims):
    '''
    row should be a pandas row, i.e., a point
    border is the border required in rtree index
    '''
    row_used_dims = row.iloc[:,used_dims]
    row_list = row_used_dims.values.tolist()[0]
    return tuple(row_list + row_list)
   
def route_data_2_partition(dataset, used_dims, partition_index, column_names, hdfs_path, 
                           print_execution_time=False):
    '''
    parameters:
    @dataset: should be in the form of pandas dataframe here
    @partition_index: the index of partitions
    @column_names: a list of the column name str, like ['_c0','_c1','_c2']
    '''   
    pid_pdf_dict = {}
    
    start_time = time.time()
    
    for i in range(len(dataset)):
        
        record = dataset[i:i+1] # row shape = (1, n_dims)
        point_border = record_2_border(record, used_dims)
        overlap_pids = list(partition_index.intersection(point_border)) # should only contains 1
        pid = overlap_pids[0]
        
        # assign this record to the corresponding partition
        if pid in pid_pdf_dict:
            #pid_pdf_dict[pid] = pid_pdf_dict[pid].append(record) # must return, cannot replace
            pid_pdf_dict[pid] = pd.concat([pid_pdf_dict[pid], record]) # a little bit faster
        else:
            pid_pdf_dict.update({pid:record})
    
    routing_time = time.time()
    
    # persist them in HDFS
    for pid, pdf in pid_pdf_dict.items():
        partition_name = 'partition_' + str(pid)+'.parquet'
        path = hdfs_path + partition_name
        pdf.columns = column_names
        df = sqlContext.createDataFrame(pdf)
        df.write.mode('append').parquet(path)
        
    persist_time = time.time()
    
    if print_execution_time:
        print(pid_pdf_dict)
        print('data routing time: ', routing_time-start_time)
        print('data persist time: ', persist_time-routing_time)

In [None]:
def batch_data(raw_data_path, chunk_size, partition_index, column_names, hdfs_path):
    begin_time = time.time()
    for chunk in pd.read_csv(raw_data_path, chunksize=chunk_size):
        # process this chunk of data
        route_data_2_partition(chunk, partition_index, column_names, hdfs_path)
    
    finish_time = time.time()
    print('total data routing and persisting time: ', finish_time - begin_time)

In [1]:
# try multi-thread

import threading

class myThread(threading.Thread):
    def __init__(self, thread_id, name, counter, parameters):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.name = name
        self.counter = counter
        self.parameters = parameters
        
    def run(self):
        print('start thread: ',thread_id, name)
        chunk, used_dims, partition_index, column_names, hdfs_path = self.parameters
        route_data_2_partition(chunk, used_dims, partition_index, column_names, hdfs_path)
        print('exit thread: ',thread_id, name)

max_threads = 8

def batch_data_parallel(raw_data_path, chunk_size, used_dims, partition_index, column_names, hdfs_path):
    begin_time = time.time()
    
    count = 0
    threads = []
    for chunk in pd.read_csv(raw_data_path, chunksize=chunk_size):
        # process this chunk of data
        tid = count % max_threads
        thread = myThread(tid, 'thread_'+str(tid)+'_'+str(count), count)
        thread.start()
        threads.append(thread)
        count += 1
        
        if tid == max_threads-1:
            for t in threads:
                t.join()
            threads = []
            
    finish_time = time.time()
    print('total data routing and persisting time: ', finish_time - begin_time)

In [None]:
raw_data_path = '/home/cloudray/Downloads/TPCH_12M_8Field.csv'
chunk_size = 10000
column_names = ['_c0', '_c1'] # handle this
hdfs_path = 'hdfs://localhost:9000/user/cloudray/NORA/'
partition_and_query_dims = [1,2]

batch_data(raw_data_path, chunk_size, partition_and_query_dims, partition_index, column_names, hdfs_path)

In [5]:
# we do not need to persist the rtree index, but only the partition layout
# we can then generate the index at run time, which is not costly

TypeError: list indices must be integers or slices, not list