In [None]:
'''
D = []
def case_generator(n):
    for i in range(0,n):
        if i <100:
            D.append(randint(1,100))
        else:
            break
case_generator(30)
'''

# 1. Data Partitioning
## 1.1 Round-Robin Partioning

In [None]:
from random import *
D = [55,30,68,39,1,
     4,49,90,34,76, 
     82,56,31,25,78, 
     56,38,32,88,9, 
     44,98,11,70,66, 
     89,99,22,23,26]

In [None]:
def rr_partition(data, n):
    '''
Round-robin data partition:
the simplest data partitioning method;
each record in turn is allocated to a processing element (simply processor);
Distributes the data evenly among all processors;
Known as “equal-partitioning”.
    '''
    result = []
    #Creating partition n as list of lists
    for i in range(n):
        result.append([])
    #enumerate data and evenly allocation to according partition
    for index, element in enumerate(data):
        index_bin = (int)(index % n) 
        #Trick: e.g. 1%4=1; 2%4=2; 3%4=3; 4%4=0
        result[index_bin].append(element)
    return result

rr_partition(D,3)

## 1.2 Hash Partitioning

In [None]:
'''
Hash data partition:
    Partitioning based on a particular attribute using 
    a hash function. all records within a partition have
    the same hash value.
'''
def hash_func(x, n):
    return x%n

def h_partition(data, n):
    dic = {} #Using hash value as the key of the item
    for element in data:
        h_value = hash_func(element, n)
        if (h_value in dic.keys()):
            items = dic[h_value]
            items.add(element) 
            # Mark: using Set
            dic[h_value] = items
        else:
            '''
            Note: using Set can avoid duplicated data.
            Searching data in a dict with hash value is O(1).
            Without duplicated record, to some extent, the
            efficiency can be improved.
            '''
            tmp = set()
            tmp.update({element})
            dic[h_value] = tmp
    return dic
print(D)
results = h_partition(D, 3)
for each in results:
    print(results[each])

In [None]:
def h_partition_listVersion(data, n):
    dic = {} #Using hash value as the key of the item
    for element in data:
        h_value = hash_func(element, n)
        if (h_value in dic.keys()):
            items = dic[h_value]
            items.append(element) 
            # Mark: List Question
            dic[h_value] = items
        else:
            '''
            Note: Just curious why not using List
            '''
            tmp = []
            tmp.append(element)
            dic[h_value] = tmp
    return dic
print(D)
h_partition_listVersion(D, 3)

## 1.3 Range Partitioning

In [None]:
def range_partition(data, range_indices):
    result = []
    sorted_data = sorted(data)#replicate data
    n_bin = len(range_indices)
    for i in range(n_bin):
        s = [x for x in sorted_data if x < range_indices[i]]
        '''
        Separate data by the range:
        e.g, let a range to be [40,60], a list contain range(100)
        this comprehensive list slice the [0, 40), [40,60)
        '''
        result.append(s)
        sorted_data = sorted_data[len(s):]
        '''
        Qestion solved:
        Error occur when data contain duplicated record
        which accidentally rest at the edge of the range
        #Original code:
        #last_index = sorted_data.index(s[-1])
        #sorted_data = sorted_data[last_index+1:]
        ''' 
    result.append([x for x in sorted_data if x >= range_indices[-1]])
    '''
    The last line finish the slicing.[60,100]
    '''
    return result

In [None]:
range_partition(D,[40,70])

## 1.4 Random-Unequal Partitioning

In [None]:
def ru_partition(data, n):
    '''
    Perform random-unequal data partitioning on data
    Arguments:
    data -- an input dataset which is a list 
    n -- the number of processors
    Return:
    result -- the paritioned subsets of D
    '''
    result = []
    for i in range(n):
        result.append([])
    #enumerate data and evenly allocation to according partition
    for element in data:
        # partition rules:
        index_bin = int(sum([int(d) for d in str(element)]) % n)
        result[index_bin].append(element)
    return result

In [None]:
def ru_partition_setVersion(data, n):
    '''
    Perform random-unequal data partitioning on data
    Arguments:
    data -- an input dataset which is a list 
    n -- the number of processors
    Return:
    result -- the paritioned subsets of D
    '''
    result = {}
    #enumerate data and evenly allocation to according partition
    for element in data:
        # partition rules:
        index_bin = int(sum([int(d) for d in str(element)]) % n)
        if index_bin in result.keys():
            result[index_bin].add(element)
        else:
            result[index_bin] = {element}
    return result

In [None]:
rr_partition(D,3)

In [None]:
ru_partition(D,3)

In [None]:
ru_partition_setVersion(D,3)
# Note: 
# 1. No duplicate data in set
# 2. Data in set followed the ascending order

# 2. Search Algorithms
## 2.1 Linear Search

In [1]:
def linear_search(data, key):
    '''
    Linear Search(Exhaustive search)
    Arguments:
    data -- an input dataset which is a list or a numpy array 
    key -- an query record
    Return:
    result -- the position of searched record
    '''
    matched_record = 'Not Found' 
    position = -1 # not found position
    ### START CODE HERE ### 
    for x in data:
        if x == key: # If x is matched with key
            matched_record = x 
            position = data.index(x) # Get the index of x 
            break
    ### END CODE HERE ###
    return (position, matched_record)

In [2]:
D = [1,2,4,2,5,1]
linear_search(D, 1)

(0, 1)

## 2.2 Binary Search

In [None]:
def binarySearch(alist, record):
    """ 
    Perform binary search on data for the given key
    Arguments:
    alist -- an input dataset which is a list 
    record -- an query record
    Return:
    result -- the position of searched record
    """
    if alist:
        print(alist)
        data = sorted(alist)
        while len(data)!=1:
            mid = len(data)//2
            if data[mid] == record:
                return alist.index(record), record
            elif data[mid] < record:
                #go right, slice off the left-side
                data = data[mid:]
            else:
                #go left, slice off the right-side
                data = data[:mid]
        if data[0] != record:
            return -1,'Not Found'
        else:
            return alist.index(record),record
    else:
        print("Input Error")

In [None]:
binarySearch(D,31)

# 3. Parallel Search Algorithms
## 3.1 Parallel Searching for Exact Match

In [None]:
from multiprocessing import Pool

In [None]:
def parallel_search_exact(data, query, n_processor, m_partition, m_search):
    """
    Perform parallel search for exact match on data for the given key
    Arguments:
    data -- an input dataset which is a list 
    query -- a query record 
    n_processor -- the number of parallel processors 
    m_partition -- a data partitioning method 
    m_search -- a search method
    Return:
    results -- the matched record information
    """
    results = []
    # Pool: a Python method enabling parallel processing.
    # We need to set the number of processes to n_processor,
    # which means that the Pool class will only allow 'n_processor' processes 
    # running at the same time.
    pool = Pool(processes = n_processor)
    ### START CODE HERE ###
    print("data partitioning:" + str(m_partition.__name__)) 
    print("searching method:" + str(m_search.__name__))
    print("Query: " + str(query))
    if m_partition == range_partition: 
        # for range partitioning method 
        # Perform data partitioning:
        # 2nd parameter is a list of maximum range values (3 ranges) 
        DD = m_partition(data, [40, 80]) 
        for d in DD: # Find the range that may contain the query
            if query in d:
                print("Found corresponding range:")
                print(d)
                m = list(d) 
                result = pool.apply(m_search, [m, query]) 
                results.append(result) 
                break
    elif m_partition == h_partition: 
        # for hash partitioning method 
        # Perform data partitioning first 
        DD = m_partition(data, n_processor)
        print(DD)
        # Each element in DD has a pair (hash key: records) 
        query_hash = hash_func(query, n_processor) 
        print("Query Hash value: " + str(query_hash))
        d = list(DD[query_hash]) 
        print("Found corresponding partition:")
        print(d)
        result = pool.apply(m_search, [d, query]) 
        results.append(result)
    else: # for round-robin or random-unequal partitioning method 
        # Perform data partitioning first
        DD = m_partition(data, n_processor) 
        for d in DD: # Perform parallel search on all data partitions
            result = pool.apply(m_search, [d, query])
            #output = result.get() # if you use pool.apply_sync(), uncomment this. 
            #results.append(output) # if you use pool.apply_sync(), uncomment this. 
            results.append(result) # if you use pool.apply_sync(), comment out this.

        """ 
        The method above 'pool.apply()' will lock the function 
        program until all a process is finished. Alternatively,
        we can use the 'pool.apply_sync()' method to spawn one 
        process for each CPU core on your machine.
        """ 
    ### END CODE HERE ### 
    return results

In [None]:
data = sorted(D)

In [None]:
# round-robin partition, linear_search
parallel_search_exact(data, 31, 3, rr_partition, linear_search)

In [None]:
# round-robin partition, binary_search
parallel_search_exact(data, 31, 3, rr_partition, binarySearch)

In [None]:
# random-unequal partition, linear_search
parallel_search_exact(data, 31, 3, ru_partition, linear_search)

In [None]:
# random-unequal partition, binary_search 
parallel_search_exact(data, 31, 3, ru_partition, binarySearch)

In [None]:
# Hash partition, linear_search 
parallel_search_exact(data, 31, 3, h_partition, linear_search)

In [None]:
# Hash partition, binary_search 
parallel_search_exact(data, 31, 3, h_partition, binarySearch)

In [None]:
# Range partition, linear_search 
parallel_search_exact(data, 31, 3, range_partition,linear_search)

In [None]:
# Range partition, binary_search
parallel_search_exact(data, 31, 3, range_partition,binarySearch)

## 3.2 Parallel Searching for Range Selection

In [None]:
'''
Build a parallel search algorithm that uses the linear search
algorithm (i.e. linear_search()) and is able to work with the
hash partitioning method (i.e. h_partition()).
'''
from multiprocessing import Pool

In [None]:
def parallel_search_range(data, query_range, n_processor):
    results = []
    pool = Pool(processes = n_processor)
    DD = h_partition(data, n_processor)
    if query_range[0] > query_range[1]:
        raise Exception("Input Error")
    else:
        candid_list = [x for x in range(query_range[0],query_range[1]+1)]
        for i in range(n_processor):
            for query in candid_list:
                result = pool.apply(linear_search, [list(DD[i]), query])
                if result[0] != -1:
                    print("Found " + str(result[1]) 
                          + "from hash: " + str(i))
                    results.append(result)
    return results

results = parallel_search_range(data, [30, 40], 3) 
print(results)