In [None]:
def decorator_1(func):
    def func_wrapper(x):
        print('dec1')
        func(x)
        print('dec1')
    return func_wrapper

def decorator_2(func):
    def func_wrapper(x):
        print('dec2')
        func(x)
        print('dec2')
    return func_wrapper

@decorator_2
@decorator_1
def function_a(x):
    print(x)

function_a(1)

In [None]:
def temp_f(name1):
    print('hello {}'.format(name1))

def mapper_f(kwargs):
    return temp_f(**kwargs)

mapper_f({'name1': 'lee'})
(lambda kwargs: temp_f(**kwargs))({'name1': 'lee'})

In [1]:
import time
import os
import ipyparallel as ipp
from collections import defaultdict
from tqdm import tqdm
import itertools
from datetime import datetime
import copy

import logging # logging can create duplicate entries if you don't reload logging
try:
    from importlib import reload # Python 3
except: # Python 2 reload is a builtin
    pass

class MultipleClusterEngine(object):
    def __init__(self, cluster_job_name, n_cpus_list, input_file_names, output_parent_dir, function_to_process, function_kwargs_dict): # always put it in as a dictionary
        reload(logging)
        self.cluster_job_name = cluster_job_name
        self.n_cpus_list = n_cpus_list
        self.output_parent_dir = output_parent_dir
        self.input_file_names = input_file_names
        self.function_to_process = lambda kwargs: function_to_process(**kwargs)
        self.function_kwargs_dict = function_kwargs_dict
        
        assert cluster_job_name, "Needs cluster name"
        assert len(n_cpus_list) > 0, "Needs the number of CPUs per cluster"
        assert os.path.isdir(self.output_parent_dir), "Output directory doesn't exist"
        assert len(self.input_file_names) > 0, "Need input files"

        # used by engine
        self.client_dict = {}
        self.load_balanced_view_dict = {}
        self.async_results_dict = defaultdict(list) # collects all the async_results
        self.file_to_cluster_order_dict = defaultdict(list) # remembers which file is sent to which cluster
        self.cluster_indexes = None
        self.logger_status = None
        self.logger_failure = None
        self.start_time = None
        self.end_time = None
        self.cluster_output_dir = None

    def create_cluster_output_dir(self):
        subdirs = [name for name in os.listdir(self.output_parent_dir) if 
                   os.path.isdir(os.path.join(self.output_parent_dir, name))]
        existing_results_dir = []
        for subdir in subdirs:
            try:
                existing_results_dir.append(int(subdir.strip(self.cluster_job_name)))
            except ValueError:
                pass
        dir_index = max(existing_results_dir) + 1 if existing_results_dir else 0
        self.cluster_output_dir = os.path.join(self.output_parent_dir, self.cluster_job_name + str(dir_index))
        os.makedirs(self.cluster_output_dir)
                
    def create_logger(self, logger_name, log_file):
        l = logging.getLogger(logger_name)
        fileHandler = logging.FileHandler(log_file)
        l.addHandler(fileHandler)
        l.setLevel(logging.INFO)
    
    def activate_logger(self):
        self.create_logger('status', os.path.join(self.cluster_output_dir, "status.log"))
        self.create_logger('failure', os.path.join(self.cluster_output_dir, "failure.log"))
        self.logger_status = logging.getLogger('status')
        self.logger_status.propagate = False
        self.logger_failure = logging.getLogger('failure')
        self.logger_failure.propagate = False
    
    def start_cluster(self, n_cpus, cluster_id):
        self.logger_status.info("\tAttempting to start cluster job {}'s {}th cluster with {} CPUs".format(self.cluster_job_name, cluster_id, n_cpus))
        os.system("ipcluster start --n={} --profile={}{} --daemonize".format(
            n_cpus, self.cluster_job_name, cluster_id)) # should deprecate to use a safer bash call

        attempt_ctr = 0 
        while attempt_ctr < 3: # Attempt to connect to client 3 times
            time.sleep(10) # hard coded
            try:
                client = ipp.Client(profile='{}{}'.format(self.cluster_job_name, cluster_id))
            except ipp.error.TimeoutError:
                attempt_ctr += 1
            else:
                self.logger_status.info('\t\tCPU processes ready for action: {}'.format(client[:].apply_async(os.getpid).get()))
                return client
            # if there is any other error other than TimeoutError, then the error will be raised
            
    def start_all_clusters(self):
        self.activate_logger()
        self.logger_status.info('Starting Multiple Cluster Engine')
        #self.logger_status.info('Attempting to start all clusters')
        for cluster_id, n_cpus in enumerate(self.n_cpus_list):
            self.client_dict[cluster_id] = self.start_cluster(n_cpus, cluster_id)
            self.load_balanced_view_dict[cluster_id] = self.client_dict[cluster_id].load_balanced_view()            
        self.start_time = datetime.now()
        self.logger_status.info('All clusters started at {}'.format(self.start_time))
        self.cluster_indexes = itertools.cycle(sorted(self.load_balanced_view_dict))
        
    def kill_cluster(self, cluster_id): # use better arguments
        # client = client_list[cluster_id]
        self.logger_status.info('\tAttempting to kill {}{} with CPU processes: {}'.format(
            self.cluster_job_name, cluster_id, self.client_dict[cluster_id][:].apply_async(os.getpid).get()))
        self.load_balanced_view_dict.pop(cluster_id)
        # client.purge_everything()
        self.client_dict[cluster_id].close()
        os.system('ipcluster stop --profile={}{}'.format(self.cluster_job_name, cluster_id))
        self.logger_status.info('\t\tCluster successfully killed')
        time.sleep(5) # hard-coded
        # have to mutate cluster_indexes
        
    def kill_all_clusters(self):
        self.end_time = datetime.now()
        self.logger_status.info('Killing all clusters')
        for cluster_id in self.client_dict:
            self.kill_cluster(cluster_id)
        self.logger_status.info('All clusters have been killed')
        self.logger_status.info('Multiple Cluster Engine shut down at {}'.format(self.end_time))
        self.logger_status.info('Processed {} files in {} minutes'.format(
            len(self.input_file_names), (self.end_time - self.start_time).seconds / 60.0))
        logging.shutdown()

    def create_kwargs_dict_list(self, input_file_name, cluster_id, n_cpus):
        function_kwargs_dict = copy.deepcopy(self.function_kwargs_dict)
        function_kwargs_dict.update({'input_file_name': input_file_name,
                                    'cluster_output_dir': self.cluster_output_dir,
                                    'cluster_id': cluster_id,
                                    'n_cpus': n_cpus})
        function_kwargs_dict_list = []
        for cpu_id in range(n_cpus):
            function_kwargs_dict_list.append(copy.deepcopy(function_kwargs_dict))
            function_kwargs_dict_list[cpu_id]['cpu_id'] = cpu_id
        return function_kwargs_dict_list 
    
    def check_if_function_in_cluster_failured(self, jth_cluster):
        if self.async_results_dict[jth_cluster] == []: # cluster just started, so it
            return # doesn't have any files sent to the cluster yet
        else:
            exception = self.async_results_dict[jth_cluster][-1].exception()
            if exception:
                self.logger_failure.info('{}th cluster has error {} on file {}'.format(
                    jth_cluster, exception.args[0], self.file_to_cluster_order_dict[jth_cluster][-1]))
                                     
    def run_clusters(self):
        small_file_ctr = 1 # determine if you want to have queue or differently ordered queue
        big_file_ctr = 0
        
        for ith_file in tqdm(range(len(self.input_file_names))):
            for jth_cluster in self.cluster_indexes: # infinite loop
                time.sleep(1) # hard coded delay time; want to do expected log time lag / number of clusters
                ### insert code here to kill cluster if RAM usage too great, if possible log which file it was processing;
                ### it has to do a global search of all clusters' RAM usage
                ### would need a dictionary here to remember which cluster has which file; write to disk
                ### profiler would also write to disk CPU usage what level

                if (not self.async_results_dict[jth_cluster][-1:] 
                    or self.async_results_dict[jth_cluster][-1].done()): # check if cluster i is available                       
                    # if necessary, recreate engine here if cluster shut down
                    # clear cluster memory
                    self.check_if_function_in_cluster_failured(jth_cluster) # check if previous file failed to process
                    
                    if jth_cluster == 0: # Send large files to large cluster (ALWAYS has id == 0)
                        index = big_file_ctr
                        big_file_ctr += 1
                    else: # Send small files to small clusters (ALWAYS have id > 0)
                        index = -small_file_ctr
                        small_file_ctr += 1
                                                                                   
                    kwargs_dict_list = self.create_kwargs_dict_list(
                        self.input_file_names[index],
                        jth_cluster, 
                        len(self.client_dict[jth_cluster].ids))                    
                    
                    ### insert code to write results to file--it will only have start times, no end times
                    async_result = self.load_balanced_view_dict[jth_cluster].map_async(
                        self.function_to_process, # function name
                        kwargs_dict_list
#                            [self.input_file_names[index]] * len(self.client_dict[jth_cluster].ids), # file name, assumes first argument is always file name
                            # [len(client_list[i].ids)] * len(client_list[i].ids), # number of CPUs, assumes second argument is always number of CPUs
                            #  client_list[i].ids # CPU ids, assumes third argument is always CPU id; actually turn into kwargs
                             # [output_folder_name] * len(client_list[i].ids) # assumes fourth argument is output directory
#                            [self.function_kwargs_dict] * len(self.client_dict[jth_cluster].ids)                    
                    #        [function_kwargs_dict] * len(self.client_dict[jth_cluster].ids)                    
                            )                                              
                    self.async_results_dict[jth_cluster].append(async_result)
                    self.file_to_cluster_order_dict[jth_cluster].append(self.input_file_names[index])
                    self.logger_status.info("{} is the {}th file and is sent to {}th cluster for processing".format(
                        self.input_file_names[index], ith_file, jth_cluster))
                    break # break out of inner loop to determine if other clusters are available
        while not all(self.async_results_dict[jth_cluster][-1].done() for jth_cluster in self.async_results_dict): # wait for all clusters to finish
            time.sleep(1)
        # async_results_dict; save to disk for later inspection?
        
    def main(self):
        self.create_cluster_output_dir()
        self.start_all_clusters()
        self.run_clusters()
        self.kill_all_clusters()

In [2]:
# create some fake files
for i in range(10):
    !echo {i + 10} > {i}.tmp        
        

def silly_func(string_saved_to_file,
             input_file_name, cluster_output_dir, cluster_id, n_cpus, cpu_id # mandatory args, you can choose not to use them but function has to take them in 
):
    import os # function has to import all the libraries it uses
    with open(input_file_name, 'r') as in_:
        text = ''.join(in_.readlines())        
    output_file_name = '_'.join(['cluster' + str(cluster_id), 'cpu' + str(cpu_id), input_file_name.split('/')[-1]])
    output_file_name = os.path.join(cluster_output_dir, output_file_name)
    with open(output_file_name, 'w') as out_:
        out_.write('My cluster_id is {}\n'.format(cluster_id))
        out_.write('The number of CPUs in this cluster is {}\n'.format(n_cpus))
        out_.write('My CPU_id is {}\n'.format(cpu_id))
        out_.write('My string is: {}\n'.format(string_saved_to_file))

mce_args = {
    'cluster_job_name': 'write_to_file', # no spaces as it will be part of directory name
    'n_cpus_list': [4, 3, 2], # 1st cluster is always the largest or equal to the other clusters
    'input_file_names': ['{}.tmp'.format(i) for i in range(10)], # absolute path prefered; I'm lazy
    'output_parent_dir': '/home/ubuntu/cluster_results', # use absolute path since it's safer, directory has to already exist
    'function_to_process': silly_func,
    'function_kwargs_dict': {'string_saved_to_file': 'pee-a-boo!'} # this can be crucial    
    }

mce = MultipleClusterEngine(**mce_args)
# mce.start_all_clusters()
# mce.run_clusters()
mce.main()
# mce.kill_all_clusters() if your function_to_process failed while clusters still alive

100%|██████████| 10/10 [00:10<00:00,  1.00s/it]


In [None]:
mce.kill_all_clusters() # if your function_to_process failed while clusters still alive
# probably still have to take a look at the processes and manually kill the clusters

In [3]:
def error_func1(
             input_file_name, cluster_output_dir, cluster_id, n_cpus, cpu_id # mandatory args, you can choose not to use them but function has to take them in 
):
    1 / 0
    
mce_args = {
    'cluster_job_name': 'write_to_file', # no spaces as it will be part of directory name
    'n_cpus_list': [4, 3, 2], # 1st cluster is always the largest or equal to the other clusters
    'input_file_names': ['{}.tmp'.format(i) for i in range(10)],
    'output_parent_dir': '/home/ubuntu/cluster_results', # use absolute path since it's safer, has to already exist
    'function_to_process': error_func1,
    'function_kwargs_dict': {} # error_func1 takes no additional arguments    
    }

mce1 = MultipleClusterEngine(**mce_args)
mce1.main()

100%|██████████| 10/10 [00:10<00:00,  1.00s/it]


In [4]:
mce1.kill_all_clusters()

In [4]:
def error_func2(
             input_file_name, cluster_output_dir, cluster_id, n_cpus, cpu_id # mandatory args, you can choose not to use them but function has to take them in 
):
    '1' + 2
    
mce_args = {
    'cluster_job_name': 'write_to_file', # no spaces as it will be part of directory name
    'n_cpus_list': [4, 3, 2], # 1st cluster is always the largest or equal to the other clusters
    'input_file_names': ['{}.tmp'.format(i) for i in range(10)],
    'output_parent_dir': '/home/ubuntu/cluster_results', # use absolute path since it's safer, has to already exist
    'function_to_process': error_func2,
    'function_kwargs_dict': {} # error_func2 takes no additional arguments    
    }

mce2 = MultipleClusterEngine(**mce_args)
mce2.main()

100%|██████████| 10/10 [00:10<00:00,  1.00s/it]


In [None]:
class MultipleClusterEnginePrototype(object):
    def __init__(self, **kwargs):
        self.RAM_limit_in_GB = pass   

    def memory_profiler():
        pass # if all clusters are dead, then raise Error with a message
        
    def early_kill():
        pass # write file to failure disk, maybe also cluster i and num_cpus # RAM overload
    
    def cluster_release_memory():
        # after each map/reducer step, use gc.collect()
        pass

In [None]:
if cluster is killed, then cannot trust jth_cluster index--instead of load_balanced_list, use load_balanced_dict

# check if all engines killed
# RAM logger
# figure out queue vs deque; deque is better
# # write a crap load of documentation
# write shell script for configuration and installation
# probably no async or threading required

In [None]:
# weakref
# unittest with a mapper/reducer? after each map/reducer step, use gc.collect()

# MCE works on files. Hence, if you don't have any datafiles, then just create some empty files
# SSD for parallel reading (not HDD); determine if you are IO constrained
# RAM usage is heavier in Python 3 than Python 2; though Python 3 memory management is better
# during function failure: benefit (error type will be saved to failure.log) and weakness (it doesn't say what line code failed
#     at so you have to debug your function outside of the MCE instance. You have to debug as if it were just calling
#     the function by itself on some data)

In [None]:
https://github.com/donnemartin/data-science-ipython-notebooks/tree/master/mapreduce