In [None]:
def decorator_1(func):
    def func_wrapper(x):
        print('dec1')
        func(x)
        print('dec1')
    return func_wrapper

def decorator_2(func):
    def func_wrapper(x):
        print('dec2')
        func(x)
        print('dec2')
    return func_wrapper

@decorator_2
@decorator_1
def function_a(x):
    print(x)

function_a(1)

In [None]:
def temp_f(name1):
    print('hello {}'.format(name1))

def mapper_f(kwargs):
    return temp_f(**kwargs)

mapper_f({'name1': 'lee'})
(lambda kwargs: temp_f(**kwargs))({'name1': 'lee'})

In [16]:
# create some fake files
for i in range(10):
    !touch {i}.tmp

In [1]:
import time
import os
import ipyparallel as ipp
from collections import defaultdict
from tqdm import tqdm
import itertools
from datetime import datetime
import copy
import logging



class MultipleClusterEnginePrototype(object):
    def __init__(self, cluster_job_name, n_cpus_list, output_dir, file_names, function_to_process, function_kwargs_dict): # always put it in as a dictionary
        self.cluster_job_name = cluster_job_name
        self.n_cpus_list = n_cpus_list
        self.output_dir = output_dir if output_dir[-1] == '/' else output_dir + '/'
        self.file_names = file_names
        self.function_to_process = lambda kwargs: function_to_process(**kwargs)
        self.function_kwargs_dict = function_kwargs_dict
        
        assert cluster_job_name, "Needs cluster name"
        assert len(n_cpus_list) > 0, "Needs the number of CPUs per cluster"
        assert os.path.isdir(self.output_dir), "Output directory doesn't exist"
        assert len(file_names) > 0, "Need input files"
        
        
        

        # used by engine
        self.client_dict = {}
        self.load_balanced_view_dict = {}
        self.async_results_dict = defaultdict(list) # collects all the async_results
        self.file_to_cluster_order_dict = defaultdict(list) # remembers which files are sent to which clusters
        self.cluster_indexes = None
        logging.basicConfig(filename='logger.log', level=logging.INFO)
        self.logger = logging.getLogger('Willie says') # Easter egg
        
    def start_cluster(self, n_cpus, cluster_id):
        self.logger.info('\tAttempting to start {}{} with {} CPUs'.format(self.cluster_job_name, cluster_id, n_cpus))
        os.system("ipcluster start --n={} --profile={}{} --daemonize".format(
            n_cpus, self.cluster_job_name, cluster_id)) # should deprecate to use a safer bash call

        attempt_ctr = 0 
        while attempt_ctr < 3: # Attempt to connect to client 3 times
            time.sleep(10) # hard coded
            try:
                client = ipp.Client(profile='{}{}'.format(self.cluster_job_name, cluster_id))
            except ipp.error.TimeoutError:
                attempt_ctr += 1
            else:
                self.logger.info('\t\tCPU processes ready for action: {}'.format(client[:].apply_async(os.getpid).get()))
                return client
            # if there is any other error other than TimeoutError, then the error will be raised
    
    def start_all_clusters(self):
        self.logger.info('Attempting to start all clusters')
        for cluster_id, n_cpus in enumerate(self.n_cpus_list):
            self.client_dict[cluster_id] = self.start_cluster(n_cpus, cluster_id)
            self.load_balanced_view_dict[cluster_id] = self.client_dict[cluster_id].load_balanced_view()            
        self.logger.info('All clusters started')
        self.cluster_indexes = itertools.cycle(sorted(self.load_balanced_view_dict))
        
    def kill_cluster(self, cluster_id): # use better arguments
        # client = client_list[cluster_id]
        self.logger.info('\tAttempting to kill {}{} with CPU processes: {}'.format(
            self.cluster_job_name, cluster_id, self.client_dict[cluster_id][:].apply_async(os.getpid).get()))
        self.load_balanced_view_dict.pop(cluster_id)
        # client.purge_everything()
        self.client_dict[cluster_id].close()
        os.system('ipcluster stop --profile={}{}'.format(self.cluster_job_name, cluster_id))
        self.logger.info('\t\tCluster successfully killed')
        time.sleep(5) # hard-coded
        # have to mutate cluster_indexes
        
    def kill_all_clusters(self):
        self.logger.info('Attempting to kill all clusters')
        for cluster_id in self.client_dict:
            self.kill_cluster(cluster_id)
        self.logger.info('All clusters have been killed')
        
    def run_clusters(self):        
        small_file_ctr = 1 # determine if you want to have queue or differently ordered queue
        big_file_ctr = 0
        

        for ith_file in tqdm(range(len(self.file_names))):
            for jth_cluster in self.cluster_indexes: # infinite loop
                time.sleep(1) # hard coded delay time; want to do expected log time lag / number of clusters
                ### insert code here to kill cluster if RAM usage too great, if possible log which file it was processing;
                ### it has to do a global search of all clusters' RAM usage
                ### would need a dictionary here to remember which cluster has which file; write to disk
                ### profiler would also write to disk CPU usage what level

                if (not self.async_results_dict[jth_cluster][-1:] 
                    or self.async_results_dict[jth_cluster][-1].done()): # check if cluster i is available                       
                    # if necessary, recreate engine here
                    if jth_cluster == 0: # Send large files to large cluster (ALWAYS has id == 0)
                        index = big_file_ctr
                        big_file_ctr += 1
                    else: # Send small files to small clusters (ALWAYS have id > 0)
                        index = -small_file_ctr
                        small_file_ctr += 1
                                              
                                     
                    # clear cluster memory 

                    # package_arguments
                    function_kwargs_dict = copy.deepcopy(self.function_kwargs_dict)
                    function_kwargs_dict.update({'file_name': self.file_names[index]}) 
                    
                    ### insert code to write results to file--it will only have start times, no end times
                    async_result = self.load_balanced_view_dict[jth_cluster].map_async(
                            self.function_to_process, # function name
#                            [self.file_names[index]] * len(self.client_dict[jth_cluster].ids), # file name, assumes first argument is always file name
                            # [len(client_list[i].ids)] * len(client_list[i].ids), # number of CPUs, assumes second argument is always number of CPUs
                            #  client_list[i].ids # CPU ids, assumes third argument is always CPU id; actually turn into kwargs
                             # [output_folder_name] * len(client_list[i].ids) # assumes fourth argument is output directory
#                            [self.function_kwargs_dict] * len(self.client_dict[jth_cluster].ids)                    
                            [function_kwargs_dict] * len(self.client_dict[jth_cluster].ids)
                            )                                              
                    self.async_results_dict[jth_cluster].append(async_result)
                    self.file_to_cluster_order_dict[jth_cluster].append(self.file_names[index])
                    self.logger.info("{} is the {}th file and is sent to {}{} for processing".format(self.file_names[index], ith_file, self.cluster_job_name, jth_cluster))
                    break # break out of inner loop to determine if other clusters are available
        # async_results_dict; save to disk for later inspection?
        
    def main(self):
        start_time = datetime.now()
        self.logger.info('Starting Multiple Cluster Engine at {}'.format(start_time))
        self.start_all_clusters()
        self.run_clusters()
        self.kill_all_clusters()
        end_time = datetime.now()
        self.logger.info('Multiple Cluster Engine shut down at {}'.format(end_time))
        self.logger.info('Total run time is {} minutes'.format((end_time - start_time).seconds / 60))
        self.logger.close()

In [2]:
def fun_func(file_name, save_string_to_file):
    with open(file_name, 'a') as f:
        f.write(save_string_to_file)

mce_args = {
    'cluster_job_name': 'write_to_file', # no spaces
    'n_cpus_list': [4, 3, 2], # 1st cluster is always the largest or equal to the other clusters
    'file_names': ['{}.tmp'.format(i) for i in range(10)],
    'function_to_process': fun_func,
    'function_kwargs_dict': {'save_string_to_file': 'pee-a-boo!'},
    'output_dir': '/home/ubuntu/cluster_results/' # use absolute path since it's safer
    }

mce = MultipleClusterEnginePrototype(**mce_args)
# mce.start_all_clusters()
# mce.run_clusters()
mce.main()

100%|██████████| 10/10 [00:10<00:00,  1.00s/it]


AttributeError: 'Logger' object has no attribute 'close'

In [3]:
import logging

logging.basicConfig(filename='logger.log', level=logging.INFO)
logger = logging.getLogger('Willie says') # Easter egg

In [4]:
logger.handle

logging.Logger

In [3]:
mce.kill_all_clusters()

In [26]:
!cat 9.tmp

pee-a-boo!pee-a-boo!pee-a-boo!

In [87]:
temp = mce.async_results_dict[0][-1]

In [30]:
mce.kill_all_clusters()
#!ipcluster stop --profile=mycluster2


Attempting to kill all clusters
	Attempting to kill mycluster0 with CPU processes: [2065, 2066, 2068, 2072]
		Cluster successfully killed
	Attempting to kill mycluster1 with CPU processes: [2136, 2137, 2139]
		Cluster successfully killed
	Attempting to kill mycluster2 with CPU processes: [2196, 2198]
		Cluster successfully killed
All clusters have been killed



In [None]:
class MultipleClusterEngine(object):
    def __init__(self, **kwargs):
        self.functions_to_run = functions_to_run
        self.file_names = file_names
        self.RAM_limit_in_GB = pass
        # how to deal with other args/objects that functions might need
        self.mce_job_name = None
        pass
   

    def memory_profiler():
        pass # if all clusters are dead, then raise Error with a message
    
    
    def early_kill():
        pass # write file to failure disk, maybe also cluster i and num_cpus
    
    def cluster_release_memory():
        # after each map/reducer step, use gc.collect()
        pass
    
    to_write = 'Processed ' + str(num_files) + ' files in ' \
            + str((datetime.today() - now).total_seconds() / 60) + ' minutes\n'
    to_write += 'Used ' + str(cpu_list[0]) + ' cpus to process ' + str(big_file_ctr) + \
                    ' big files and ' 
    # If there are multiple clusters, specify number of small files processed by small cluster
    if len(cpu_list) > 1: 
        to_write += str(cpu_list[1]) + ' cpus to process ' + \
                    str(small_file_ctr) + ' small files.\n'


In [None]:
if cluster is killed, then cannot trust jth_cluster index--instead of load_balanced_list, use load_balanced_dict

# RAM writer and also progress/log writer and failre writer; use logging library
# figure out queue vs deque; deque is better
# # write a crap load of documentation

In [None]:
# probably no async or threading required

In [None]:
# weakref
# unittest with a mapper/reducer? after each map/reducer step, use gc.collect()

# MCE works on files. Hence, if you don't have any datafiles, then just create some empty files
# SSD for parallel reading (not HDD); determine if you are IO constrained
# RAM usage is heavier in Python 3 than Python 2; though Python 3 memory management is better

In [None]:
# check if all engines killed

In [None]:
# check if function fails

In [None]:
https://github.com/donnemartin/data-science-ipython-notebooks/tree/master/mapreduce

In [47]:
tail -n 50 -f temp.py