In [1]:
import numpy as np
import pandas as pd
import time
import swifter
import multiprocessing as mp
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
input_file = 'sample_data.pkl'
df = pd.read_pickle(input_file)
df.shape

(10000, 4)

In [3]:
df.head()

Unnamed: 0,Submit,Start,End,CPUs
0,2020-10-16 03:58:19,2020-10-16 03:58:20,2020-10-16 03:58:22,28
1,2020-06-08 20:42:58,2020-06-09 14:50:51,2020-06-09 14:50:51,20
2,2020-03-19 17:14:21,2020-03-19 17:14:32,2020-03-19 17:15:05,1
3,2020-07-23 14:31:16,2020-07-24 02:01:33,2020-07-24 06:03:26,1
4,2020-02-08 00:50:17,2020-02-08 00:50:52,2020-02-08 00:51:33,24


### Converting desired columns of the dataframe to numpy arrays

In [4]:
submit = df.Submit.to_numpy()
start = df.Start.to_numpy()
end = df.End.to_numpy()
cpus = df.CPUs.to_numpy()

#### Utilizing the apply function of Pandas to execute an operation on every array element: calculate number of cpus in use between a specified start and end time

In [5]:
### Pandas apply ###
def calculate_cpus_utilized(rowsubmit, start, end, cpus):
    indexes = np.where((rowsubmit > start) & (rowsubmit < end))[0]
    cpus_sum = cpus[indexes].sum()
    return cpus_sum

start_time = time.time()
df['cpus_utilized_pandas'] = df[['Submit']].apply(lambda row: calculate_cpus_utilized(row['Submit'], start, end, cpus), axis=1)
elapsed_pandas = time.time() - start_time

#### Utilizing the Swifter package to efficiently execute the apply function to carry out the above calculation

In [6]:
## Swifter ## 
def calculate_cpus_utilized(rowsubmit, start, end, cpus):
    indexes = np.where((rowsubmit > start) & (rowsubmit < end))[0]
    cpus_sum = cpus[indexes].sum()
    return cpus_sum

start_time = time.time()
df['cpus_utilized_swifter'] = df[['Submit']].swifter.apply(lambda row: calculate_cpus_utilized(row['Submit'], start, end, cpus), axis=1)
elapsed_swifter = time.time() - start_time

Pandas Apply: 100%|██████████| 10000/10000 [00:00<00:00, 29696.16it/s]


#### Doing the same operation as above but using the for loop instead of apply

In [7]:
## NumPy ##
start_time = time.time()
cpu_ct_np = [np.sum(np.where(((sub>start) & (sub < end)),cpus,0)) for sub in submit]
elapsed_numpy = (time.time() - start_time)
df['cpus_utilized_numpy'] = cpu_ct_np

### Process-based Parallelism

#### Getting the number of CPUs allocated to this job using Python's os.sched_getaffinity

In [8]:
print("Number of CPUs in the node using multiprocessing library  : ",mp.cpu_count())
print("Number of CPUs in the node using OS library : ",os.cpu_count())
print("Actual number of Usable CPUs for multiprocessing: ",len(os.sched_getaffinity(0)))

Number of CPUs in the node using multiprocessing library  :  112
Number of CPUs in the node using OS library :  112
Actual number of Usable CPUs for multiprocessing:  52


#### Doing the same calculation as done using the apply function and the for loop but utilizing Python's multiprocessing library to make it distributed across useable CPUs

In [9]:
## Number of process = len(os.sched_getaffinity(0)) ##
def countCPUS(sub):
    return np.sum(np.where(((sub>start) & (sub < end)),cpus,0))

start_time = time.time()
cpu_ct_mt_proc_cpu=[]
with mp.Pool(len(os.sched_getaffinity(0))) as process:
    cpu_ct_mt_proc_cpu = process.map(countCPUS, submit)
elapsed_mt_proc_cpu_task=(time.time() - start_time)
df['cpus_utilized_multiproc'] = cpu_ct_mt_proc_cpu

In [10]:
### Performance analysis ###
print("Panda apply                     : ",elapsed_pandas)
print("Panda swifter apply             : ",elapsed_swifter)
print("Pure Numpy                      : ",elapsed_numpy)
print("Multiprocess                    : ",elapsed_mt_proc_cpu_task)

Panda apply                     :  0.3298323154449463
Panda swifter apply             :  0.38505053520202637
Pure Numpy                      :  0.31360602378845215
Multiprocess                    :  0.21578145027160645


#### Multiprocessing with numpy arrays is by far the fastest compared to the other two techniques applied on numpy arrays for heavy-duty mathematical calculation

In [11]:
df.head()

Unnamed: 0,Submit,Start,End,CPUs,cpus_utilized_pandas,cpus_utilized_swifter,cpus_utilized_numpy,cpus_utilized_multiproc
0,2020-10-16 03:58:19,2020-10-16 03:58:20,2020-10-16 03:58:22,28,2,2,2,2
1,2020-06-08 20:42:58,2020-06-09 14:50:51,2020-06-09 14:50:51,20,90,90,90,90
2,2020-03-19 17:14:21,2020-03-19 17:14:32,2020-03-19 17:15:05,1,43,43,43,43
3,2020-07-23 14:31:16,2020-07-24 02:01:33,2020-07-24 06:03:26,1,0,0,0,0
4,2020-02-08 00:50:17,2020-02-08 00:50:52,2020-02-08 00:51:33,24,30,30,30,30
