By AppliedAICourse.com

In [1]:
# Number of CPUs and Threads in Linux.
# Refer: https://linux.die.net/man/1/lscpu
!lscpu


Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                8
On-line CPU(s) list:   0-7
Thread(s) per core:    2
Core(s) per socket:    4
Socket(s):             1
NUMA node(s):          1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 79
Model name:            Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
Stepping:              1
CPU MHz:               2702.869
CPU max MHz:           3000.0000
CPU min MHz:           1200.0000
BogoMIPS:              4600.01
Hypervisor vendor:     Xen
Virtualization type:   full
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              46080K
NUMA node0 CPU(s):     0-7
Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid aperfmperf

In [5]:
# for Mac
# !system_profiler SPHardwareDataType; 

Hardware:

    Hardware Overview:

      Model Name: MacBook Pro
      Model Identifier: MacBookPro15,1
      Processor Name: 6-Core Intel Core i7
      Processor Speed: 2.6 GHz
      Number of Processors: 1
      Total Number of Cores: 6
      L2 Cache (per Core): 256 KB
      L3 Cache: 9 MB
      Hyper-Threading Technology: Enabled
      Memory: 16 GB
      Boot ROM Version: 1037.147.1.0.0 (iBridge: 17.16.16065.0.0,0)
      Serial Number (system): C02XN307JG5J
      Hardware UUID: F0006400-93A8-535A-B85D-F101341AAC16
      Activation Lock Status: Enabled



## Mean of 100 Million observations

In [2]:
# Generate random 100MM data points 
import numpy as np
n =100000000
d = np.random.rand(n)
print(d.shape)

(100000000,)


In [7]:
import time
def mean():

  #Sum using for loops. We can use inbuilt NumPy Sum opeartion for better speed.
  sum = 0
  n=d.size
  for i in range(n):
    sum +=d[i]

  #Mean
  mean = sum/n
  return mean


#Time the execution
start_time = time.time()
m = mean() # compute mean of 100MM numbers.
end_time = time.time()
print (end_time-start_time)
print(m)

20.90457510948181
0.49994777164597376


### Multi-Processing Code

In [13]:
#Refer: https://docs.python.org/3/library/multiprocessing.html
from multiprocessing import Process, Queue
import math

def mean_MP(s, e, q ):

  #Sum using for loops. We can use inbuilt NumPy Sum opeartion for better speed.
  sum = 0
  for i in range(s,e+1):
    sum +=d[i]

  #Mean
  mean = sum/(e-s+1)
  q.put(mean)
  return 

n1 = math.floor(n/2)

q = Queue() #Queues are thread and process safe. For communicating between processes and threads.

p1 = Process(target=mean_MP, args=(0, n1,q )) 
p2 = Process(target=mean_MP, args=(n1+1,n-1, q)) 


#Time the execution
start_time = time.time()

p1.start()
p2.start()

p1.join() # Wait till p1 finishes
p2.join() 

m=0;
while not q.empty():
     m += q.get()

m /= 2;
    
end_time = time.time()
print (end_time-start_time)
print(m)


11.001178979873657
0.4999477716457993


### Multi-Threaded Code

In [17]:
#Refer: https://docs.python.org/3/library/threading.html
from threading import Thread


means = [0,0];

def mean_MT(s, e, threadNum ):

  #Sum using for loops. We can use inbuilt NumPy Sum opeartion for better speed.
  sum = 0
  for i in range(s,e+1):
    sum +=d[i]

  #Mean
  mean = sum/(e-s+1)
  means[threadNum] = mean; # means is a shared varibale between the threads

  return 

n1 = math.floor(n/2)

t1 = Thread(target=mean_MT, args=(0, n1,0 ))  # Third apram is the thread number
t2 = Thread(target=mean_MT, args=(n1+1,n-1,1)) 

#Time the execution
start_time = time.time()

t1.start()
t2.start()

t1.join() # Wait till t1 finishes
t2.join() 

m = (means[0]+means[1])/2
    
end_time = time.time()
print (end_time-start_time)
print(m)



19.538660049438477
0.4999477716457993


## Joblib


### Caching of function output values

In [19]:

#Transparent and fast disk-caching of output value
# Refer: https://joblib.readthedocs.io/en/latest/
from joblib import Memory
cachedir = './'
mem = Memory(cachedir)

import numpy as np
a = np.vander(np.arange(3)).astype(np.float)
square = mem.cache(np.square)
b = square(a)                                   

________________________________________________________________________________
[Memory] Calling square...
square(array([[0., 0., 1.],
       [1., 1., 1.],
       [4., 2., 1.]]))
___________________________________________________________square - 0.0s, 0.0min


In [20]:
c = square(a)
# The above call did not trigger an evaluation

### Simple Parallel programming for Loops

In [23]:
# Refer: https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html

import time
from math import sqrt # inbuilt fucntion

def f(i):
    
    # some computations  that take time
    x=10000
    p =1;
    for j in range(x):
        for k in range(j):
            p *= k
    
    return sqrt(i ** 2);

# Find sqrt of first n numbers
n=10;

start_time = time.time()

for i in range(n):
    f(i)

end_time = time.time()
print (end_time-start_time)

17.116321802139282


In [24]:
from joblib import Parallel, delayed

start_time = time.time()

a = Parallel(n_jobs=2)(delayed(f)(i) for i in range(n)) 

# Why we need dealyed(): https://stackoverflow.com/questions/42220458/what-does-the-delayed-function-do-when-used-with-joblib-in-python

end_time = time.time()
print (end_time-start_time)

9.58085012435913


In [25]:
# Multi threading: GIL is an issue
start_time = time.time()

a = Parallel(n_jobs=2,prefer="threads")(delayed(f)(i ** 2) for i in range(n))

end_time = time.time()
print (end_time-start_time)

17.342177867889404


In [26]:

# 6 jobs

from joblib import Parallel, delayed

start_time = time.time()

a = Parallel(n_jobs=6)(delayed(f)(i ** 2) for i in range(n)) 

# Why we need dealyed(): https://stackoverflow.com/questions/42220458/what-does-the-delayed-function-do-when-used-with-joblib-in-python

end_time = time.time()
print (end_time-start_time)

4.557589054107666
