# Efficient Python Codes

In this lecture we will study ways to 

## Timing and profiling code

progress bar:

In [None]:
from time import sleep

from tqdm import tqdm, trange, tqdm_notebook

In [None]:
for i in tqdm(range(10)):
    
    sleep(.1)

In [None]:
def generator():
    
    for i in range(10):
        
        yield i

In [None]:
for i in tqdm(generator()):
    
    sleep(.1)

In [None]:
for i in tqdm(generator(), total=10):
    
    sleep(.1)

In [None]:
for i in trange(10):
    
    sleep(.1)

In [None]:
for i in tqdm_notebook(range(10)):
    
    sleep(.1)

In [None]:
import pandas as pd

In [None]:
tqdm.pandas()

In [None]:
amazon_fires = pd.read_csv('data/amazon.csv', encoding='latin1')

In [None]:
amazon_fires.head()

In [None]:
amazon_fires.progress_apply(lambda x: '{} {}'.format(x['Estado'], x['Mês']), axis='columns')

## Timing a code

Magic commands or magic functions are one of the important enhancements that IPython offers compared to the standard Python shell. These magic commands are intended to solve common problems in data analysis using Python. In fact, they control the behaviour of IPython itself.

In [None]:
def good_practice(rand_array):
    bigger_than_fifties = [*rand_array[rand_array > 50]] # using masking, broadcasting and unpacking over an np.array
    return bigger_than_fifties

def bad_practice(rand_list):
    bigger_than_fifties = []
    for i in range(len(rand_list)):
        if rand_list[i] > 50:
            bigger_than_fifties.append(rand_list[i])
    return bigger_than_fifties

In [None]:
import numpy as np

rand_array = np.random.randint(100, size=1000)
rand_list = [*rand_array]

The Magic commands time and timeit 

In [None]:
%time a = bad_practice(rand_list)

In [None]:
%timeit a = bad_practice(rand_list)

In [None]:
%time a = good_practice(rand_array)

In [None]:
%timeit -n 100 a = good_practice(rand_array)

In [None]:
%%time

a=bad_practice(rand_list)
b=good_practice(rand_array)

In [None]:
%%timeit

a=bad_practice(rand_list)
b=good_practice(rand_array)

In [None]:
%%timeit -n 100

a=bad_practice(rand_list)
b=good_practice(rand_array)

In [None]:
print(good_practice)

In [None]:
# conda install -c anaconda line_profiler 

In [None]:
%prun bad_practice(rand_list)

In [None]:
%prun good_practice(rand_array)

In [None]:
%load_ext line_profiler

In [None]:
%lprun good_practice(rand_array)

In [None]:
 %lprun -f good_practice good_practice(rand_array)

In [None]:
 %lprun -f bad_practice bad_practice(rand_list)

## Memory profiling

In [None]:
# conda install -c anaconda memory_profiler 

In [None]:
%load_ext memory_profiler

In [None]:
%memit good_practice(rand_array)

In [None]:
%memit bad_practice(rand_list)

In [None]:
%%memit

a=bad_practice(rand_list)
b=good_practice(rand_array)

In [None]:
%mprun -f good_practice good_practice(rand_array)

In [None]:
%%file practices.py

def good_practice(rand_array):
    bigger_than_fifties = [*rand_array[rand_array > 50]] # using masking, broadcasting and unpacking over an np.array
    return bigger_than_fifties

def bad_practice(rand_list):
    bigger_than_fifties = []
    for i in range(len(rand_list)):
        if rand_list[i] > 50:
            bigger_than_fifties.append(rand_list[i])
    return bigger_than_fifties

In [None]:
from practices import good_practice, bad_practice

In [None]:
%mprun -f good_practice good_practice(rand_array)

In [None]:
%mprun -f bad_practice bad_practice(rand_list)

How to optimize? AVOID LOOPS AND CONDITIONS!

1. Prefer numpy arrays, pandas apply, itertools and collections.
2. Try list comprehensions.
3. Write better loops.

## Numba

Numba translates Python functions to optimized machine code at runtime using the industry-standard LLVM compiler library. Numba-compiled numerical algorithms in Python can approach the speeds of C or FORTRAN.

You don't need to replace the Python interpreter, run a separate compilation step, or even have a C/C++ compiler installed. Just apply one of the Numba decorators to your Python function, and Numba does the rest. 

In [None]:
from numba import jit

import numpy as np

import random
import numpy.random as rd

In [None]:
@jit(nopython=True)
def numba_monte_carlo_pi(nsamples):
    acc = 0
    for i in range(nsamples):
        x = random.random()
        y = random.random()
        if (x ** 2 + y ** 2) < 1.0:
            acc += 1
    return 4.0 * acc / nsamples

In [None]:
def numpy_monte_carlo_pi(nsamples):
    
    x = rd.uniform(size=nsamples)
    y = rd.uniform(size=nsamples)

    acc = ((x ** 2 + y ** 2) < 1.0).sum()

    return 4.0 * acc / nsamples

In [None]:
%%time

numba_monte_carlo_pi(100000)

In [None]:
%%time

numpy_monte_carlo_pi(100000)

In [None]:
%%time

numba_monte_carlo_pi(100000)

## High Performance Computing

In [None]:
import os
os.environ['OMP_NUM_THREADS'] = '1'

With Numba:

In [None]:
SQRT_2PI = np.sqrt(2 * np.pi)

@jit(nopython=True, parallel=True)
def gaussians(x, means, widths):
    '''Return the value of gaussian kernels.
    
    x - location of evaluation
    means - array of kernel means
    widths - array of kernel widths
    '''
    n = means.shape[0]
    result = np.exp( -0.5 * ((x - means) / widths)**2 ) / widths
    return result / SQRT_2PI / n

In [None]:
means = np.random.uniform(-1, 1, size=1000000)
widths = np.random.uniform(0.1, 0.3, size=1000000)

gaussians(0.4, means, widths)

In [None]:
gaussians_nothread = jit(nopython=True)(gaussians.py_func)

%timeit gaussians_nothread(0.4, means, widths)
%timeit gaussians(0.4, means, widths)
%timeit gaussians.py_func(0.4, means, widths) # compare to pure NumPy

In [None]:
from numba import prange

In [None]:
# Serial version
@jit(nopython=True)
def monte_carlo_pi_serial(nsamples):
    acc = 0
    for i in range(nsamples):
        x = random.random()
        y = random.random()
        if (x**2 + y**2) < 1.0:
            acc += 1
    return 4.0 * acc / nsamples

# Parallel version
@jit(nopython=True, parallel=True)
def monte_carlo_pi_parallel(nsamples):
    acc = 0
    # Only change is here
    for i in prange(nsamples):
        x = random.random()
        y = random.random()
        if (x**2 + y**2) < 1.0:
            acc += 1
    return 4.0 * acc / nsamples

In [None]:
%time monte_carlo_pi_serial(int(4e8))
%time monte_carlo_pi_parallel(int(4e8))

### External Multithreading

Sometimes your threading system is external to Numba entirely. You might be using concurrent.futures to run functions in multiple threads, or a parallel framework like Dask. For these situations, you do not want to use ParallelAccelerator, but do want to allow the Numba-compiled function to run concurrently in different threads.

To do this, you want the Numba function to release the Global Interpreter Lock (GIL) during execution. This can be done using the nogil=True option to @jit.

Let's do our Monte Carlo example again, but with Dask. Note that Numba will still handle initializing separate random number generator seeds on each thread, as it did with ParallelAccelerator.

In [None]:
import dask
import dask.delayed

@jit(nopython=True, nogil=True)
def monte_carlo_pi(nsamples):
    acc = 0
    for i in range(nsamples):
        x = random.random()
        y = random.random()
        if (x**2 + y**2) < 1.0:
            acc += 1
    return 4.0 * acc / nsamples

print(monte_carlo_pi(int(1e6)))

delayed_monte_carlo_pi = dask.delayed(monte_carlo_pi)

In [None]:
%%time
futures = [delayed_monte_carlo_pi(int(4e8)) for i in range(4)]
results = dask.compute(futures)[0]

np.sum(results)/4

In [None]:
%%time
futures = [delayed_monte_carlo_pi(int(4e8)) for i in range(4)]
results = dask.compute(futures, num_workers=1)[0]

np.sum(results)/4

### Pymp

In [None]:
import pymp

In [None]:
a = []

with pymp.Parallel(4) as p:
    
    for i in p.range(40):
        
        a.append(i**4)

In [None]:
a

In [None]:
r = []

with pymp.Parallel(4) as p:
    
    for i in p.iterate(a):
        
        r.append(i**4)

In [None]:
r

In [None]:
r = pymp.shared.list()

with pymp.Parallel(4) as p:
    
    for i in p.iterate(a):
        
        r.append(i**4)

In [None]:
r

In [None]:
list(r)