In [1]:
import numpy as np # numerical library
import scipy as sp # scientific library (statistics, least-square solver, etc)
import pandas as pd # simplifies handling with datasets, handling is similar to matlab or R
import matplotlib as mpl # plot library
import matplotlib.pyplot as plt # plot commands
import seaborn as sns # helper library for more advanced charts

%matplotlib inline
%config InlineBackend.figure_formats = {'retina', 'png'}

# Additional smaller topics

## Speed-up repetitively used functions

In [2]:
# To cache the output, function has to be deterministic
import joblib

memory = joblib.Memory('./cachedir', verbose=1, compress=0)

In [3]:
import time

def long_running(x):
    time.sleep(2)
    return x**2

@memory.cache
def cached_long_running(x):
    time.sleep(2)
    return x**2

In [4]:
#%timeit long_running(12)
%timeit cached_long_running(12)

476 µs ± 79.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Simple multiprocess computation

In [5]:
import multiprocessing as mp

data = list(range(0, 50, 3))
print(data)

def processor(x):
    return x**3

with mp.Pool() as pool:
    data = pool.map(processor, data)

print(data)

[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48]
[0, 27, 216, 729, 1728, 3375, 5832, 9261, 13824, 19683, 27000, 35937, 46656, 59319, 74088, 91125, 110592]


## Computation on a server / cluster

In [6]:
import ipyparallel as ipp

In [7]:
rc = ipp.Client()

In [8]:
rc.ids

[0, 1, 2, 3, 4, 5, 6, 7]

In [9]:
dview = rc[:]

In [10]:
# Run task on local machine
serial_result = list(map(lambda x:x**10, range(32)))

In [11]:
# Run function on "cluster"
parallel_result = dview.map_sync(lambda x: x**10, range(32))

In [12]:
# Are they the same?
serial_result == parallel_result

True