# Batch Processing with `joblib`

In [23]:
from math import ceil
import random
import datetime
from joblib import Parallel, delayed
from typing import Callable, Dict, List, Union
from multiprocessing import Queue

import tqdm as tqdm_
from tqdm.auto import tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm_batch import batch_process

sns.set_context('poster')

![img](https://res.cloudinary.com/hevo/images/f_auto,q_auto/v1649315584/hevo-learn/Batch-Processing-Batch-Processing-vs-Stream-Processing/Batch-Processing-Batch-Processing-vs-Stream-Processing.png?_i=AA)

(Source: https://hevodata.com/learn/batch-processing/).

Batch processing is to be contrasted with serial or *stream* processing. Stream processing is critical when you need real-time updating of data reports or analyses. But if you are processing large chunks of data, it can be better to process it in batches.

Batch processing works in an **automated** way based on a **scheduler**.

## Some Advantages of `joblib`:

- Disk Caching of Functions & Lazy Re-Evaluation

Separate flow-execution logic from algorithmic logic and **memoize** to speed up computations. That is, cache the results of expensive function calls for later use.

- Parallel Computing

Simple and easy to debug.

- Fast Storage / Compression

Better than `pickle` for large objects.

(Source: https://hevodata.com/learn/python-batch-processing/.)

## `tqdm`

In [24]:
num = 0
for j in tqdm_.tqdm(range(10000000)):
    num += j
    if not j % 1000000:
        print(num)

  6%|███▏                                                   | 584478/10000000 [00:00<00:03, 3036507.66it/s]

0


 18%|█████████▌                                            | 1762833/10000000 [00:00<00:02, 3750532.22it/s]

500000500000


 25%|█████████████▋                                        | 2534880/10000000 [00:00<00:01, 3742813.29it/s]

2000001000000


 36%|███████████████████▋                                  | 3638471/10000000 [00:01<00:01, 3572114.70it/s]

4500001500000


 47%|█████████████████████████▌                            | 4743415/10000000 [00:01<00:01, 3661200.73it/s]

8000002000000


 55%|█████████████████████████████▌                        | 5476357/10000000 [00:01<00:01, 3414897.42it/s]

12500002500000


 67%|████████████████████████████████████                  | 6670520/10000000 [00:01<00:00, 3797222.62it/s]

18000003000000


 79%|██████████████████████████████████████████▋           | 7898452/10000000 [00:02<00:00, 3987113.62it/s]

24500003500000


 87%|███████████████████████████████████████████████       | 8709940/10000000 [00:02<00:00, 4023286.34it/s]

32000004000000


 99%|█████████████████████████████████████████████████████▋| 9940042/10000000 [00:02<00:00, 4080300.02it/s]

40500004500000


100%|█████████████████████████████████████████████████████| 10000000/10000000 [00:02<00:00, 3756096.72it/s]


## Contrasting Serial and Batch Processing

The function below is based on the following mathematical theorem:

$\large\frac{\pi}{4} = 1 - \frac{1}{3} + \frac{1}{5} - \frac{1}{7} + \frac{1}{9} - ... = lim_{n\rightarrow\infty}\sum^n_{j=0}\frac{(-1)^j}{2j+1}$

In [25]:
def batch_process_function(row, order, payload):
    """
    Simulate process function
    
    Row and payload are ignored.
    
    Approximate pi
    """
    k, pi = 1, 0
    for i in range(10**order):
        if i % 2 == 0: # even
            pi += 4 / k
        else:  # odd 
            pi -= 4 / k 
        k += 2
    return pi

In [26]:
# Settings
order=6
N = 1_000
items = range(N)

### Serial

In [15]:
result = [batch_process_function(row, order, None) for row in items]

In [16]:
result[0]

3.1415916535897743

In [10]:
%%time

# Serial run
result = [batch_process_function(row, order, None) for row in items]

CPU times: user 1min 57s, sys: 558 ms, total: 1min 57s
Wall time: 1min 59s


### Batch

In [17]:
result = joblib.Parallel(n_jobs=8)(
    joblib.delayed(batch_process_function)
    (row, order, None)
    for row in tqdm_.tqdm(items)
)

100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:23<00:00, 42.85it/s]


In [27]:
result[0]

3.1415916535897743

In [18]:
%%time

# Parallel using joblib and a progress bar using tqdm
result = joblib.Parallel(n_jobs=8)(
    joblib.delayed(batch_process_function)
    (row, order, None) 
    for row in tqdm_.tqdm(items)
)

100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:22<00:00, 45.03it/s]


CPU times: user 1.1 s, sys: 179 ms, total: 1.28 s
Wall time: 22.5 s


## Things to Be Aware of

- Batch Triggers
- Scheduling
- Exception Alerts

## Serialize per Batch

(Source: https://towardsdatascience.com/parallel-batch-processing-in-python-8dcce607d226)

In [27]:
n_workers = 8

# Create a batch function
def proc_batch(batch, order, matrix):
    return [
        batch_process_function(row, order, matrix)
        for row in batch
    ]

# Divide data in batches
batch_size = np.ceil(len(items) / n_workers)
batches = [
    items[ix:ix+int(batch_size)] for ix in range(0, len(items), int(batch_size))
]

# divide the work
result = joblib.Parallel(n_jobs=n_workers)(
    joblib.delayed(proc_batch)
    (batch, order, None) 
    for batch in tqdm.tqdm(batches)
)



100%|██████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 2200.29it/s][A[A


In [16]:
def progress_bar(
    totals: Union[int, List[int]],
    queue : Queue,
) -> None:
    if isinstance(totals, list):
        splitted = True
        pbars = [
            tqdm(
                desc=f'Worker {pid + 1}',
                total=total,
                position=pid,
            )
            for pid, total in enumerate(totals)
        ]
    else:
        splitted = False
        pbars = [
            tqdm(total=totals)
        ]

    while True:
        try:
            message = queue.get()
            if message.startswith('update'):
                if splitted:
                    pid = int(message[6:])
                    pbars[pid].update(1)
                else:
                    pbars[0].update(1)
            elif message == 'done':
                break
        except:
            pass
    for pbar in pbars:
        pbar.close()

        
def task_wrapper(pid, function, batch, queue, *args, **kwargs):
    result = []
    for example in batch:
        result.append(function(example, *args, **kwargs))
        queue.put(f'update{pid}')
    return result

        
def batch_process(
    items: list,
    function: Callable,
    n_workers: int=8,
    sep_progress: bool=False,
    *args,
    **kwargs,
    ) -> List[Dict[str, Union[str, List[str]]]]:
    # Divide data in batches
    batch_size = ceil(len(items) / n_workers)
    batches = [
        items[ix:ix+batch_size]
        for ix in range(0, len(items), batch_size)
    ]

    # Check single or multiple progress bars
    if sep_progress:
        totals = [len(batch) for batch in batches]
    else:
        totals = len(items)

    # Start progress bar in separate thread
    manager = Manager()
    queue = manager.Queue()
    progproc = Thread(target=progress_bar, args=(totals, queue))
    progproc.start()

    # Parallel process the batches
    result = Parallel(n_jobs=n_workers)(
        delayed(task_wrapper)
        (pid, function, batch, queue, *args, **kwargs)
        for pid, batch in enumerate(batches)
    )

    # Stop the progress bar thread
    queue.put('done')
    progproc.join()

    # Flatten result
    flattened = [item for sublist in result for item in sublist]

    return flattened

In [27]:
result = batch_process(items,
                       batch_process_function,
                       order=6,
                       n_workers=8,
                       payload=matrix,
                       sep_progress=True
                      )

NameError: name 'matrix' is not defined