# Lab: threading

Write a program to fetch photos from the site <https://jsonplaceholder.typicode.com/>. 

Try fetching with diferent levels of parallelism (number of threads).

Use the following to get started:

In [1]:
import requests

def get_photos():
    return [
        obj['url'] for obj in requests.get('https://jsonplaceholder.typicode.com/photos').json()
    ]

def fetch_photo(url):
    resp = requests.get(url)
    resp.raise_for_status()
    return resp

In [2]:
%%time
# Serial version (1 thread)

photos = get_photos()
for i, url in enumerate(photos):
    r = fetch_photo(url)
    if i > 10:
        break


CPU times: user 466 ms, sys: 62.1 ms, total: 528 ms
Wall time: 2.3 s


Write a version to fetch with a specified number of threads

In [3]:
import threading
import queue

def worker(qjob, qresult):
    while True:
        job = qjob.get()
        if job is None:
            break
        try:
            r = fetch_photo(job)
        except Exception as err:
            qresult.put((None, err))
        else:
            qresult.put((r, None))

In [4]:
def thread_version(photos, nthread):
    qjob = queue.Queue()
    qresult = queue.Queue()
    threads = [
        threading.Thread(target=worker, args=(qjob, qresult))
        for i in range(nthread)
    ]
    for t in threads:
        t.start()
    for url in photos:
        qjob.put(url)
    num_err = 0
    for i in range(len(photos)):
        resp, err = qresult.get()
        if err:
            num_err += 1
    for t in threads:
        qjob.put(None)
    for t in threads:
        t.join()
    print(f'Fetched {len(photos)} with {num_err} errors')

Use the `%%time` Jupyter magic to see how long it takes to fetch 500 photos with 10, 100, 1000 threads

In [5]:
%%time
thread_version(photos[:500], 10)

Fetched 500 with 0 errors
CPU times: user 18.8 s, sys: 2.13 s, total: 20.9 s
Wall time: 34.7 s


In [6]:
%%time
thread_version(photos[:500], 100)

Fetched 500 with 0 errors
CPU times: user 7.05 s, sys: 1.79 s, total: 8.85 s
Wall time: 34.2 s


In [7]:
%%time
thread_version(photos[:500], 500)

Fetched 500 with 0 errors
CPU times: user 13.1 s, sys: 6.13 s, total: 19.2 s
Wall time: 8.3 s


Alternatively, use `multiprocessing.pool.ThreadPool`

In [6]:
import multiprocessing.pool

In [7]:
%%time
with multiprocessing.pool.ThreadPool(processes=10) as pool:
    for r in pool.imap_unordered(fetch_photo, photos[:500]):
        pass

CPU times: user 19.9 s, sys: 2.45 s, total: 22.3 s
Wall time: 17.5 s


In [8]:
%%time
with multiprocessing.pool.ThreadPool(processes=100) as pool:
    for r in pool.imap_unordered(fetch_photo, photos[:500]):
        pass

CPU times: user 17.8 s, sys: 3.44 s, total: 21.3 s
Wall time: 34.5 s


In [9]:
%%time
with multiprocessing.pool.ThreadPool(processes=500) as pool:
    for r in pool.imap_unordered(fetch_photo, photos[:500]):
        pass

CPU times: user 19.2 s, sys: 3.85 s, total: 23 s
Wall time: 18.2 s


# Lab: multiprocessing

Using a `multiprocessing.Pool`, repeat the exercise above and see if it became faster or slower. Any hypotheses why?

In [10]:
%%time
with multiprocessing.Pool(processes=10) as pool:
    pool.map(fetch_photo, photos[:500])

CPU times: user 372 ms, sys: 174 ms, total: 546 ms
Wall time: 35.8 s


In [11]:
%%time
with multiprocessing.Pool(processes=20) as pool:
    pool.map(fetch_photo, photos[:500])

CPU times: user 500 ms, sys: 222 ms, total: 722 ms
Wall time: 34.1 s


In [12]:
%%time
with multiprocessing.Pool(processes=50) as pool:
    pool.map(fetch_photo, photos[:500])

CPU times: user 238 ms, sys: 276 ms, total: 514 ms
Wall time: 6.39 s


In [13]:
%%time
with multiprocessing.Pool(processes=500) as pool:
    pool.map(fetch_photo, photos[:500])

ConnectionError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))