In [1]:
import time

# regular (no threads)

def f(name, n):
    for i in range(n):
        print(name, n-i)
        time.sleep(1)

f("A", 3)
f("B", 5)

A 3
A 2
A 1
B 5
B 4
B 3
B 2
B 1


In [5]:
# with threads

from threading import Thread

t1 = Thread(target=f, args=("A", 3))
t2 = Thread(target=f, args=("B", 5))
t1.start()
t2.start()
t1.join()
t2.join()

print("done")

A 3
B 5
A 2
B 4
A 1
B 3
B 2
B 1
done


In [7]:
def double(x):
    return x * 2

for result in map(double, [7,8,9]):
    print(result)

14
16
18


In [8]:
list(map(double, [7,8,9]))

[14, 16, 18]

In [10]:
from multiprocessing import Pool

In [11]:
with Pool() as p:
    # this is probably a bad idea: it is much slower to create a process than multiply by 2
    results = p.map(double, [7,8,9])
results

[14, 16, 18]

In [13]:
import requests

def get_page(page_num):
    url = f"https://tyler.caraza-harter.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    r.raise_for_status()
    return r.text

get_page(1)

'<html><body>\n<h1>Node: 1</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="0.html">0</a><li><a href="2.html">2</a></ul>\n</body></html>\n'

In [17]:
t0 = time.time()
with Pool() as p:
    results = p.map(get_page, range(18))
t1 = time.time()
print((t1-t0) * 1000) # in ms, how long did it take?

1252.6209354400635


In [19]:
t0 = time.time()
with Pool(1) as p:
    results = p.map(get_page, range(18))
t1 = time.time()
print((t1-t0) * 1000) # in ms, how long did it take?

2301.382541656494


In [20]:
t0 = time.time()
with Pool(8) as p:
    results = p.map(get_page, range(18))
t1 = time.time()
print((t1-t0) * 1000) # in ms, how long did it take?

974.8213291168213


In [21]:
# bug 1: is there is an exception in one of our processes
with Pool(8) as p:
    results = p.map(get_page, range(20)) # we don't actually have this many pages

HTTPError: 404 Client Error: Not Found for url: https://tyler.caraza-harter.com/cs320/crawl/practice7/18.html

In [26]:
import os
os.getpid() # the unique process ID for the current process

83200

In [27]:
status_codes = []

def get_page(page_num):
    url = f"https://tyler.caraza-harter.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    status_codes.append(r.status_code) # bug 2: ????
    print(os.getpid(), status_codes)
    r.raise_for_status()
    # ADVICE: return everything you need when using a process in a pool
    # (global variables aren't shared between processes)
    return r.text

In [28]:
with Pool(8) as p:
    results = p.map(get_page, range(18))

83758 [200]
83764 [200]83757
837608376183759 [200] 83762  [200][200]
 83763
[200]
[200] 
[200]

83758 [200, 200]
8376283757 8375983764 [200, 200] [200, 200]83760
83761[200, 200]
 83763  [200, 200][200, 200][200, 200]

 
[200, 200]

83758 [200, 200, 200]
83762 [200, 200, 200]


In [29]:
print(os.getpid(), status_codes)

83200 []
