# Threads

In [5]:
import time
from threading import Thread

def f(name, n):
    for i in range(n):
        print(name, n-i)
        time.sleep(1)

# f("A", 3)
# f("B", 5)
        
t1 = Thread(target=f, args=("A", 3))
t2 = Thread(target=f, args=("B", 5))
t1.start()
t2.start()
t1.join() # wait for them to finish
t2.join()

A 3
Bhi
 5
A 2
B 4
A 1
B 3
B 2
B 1


# Processes

How can we do the same computation to every item in a list?

## 1. For Loop, Append

In [6]:
nums = [1, 2, 3]
doubles = []
for x in nums:
    doubles.append(x * 2)
doubles

[2, 4, 6]

## 2. List Comprehension

In [7]:
[x * 2 for x in nums]

[2, 4, 6]

## 3. Series.apply with lambda

In [9]:
import pandas as pd
s = pd.Series(nums)
s.apply(lambda my_param: my_param*2)

0    2
1    4
2    6
dtype: int64

## 4. map with lambda

In [11]:
list(map(lambda my_param: my_param*2, nums))

[2, 4, 6]

## 5. map with named function

In [20]:
from time import time

#my_func = lambda my_param: my_param*2

def my_func(my_param):
    return my_param*2

t0 = time()
list(map(my_func, nums))
t1 = time()

(t1-t0) * 1000

0.04506111145019531

## 6. pool map

In [22]:
import os
os.cpu_count()

2

In [27]:
from multiprocessing import Pool

with Pool(100) as pool:
    t0 = time()
    results = list(pool.map(my_func, nums))
    t1 = time()

print(results)

(t1-t0) * 1000

[2, 4, 6]


16.08872413635254

# Parallel Download

In [29]:
import requests
url = "https://tyler.caraza-harter.com/cs320/crawl/practice7/0.html"
r = requests.get(url)
r.raise_for_status()
r.text

'<html><body>\n<h1>Node: 0</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="16.html">16</a><li><a href="17.html">17</a><li><a href="6.html">6</a><li><a href="12.html">12</a></ul>\n</body></html>\n'

In [31]:
def get_page(page_num):
    url = f"https://tyler.caraza-harter.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    r.raise_for_status()
    return r.text

get_page(1)

'<html><body>\n<h1>Node: 1</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="0.html">0</a><li><a href="2.html">2</a></ul>\n</body></html>\n'

In [35]:
with Pool(10) as p:
    t0 = time()
    results = p.map(get_page, range(0, 18))
    t1 = time()

print(t1-t0) # time in seconds
results

0.25068235397338867


['<html><body>\n<h1>Node: 0</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="16.html">16</a><li><a href="17.html">17</a><li><a href="6.html">6</a><li><a href="12.html">12</a></ul>\n</body></html>\n',
 '<html><body>\n<h1>Node: 1</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="0.html">0</a><li><a href="2.html">2</a></ul>\n</body></html>\n',
 '<html><body>\n<h1>Node: 2</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="0.html">0</a><li><a href="1.html">1</a></ul>\n</body></html>\n',
 '<html><body>\n<h1>Node: 3</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="2.html">2</a><li><a href="4.html">4</a></ul>\n</body></html>\n',
 '<html><body>\n<h1>Node: 4</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="2.html">2</a><li><a href="3.html">3</a></ul>\n</body></html>\n',
 '<html><body>\n<h1>Node: 5</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="4.html">4</a><li><a href="6.html">6</a></ul>\n</body></html>\n',
 '<html><body>\n<h1>Node: 6</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="4.html">4</a><li><a href="5.html">5</a><l

# Bug 1: Child Crashes -- What Happens to the Parent

In [36]:
with Pool(10) as p:
    t0 = time()
    results = p.map(get_page, range(0, 20))
    t1 = time()

print(t1-t0) # time in seconds
results

HTTPError: 404 Client Error: Not Found for url: https://tyler.caraza-harter.com/cs320/crawl/practice7/19.html

# Bug 2: global variables

"global" variables are only global within a process

In [46]:
os.getpid() # get process ID

1322

In [47]:
all_pages = {} # page number => HTML for that page

def get_page_v2(page_num):
    url = f"https://tyler.caraza-harter.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    r.raise_for_status()
    all_pages[page_num] = r.text
    print(os.getpid(), list(all_pages.keys()))

# get_page_v2(1)
# get_page_v2(2)

all_pages

{}

In [52]:
with Pool(2) as p:
    t0 = time()
    p.map(get_page_v2, range(0, 18))
    t1 = time()

all_pages

1519715196  [3][0]

1519715196 [0, 1]
 [3, 4]
15196 [0, 1, 2]15197 [3, 4, 5]

15197 [3, 4, 5, 6]
15196 [0, 1, 2, 9]
15197 [3, 4, 5, 6, 7]
15196 [0, 1, 2, 9, 10]
15197 [3, 4, 5, 6, 7, 8]
15196 [0, 1, 2, 9, 10, 11]
15197 [3, 4, 5, 6, 7, 8, 12]
15196 [0, 1, 2, 9, 10, 11, 15]
15197 [3, 4, 5, 6, 7, 8, 12, 13]
15196 [0, 1, 2, 9, 10, 11, 15, 16]
15197 [3, 4, 5, 6, 7, 8, 12, 13, 14]
15196 [0, 1, 2, 9, 10, 11, 15, 16, 17]


{}

In [49]:
os.getpid()

1322