# Threads

In [16]:
import time
from threading import Thread

def f(name, n):
    for i in range(n):
        print(name, n-i)
        time.sleep(1)

# f("A", 3)
# f("B", 5)

t1 = Thread(target=f, args=("A", 3))
t2 = Thread(target=f, args=("B", 5))
t1.start()
t2.start()
t1.join() # wait for it to finish
t2.join()

A 3
B 5
A 2
B 4
A 1
B 3
B 2
B 1


# Processes

How can we do the same computation to every item in a list?

## 1. For Loop, Append

In [2]:
nums = [1, 2, 3]
doubles = []
for x in nums:
    doubles.append(x * 2)
doubles

[2, 4, 6]

## 2. List Comprehension

In [3]:
[x * 2 for x in nums]

[2, 4, 6]

## 3. Series.apply with lambda

In [5]:
import pandas as pd
pd.Series(nums).apply(lambda x: x*2)

0    2
1    4
2    6
dtype: int64

## 4. map with lambda

In [7]:
list(map(lambda x: x*2, nums))

[2, 4, 6]

## 5. map with named function

In [17]:
def double(x):
    return 2*x

t0 = time.time()
list(map(double, nums))
t1 = time.time()
(t1-t0) * 1000 # milliseconds

0.08940696716308594

## 6. pool map

In [18]:
from multiprocessing import Pool


t0 = time.time()


with Pool(4) as p:
    result = list(p.map(double, nums))
    
    
t1 = time.time()
(t1-t0) * 1000 # milliseconds

50.85563659667969

# Parallel Download

In [20]:
import requests
url = "https://tyler.caraza-harter.com/cs320/crawl/practice7/0.html"
r = requests.get(url)
r.raise_for_status()
r.text

'<html><body>\n<h1>Node: 0</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="16.html">16</a><li><a href="17.html">17</a><li><a href="6.html">6</a><li><a href="12.html">12</a></ul>\n</body></html>\n'

In [23]:
def download(page_num):
    url = f"https://tyler.caraza-harter.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    r.raise_for_status()
    return r.text

download(2)

'<html><body>\n<h1>Node: 2</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="0.html">0</a><li><a href="1.html">1</a></ul>\n</body></html>\n'

In [33]:
t0 = time.time()


with Pool(8) as p:
    result = list(p.map(download, range(18)))
    
    
t1 = time.time()
t1-t0

0.4284837245941162

In [34]:
t0 = time.time()


with Pool(8) as p:
    result = list(p.map(download, range(19))) # last page doesn't exist
    
    
t1 = time.time()
t1-t0

HTTPError: 404 Client Error: Not Found for url: https://tyler.caraza-harter.com/cs320/crawl/practice7/18.html

# Bad Version 2

In [41]:
import os
os.getpid() # PID => Process ID

2042

In [43]:
# each process has it's own global variables, they're not shared!
pages = {}

def download_v2(page_num):
    global pages
    url = f"https://tyler.caraza-harter.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    r.raise_for_status()
    pages[page_num] = r.text
    print(os.getpid(), pages.keys())
    
with Pool(2) as p:
    p.map(download_v2, range(18))
    
print(os.getpid(), pages)

2735 dict_keys([3])
2734 dict_keys([0])
2735 dict_keys([3, 4])
2734 dict_keys([0, 1])
2735 dict_keys([3, 4, 5])
2734 dict_keys([0, 1, 2])
2735 dict_keys([3, 4, 5, 6])
2734 dict_keys([0, 1, 2, 9])
2735 dict_keys([3, 4, 5, 6, 7])
2734 dict_keys([0, 1, 2, 9, 10])
2735 dict_keys([3, 4, 5, 6, 7, 8])
2734 dict_keys([0, 1, 2, 9, 10, 11])
2735 dict_keys([3, 4, 5, 6, 7, 8, 12])
2735 dict_keys([3, 4, 5, 6, 7, 8, 12, 13])
2734 dict_keys([0, 1, 2, 9, 10, 11, 15])
2735 dict_keys([3, 4, 5, 6, 7, 8, 12, 13, 14])
2734 dict_keys([0, 1, 2, 9, 10, 11, 15, 16])
2734 dict_keys([0, 1, 2, 9, 10, 11, 15, 16, 17])
2042 {}
