# Threads

In [1]:
import time
from threading import Thread

def f(name, n):
    for i in range(n):
        print(name, n-i)
        time.sleep(1)

# f("A", 3)
# f("B", 5)
        
t1 = Thread(target=f, args=("A", 3))
t2 = Thread(target=f, args=("B", 5))
t1.start()
t2.start()
t1.join()
t2.join()

A 3
B 5
A 2
B 4
A 1
B 3
B 2
B 1


# Processes

How can we do the same computation to every item in a list?

## 1. For Loop, Append

In [3]:
nums = [1, 2, 3]
doubles = []
for x in nums:
    doubles.append(x * 2)
doubles

[2, 4, 6]

## 2. List Comprehension

In [4]:
[x * 2 for x in nums]

[2, 4, 6]

## 3. Series.apply with lambda

In [7]:
import pandas as pd

pd.Series(nums).apply(lambda my_parameter: my_parameter*2)

0    2
1    4
2    6
dtype: int64

## 4. map with lambda

In [9]:
list(map(lambda my_parameter: my_parameter*2, nums))

[2, 4, 6]

## 5. map with named function

In [19]:
from time import time

def my_func(my_parameter):
    return my_parameter*2

# single process version

t0 = time()
list(map(my_func, nums))
t1 = time()
(t1-t0) * 1000

0.08368492126464844

## 6. pool map

In [12]:
from multiprocessing import Pool

In [29]:
with Pool(3) as p:
    t0 = time()
    results = p.map(my_func, nums)
    t1 = time()
print(results)
(t1-t0) * 1000

[2, 4, 6]


1.9137859344482422

In [16]:
import os
os.cpu_count()

2

# Parallel Download

In [31]:
import requests
url = "https://tyler.caraza-harter.com/cs320/crawl/practice7/0.html"
r = requests.get(url)
r.raise_for_status()
r.text

'<html><body>\n<h1>Node: 0</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="16.html">16</a><li><a href="17.html">17</a><li><a href="6.html">6</a><li><a href="12.html">12</a></ul>\n</body></html>\n'

In [33]:
def get_page(page_num):
    url = f"https://tyler.caraza-harter.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    r.raise_for_status()
    return r.text
get_page(2)

'<html><body>\n<h1>Node: 2</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="0.html">0</a><li><a href="1.html">1</a></ul>\n</body></html>\n'

In [48]:
with Pool(20) as p:
    t0 = time()
    results = p.map(get_page, range(0,18))
    t1 = time()

t1-t0 # seconds

0.2566366195678711

# Bug 1

In [49]:
with Pool(20) as p:
    t0 = time()
    results = p.map(get_page, range(0,20))
    t1 = time()

t1-t0 # seconds

HTTPError: 404 Client Error: Not Found for url: https://tyler.caraza-harter.com/cs320/crawl/practice7/18.html

# Bug 2

Global variables are not global across different processes.

In [59]:
os.getpid() # get process ID

6702

In [60]:
all_pages = {} # page num => HTML of that page

def get_page_v2(page_num):
    url = f"https://tyler.caraza-harter.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    r.raise_for_status()
    all_pages[page_num] = r.text
    print(os.getpid(), all_pages.keys())

# get_page_v2(2)
# get_page_v2(3)
all_pages

{}

In [61]:
with Pool(2) as p:
    t0 = time()
    p.map(get_page_v2, range(0,18))
    t1 = time()

all_pages

13527 dict_keys([3])
13526 dict_keys([0])
13527 13526dict_keys([3, 4])
 dict_keys([0, 1])
13527 13526 dict_keys([3, 4, 5])dict_keys([0, 1, 2])

1352713526 dict_keys([0, 1, 2, 6])
 dict_keys([3, 4, 5, 9])
13526 dict_keys([0, 1, 2, 6, 7])
13527 dict_keys([3, 4, 5, 9, 10])
13526 dict_keys([0, 1, 2, 6, 7, 8])
13527 dict_keys([3, 4, 5, 9, 10, 11])
13526 dict_keys([0, 1, 2, 6, 7, 8, 12])
13527 dict_keys([3, 4, 5, 9, 10, 11, 15])
13526 dict_keys([0, 1, 2, 6, 7, 8, 12, 13])
13527 dict_keys([3, 4, 5, 9, 10, 11, 15, 16])
13526 dict_keys([0, 1, 2, 6, 7, 8, 12, 13, 14])
13527 dict_keys([3, 4, 5, 9, 10, 11, 15, 16, 17])


{}

In [62]:
os.getpid()

6702