# Threads

In [1]:
import time
from threading import Thread

def f(name, n):
    for i in range(n):
        print(name, n-i)
        time.sleep(1)

# f("A", 3)
# f("B", 5)
        
t1 = Thread(target=f, args=("A", 3))
t2 = Thread(target=f, args=("B", 5))
t1.start()
t2.start()
t1.join()
t2.join()

AB 5
 3
BA 2
 4
A 1
B 3
B 2
B 1


# Processes

How can we do the same computation to every item in a list?

## 1. For Loop, Append

In [2]:
nums = [1, 2, 3]
doubles = []
for x in nums:
    doubles.append(x * 2)
doubles

[2, 4, 6]

## 2. List Comprehension

In [3]:
[x * 2 for x in nums]

[2, 4, 6]

## 3. Series.apply with lambda

In [4]:
import pandas as pd
pd.Series(nums).apply(lambda x: 2*x)

0    2
1    4
2    6
dtype: int64

## 4. map with lambda

In [5]:
list(map(lambda x: 2*x, nums))

[2, 4, 6]

## 5. map with named function

In [6]:
def double(x):
    return 2*x

t0 = time.time()
list(map(double, nums))
t1 = time.time()
(t1-t0) * 1000 # milliseconds

0.08368492126464844

## 6. pool map

In [7]:
from multiprocessing import Pool

t0 = time.time()
with Pool(4) as p:  # pool size defaults to number of cores
    result = list(p.map(double, nums))
print(result)
t1 = time.time()
(t1-t0) * 1000 # milliseconds

[2, 4, 6]


67.43073463439941

# Parallel Download

In [8]:
import requests
url = "https://tyler.caraza-harter.com/cs320/crawl/practice7/0.html"
r = requests.get(url)
r.raise_for_status()
r.text

'<html><body>\n<h1>Node: 0</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="16.html">16</a><li><a href="17.html">17</a><li><a href="6.html">6</a><li><a href="12.html">12</a></ul>\n</body></html>\n'

In [9]:
def download(page_num):
    url = f"https://tyler.caraza-harter.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    r.raise_for_status()
    return r.text
    
download(1)

'<html><body>\n<h1>Node: 1</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="0.html">0</a><li><a href="2.html">2</a></ul>\n</body></html>\n'

In [13]:
t0 = time.time()


with Pool(10) as p:  # pool size defaults to number of cores
    result = list(p.map(download, range(18)))
    
    
t1 = time.time()
t1-t0

0.3741741180419922

# Bad version 2

In [20]:
import os
os.getpid() # PID => Process ID

1219

In [24]:
# oops!  global variables are per process!
pages = {}

def download_to_global(page_num):
    global pages
    url = f"https://tyler.caraza-harter.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    r.raise_for_status()
    pages[page_num] = r.text
    print(os.getpid(), pages.keys())
    
with Pool(2) as p:  # pool size defaults to number of cores
    p.map(download_to_global, range(18))
    
print(os.getpid(), pages)

1842 1841 dict_keys([0])
dict_keys([3])
1841 dict_keys([0, 1])
1842 dict_keys([3, 4])
1841 dict_keys([0, 1, 2])
1842 dict_keys([3, 4, 5])
1841 dict_keys([0, 1, 2, 6])
1842 dict_keys([3, 4, 5, 9])
1841 dict_keys([0, 1, 2, 6, 7])
1842 dict_keys([3, 4, 5, 9, 10])
1841 dict_keys([0, 1, 2, 6, 7, 8])
1842 dict_keys([3, 4, 5, 9, 10, 11])
1841 dict_keys([0, 1, 2, 6, 7, 8, 12])
1842 dict_keys([3, 4, 5, 9, 10, 11, 15])
1841 dict_keys([0, 1, 2, 6, 7, 8, 12, 13])
1842 dict_keys([3, 4, 5, 9, 10, 11, 15, 16])
1841 dict_keys([0, 1, 2, 6, 7, 8, 12, 13, 14])
1842 dict_keys([3, 4, 5, 9, 10, 11, 15, 16, 17])
1219 {}
