# Async in Python

Types of Intensive Tasks:
- CPU - calculations, searching etc..
- IO - socket calls (files, calling and api)

## Threads

In [4]:
import requests

In [3]:
def get_page(url):
    resp = requests.get(url)
    res = resp.text[:42]

    print(url, '->', res, end='\n\n')
    return res
get_page('https://google.com')

https://google.com -> <!doctype html><html dir="rtl" itemscope="



'<!doctype html><html dir="rtl" itemscope="'

In [24]:
urls = [
    'https://google.com',
    'https://yahoo.com',
    'https://bing.com'
]

for url in urls:
    get_page(url)

<!doctype html><html dir="rtl" itemscope="

<!doctype html><html id=atomic class="ltr 

<!doctype html><html lang="he" dir="rtl"><



Multiple threads example

In [34]:
import threading

threads = []
for url in urls:
    t = threading.Thread(target=get_page, args=(url, ), daemon=True)
    t.start()
    threads.append(t)

print('started')
for t in threads:
    t.join()  # join is a blocking action, therefore only run it once all threads have been defined
print('done')

started
https://bing.com -> <!doctype html><html lang="he" dir="rtl"><

https://google.com -> <!doctype html><html dir="rtl" itemscope="

https://yahoo.com -> <!doctype html><html id=atomic class="ltr 

done


Problem - the functions print the result but the return value is not stored in the main thread.  
Solution - use a Queue (or similar DS)

### Queue

In [44]:
from queue import Queue
import threading

urls = [
    'https://google.com',
    'https://yahoo.com',
    'https://bing.com'
]

def get_page_store(url, q):
    resp = requests.get(url)
    res = resp.text[:42]
    q.put(res)

q = Queue()
fetch_threads = []

for url in urls:
    t = threading.Thread(
        target=get_page_store,
        args=(url, q),
        daemon=True)
    t.start()
    fetch_threads.append(t)

for t in fetch_threads:
    t.join()

while not q.empty():
    print(q.get())

<!doctype html><html lang="he" dir="rtl"><
<!doctype html><html dir="rtl" itemscope="
<!doctype html><html id=atomic class="ltr 


### Thread Object

In [45]:
class Fetcher(threading.Thread):
    def __init__(self, url, q):
        super().__init__()
        self.url = url
        self.q = q
    
    def run(self):  # define a run method
        resp = requests.get(self.url)
        result = resp.text[:42]
        self.q.put(result)

urls = [
    'https://google.com',
    'https://yahoo.com',
    'https://bing.com'
]

q = Queue()
fetch_threads = []

for url in urls:
    t = Fetcher(url, q)
    t.start()
    fetch_threads.append(t)

for t in fetch_threads:
    t.join()

while not q.empty():
    print(q.get())

<!doctype html><html lang="he" dir="rtl"><
<!doctype html><html dir="rtl" itemscope="
<!doctype html><html id=atomic class="ltr 


### Producer-Consumer Pattern with Threads

In [46]:
# Create a Producer which fetches an HTML page and push to Q the links
# Create a Consumer to get pushed link from the Q and get the page and get the first few bytes from the page

In [5]:
# TODO: Fix the Bug
import requests
import threading
from queue import Queue
from bs4 import BeautifulSoup


def generate_links(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text)
    for link in soup.findAll('a'):
        l = link.get('href')
        if l is not None and l.startswith('http'):
            yield l

class FetchProducer(threading.Thread):
    
    def __init__(self, name, url, q):
        super().__init__()
        self.url = url
        self.q = q

    def run(self):  # define a run method
        for link in generate_links(self.url):
            print(f'Producer {self.name} putting...{l[:40]}')
            self.q.put(link)  
        print(f'Producer {self.name} Done')

class FetchConsumer(threading.Thread):
    def __init__(self, name, qin, qout):
        super().__init__()
        self.url = url
        self.qin = qout
        self.name = name

    def run(self):

        while True:
            link = self.qin.get()
            print(f'getting...{link}')
            if link is None:
                print('consumer done')
                break
            resp = requests.get(link)
            print(f'{self.name} consuming...')
            self.qout.put(resp.text[:42])
        

qin = Queue()
qout = Queue()

urls = [
    'https://google.com',
    'https://yahoo.com',
    'https://bing.com'
]

_NUM_OF_CONSUMERS = 5
producers = []
consumers = []


for i, url in enumerate(urls):
    p = FetchProducer(f'p{i}',url, qin)
    p.start()
    producers.append(p)

for i, _ in enumerate(range(_NUM_OF_CONSUMERS)):
    c = FetchConsumer(f'c{i}', qin, qout)
    c.start()
    consumers.append(c)

print('producing')
for p in producers:
    p.join()

print('producing None')
for i in range(_NUM_OF_CONSUMERS):
    qin.put(None)

print('consuming')
for c in consumers:
    c.join()

print('printing qout')
while not qout.empty():
    print(qout.get())

print('done')



producing
Producer Thread-13 Done


Exception in thread Thread-11:
Traceback (most recent call last):
  File "C:\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\jbt\AppData\Local\Temp\ipykernel_10380\3527240679.py", line 24, in run
NameError: name 'l' is not defined
Exception in thread Thread-12:
Traceback (most recent call last):
  File "C:\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\jbt\AppData\Local\Temp\ipykernel_10380\3527240679.py", line 24, in run
NameError: name 'l' is not defined


producing None
consuming


In [1]:
import threading
import requests
from queue import Queue
from bs4 import BeautifulSoup

def get_links_from_url(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    links = soup.find_all("a")
    for link in links:
        if link.has_attr("href"):
            if "http" in link["href"]:  
                yield link["href"]
            elif link["href"].startswith("//"):
                yield "https:" + link["href"]

len(list(get_links_from_url("https://www.google.com")))



import time


class Producer(threading.Thread):
    def __init__(self, name, url, q):
        super().__init__()
        self.name = name
        self.url = url
        self.q = q

    def run(self):
        links = get_links_from_url(self.url)
        for link in links:
            print(f"{self.name} adding {link[:42]} to queue")
            self.q.put(link)
        print(f"{self.name} Producer done")

class Consumer(threading.Thread):
    def __init__(self, name, qin, qout):
        super().__init__()
        self.name = name
        self.qin = qin
        self.qout = qout

    def run(self):
        start_time = time.time()
        while True: 
            url = self.qin.get()
            if not url:
                print(f"{self.name} Consumer done")
                break
            res = requests.get(url).text[:42]
            print(f"{self.name} {time.time()-start_time} {repr(res)}")
            self.qout.put(res)





qin = Queue()
qout = Queue()

urls = [
    "https://www.google.com",
    "https://www.yahoo.com",
    "https://www.bing.com"
]
producers = []
consumers = []
num_of_consumers = 5

for i, url in enumerate(urls):
    p = Producer("p"+str(i+1), url, qin)
    p.start()
    producers.append(p)

for i in range(num_of_consumers):
    c = Consumer("c"+str(i+1), qin, qout)
    c.start()
    consumers.append(c)

for p in producers:
    p.join()

for i in range(num_of_consumers):
    qin.put(None)

for c in consumers:
    c.join()
    
while not qout.empty():
    print("final result", repr(qout.get()))
print("done")


p3 adding https://www.msn.com/?ocid=BHEA000 to queue
p3 adding https://www.takelessons.com/?utm_source=bi to queue
p3 adding https://www.office.com?WT.mc_id=O16_BingHP to queue
p3 adding https://outlook.com/?WT.mc_id=O16_BingHP to queue
p3 adding https://office.live.com/start/Word.aspx?WT to queue
p3 adding https://office.live.com/start/Excel.aspx?W to queue
p3 adding https://office.live.com/start/PowerPoint.a to queue
p3 adding https://www.onenote.com/notebooks?WT.mc_id to queue
p3 adding https://sway.office.com?WT.mc_id=O16_BingH to queue
p3 adding https://onedrive.live.com/?gologin=1&WT.mc to queue
p3 adding https://calendar.live.com/?WT.mc_id=O16_Bi to queue
p3 adding https://outlook.live.com/owa/?path=/people to queue
p3 Producer done
p1 adding https://www.google.co.il/imghp?hl=iw&tab=w to queue
p1 adding https://maps.google.co.il/maps?hl=iw&tab=w to queue
p1 adding https://play.google.com/?hl=iw&tab=w8 to queue
p1 adding https://www.youtube.com/?tab=w1 to queue
p1 adding https://

### Threadpool Executor

In [6]:
from concurrent.futures import ThreadPoolExecutor

with ThreadPoolExecutor(max_workers=5) as executor:
    for url in urls:
        executor.submit(get_page, url)
        

https://www.bing.com -> <!doctype html><html lang="he" dir="rtl"><

https://www.google.com -> <!doctype html><html dir="rtl" itemscope="

https://www.yahoo.com -> <!doctype html><html id=atomic class="ltr 

https://www.bing.com -> <!doctype html><html lang="he" dir="rtl"><

https://www.google.com -> <!doctype html><html dir="rtl" itemscope="

<!doctype html><html dir="rtl" itemscope="
https://www.yahoo.com -> <!doctype html><html id=atomic class="ltr 

<!doctype html><html id=atomic class="ltr 
<!doctype html><html lang="he" dir="rtl"><


In [8]:
from concurrent.futures import ThreadPoolExecutor

with ThreadPoolExecutor(max_workers=5) as executor:
    results = executor.map(get_page, urls)
    for url in results:
        print(url)

https://www.bing.com -> <!doctype html><html lang="he" dir="rtl"><

https://www.google.com -> <!doctype html><html dir="rtl" itemscope="

<!doctype html><html dir="rtl" itemscope="
https://www.yahoo.com -> <!doctype html><html id=atomic class="ltr 

<!doctype html><html id=atomic class="ltr 
<!doctype html><html lang="he" dir="rtl"><


### Multiprocessing

In [9]:
import multiprocessing as mp
import utils

with mp.Pool(5) as pool:
    print(pool.map(utils.f, range(10)))

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


### Asyncio

In [66]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup

urls = [
    "https://www.google.com",
    "https://www.yahoo.com",
    "https://www.bing.com"
]

async def get_page(url, session):
    async with session.get(url) as resp:
        result = await resp.text()
        return result[:42]


async def get_links(url, session):
    
    links = []
    async with session.get(url) as resp:
        soup = BeautifulSoup(await resp.text())
        for link in soup.findAll('a'):
            l = link.get('href')
            if l is not None and l.startswith('http'):
                links.append(l)
    return links

async def generate_links(url, session):
    
    async with session.get(url) as resp:
        soup = BeautifulSoup(await resp.text())
        for link in soup.findAll('a'):
            l = link.get('href')
            if l is not None and l.startswith('http'):
                yield l

async def generate_page(url, session):
    async with session.get(url) as resp:
        result = await resp.text()
        yield result[:42]

async with aiohttp.ClientSession() as session:
    for result in await asyncio.gather(*(get_links(url, session) for url in urls)):
        print(await asyncio.gather(*(get_page(url, session) for url in result)))


['<!doctype html><html dir="rtl" itemscope="', '<!DOCTYPE html><html dir="rtl" itemscope="', '<!doctype html><html lang="iw" dir="rtl"><', '<!DOCTYPE html><html style="font-size: 10p', '<!doctype html><html lang="en-GB" dir="ltr', '<!doctype html><html lang="en" dir="ltr"><', '<!doctype html><html lang="en" dir="ltr"><', '\n\n<!DOCTYPE html>\n\n\n<html lang="en" dir="l', '<!doctype html><html lang="iw" dir="rtl"><', '<!doctype html><html lang="iw" dir="rtl"><', '<!doctype html><html dir="rtl" itemscope="', '<!doctype html><html itemscope="" itemtype', '<!DOCTYPE html>\n<html class="google mmfb" ', '<!doctype html><html dir="rtl" itemscope="']
['<!doctype html><html id=atomic class="ltr ', '<!DOCTYPE html>\n<html id="Stencil" class="', '<!DOCTYPE html><html id="atomic" class="No', '<!DOCTYPE html><html data-color-theme="lig', '<!DOCTYPE html><html id="atomic" class="No', '<!doctype html><html id=atomic class="Fz(6', '<!doctype html><html id=atomic class="Fz(6', '<!doctype html><html lan

#### Teacher Example

### Flask Example