# 异步加载 Asyncio

In [1]:
import time

def job(t):
    print('start',t)
    time.sleep(t)
    c

def main():
    [job(t) for t in range(1,3)]
    
t1=time.time()
main()
print('normal: ',time.time()-t1)

start 1
job  1 takes  1   s
start 2
job  2 takes  2   s
normal:  3.000446081161499


In [3]:
import asyncio
import time
import nest_asyncio

nest_asyncio.apply()

async def job(t):
    print('start',t)
    await asyncio.sleep(t)
    print('job ',t,'takes ',t,'  s')
    
async def main(loop):
    tasks=[loop.create_task(job(t)) for t in range(1,3)]#only create not run
    await asyncio.wait(tasks)#wait for all done
    
t1=time.time()
loop=asyncio.get_event_loop()
loop.run_until_complete(main(loop))
#loop.close() 

print('asyncio time: ',time.time()-t1)

start 1
start 2
job  1 takes  1   s
job  2 takes  2   s
asyncio time:  2.001598596572876


# normal or asyncio in crawling 

In [4]:
#normal
import requests

URL='https://morvanzhou.github.io/'

def normal():
    for i in range(2):
        r=requests.get(URL)
        url=r.url
        print(url)

t1=time.time()
normal()
print('normal way crawling: ',time.time()-t1)

https://morvanzhou.github.io/
https://morvanzhou.github.io/
normal way crawling:  1.6484651565551758


In [10]:
import aiohttp
import nest_asyncio

nest_asyncio.apply()

async def job(session):
    response=await session.get(URL)
    return str(response.url)

async def main(loop):
    async with aiohttp.ClientSession() as session:
        tasks=[loop.create_task(job(session)) for _ in range(2)]
        finished,unfinished=await asyncio.wait(tasks)
        all_results=[r.result() for r in finished]
        
        print(all_results)

t1=time.time()
loop=asyncio.get_event_loop()
loop.run_until_complete(main(loop))

print('async total time: ',time.time()-t1)

['https://morvanzhou.github.io/', 'https://morvanzhou.github.io/']
async total time:  0.5569491386413574


# async method (with multiprocessing)

In [None]:
import aiohttp
import asyncio
import time
from bs4 import BeautifulSoup
from urllib.request import urljoin
import re
import multiprocessing as mp
import nest_asyncio

nest_asyncio.apply()

base_url = 'https://morvanzhou.github.io/'

seen = set()
unseen = set([base_url])


def parse(html):
    soup = BeautifulSoup(html, 'lxml')
    urls = soup.find_all('a', {'href': re.compile('^/.+?/$')})
    title = soup.find('h1').get_text().strip()
    page_urls = set([urljoin(base_url, url['href']) for url in urls])
    url = soup.find('meta', {'property': 'og:url'})['content']
    return title, page_urls, url


async def crawl(url, session):
    r = await session.get(url)
    html = await r.text()
    await asyncio.sleep(0.1)
    return html


async def main(loop):
    pool = mp.Pool(8)
    async with aiohttp.ClientSession() as session:
        count = 1
        while len(unseen) != 0:
            if len(seen) > 20:
                break

            print('\nAsync Crawling...')
            tasks = [loop.create_task(crawl(url, session)) for url in unseen]
            finished, unfinished = await asyncio.wait(tasks)
            htmls = [f.result() for f in finished]

            print('\nDistributed Parsing...')
            parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
            results = [j.get() for j in parse_jobs]

            print('\nAnalysing...')
            seen.update(unseen)
            unseen.clear()

            for title, page_urls, url in results:
                print(count, title, url)
                unseen.update(page_urls - seen)
                count += 1

if __name__=='__main__':
    t1 = time.time()
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main(loop))
    loop.close()
    print('total time: ', time.time() - t1)


Async Crawling...

Distributed Parsing...
