In [1]:
import multiprocessing as mp
import time
from urllib.request import urlopen,urljoin
from bs4 import BeautifulSoup
import re

In [2]:
base_url="https://morvanzhou.github.io/"

if base_url!="http://127.0.0.1:4000/":
    restricted_crawl=True
else:
    restricted_crawl=False

In [3]:
def crawl(url):
    response=urlopen(url)
    time.sleep(0.1)
    return response.read().decode()

In [6]:
def parse(html):
    soup=BeautifulSoup(html,'lxml')
    urls=soup.find_all('a',{'href': re.compile('^/.+?/$')})
    title=soup.find('h1').get_text().strip()
    page_urls=set([urljoin(base_url,url['href']) for url in urls])
    url=soup.find('meta',{'property':'og:url'})['content']
    return title,page_urls,url

In [7]:
unseen=set([base_url,])
seen=set()

count,t1=1,time.time()

while len(unseen)!=0:
    if restricted_crawl and len(seen)>20:
        break
    print('\nDistributed Crawling...')
    htmls=[crawl(url) for url in unseen]
    
    print('\nDistributed Parsing...')
    results=[parse(html) for html in htmls]
    
    print('\nAnalysing...')
    seen.update(unseen)
    unseen.clear()
    
    for title,page_urls,url in results:
        print(count,title,url)
        count+=1
        unseen.update(page_urls-seen)
print('Total time:%.1f s'%(time.time()-t1))


Distributed Crawling...

Distributed Parsing...

Analysing...
1 教程 https://morvanzhou.github.io/

Distributed Crawling...

Distributed Parsing...

Analysing...
2 高级爬虫: 让 Selenium 控制你的浏览器帮你爬 https://morvanzhou.github.io/tutorials/data-manipulation/scraping/5-01-selenium/
3 迁移学习 Transfer Learning https://morvanzhou.github.io/tutorials/machine-learning/ML-intro/2-9-transfer-learning/
4 机器学习系列 https://morvanzhou.github.io/tutorials/machine-learning/
5 Why? https://morvanzhou.github.io/tutorials/data-manipulation/scraping/1-00-why/
6 高级爬虫: 高效无忧的 Scrapy 爬虫库 https://morvanzhou.github.io/tutorials/data-manipulation/scraping/5-02-scrapy/
7 关于莫烦 https://morvanzhou.github.io/about/
8 近期更新 https://morvanzhou.github.io/recent-posts/
9 Keras 教程系列 https://morvanzhou.github.io/tutorials/machine-learning/keras/
10 计算机视觉 教程系列 https://morvanzhou.github.io/tutorials/machine-learning/computer-vision/
11 迁移学习 Transfer Learning https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/5-16-transfer

In [None]:
unseen=set([base_url,])
seen=set()

pool=mp.Pool(4)
count,t1=1,time.time()
while len(unseen)!=0:
    if restricted_crawl and len(seen)>20:
        break
    print('\nDistributed Crawing...')
    crawl_jobs=[pool.apply_async(crawl,args=(url,)) for url in  unseen]
    htmls=[j.get() for j in crawl_jobs]
    
    print('\nDistributed Parsing...')
    parse_jobs=[pool.apply_async(parse,args=(html,)) for html in htmls]
    results=[j.get() for j in parse_jobs]
    
    print('\nAnalysing...')
    seen.update(unseen)
    unseen.clear()
    
    for title,page_urls,url in results:
        print(count,title,url)
        count+=1
        unseen.update(page_urls-seen)
print('Total time:%.1f s'%(time.time()-t1))

In [None]:
unseen = set([base_url,])
seen = set()

pool = mp.Pool(4)                       
count, t1 = 1, time.time()
while len(unseen) != 0:                 # still get some url to visit
    if restricted_crawl and len(seen) > 20:
            break
    print('\nDistributed Crawling...')
    crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
    htmls = [j.get() for j in crawl_jobs]                                       # request connection

    print('\nDistributed Parsing...')
    parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
    results = [j.get() for j in parse_jobs]                                     # parse html

    print('\nAnalysing...')
    seen.update(unseen)         # seen the crawled
    unseen.clear()              # nothing unseen

    for title, page_urls, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_urls - seen)     # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1, ))    # 16 s !!!