In [9]:
import requests
import re

class MyCrawler:
    def __init__(self, filename):
        self.filename = filename
        self.headers =  {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
        }
    
    def download(self, url):
        r = requests.get(url, headers=self.headers)
        return r.text
    
    def extract(self, content, pattern):
        result = re.findall(pattern, content)
        return result
    
    def save(self, info):
        with open(self.filename, 'a', encoding='utf-8') as f:
            for item in info:
                f.write('|||'.join(item) + '\n')
    
    def crawl(self, url, pattern, headers=None):
        if headers:
            self.headers.update(headers)
        content = self.download(url)
        info = self.extract(content, pattern)
        self.save(info)

In [8]:
url = 'https://book.douban.com/tag/?view=type'
content = douban_crawler.download(url)
tree = html.fromstring(content)
tags = tree.xpath("//td/a/text()")

In [11]:
urllib.parse.quote(tags[0])

'%E5%B0%8F%E8%AF%B4'

In [13]:
import re
import time
import requests
from lxml import html
import urllib.parse

douban_crawler = MyCrawler('douban.txt')

tag_list_url = 'https://book.douban.com/tag/?view=type'
tag_content = douban_crawler.download(tag_list_url)
tag_tree = html.fromstring(tag_content)
tags = tag_tree.xpath("//td/a/text()")
for tag in tags[:5]:
    print('Current tag:', tag)
    tag = urllib.parse.quote(tag)
    page_id = 1
    last_start = 0
    while 1:
        start_id = 20 * (page_id - 1)
        url = 'https://book.douban.com/tag/{}?start={}&type=T'.format(tag, start_id)
        print(url)
        content = douban_crawler.download(url)
        tree = html.fromstring(content)
        if page_id == 1:
            page_links = tree.xpath("//div[@class='paginator']/a[last()]/@href")
            if page_links:
                last_start = int(re.findall('start=(\d+)', page_links[0])[0])
                print('Last Start ID: ', last_start)
        book_infos = tree.xpath("//li[@class='subject-item']")
        for book_info in book_infos:
            book_name_elem = book_info.xpath('.//h2/a')[0]
            book_name = re.sub('\s{2,}', '', book_name_elem.text_content().replace('\n', ''))
            book_url = book_name_elem.attrib['href']
            book_pub_info = book_info.xpath(".//div[@class='pub']")[0].text.strip()
            book_intro = 'N/A'
            book_intro_elem = book_info.xpath(".//div[@class='info']/p")
            if book_intro_elem:
                book_intro = book_intro_elem[0].text.strip()
            print(book_name)
        page_id += 1
        if start_id == last_start:
            break
        print('------------------------------------')
        break
        time.sleep(1)

Current tag: 小说
https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T
Last Start ID:  7600
活着
房思琪的初恋乐园
白夜行
解忧杂货店
红楼梦
追风筝的人
百年孤独
小王子
围城
平凡的世界（全三部）
嫌疑人X的献身
霍乱时期的爱情
1984
飘
月亮与六便士
三体: “地球往事”三部曲之一
三体全集: 地球往事三部曲
局外人
杀死一只知更鸟
骆驼祥子
------------------------------------
Current tag: 外国文学
https://book.douban.com/tag/%E5%A4%96%E5%9B%BD%E6%96%87%E5%AD%A6?start=0&type=T
Last Start ID:  7640
小王子
追风筝的人
百年孤独
飘
1984
霍乱时期的爱情
月亮与六便士
月亮和六便士
杀死一只知更鸟
傲慢与偏见
局外人
动物农场
安徒生童话故事集
简爱（英文全本）
老人与海
基督山伯爵
哈利•波特
一个陌生女人的来信
牧羊少年奇幻之旅
肖申克的救赎
------------------------------------
Current tag: 文学
https://book.douban.com/tag/%E6%96%87%E5%AD%A6?start=0&type=T
Last Start ID:  7640
你当像鸟飞往你的山
房思琪的初恋乐园
小王子
红楼梦
百年孤独
追风筝的人
围城
活着
平凡的世界（全三部）
解忧杂货店
撒哈拉的故事
霍乱时期的爱情
月亮和六便士
1984
边城
局外人
许三观卖血记
白鹿原: 20周年精装典藏版
沉默的大多数: 王小波杂文随笔全编
云边有个小卖部
------------------------------------
Current tag: 经典
https://book.douban.com/tag/%E7%BB%8F%E5%85%B8?start=0&type=T
Last Start ID:  7820
活着
小王子
红楼梦
百年孤独
围城
飘
平凡的世界（全三部）
三体全集: 地球往事三部曲
骆驼祥子
月亮与六便士
哈利•波特
杀死一只知更

In [4]:
urls = [f'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={start_id}&type=T' for start_id in range(0, 200, 20)]

In [5]:
urls

['https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=20&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=60&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=80&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=100&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=120&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=140&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=160&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=180&type=T']

In [11]:
%%time

import concurrent.futures
import requests

# URLS = ['http://www.163.com/',
#         'http://www.sina.com.cn/',
#         'http://baidu.com/',
#         'http://youdao.com/',
#         'http://bing.com/']

douban_crawler = MyCrawler('douban.txt')

# Retrieve a single page and report the URL and contents
def load_url(url):
    global douban_crawler
    return douban_crawler.download(url)

# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(load_url, url): url for url in urls}
    for future in concurrent.futures.as_completed(future_to_url):
        url = future_to_url[future]
        try:
            data = future.result()
        except Exception as exc:
            print('%r generated an exception: %s' % (url, exc))
        else:
            print('%r page is %d bytes' % (url, len(data)))

'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T' page is 54058 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=80&type=T' page is 52984 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=20&type=T' page is 52973 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T' page is 52753 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=60&type=T' page is 52622 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=120&type=T' page is 53638 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=100&type=T' page is 52683 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=140&type=T' page is 54098 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=180&type=T' page is 53970 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=160&type=T' page is 53460 bytes
Wall time: 1.11 s


In [13]:
%%time

import concurrent.futures

# URLS = ['http://www.163.com/',
#         'http://www.sina.com.cn/',
#         'http://baidu.com/',
#         'http://youdao.com/',
#         'http://bing.com/']

for url in urls:
    data = douban_crawler.download(url)
    print('%r page is %d bytes' % (url, len(data)))

'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T' page is 52753 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=20&type=T' page is 52973 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T' page is 54058 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=60&type=T' page is 52622 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=80&type=T' page is 52984 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=100&type=T' page is 52683 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=120&type=T' page is 53638 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=140&type=T' page is 54098 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=160&type=T' page is 53460 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=180&type=T' page is 53970 bytes
Wall time: 2.69 s


In [19]:
%%time

import concurrent.futures
import time

from threading import Semaphore

my_semaphore = Semaphore()

def do_it(tid):
    result = []
    time.sleep(1)
    result.append(f'task {tid} step 1\n')
    time.sleep(1)
    result.append(f'task {tid} step 2\n')
    time.sleep(1)
    result.append(f'task {tid} step 3\n')
    time.sleep(1)
    result.append(f'task {tid} completed.\n')
    my_semaphore.acquire()
    print(''.join(result))
    my_semaphore.release()
    return 0

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Start the load operations and mark each future with its URL
    future_to_tid = {executor.submit(do_it, tid): tid for tid in range(10)}
    for future in concurrent.futures.as_completed(future_to_tid):
        tid = future_to_tid[future]
        try:
            data = future.result()
        except Exception as exc:
            print('%r generated an exception: %s.\n' % (tid, exc), end='')
        else:
            print('task %d return %d.\n' % (tid, data), end='')

task 1 step 1
task 1 step 2
task 1 step 3
task 1 completed.
task 1 return 0.
task 0 step 1
task 0 step 2
task 0 step 3
task 0 completed.
task 0 return 0.
task 5 step 1
task 5 step 2
task 5 step 3
task 5 completed.
task 5 return 0.
task 6 step 1
task 6 step 2
task 6 step 3
task 6 completed.
task 6 return 0.
task 7 step 1
task 7 step 2
task 7 step 3
task 7 completed.
task 7 return 0.
task 8 step 1
task 8 step 2
task 8 step 3
task 8 completed.
task 8 return 0.
task 9 step 1
task 9 step 2
task 9 step 3
task 9 completed.
task 9 return 0.
task 4 step 1
task 4 step 2
task 4 step 3
task 4 completed.
task 4 return 0.
task 2 step 1
task 2 step 2
task 2 step 3
task 2 completed.
task 2 return 0.
task 3 step 1
task 3 step 2
task 3 step 3
task 3 completed.
task 3 return 0.
Wall time: 20 s


In [4]:
print?

[1;31mDocstring:[0m
print(value, ..., sep=' ', end='\n', file=sys.stdout, flush=False)

Prints the values to a stream, or to sys.stdout by default.
Optional keyword arguments:
file:  a file-like object (stream); defaults to the current sys.stdout.
sep:   string inserted between values, default a space.
end:   string appended after the last value, default a newline.
flush: whether to forcibly flush the stream.
[1;31mType:[0m      builtin_function_or_method


In [10]:
print(1,2,3,sep='\t',end='')
print(1,2,3,sep='\t',end='')
print(1,2,3,sep='\t',end='')

1	2	31	2	31	2	3