In [None]:
# default_exp proxy

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
# hide
!nbdev_build_lib --fname 11_Proxy_Request.ipynb

Converted 11_Proxy_Request.ipynb.


# TODO
- [x] 2020-03-15 抽象成Class

# 代理爬虫
> 很多网站都有反爬虫机制，一个IP频繁访问一个网站，就会出现访问被拒绝的情况，所以换IP可以解决这个问题。（运用技术请克制，避免过度浪费服务器资源）

## 测试代理
> 有专用的付费代理IP稳定可靠，也有免费的代理IP可能会随时失效

In [9]:
# export
import requests,json,re,random,sys,time,os
from bs4 import BeautifulSoup,Tag,NavigableString

from crawler_from_scratch.utils import *

from concurrent.futures import ThreadPoolExecutor
import pandas as pd


先从 https://www.freeip.top/ 随便拿个ip来测试

In [10]:
# hide
url = 'https://www.baidu.com/'
headers={'user-agent':'Mozilla/5.0'}
proxies = {'https': 'https://64.227.1.188:8080'}
res = requests.get(url,proxies=proxies,headers=headers,timeout=5)
res

ConnectTimeout: HTTPSConnectionPool(host='www.baidu.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc1b00158d0>, 'Connection to 64.227.1.188 timed out. (connect timeout=5)'))

## 制作代理池
> 网上有专门整理的[代理池](https://github.com/jhao104/proxy_pool)，但需要配置数据库，所以不在这里演示，而是自己写个爬虫，通过[这个网站的API](https://github.com/jiangxianli/ProxyIpLib)获取IP

In [3]:
# export
class Proxy():
    '一个代理器，用爬取的免费代理ip，来爬取网站'
    def __init__(self):
        self.db = {}
        self.path = './data/11_Proxy.json'
        self.debug = True
        
        if os.path.exists(self.path):
            with open(self.path, 'r') as f:
                self.db = json.loads(f.read())
                print('加载成功',len(self.db.keys()))
        else:
            self.update(self)
    
    def update(self):
        '重新爬取ip，初始化health值，赋值给`self.db`，保存在`self.path`'
        data = []
        next_page_url = 'https://www.freeip.top/api/proxy_ips?page=1'
        while next_page_url:
            if self.debug: print('start:',next_page_url)
            res = requests.get(next_page_url)
            if res.status_code == 200:
                data_list = res.json()['data']['data']   
                data += data_list
                next_page_url = res.json()['data']['next_page_url']
            time.sleep(1)
        # ip list 转 dict 增加健康值
        self.db = {}
        for d in data:
            _id = d['unique_id']
            self.db[_id] = d
            self.db[_id]['health'] = 50
        
        self.save()
            
        self.validate('http://www.baidu.com/')
        self.validate('https://www.baidu.com/')
    
    def save(self):    
        with open(self.path, 'w') as f:
            json.dump(self.db,f)
            print('更新成功',len(self.db.keys()))
    
    def validate(self,url,max_workers=50):
        '批量测试ip有效性'
        protocol = url.split(':')[0]
        db_with_protocol = [self.db[k] for k in self.db if self.db[k]['protocol'] == protocol]
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            executor.map(lambda ip_obj : self._get(url,ip_obj),db_with_protocol)
        
    def choose_healthy_ip(self,protocol):
        '根据健康度，随机选择优质ip'
        db_with_protocol = [self.db[k] for k in self.db if self.db[k]['protocol'] == protocol]
        sorted_db = sorted(db_with_protocol, 
                           key = lambda item : item['health'],
                           reverse=True)
        return random.choice(sorted_db[:10])
        
    def update_ip_health(self,res,obj):
        '根据response，更新health'
        if res.status_code == 200:
            obj['health'] += 1
        else:
            obj['health'] = int(obj['health']/2)
        if self.debug: print(obj['ip'],'健康值变为：',obj['health'])
        
    def _get(self,url,ip_obj={}):
        '如果不指定ip，则自动选择`self.db`中最优的ip，访问网页，并更新health值'
        protocol = url.split(':')[0]
        if not ip_obj: ip_obj = self.choose_healthy_ip(protocol)
        ip = f"{ip_obj['protocol']}://{ip_obj['ip']}:{ip_obj['port']}"
        
        try:
            res = requests.get(url,
                               proxies={protocol: ip},
                               headers={'user-agent':'Mozilla/5.0'},
                               timeout=5)
        except:
            if self.debug: print(f'error: {ip}\n{sys.exc_info()}\n')
            res = requests.Response()
        
        self.update_ip_health(res,ip_obj)
            
        return res
    def get(self,url):
        '如果一个网页访问失败，会更换ip重试10次'
        try_times = 1
        while try_times < 11:
            if self.debug : print('\n',try_times,url)
            res = self._get(url)
            if res.status_code == 200:
                print('访问成功：',url)
                return res
            else:
                try_times += 1
        print('访问失败：',url)
        return res
    


In [None]:
# hide
px = Proxy()
px.update()

加载成功 365
start: https://www.freeip.top/api/proxy_ips?page=1
start: https://www.freeip.top/api/proxy_ips?page=2
start: https://www.freeip.top/api/proxy_ips?page=3
start: https://www.freeip.top/api/proxy_ips?page=4
start: https://www.freeip.top/api/proxy_ips?page=5
start: https://www.freeip.top/api/proxy_ips?page=6
start: https://www.freeip.top/api/proxy_ips?page=7
start: https://www.freeip.top/api/proxy_ips?page=8
start: https://www.freeip.top/api/proxy_ips?page=9
start: https://www.freeip.top/api/proxy_ips?page=10
start: https://www.freeip.top/api/proxy_ips?page=11
start: https://www.freeip.top/api/proxy_ips?page=12
start: https://www.freeip.top/api/proxy_ips?page=13
start: https://www.freeip.top/api/proxy_ips?page=14
start: https://www.freeip.top/api/proxy_ips?page=15
start: https://www.freeip.top/api/proxy_ips?page=16
start: https://www.freeip.top/api/proxy_ips?page=17
start: https://www.freeip.top/api/proxy_ips?page=18
start: https://www.freeip.top/api/proxy_ips?page=19
start: https

183.232.232.69 健康值变为： 51
163.172.146.119 健康值变为： 51
183.56.161.62 健康值变为： 51
167.71.197.226 健康值变为： 51
159.203.166.41 健康值变为： 51
error: http://128.199.245.21:44344
(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ReadTimeoutError("HTTPConnectionPool(host='128.199.245.21', port=44344): Read timed out.")), <traceback object at 0x120a81370>)

128.199.245.21 健康值变为： 25
178.128.87.98 健康值变为： 51
178.128.126.135 健康值变为： 51
128.199.177.120 健康值变为： 51
185.80.128.166 健康值变为： 51
157.230.241.171 健康值变为： 51
159.203.164.91 健康值变为： 51
176.53.40.222 健康值变为： 51
138.197.133.199 健康值变为： 51
151.253.165.70 健康值变为： 51
178.128.16.115 健康值变为： 51
103.235.46.121 健康值变为： 51
183.146.213.198 健康值变为： 51
error: http://121.67.3.3:8080
(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ReadTimeoutError("HTTPConnectionPool(host='121.67.3.3', port=8080): Read timed out.")), <traceback object at 0x10ddee050>)

121.67.3.3 健康值变为： 25
128.199.237.185 健康值变为： 51
218.75.158.153 健康值变为： 51
error: http://167.99.185.216:

51.79.85.125 健康值变为： 51
68.183.178.107 健康值变为： 51
error: http://60.51.170.27:80
(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ReadTimeoutError("HTTPConnectionPool(host='60.51.170.27', port=80): Read timed out.")), <traceback object at 0x10de3a5a0>)

60.51.170.27 健康值变为： 25
85.95.220.32 健康值变为： 51
error: http://190.103.178.14:8080
(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ReadTimeoutError("HTTPConnectionPool(host='190.103.178.14', port=8080): Read timed out.")), <traceback object at 0x120cecf00>)

190.103.178.14 健康值变为： 25
68.183.237.110 健康值变为： 51
84.17.47.187 健康值变为： 51
84.17.47.183 健康值变为： 51
84.17.47.190 健康值变为： 51
error: http://84.17.47.191:80
(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ReadTimeoutError("HTTPConnectionPool(host='84.17.47.191', port=80): Read timed out.")), <traceback object at 0x120cd8d70>)

84.17.47.191 健康值变为： 25
200.89.178.210 健康值变为： 51
84.17.47.184 健康值变为： 51
78.166.75.176 健康值变为： 51
84.17.47.182 健康值变为： 51
84.17.4

178.128.31.220 健康值变为： 51
58.240.97.154 健康值变为： 51
209.97.183.194 健康值变为： 51
52.80.58.248 健康值变为： 51
178.128.221.73 健康值变为： 51
5.44.107.147 健康值变为： 51
51.158.114.177 健康值变为： 51
51.158.108.135 健康值变为： 51
error: https://206.189.154.176:8080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='www.baidu.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out')))")), <traceback object at 0x1209a19b0>)

206.189.154.176 健康值变为： 25
52.140.242.103 健康值变为： 51
176.53.40.222 健康值变为： 51
178.128.87.184 健康值变为： 51
128.199.66.13 健康值变为： 51
200.73.128.63 健康值变为： 51
24.113.36.247 健康值变为： 51
51.158.99.51 健康值变为： 51
51.158.165.18 健康值变为： 51
51.158.123.250 健康值变为： 51
error: https://46.28.95.11:3128
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='www.baidu.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.VerifiedHTT

## 自动切换代理
> 优先选择健康值高的ip，一次请求，成功健康值+1，失败则减半

In [4]:
# hide
px = Proxy()
px._get('https://www.baidu.com/')

加载成功 364
error: https://206.189.37.101:44344
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='www.baidu.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fc1b0015f10>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc1b001b1e0>)

206.189.37.101 健康值变为： 13


<Response [None]>

## 校验&更替代理
> 用百度批量测试网站的有效性

In [5]:
# hide
px = Proxy()
px.validate('http://www.baidu.com/')

加载成功 364
112.80.255.29 健康值变为： 1
112.80.248.18 健康值变为： 58
112.80.255.51 健康值变为： 1
112.80.255.77 健康值变为： 1
112.80.248.95 健康值变为： 1
103.235.46.121103.235.46.154 健康值变为： 1
 健康值变为： 1
error: http://103.105.49.53:80
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPConnectionPool(host='103.105.49.53', port=80): Max retries exceeded with url: http://www.baidu.com/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc202b9d990>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc202c2f960>)

103.105.49.53 健康值变为： 0
error: http://104.244.77.254:8080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPConnectionPool(host='104.244.77.254', port=8080): Max retries exceeded with url: http://www.baidu.com/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc202bccbd0>: Failed to e

error: http://139.59.101.223:8080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPConnectionPool(host='139.59.101.223', port=8080): Max retries exceeded with url: http://www.baidu.com/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc202c42590>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc1c0144870>)

139.59.101.223 健康值变为： 0
error: http://139.59.53.107:8080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPConnectionPool(host='139.59.53.107', port=8080): Max retries exceeded with url: http://www.baidu.com/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc202c4ac90>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc1c014a500>)

139.59.53.107 健康值变为： 0
error: http://157.230.246.208:44344
(<cla

error: http://116.114.19.211:443
(<class 'requests.exceptions.ReadTimeout'>, ReadTimeout(ReadTimeoutError("HTTPConnectionPool(host='116.114.19.211', port=443): Read timed out. (read timeout=5)")), <traceback object at 0x7fc1e03e53c0>)

116.114.19.211 健康值变为： 28
error: http://116.114.19.204:443
(<class 'requests.exceptions.ReadTimeout'>, ReadTimeout(ReadTimeoutError("HTTPConnectionPool(host='116.114.19.204', port=443): Read timed out. (read timeout=5)")), <traceback object at 0x7fc1e03e5a50>)

116.114.19.204 健康值变为： 28
error: http://128.199.172.214:8080
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPConnectionPool(host='128.199.172.214', port=8080): Max retries exceeded with url: http://www.baidu.com/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fc202c25e10>, 'Connection to 128.199.172.214 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc1e03e1f00>)

128.199.172.214 健康值变为： 1
error: http://128.199.177.120:80

error: http://178.128.28.124:44344
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPConnectionPool(host='178.128.28.124', port=44344): Max retries exceeded with url: http://www.baidu.com/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc1e03e0350>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc202bb9e10>)

178.128.28.124 健康值变为： 15
180.97.104.97 健康值变为： 1
180.97.33.144 健康值变为： 1
180.97.33.218 健康值变为： 1
180.97.33.78 健康值变为： 1
180.97.33.249 健康值变为： 1
180.97.33.66 健康值变为： 1
180.97.104.72 健康值变为： 1
180.97.33.93 健康值变为： 1
180.97.33.94 健康值变为： 1
error: http://178.128.87.184:44344
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPConnectionPool(host='178.128.87.184', port=44344): Max retries exceeded with url: http://www.baidu.com/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnect

error: http://150.109.32.166:80
(<class 'requests.exceptions.ConnectionError'>, ConnectionError(ReadTimeoutError("HTTPConnectionPool(host='150.109.32.166', port=80): Read timed out.")), <traceback object at 0x7fc202c6f820>)

150.109.32.166 健康值变为： 0
error: http://157.230.249.183:44344
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPConnectionPool(host='157.230.249.183', port=44344): Max retries exceeded with url: http://www.baidu.com/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fc1c0154d10>, 'Connection to 157.230.249.183 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc1c01533c0>)

157.230.249.183 健康值变为： 2
error: http://222.173.10.82:8888
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPConnectionPool(host='222.173.10.82', port=8888): Max retries exceeded with url: http://www.baidu.com/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPC

error: http://178.128.85.90:44344
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPConnectionPool(host='178.128.85.90', port=44344): Max retries exceeded with url: http://www.baidu.com/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fc1e03cd110>, 'Connection to 178.128.85.90 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc2102cdaf0>)

178.128.85.90 健康值变为： 0
error: http://178.128.87.98:44344
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPConnectionPool(host='178.128.87.98', port=44344): Max retries exceeded with url: http://www.baidu.com/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fc1e03bf890>, 'Connection to 178.128.87.98 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc2102bce60>)

178.128.87.98 健康值变为： 3
error: http://178.128.96.174:44344
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPConn

error: http://200.89.159.240:80
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPConnectionPool(host='200.89.159.240', port=80): Max retries exceeded with url: http://www.baidu.com/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fc1e02476d0>, 'Connection to 200.89.159.240 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc1e023afa0>)

200.89.159.240 健康值变为： 28
error: http://51.158.186.242:8811
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPConnectionPool(host='51.158.186.242', port=8811): Max retries exceeded with url: http://www.baidu.com/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2102b1cd0>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc1e023deb0>)

51.158.186.242 健康值变为： 28
error: http://203.202.245.62:80
(<class 'requests.exceptions.ConnectTimeout'>, 

error: http://37.120.159.63:80
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPConnectionPool(host='37.120.159.63', port=80): Max retries exceeded with url: http://www.baidu.com/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fc1c014d250>, 'Connection to 37.120.159.63 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc1e024bc80>)

37.120.159.63 健康值变为： 28
error: http://37.120.159.64:80
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPConnectionPool(host='37.120.159.64', port=80): Max retries exceeded with url: http://www.baidu.com/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fc2102c1f50>, 'Connection to 37.120.159.64 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc1e028e7d0>)

37.120.159.64 健康值变为： 28
error: http://37.120.159.68:80
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPConnectionPool(hos

error: http://68.183.237.110:8080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPConnectionPool(host='68.183.237.110', port=8080): Max retries exceeded with url: http://www.baidu.com/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2102e4d50>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc1e03e9af0>)

68.183.237.110 健康值变为： 2
error: http://69.55.55.214:3128
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPConnectionPool(host='69.55.55.214', port=3128): Max retries exceeded with url: http://www.baidu.com/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc1c0145e90>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc1e03d1820>)

69.55.55.214 健康值变为： 3
error: http://80.187.140.26:80
(<class 'requ

error: http://91.196.92.24:8080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPConnectionPool(host='91.196.92.24', port=8080): Max retries exceeded with url: http://www.baidu.com/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc1e040be10>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc1e0230460>)

91.196.92.24 健康值变为： 28
error: http://61.135.185.176:80
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPConnectionPool(host='61.135.185.176', port=80): Max retries exceeded with url: http://www.baidu.com/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fc1e0247110>, 'Connection to 61.135.185.176 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc1e0427730>)

61.135.185.176 健康值变为： 0
error: http://61.135.185.38:80
(<class 'requests.exceptions.ConnectTimeout'>, ConnectT

error: http://84.17.47.194:80
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPConnectionPool(host='84.17.47.194', port=80): Max retries exceeded with url: http://www.baidu.com/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fc2102fa9d0>, 'Connection to 84.17.47.194 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc1e024f3c0>)

84.17.47.194 健康值变为： 28
error: http://84.17.47.195:80
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPConnectionPool(host='84.17.47.195', port=80): Max retries exceeded with url: http://www.baidu.com/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fc21030ec50>, 'Connection to 84.17.47.195 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc210313a50>)

84.17.47.195 健康值变为： 28
error: http://84.17.47.196:80
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPConnectionPool(host='84.17.

## 用代理爬取豆瓣页面
> 昨天用豆瓣页面测试爬虫功能的时候，就发现了访问频率过高的被拒的问题，今天就用爬虫来抓取整个互联网类目下的图书信息

In [6]:
url_list = []
for i in range(0,1000,20):
    url = f'https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start={i}&type=T'
    url_list.append(url)
len(url_list)

50

In [7]:
def get_douban_data(url,px,data):
    res = px.get(url)    

    if res.status_code == 200:
        soup = BeautifulSoup(res.text)
        main_content = soup.body.find('ul',class_='subject-list')

        for c in get_children(main_content):
            item_data = get_data(c)
            a_nbg_url = item_data['a_nbg_url']
            _id = re.search(r'/(\d+)/',a_nbg_url).group(1)
            # 写入data
            data[_id] = item_data
    else:
        print(res,res.text)

In [8]:
# hide
px = Proxy()
data ={}
px.debug = True

for i in range(3):
    px.validate('https://book.douban.com')   

px.save()


加载成功 364
error: https://172.105.23.17:8080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fc1e0233850>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc201565690>)
error: https://138.197.135.237:8080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fc1c0128ed0>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc1c007db40>)

138.197.135.237 健康值变为： 3

172.105.23.17 健康值变为： 26
error: https://157.245.224.29:8080
(<class 'requests.exception

error: https://198.100.154.150:3128
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fc202b9d950>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc1e027b4b0>)

198.100.154.150 健康值变为： 3
error: https://178.128.28.124:44344
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fc1f019f510>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc1e042afa0>)

178.128.28.124 健康值变为： 3
error: https://178.128.31.220:44344
(<class 'requests.exceptions.Prox

error: https://80.187.140.26:8080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', ConnectionResetError(54, 'Connection reset by peer')))")), <traceback object at 0x7fc202c5ab90>)

error: https://178.128.18.144:44344
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc1e0247bd0>, 'Connection to 178.128.18.144 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc1c00f25f0>)
error: https://178.128.16.115:44344
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSCon

80.187.140.26 健康值变为： 3
error: https://14.140.131.82:3128
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out')))")), <traceback object at 0x7fc1c013f460>)

14.140.131.82 健康值变为： 3
error: https://209.97.164.211:47503
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc1e0221a90>, 'Connection to 209.97.164.211 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc1e04235a0>)

209.97.164.211 健康值变为： 3
error: https://221.126.249.100:8080
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ConnectTi

error: https://91.229.192.93:80
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc1e0247890>, 'Connection to 91.229.192.93 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc1e0409320>)

91.229.192.93 健康值变为： 26
error: https://88.99.10.254:1080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', timeout('_ssl.c:1074: The handshake operation timed out')))")), <traceback object at 0x7fc1c005d780>)

88.99.10.254 健康值变为： 3
error: https://138.197.135.237:8080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError

error: https://198.100.154.150:3128
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fc1e03bf0d0>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc1c005bfa0>)

198.100.154.150 健康值变为： 1
error: https://178.128.31.220:44344
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fc1e0258910>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc202bab9b0>)
error: https://178.128.28.124:44344
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxR

error: https://80.187.140.26:8080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', ConnectionResetError(54, 'Connection reset by peer')))")), <traceback object at 0x7fc1c006a780>)

error: https://178.128.16.115:44344
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc202bf5310>, 'Connection to 178.128.16.115 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc1c006a690>)

178.128.16.115 健康值变为： 1
error: https://69.55.55.214:3128
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to prox

error: https://14.140.131.82:3128
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out')))")), <traceback object at 0x7fc1e043eaa0>)

14.140.131.82 健康值变为： 1
error: https://209.97.164.211:47503
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc1e0467b90>, 'Connection to 209.97.164.211 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc202b10d20>)

209.97.164.211 健康值变为： 1
error: https://221.126.249.100:8080
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.con

error: https://88.99.10.254:1080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', timeout('_ssl.c:1074: The handshake operation timed out')))")), <traceback object at 0x7fc1c012c870>)

88.99.10.254 健康值变为： 1
error: https://91.229.192.93:80
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc21030ce10>, 'Connection to 91.229.192.93 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc2102df910>)

91.229.192.93 健康值变为： 13
error: https://138.197.135.237:8080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError

error: https://198.100.154.150:3128
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fc1c01207d0>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc1b0016730>)

198.100.154.150 健康值变为： 0
error: https://178.128.31.220:44344
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fc1e026c1d0>: Failed to establish a new connection: [Errno 61] Connection refused')))")), <traceback object at 0x7fc1e025ec80>)

178.128.31.220 健康值变为： 0
error: https://178.128.87.184:44344
(<class 'requests.exceptions.Prox

error: https://80.187.140.26:8080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', ConnectionResetError(54, 'Connection reset by peer')))")), <traceback object at 0x7fc1e04185f0>)

error: https://178.128.16.115:44344
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc1c0091b90>, 'Connection to 178.128.16.115 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc1e0418780>)

178.128.16.115 健康值变为： 0
error: https://69.55.55.214:3128
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to prox

error: https://189.240.124.61:8080
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc1e041fb50>, 'Connection to 189.240.124.61 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc1c0058280>)

189.240.124.61 健康值变为： 0
error: https://200.73.128.63:80
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc2102b1bd0>, 'Connection to 200.73.128.63 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc1b0018230>)
error: https://206.189.154.176:8080
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries

error: https://91.229.192.93:80
(<class 'requests.exceptions.ConnectTimeout'>, ConnectTimeout(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fc1e03bf810>, 'Connection to 91.229.192.93 timed out. (connect timeout=5)'))")), <traceback object at 0x7fc202bd6cd0>)

91.229.192.93 健康值变为： 6
error: https://88.99.10.254:1080
(<class 'requests.exceptions.ProxyError'>, ProxyError(MaxRetryError("HTTPSConnectionPool(host='book.douban.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', timeout('_ssl.c:1074: The handshake operation timed out')))")), <traceback object at 0x7fc2102f3960>)

88.99.10.254 健康值变为： 0
更新成功 364


In [None]:
#hide
px.debug = False
with ThreadPoolExecutor(max_workers=10) as executor:
    executor.map(lambda url : get_douban_data(url,px,data), url_list) 


访问成功： https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start=180&type=T
访问成功： https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start=120&type=T
访问成功： https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start=40&type=T
访问成功： https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start=20&type=T
访问成功： https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start=260&type=T
访问成功： https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start=0&type=T
访问成功： https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start=160&type=T
访问成功： https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start=60&type=T
访问成功： https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start=300&type=T
访问成功： https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start=200&type=T
访问成功： https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start=220&type=T
访问成功： https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start=80&type=T
访问成功： https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD

In [None]:
# hide
dataframe = pd.DataFrame.from_dict(data,orient='index')
dataframe.head()

Unnamed: 0,a_nbg_url,img_no_class_src,a_no_class_url,a_no_class_title,a_no_class_text,span_no_class_text,div_pub_text,span_rating_nums_text,span_pl_text,p_no_class_text
3191237,https://book.douban.com/subject/3191237/,https://img3.doubanio.com/view/subject/s/publi...,https://book.douban.com/subject/3191237/buylinks,众声喧哗,纸质版 18.60 元起,: 网络时代的个人表达与公共讨论,胡泳 / 广西师范大学出版社 / 2008-9 / 35.00元,8.0,(907人评价),本书触及了网络政治学中的一个重大话题——网络空间中的私域与公域。随着科技的进步，在信息时代的...
30364484,https://book.douban.com/subject/30364484/,https://img1.doubanio.com/view/subject/s/publi...,https://read.douban.com/ebook/60171085/?dcs=ta...,创投42章经,去看电子版,: 互联网商业逻辑与投资进阶指南,曲凯 / 中信出版集团 / 2018-10-20 / 58.00,8.2,(212人评价),《创投42章经》是拥有百万粉丝的微信公众号“42章经”的精选文章合集，全书共分为心法、内功、...
25843241,https://book.douban.com/subject/25843241/,https://img3.doubanio.com/view/subject/s/publi...,https://book.douban.com/subject/25843241/buylinks,互联网思维独孤九剑,纸质版 37.40 元起,: 移动互联时代的思维革命,赵大伟 / 机械工业出版社 / 2014-3-20 / 49,7.3,(842人评价),《互联网思维独孤九剑》是国内第一部系统阐述互联网思维的著作，用9大互联网思维：用户思维、简约...
26400900,https://book.douban.com/subject/26400900/,https://img3.doubanio.com/view/subject/s/publi...,https://read.douban.com/ebook/25462377/?dcs=ta...,创京东,去看电子版,: 刘强东亲述创业之路,李志刚 / 中信出版社 / 2015-5-1 / CNY 49.80,7.1,(2242人评价),1998年，刘强东创业，在中关村经销光磁产品。2004年，因为非典，京东偶然之下转向线上销售...
20388034,https://book.douban.com/subject/20388034/,https://img3.doubanio.com/view/subject/s/publi...,https://book.douban.com/subject/20388034/buylinks,大连接,纸质版 44.90 元起,: 社会网络是如何形成的以及对人类现实行为的影响,[美] 尼古拉斯•克里斯塔基斯（Nicholas A. Christakis）、[美] 詹姆...,7.2,(561人评价),[内容简介]\n1. 本书是继《六度分隔》之后，社会科学领域最重要的作品。作者发现：相距三度...


得到了这样的规范的数据结构的数据，就可以进行很多数据分析的工作，比如评价数分布，评分分布，热门作者等等

In [None]:
# hide
dataframe.describe()

Unnamed: 0,a_nbg_url,img_no_class_src,a_no_class_url,a_no_class_title,a_no_class_text,span_no_class_text,div_pub_text,span_rating_nums_text,span_pl_text,p_no_class_text
count,1000,1000,1000,999,1000,573,997,921.0,1000,945
unique,1000,999,1000,957,379,561,997,52.0,572,938
top,https://book.douban.com/subject/26177913/,https://img9.doubanio.com/f/shire/5522dd1f5b74...,https://book.douban.com/subject/24875857/buylinks,引爆点,去看电子版,: 无组织的组织力量,李彦 / 清华大学 / 2005-7 / 66.00元,8.0,(少于10人评价),计算广告是一项新兴的研究课题，它涉及大规模搜索和文本分析、信息获取、统计模型、机器学习、分类...
freq,1,2,1,3,312,3,1,51.0,66,2
