In [17]:
# coding: utf-8

import pandas as pd
from bs4 import BeautifulSoup
from config import GROUP_DICT, MAX_PAGE, SQL_DICT, HEADERS, PROXY_POOL_URL, MAX_GET_RETRY, OUTPUT_PATH
from base import _Sql_Base
import requests
import emoji
import time
import random
import logging
import os

class HTTPError(Exception):

    """ HTTP状态码不是200异常 """

    def __init__(self, status_code, url):
        self.status_code = status_code
        self.url = url

    def __str__(self):
        return "%s HTTP %s" % (self.url, self.status_code)
    
def get_logger(name):
    """logger
    """
    default_logger = logging.getLogger(name)
    default_logger.setLevel(logging.DEBUG)
    stream = logging.StreamHandler()
    stream.setLevel(logging.DEBUG)
    formatter = logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s")
    stream.setFormatter(formatter)
    default_logger.addHandler(stream)
    return default_logger
    

class Douban_corpus_spider(_Sql_Base):

    def __init__(self, is_proxy = False):

        self.GROUP_DICT = GROUP_DICT
        self.MAX_PAGE = MAX_PAGE
        self.sql_engine = self.create_engine(SQL_DICT)
        self.is_proxy = is_proxy
        if is_proxy:
            self.proxyIP = self.get_proxy()
        self.logger = get_logger("douban_spider")
            
    def request_douban(self, url):

        headers = {
            'User-Agent': HEADERS
        }
        for i in range(MAX_GET_RETRY):
            try:
                if self.is_proxy:
                    proxyIP = self.proxyIP
                    proxies = {
                        'http' : proxyIP,
                        'https': proxyIP
                    }
                    response = requests.get(url, proxies=proxies, headers=headers)
                else:
                    response = requests.get(url, headers=headers)
                if response.status_code != 200:
                    raise HTTPError(response.status_code, url)
                else:
                    print('proxy: %s sucessfully get data from %s' %(self.proxyIP, url))
                break
            except Exception as exc:
                self.logger.warn("%s %d failed!\n%s", url, i, str(exc))
                if self.is_proxy:
                    self.proxyIP = self.get_proxy()
                continue
        return response.text
    
    # 从代理池中随机取出一个IP
    def get_proxy(self):
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('proxy: %s' %response.text)
                return "http://%s" %response.text
        except ConnectionError:
            return None

    def spider_links(self, group, page):
        url = '{}discussion?start={}'.format(self.GROUP_DICT[group], str(page*25))
        html = self.request_douban(url)
        soup = BeautifulSoup(html, 'lxml')
        list_ = soup.find(class_='olt').find_all('tr')
        page_link = []; page_title = []
        for item in list_:
            try:
                page_link.append(item.find('a').get('href'))
                page_title.append(item.find('a').get('title'))
            except:
                continue
        return page_link, page_title

    def spider_page(self, url):
        html = self.request_douban(url)
        soup = BeautifulSoup(html, 'lxml')
        for item in soup.find(type="application/ld+json"):
            try:
                page_author_diag = json.loads(item)['text']
            except:
                page_author_diag = ''
        list_ = soup.find_all(class_='clearfix comment-item reply-item')
        page_comments = []
        for item in list_:
            try:
                page_comments.append(item.find('p').contents[0])
            except:
                continue
        return page_author_diag, page_comments

    def spider_group(self, group):
        spider_outputs = {}
        link_list = []
        title_list = []
        for page in range(self.MAX_PAGE):
            link_list_page, title_list_page = self.spider_links(group, page)
            link_list = link_list + link_list_page
            title_list = title_list + title_list_page
        for link in link_list:
            spider_outputs[link] = {}
            spider_outputs[link]['title'] = title_list[link_list.index(link)]
            spider_outputs[link]['author_diag'], spider_outputs[link]['comments'] = self.spider_page(link)
            try:
                self.json_write(spider_outputs, os.path.join(OUTPUT_PATH, '{}.json'.format(group)))
            except:
                del spider_outputs[link]
        return spider_outputs

    def group_dict_transfer(self, output_dict):
        data = pd.DataFrame(output_dict).T
        data['link'] = data.index
        data = data.reset_index(drop = True)[['link','title','author_diag','comments']]
        def comments_sub(a):
            b = ''
            for item in a:
                b = item + '|' + b
            return b
        data['comments'] = data['comments'].apply(comments_sub)
        for col in ['title','author_diag','comments']:
            data[col] = data[col].apply(emoji.demojize)
        return data

    def run(self):
        for group in self.GROUP_DICT.keys():
            output_dict = self.spider_group(group)
            output_table = self.group_dict_transfer(output_dict)
            self.table_save(output_table, group)

In [None]:
dcs = Douban_corpus_spider(is_proxy = True)

dcs.run()

proxy: 61.183.176.122:57210
proxy: http://61.183.176.122:57210 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=0
proxy: http://61.183.176.122:57210 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=25
proxy: http://61.183.176.122:57210 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=50


HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/kuakua/discussion?start=75 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000233CE1039E8>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。',)))


proxy: 31.43.18.182:8080


HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/kuakua/discussion?start=75 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000233CE103E10>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。',)))


proxy: 49.83.59.250:53281
proxy: http://49.83.59.250:53281 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=75
proxy: http://49.83.59.250:53281 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=100


HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/kuakua/discussion?start=125 (Caused by ProxyError('Cannot connect to proxy.', ConnectionResetError(10054, '远程主机强迫关闭了一个现有的连接。', None, 10054, None)))


proxy: 117.254.219.177:8080
proxy: http://117.254.219.177:8080 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=125


HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/kuakua/discussion?start=150 (Caused by ProxyError('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response',)))


proxy: 49.83.59.250:53281
proxy: http://49.83.59.250:53281 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=150
proxy: http://49.83.59.250:53281 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=175
proxy: http://49.83.59.250:53281 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=200
proxy: http://49.83.59.250:53281 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=225
proxy: http://49.83.59.250:53281 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=250
proxy: http://49.83.59.250:53281 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=275
proxy: http://49.83.59.250:53281 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=300
proxy: http://49.83.59.250:53281 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=325
proxy: http://49.83.59.250:53281 sucessfully g

HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/kuakua/discussion?start=475 (Caused by ProxyError('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response',)))


proxy: 117.241.97.251:60375


HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/kuakua/discussion?start=475 (Caused by ProxyError('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response',)))


proxy: 36.89.99.90:8080
proxy: http://36.89.99.90:8080 sucessfully get data from https://www.douban.com/group/kuakua/discussion?start=475
proxy: http://36.89.99.90:8080 sucessfully get data from https://www.douban.com/group/topic/141890698/
proxy: http://36.89.99.90:8080 sucessfully get data from https://www.douban.com/group/topic/141903603/
proxy: http://36.89.99.90:8080 sucessfully get data from https://www.douban.com/group/topic/141902457/
proxy: http://36.89.99.90:8080 sucessfully get data from https://www.douban.com/group/topic/141898605/
proxy: http://36.89.99.90:8080 sucessfully get data from https://www.douban.com/group/topic/141892860/
proxy: http://36.89.99.90:8080 sucessfully get data from https://www.douban.com/group/topic/141895952/
proxy: http://36.89.99.90:8080 sucessfully get data from https://www.douban.com/group/topic/141864306/
proxy: http://36.89.99.90:8080 sucessfully get data from https://www.douban.com/group/topic/141846999/
proxy: http://36.89.99.90:8080 sucessf

HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/topic/141562214/ (Caused by ProxyError('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response',)))


proxy: 182.253.175.208:8080


HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/topic/141562214/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000233CE815E80>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。',)))


proxy: 138.197.65.120:80


HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/topic/141562214/ (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 500 Internal Server Error',)))


proxy: 36.89.99.90:8080


HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/topic/141562214/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000233CE8346A0>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。',)))


proxy: 5.45.125.243:3128
proxy: http://5.45.125.243:3128 sucessfully get data from https://www.douban.com/group/topic/141562214/
proxy: http://5.45.125.243:3128 sucessfully get data from https://www.douban.com/group/topic/141861358/
proxy: http://5.45.125.243:3128 sucessfully get data from https://www.douban.com/group/topic/141824716/
proxy: http://5.45.125.243:3128 sucessfully get data from https://www.douban.com/group/topic/141853547/
proxy: http://5.45.125.243:3128 sucessfully get data from https://www.douban.com/group/topic/141746144/
proxy: http://5.45.125.243:3128 sucessfully get data from https://www.douban.com/group/topic/141718930/
proxy: http://5.45.125.243:3128 sucessfully get data from https://www.douban.com/group/topic/141761564/
proxy: http://5.45.125.243:3128 sucessfully get data from https://www.douban.com/group/topic/141760634/
proxy: http://5.45.125.243:3128 sucessfully get data from https://www.douban.com/group/topic/141568604/
proxy: http://5.45.125.243:3128 sucessf

HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/topic/141857325/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000233D08EB898>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。',)))


proxy: 46.201.249.7:51137


HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/topic/141857325/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000233D08F6CF8>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。',)))


proxy: 188.32.20.79:8197


('Connection aborted.', BadStatusLine('<html>\r\n',))


proxy: 60.216.101.46:32868


https://www.douban.com/group/topic/141857325/ HTTP 403


proxy: 165.22.135.225:8080


HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/topic/141857325/ (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 403 Forbidden',)))


proxy: 178.46.157.208:8080


HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/topic/141857325/ (Caused by ProxyError('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response',)))


proxy: 165.22.135.225:8080


HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/topic/141857325/ (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 403 Forbidden',)))


proxy: 168.228.229.234:8080
proxy: http://168.228.229.234:8080 sucessfully get data from https://www.douban.com/group/topic/141857325/
proxy: http://168.228.229.234:8080 sucessfully get data from https://www.douban.com/group/topic/141884089/
proxy: http://168.228.229.234:8080 sucessfully get data from https://www.douban.com/group/topic/141834552/


HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/topic/141804222/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000233D0D25978>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。',)))


proxy: 41.60.239.93:8080
proxy: http://41.60.239.93:8080 sucessfully get data from https://www.douban.com/group/topic/141804222/
proxy: http://41.60.239.93:8080 sucessfully get data from https://www.douban.com/group/topic/141853749/
proxy: http://41.60.239.93:8080 sucessfully get data from https://www.douban.com/group/topic/141898081/
proxy: http://41.60.239.93:8080 sucessfully get data from https://www.douban.com/group/topic/141827189/
proxy: http://41.60.239.93:8080 sucessfully get data from https://www.douban.com/group/topic/141897644/


HTTPSConnectionPool(host='www.douban.com', port=443): Max retries exceeded with url: /group/topic/141862367/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000233D0FA3B70>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。',)))


proxy: 106.10.56.138:80


https://www.douban.com/group/topic/141862367/ HTTP 403


proxy: 188.32.20.79:8197


('Connection aborted.', BadStatusLine('<html>\r\n',))


proxy: 117.254.219.177:8080
proxy: http://117.254.219.177:8080 sucessfully get data from https://www.douban.com/group/topic/141862367/
proxy: http://117.254.219.177:8080 sucessfully get data from https://www.douban.com/group/topic/141896720/
proxy: http://117.254.219.177:8080 sucessfully get data from https://www.douban.com/group/topic/135915701/
proxy: http://117.254.219.177:8080 sucessfully get data from https://www.douban.com/group/topic/141685645/
proxy: http://117.254.219.177:8080 sucessfully get data from https://www.douban.com/group/topic/141182313/
proxy: http://117.254.219.177:8080 sucessfully get data from https://www.douban.com/group/topic/141888890/
proxy: http://117.254.219.177:8080 sucessfully get data from https://www.douban.com/group/topic/141875780/
proxy: http://117.254.219.177:8080 sucessfully get data from https://www.douban.com/group/topic/141865407/
proxy: http://117.254.219.177:8080 sucessfully get data from https://www.douban.com/group/topic/141462907/
proxy: ht