In [242]:
import requests
import json
import pymongo
from lxml import etree
from retrying import retry
import re
import time

In [254]:
class DoubanTop250():
    def __init__(self):
        self.records = range(0, 226, 25)
        self.url = 'https://movie.douban.com/top250?start=%d&filter='
        self.host = 'localhost'
        self.port = 27017
        self.db = 'spider'
        self.coll = 'douban_top250'
    
    def conn_mongo(self):
        """连接MongoDB数据库"""
        client = pymongo.MongoClient(self.host, self.port)
        db = client[self.db]
        coll = db[self.coll]
        coll.create_index([('movieId', 1)], unique=True)
        return client, coll
    
    @retry(stop_max_attempt_number=3)  # 失败最多重试三次
    def page_info(self, page):
        """
        获取网页源代码
        fields: movieName、foreignName、ranking、link、alias、poster、isPlayable、director、actors、
                releaseYear、releaseCountry、movieType、score、evalOfNum、hotReview
        """
        try:
            res = requests.get(url % page, timeout=(10, 10))
        except Exception as e:
            print(e)
        return res.text
    
    def parse_page(self, page):
        """解析网页响应内容"""
        response = self.page_info(page)
        response = etree.HTML(response)
        items = response.xpath('//ol/li')
        for idx, item in enumerate(items):
            names = item.xpath('//li[%d]//div[@class="hd"]//span[@class="title"]/text()' % (idx + 1))
            if len(names) > 1:
                foreignName = names[1].strip().replace('/\xa0', '')
            else:
                foreignName = None
            play = item.xpath('//li[%d]//div[@class="hd"]//span[@class="playable"]/text()' % (idx + 1))
            if len(play) >= 1:
                isPlayable = play[0].strip().replace('[', '').replace(']', '')
            else:
                isPlayable = None
            info = [x.strip() for x in item.xpath('//li[%d]//div[@class="bd"]/p/text()' % (idx + 1)) if x.strip() != '' ]
            try:
                actors = [x.strip() for x in (re.search('主演: (.*)', info[0]).group(1) + '/').replace('...', '').split('/') if x.strip() != '']
            except:
                actors = []
            try:
                director = re.match('导演: (.*?)\xa0', info[0]).group(1)
            except:
                director = []
            hotReview = item.xpath('//li[%d]//div[@class="bd"]//p[@class="quote"]/span/text()' % (idx + 1))
            if len(hotReview) >= 1:
                hotReview = hotReview[0]
            else:
                hotReview = None
            yield {
                'movieId': re.search('(\d+)', item.xpath('//li[%d]//div[@class="pic"]/a/@href' % (idx + 1))[0]).group(1),
                'movieName': names[0],
                'foreignName': foreignName,
                'ranking': int(item.xpath('//li[%d]//div[@class="pic"]/em/text()' % (idx + 1))[0]),
                'link': item.xpath('//li[%d]//div[@class="pic"]/a/@href' % (idx + 1))[0],
                'alias': [x.strip() for x in item.xpath('//li[%d]//div[@class="hd"]//span[@class="other"]/text()' % (idx + 1))[0]\
                          .strip().replace('/\xa0', '').split('/')],
                'poster': item.xpath('//li[%d]//div[@class="pic"]//img/@src' % (idx + 1))[0],
                'isPlayable': isPlayable,
                'director': director,
                'actors': actors,
                'releaseYear': int(re.match('(\d+)', info[1]).group(1)),
                'releaseCountry': re.search('\xa0/\xa0(.*?)\xa0/\xa0', info[1]).group(1).split(),
                'Type': info[1],
                'score': float(item.xpath('//li[%d]//div[@class="bd"]//div[@class="star"]/span[@class="rating_num"]/text()' % (idx + 1))[0]),
                'evalOfNum': int(item.xpath('//li[%d]//div[@class="bd"]//div[@class="star"]/span[last()]/text()' % (idx + 1))[0]\
                                .replace('人评价', '')),
                'hotReview': hotReview
            }
    
    def main(self):
        client, coll = self.conn_mongo()
        for page in self.records:
            print('正在采集第%d条...' % page)
            results = self.parse_page(page)
            for result in results:
                try:
                    coll.insert_one(result)
                except Exception as e:
                    print(e.center(50, '-'))
                    print(result)
                    continue
            time.sleep(5)
        client.close()
        print('finished')
        return

In [255]:
if __name__ == '__main__':
    douban = DoubanTop250()
    douban.main()

正在采集第0条...
正在采集第25条...
正在采集第50条...
正在采集第75条...
正在采集第100条...
正在采集第125条...
正在采集第150条...
正在采集第175条...
正在采集第200条...
正在采集第225条...
finished
