In [21]:
from lxml import etree
import requests
import urllib.parse
import re
import pandas as pd
import numpy as np
import csv
import time
import datetime
import pymongo
import json
import os
import sys
from progressbar import ProgressBar
from fake_useragent import UserAgent
from retrying import retry
from selenium import webdriver  # 解决动态加载问题
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import pyecharts
from pyecharts.charts import Bar

In [22]:
# 根据关键字搜索书籍
keyword = 'spark'
keyword_collection = 'JDBook_keyword'
wtype = 1  # 自营
search_page_num = 30  # 关键字搜索每页初始加载数30，动态加载30。page为奇数为初始加载，偶数为下滑滚动加载。
index_name = 'skuid'

search_params = {'keyword': keyword, 'wtype': wtype, 'enc': 'utf8'}
search_url = 'https://search.jd.com/Search?page=%s&' + urllib.parse.urlencode(search_params)
search_headers = {
    'User-Agent': UserAgent().Chrome,
    'X-Requested-With': 'XMLHttpRequest',
    'Referer': 'https://search.jd.com/Search?enc=utf8&keyword=' + urllib.parse.quote(keyword)
}

# 通过图书种类搜索书籍
category_collection = 'JDBook'
cat_url = 'https://book.jd.com/booksort.html'  # 全部分类URL
cat_filename = '京东图书分类.json'
subcat_url = 'https://list.jd.com/%s.html?delivery=1'
cat_page_num = 60
cat_headers = {'User-Agent': UserAgent().Chrome}

In [26]:
@retry(stop_max_attempt_number=5, wait_fixed=2000)
def page_content(url, headers, encoding=None):
    """获取网页源代码"""
    content = requests.get(url, headers=headers)
    if encoding:
        content.encoding = encoding  # 解决中文乱码问题
    return content.text


def pages(url, headers, encoding, page_num, cat):
    """计算查询结果总页数"""
    global search_type
    content = page_content(url, headers, encoding)
    html = etree.HTML(content)
    if search_type == 'keyword':
        records = html.xpath('//meta[@name="description"]/@content')[0]
        records = int(re.search('(\d+)件', records, re.S).group(1))
    else:
        records = int(html.xpath('//div[@class="st-ext"]/span/text()')[0])
    total_pages = int(np.ceil(records / page_num))
    print('查询%s:结果有%d件，共%d页'.center(50, '*') % (cat, records, total_pages))
    return total_pages


def book_detail(content, save_book_only, search_type, cat):
    """
    抓取sku、price、href、shop、name
    sku:商品编号
    discount_price:折扣价
    href:商品链接
    publish:出版商
    name:商品标题
    isjd:是否京东自营
    keyword:搜索关键词
    """
    html = etree.HTML(content)
    records_num = len(html.xpath('//li[@class="gl-item"]'))
    for index in range(1, records_num + 1):
        if search_type == 'keyword':
            sku = html.xpath('//li[@class="gl-item"][%s]/@data-sku' % index)[0]
            isjd = html.xpath('//li[@class="gl-item"][%s]//div[@class="p-icons"]/i[contains(@class, "goods-icons")]/text()' % index)
        else:
            sku = html.xpath('//li[@class="gl-item"][%s]//div[@class="p-operate"]/a/@data-sku' % index)[0]
            isjd = ''.join(html.xpath('//li[@class="gl-item"][%s]//div[@class="p-shopnum"]//text()' % index)).strip()
        try:
            discount_price = float(html.xpath('//li[@class="gl-item"][%s]//div[@class="p-price"]//i/text()' % index)[0])
        except:
            discount_price = None
        href = 'https:' + html.xpath('//li[@class="gl-item"][%s]//div[contains(@class, "p-name")]/a/@href' % index)[0]
        name = html.xpath('//li[@class="gl-item"][%s]//div[contains(@class, "p-name")]' % index)
        name = ''.join(name[0].xpath('.//em//text()')).strip()
        # 部分图书没有p-bookdetails标签
        isbook = html.xpath('//li[@class="gl-item"][%s]//div[@class="p-bookdetails"]' % index)  # 判断是否是图书
        if len(isbook) >= 1:
            author = list(x.strip() for x in set(html.xpath('//li[@class="gl-item"][%s]//div[@class="p-bookdetails"]/span[@class="p-bi-name"]//a/text()' % index)))
            publish = ''.join(html.xpath('//li[@class="gl-item"][%s]//div[@class="p-bookdetails"]/span[@class="p-bi-store"]//a/text()' % index))
            publish_date = ''.join(html.xpath('//li[@class="gl-item"][%s]//div[@class="p-bookdetails"]/span[@class="p-bi-date"]/text()' % index)).strip()
        else:
            shop = ''.join(html.xpath('//li[@class="gl-item"][%s]//div[@class="p-shop"]//a/text()' % index))
        
        isjd = 1 if '自营' in ''.join(isjd) else 0
        if isjd == 1:
            if len(isbook) >= 1:
                bookinfo =  {'skuid': sku, 'discount_price': discount_price, 'href': href, 'publish': publish,
                       'author': author, 'publish_date': publish_date,
                       'name': name, 'isjd': isjd, 'isbook': 1}
            else:
                if not save_book_only:
                    bookinfo = {'skuid': sku, 'discount_price': discount_price, 'href': href, 'shop': shop,
                           'name': name, 'isjd': isjd, 'isbook': 0}
                else:
                    yield None
            if search_type == 'keyword':
                bookinfo = dict(bookinfo, **{'keyword': keyword})
            else:
                bookinfo = dict(bookinfo, **{'top_catname': cat[0], 'sub_catname': cat[1]})
            yield bookinfo


def load_ajax_page(page):
    """Chrome无头浏览器加载关键字对应的网页"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    browser = webdriver.Chrome(options=chrome_options)
    try:
        browser.get(search_url % page)
        wait = WebDriverWait(browser, 10)
        try:
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'm-list')))
            end = browser.find_element_by_class_name('page')
            browser.execute_script('arguments[0].scrollIntoView(true);', end)  # 拖动到最底部
        except:
            time.sleep(30)
        time.sleep(5)
        return browser.page_source
    finally:
        browser.close()


def conn_mongo(collection, index_name, remove=False):
    """连接MongoDB数据库"""
    client = pymongo.MongoClient('127.0.0.1', 27017)
    db = client['spider']
    coll = db[collection]
    if remove:
        coll.delete_many({})
    indexes = coll.index_information()
    if ('%s_1' % index_name) not in indexes:
        coll.create_index([(index_name, 1)], unique=True)
    return client, db, coll


def save_to_mongo(coll, total_pages, search_type, save_book_only, cat, step=1, url=None):
    """查询结果保存到MongoDB数据库"""
    if search_type == 'keyword':
        step=2
    pbar = ProgressBar(maxval=100)
    pbar.start()
    for page in range(1, total_pages + 1, step):
        if search_type == 'keyword':
            content = load_ajax_page(page)
        else:
            content = page_content(url + '&page=%d' % page, cat_headers, 'utf8')
        books = book_detail(content, save_book_only, search_type, cat)
        for book in books:
            try:
                if book:
                    coll.insert_one(book)
            except Exception as e:
                pass
        pbar.update(page*100//total_pages)
        time.sleep(0.5)
    pbar.finish()
    return


def book_category(save=True):
    """自营图书全部种类，保存到json文件"""
    # 网页源代码头部编码声明中编码格式为gb2312。因utf8无法正常显示中文,gb2312无法显示繁体字，故用gb18030
    content = page_content(cat_url, headers=None, encoding='gb18030')
    html = etree.HTML(content)

    cat_dict = {}
    top_category = html.xpath('//dt/a/text()')  # 图书大类
    for i, top_catname in enumerate(top_category, 1):
        sub_category = html.xpath(f'//div[@class="mc"]/dl/dd[{i}]//a')
        for j, sub in enumerate(sub_category, 0):
            href = sub.xpath(f'./@href')[0]
            sub_catname = sub.xpath(f'./text()')[0]
            cat_dict.setdefault(top_catname, {}).update({sub_catname: 'https:' + href})
    if save:
        with open(cat_filename, 'w', encoding='utf8') as f:
            json.dump(cat_dict, f)
    return cat_dict


def category_detail(category):
    """一级分类、二级分类、链接"""
    for top_cat, i in category.items():
        for sub_cat, href in i.items():
            cat_num = re.search('(\d+-\d+-\d+)', href).group(1)
            url = subcat_url % (cat_num)
            yield (top_cat, sub_cat, url)

In [None]:
if __name__ == "__main__":
    search_type = 'category'  # category
    save_book_only = True
    remove = True
    save_category = False
    if 'client' in vars().keys():
        client.close()
    if search_type == 'keyword':
        total_pages = pages(search_url % 1, search_headers, 'utf8', search_page_num, keyword)
        client, db, coll = conn_mongo(keyword_collection, index_name, remove)
        save_to_mongo(coll, total_pages, search_type, save_book_only, cat=keyword)
    else:
        client, db, coll = conn_mongo(category_collection, index_name, remove)
        category = book_category(save_category)
        for top_cat, sub_cat, href in category_detail(category):
            total_pages = pages(href, None, 'utf8', cat_page_num, f'[{top_cat}-{sub_cat}]')
            if total_pages > 0:
                save_to_mongo(coll, total_pages, search_type, save_book_only, url=href, cat=(top_cat, sub_cat))
    client.close()

  0% |                                                                        |

*****************查询小说-中国当代小说:结果有4684件，共79页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-中国近现代小说:结果有992件，共17页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-中国古典小说:结果有1432件，共24页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-四大名著:结果有665件，共12页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-港澳台小说:结果有61件，共2页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-穿越/重生/架空:结果有86件，共2页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-外国小说:结果有5310件，共89页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-侦探/悬疑/推理:结果有3150件，共53页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-惊悚/恐怖:结果有394件，共7页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-科幻小说:结果有1239件，共21页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-魔幻/奇幻/玄幻:结果有919件，共16页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-武侠:结果有427件，共8页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-军事:结果有449件，共8页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-情感/家庭/婚姻:结果有2677件，共45页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-宫廷:结果有72件，共2页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-社会:结果有4364件，共73页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-都市:结果有333件，共6页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-乡土:结果有164件，共3页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-职场:结果有165件，共3页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-财经:结果有136件，共3页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-官场:结果有290件，共5页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-历史:结果有1553件，共26页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-影视小说:结果有334件，共6页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-作品集:结果有3652件，共61页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-世界名著:结果有3206件，共54页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询小说-期刊杂志:结果有10件，共1页*****************


100% |########################################################################|
  0% |                                                                        |

*****************查询文学-散文/随笔/书信:结果有6104件，共102页*****************


 41% |#############################                                           |