In [85]:
from lxml import etree
import requests
import urllib.parse
import re
import pandas as pd
import numpy as np
import csv
import time
import datetime
import pymongo
import os
import sys
from progressbar import ProgressBar
from tqdm import tqdm
from fake_useragent import UserAgent
from selenium import webdriver  # 解决动态加载问题
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import pyecharts
from pyecharts.charts import Bar

In [98]:
# 常量配置
keyword = 'spark'
collection = 'JdBook_spark'
wtype = 1  # 自营
page_num = 30  # 每页初始加载数30，动态加载30。page为奇数为初始加载，偶数为下滑滚动加载。

base_url = 'https://search.jd.com/Search?page=%s'
url = 'https://search.jd.com/Search?' + \
    urllib.parse.urlencode({'keyword': keyword, 'enc': 'utf8', 'wtype': wtype})
headers = {
    'User-Agent': UserAgent().Chrome,
    'X-Requested-With': 'XMLHttpRequest',
    'Referer': 'https://search.jd.com/Search?enc=utf8&keyword=' + urllib.parse.quote(keyword)
}
params = {
    'wtype': 1,  # 是否自营
    'keyword': keyword,  # urllib.parse.quote(keyword)
    'enc': 'utf8'
}

In [103]:
def page_content(page):
    """获取网页源代码"""
    try:
        content = requests.get(base_url % page, headers=headers, params=params)
        content.encoding = 'utf8'  # 解决中文乱码问题
        return content.text
    except Exception as e:
        print(e)
        return


def pages():
    """计算查询结果总页数"""
    content = page_content(1)
    html = etree.HTML(content)
    records = html.xpath('//meta[@name="description"]/@content')[0]
    records = int(re.search('(\d+)件', records, re.S).group(1))
    total_pages = int(np.ceil(records / page_num))
    print('根据关键词"%s"查询结果有%d件，共%d页'.center(50, '*') % (keyword, records, total_pages))
    return total_pages


def book_detail(content):
    """
    抓取sku、price、href、shop、name
    sku:商品编号
    discount_price:折扣价
    href:商品链接
    publish:出版商
    name:商品标题
    isjd:是否京东自营
    keyword:搜索关键词
    """
    html = etree.HTML(content)
    records_num = len(html.xpath('//li[@class="gl-item"]/@data-sku'))
    for index in range(1, records_num + 1):
        sku = html.xpath('//li[@class="gl-item"][%s]/@data-sku' % index)[0]
        try:
            discount_price = float(html.xpath('//li[@class="gl-item"][%s]//div[@class="p-price"]//i/text()' % index)[0])
        except:
            discount_price = None
        href = 'https:' + html.xpath('//li[@class="gl-item"][%s]//div[contains(@class, "p-name")]/a/@href' % index)[0]
        name = html.xpath('//li[@class="gl-item"][%s]//div[contains(@class, "p-name")]' % index)
        name = ''.join(name[0].xpath('.//em//text()'))
        # 部分图书没有p-bookdetails标签
        isbook = html.xpath('//li[@class="gl-item"][%s]//div[@class="p-bookdetails"]' % index)  # 判断是否是图书
        if len(isbook):
            author = ''.join(html.xpath('//li[@class="gl-item"][%s]//div[@class="p-bookdetails"]/span[@class="p-bi-name"]/a/text()' % index))
            publish = ''.join(html.xpath('//li[@class="gl-item"][%s]//div[@class="p-bookdetails"]/span[@class="p-bi-store"]/a/text()' % index))
            publish_date = ''.join(html.xpath('//li[@class="gl-item"][%s]//div[@class="p-bookdetails"]/span[@class="p-bi-date"]/text()' % index))
        else:
            shop = ''.join(html.xpath('//li[@class="gl-item"][%s]//div[@class="p-shop"]//a/text()' % index))
        isjd = html.xpath('//li[@class="gl-item"][%s]//div[@class="p-icons"]/i[contains(@class, "goods-icons")]/text()' % index)
        isjd = 1 if '自营' in ''.join(isjd) else 0
        if isjd == 1:
            if len(isbook):
                yield {'sku': sku, 'discount_price': discount_price, 'href': href, 'publish': publish,
                       'author': author, 'publish_date': publish_date,
                       'name': name, 'isjd': isjd, 'keyword': keyword, 'isbook': 1}
            else:
                yield {'sku': sku, 'discount_price': discount_price, 'href': href, 'shop': shop
                       'name': name, 'isjd': isjd, 'keyword': keyword, 'isbook': 0}


def load_ajax_page(page):
    """selenium渲染动态页面"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    browser = webdriver.Chrome(options=chrome_options)
    try:
        browser.get(url + '&page=%d' % page)
#         input = browser.find_element_by_id('key')
#         input.send_keys(keyword)
#         input.send_keys(Keys.ENTER)
        wait = WebDriverWait(browser, 10)
        try:
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'm-list')))
            end = browser.find_element_by_class_name('page')
            browser.execute_script('arguments[0].scrollIntoView(true);', end)
        except:
            time.sleep(30)
        # 拖动到最底部
        time.sleep(5)
#         print(browser.current_url)
#         print(browser.get_cookies())
        return browser.page_source
    finally:
        browser.close()


def conn_mongo(remove=False):
    """保存到MongoDB数据库"""
    client = pymongo.MongoClient('127.0.0.1', 27017)
    db = client['spider']
    coll = db[collection]
    if remove:
        coll.delete_many({})
    indexes = coll.index_information()
    if 'sku_1' not in indexes:
        coll.create_index([('sku', 1)], unique=True)
    return client, db, coll

In [104]:
if __name__ == "__main__":
    try:
        client.close()
    except:
        pass
    client, db, coll = conn_mongo(remove=True)
    total_pages = pages()
    # 抓取关键字搜索结果的sku并保存到MongoDB数据库
    pbar = ProgressBar(maxval=100)
    pbar.start()
    for page in range(1, total_pages + 1, 2):
        content = load_ajax_page(page)
        books = book_detail(content)
        for book in books:
            try:
                coll.insert_one(book)
            except Exception as e:
                pass
        pbar.update(page*100//total_pages)
        time.sleep(0.5)
    client.close()
    pbar.finish()

**************根据关键词"spark"查询结果有1850件，共62页**************


100% |########################################################################|


In [100]:
# 抓取全部自营图书
cat_url = 'https://book.jd.com/booksort.html'  # 全部分类URL
res = requests.get(cat_url)
# 网页源代码头部编码声明中编码格式为gb2312。因utf8无法正常显示中文,gb2312无法显示繁体字，故用gb18030
res.encoding='gb18030'
html = etree.HTML(res.text)

cat_dict = {}
top_category = html.xpath('//dt/a/text()')  # 大类
for i, top_catname in enumerate(top_category, 1):
    sub_category = html.xpath(f'//div[@class="mc"]/dl/dd[{i}]//a')
    for j, sub in enumerate(sub_category, 0):
        href = sub.xpath(f'./@href')[0]
        sub_catname = sub.xpath(f'./text()')[0]
        cat_dict.setdefault(top_catname, {}).update({sub_catname: 'https:' + href})
cat_dict
with open('京东图书分类.json', 'w', encoding='utf8') as f:
    json.dump(cat_dict, f)

In [79]:
import json

with open(r'F:\abel\cga\spider\jd\京东图书分类.json') as f:
    json_dict = json.load(f)
for i in json_dict.values():
    for j in i.values():
        cat = re.search('(\d+-\d+-\d+)', j).group(1)
        url = 'https://list.jd.com/%s.html?delivery=1&page=1' % cat
        res = requests.get(url).text
        html = etree.HTML(res)
        results = html.xpath('//div[@class="p-bookdetails"]')
        if len(results) == 0:
            print(cat, url, len(results))

1713-3263-3402 https://list.jd.com/1713-3263-3402.html?delivery=1&page=1 0
1713-3263-3405 https://list.jd.com/1713-3263-3405.html?delivery=1&page=1 0
1713-3267-17593 https://list.jd.com/1713-3267-17593.html?delivery=1&page=1 0
1713-3266-17585 https://list.jd.com/1713-3266-17585.html?delivery=1&page=1 0
1713-3266-17586 https://list.jd.com/1713-3266-17586.html?delivery=1&page=1 0
1713-3264-17591 https://list.jd.com/1713-3264-17591.html?delivery=1&page=1 0
1713-3289-17583 https://list.jd.com/1713-3289-17583.html?delivery=1&page=1 0
1713-3290-17579 https://list.jd.com/1713-3290-17579.html?delivery=1&page=1 0
1713-3291-17581 https://list.jd.com/1713-3291-17581.html?delivery=1&page=1 0
1713-14669-14681 https://list.jd.com/1713-14669-14681.html?delivery=1&page=1 0
1713-14669-14684 https://list.jd.com/1713-14669-14684.html?delivery=1&page=1 0
1713-14669-14687 https://list.jd.com/1713-14669-14687.html?delivery=1&page=1 0
1713-11745-11751 https://list.jd.com/1713-11745-11751.html?delivery=1&page