## 导入相关包

In [1]:
import pandas as pd

import requests  

import re
import math
import random
import time
import sys, urllib

## 搜索结果爬虫

In [2]:
def search(key, status=0, mode='isfuzzy', order=0, page=1, **kwargs):
    '''
        根据关键词搜索孔夫子旧书网所有商品，搜索结果返回前 5000 条
        @param key: search keywords, str
        @param status: product status, int, default 0; 0=在售, 1=已售
        @param mode: search mode, str, default 'isfuzzy'; plain=普通, isfuzzy=模糊, exact=精确, perfect=完全匹配
        @param order: sorting method, int, default 0; 0=综合, 1=价格升序, 2=价格降序, 3=出版时间降序, 4=出版时间升序, 6=最新上架, 7=书店等级, 100=运费升序
        @param page: page num, int, default=1
        @param kwargs: other conditions, Dict
        @return result: search results, Dict
    '''
    encode_key = urllib.parse.quote(key)
    
    # 搜索条件
    search_conditions = 'select=0&' + 'key=' + encode_key + '&status=' + str(status)
    for key, value in kwargs.items():
        value = value.encode('unicode_escape').decode()
        pattern = re.compile(r'\\u')
        encode_value  = re.sub(pattern, 'k', value)
        search_conditions += '&' + key
        if key=='author' or key=='press':
            search_conditions += '=h' + str(encode_value)
        else:
            search_conditions += '=' + str(encode_value)
    search_conditions += '&' + mode + '=1' + '&order=' + str(order) 
    if page != 1:
        search_conditions += '&pagenum=' + str(page)
    
    #  Request URL 
    request_url = 'http://search.kongfz.com/product_result/?' + search_conditions + '&type=1' + '&ajaxdata=1' + '&_=' + str(round(time.time() * 1000))
    
    # Request Headers
    my_headers = {
        'Host': 'search.kongfz.com',
        'Referer': 'http://search.kongfz.com/product_result/?' + search_conditions,
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
    }
    
    res = requests.post(url=request_url, headers = my_headers)  
    res.raise_for_status()  
    res.encoding = 'utf-8'  

    # 得到包含搜索结果的字典
    result = res.json()  
    return result

In [3]:
# 所需字段列表
columns = ['itemid', 'itemname', 'shopid', 'shopname', 'area', 'areaname', 
           'author', 'press', 'price', 'binding', 'bindingname', 'pubdate', 'quality', 'qualityname', 
           'bigimgurl', 'smallimgurl', 'biztype', 'catid', 'class', 'importantdesc', 'isrelatedisbn', 
           'nickname', 'addtime', 'newaddtime', 'updatetime', 'years', 'years2', 'yearsgroup']

In [4]:
def getPageInfo(product_list):  
    '''
        对一个网页的商品信息进行解析，返回结果列表
        @param product_list: product list, List
        @return page_info_list: result list, List
    '''  
    page_info_list = []  
    
    for item in product_list:  
        product_info = []  
        for col in columns:
            try:
                product_info.append(item[col])
            except:
                product_info.append(None)
        product_info.append('http://book.kongfz.com/' + item['itemid'] + '/' + item['shopid'])
        page_info_list.append(product_info)  
    
    return page_info_list

In [5]:
def kongfzSearchSpider(key, status=0, mode='isfuzzy', order=0, page=1, **kwargs):  
    '''
        爬取孔夫子商品搜索结果列表数据
        @param key: search keywords, str
        @param status: product status, int, default 0; 0=在售, 1=已售
        @param mode: search mode, str, default 'isfuzzy'; plain=普通, isfuzzy=模糊, exact=精确, perfect=完全匹配
        @param order: sorting method, int, default 0; 0=综合, 1=价格升序, 2=价格降序, 3=出版时间降序, 4=出版时间升序, 6=最新上架, 7=书店等级, 100=运费升序
        @param page: page num, int, default=1
        @param kwargs: other conditions, Dict
    '''
    # 先爬取第一页，得到搜索结果的总记录数和总页数
    result_dict = search(key, status, mode, order, page, **kwargs)
    total_found = result_dict['other']['total_found'] #获取总记录数
    page_num = result_dict['other']['page_count'] #获取总页数
    print('商品总数：{0}，总页数：{1}'.format(total_found, page_num))  
    time.sleep(30)
    
    # 对每个网页读取JSON, 获取每页数据 
    total_info = [] 
    for n in range(1, page_num+1):  
        result_dict = search(key, status, mode, order, page=n, **kwargs)  
        item_list = result_dict['data']['itemList']
        page_info = getPageInfo(item_list)
        total_info += page_info  
        time.sleep(random.randint(20, 40)) 
        print('已经抓取第 {0}/{1} 页'.format(n, page_num))
       
    #将所有数据转化为 DataFrame再输出 
    columns.append('detail_url')
    df = pd.DataFrame(data=total_info, columns=columns)   
    data_output = 'output\\' + key + '_' + str(round(time.time())) + '.csv'
    df.to_csv(data_output, index = False)  
    print('已保存为 CSV 文件') 

In [6]:
kongfzSearchSpider('他改变了中国', mode='exact', order=6, press='上海译文出版社')

商品总数：721，总页数：15
已经抓取第 1/15 页
已经抓取第 2/15 页
已经抓取第 3/15 页
已经抓取第 4/15 页
已经抓取第 5/15 页
已经抓取第 6/15 页
已经抓取第 7/15 页
已经抓取第 8/15 页
已经抓取第 9/15 页
已经抓取第 10/15 页
已经抓取第 11/15 页
已经抓取第 12/15 页
已经抓取第 13/15 页
已经抓取第 14/15 页
已经抓取第 15/15 页
已保存为 CSV 文件


## 图书条目爬虫