## 导入相关包

In [11]:
import numpy as np
import pandas as pd

import requests  

import re
import math  
import time
import sys, urllib

## 搜索结果爬虫

In [58]:
def search(key, status=0, mode='isfuzzy', order=0, page=1, **kwargs):
    '''
        根据关键词搜索孔夫子旧书网所有商品，搜索结果返回前 5000 条
        @param key: search keywords, str
        @param status: product status, int, default 0; 0=在售, 1=已售
        @param mode: search mode, str, default 'isfuzzy'; plain=普通, isfuzzy=模糊, exact=精确, perfect=完全匹配
        @param order: sorting method, int, default 0; 0=综合, 1=价格升序, 2=价格降序, 3=出版时间降序, 4=出版时间升序, 6=最新上架, 7=书店等级, 100=运费升序
        @param page: page num, int, default=1
        @param kwargs: other conditions, Dict
        @return result: search results, Dict
    '''
    encode_key = urllib.parse.quote(key)
    
    # 搜索条件
    search_conditions = 'select=0&' + 'key=' + encode_key + '&status=' + str(status)
    for key, value in kwargs.items():
        unicode_value = value.encode('unicode_escape').decode()
        pattern = re.compile(r'\\u')
        encode_value  = re.sub(pattern, 'k', unicode_value)
        search_conditions += '&' + key + '=h' + str(encode_value)
    search_conditions += '&' + mode + '=1' + '&order=' + str(order) 
    if page != 1:
        search_conditions += '&pagenum=' + page
    
    #  request URL 
    request_url = 'http://search.kongfz.com/product_result/?' + search_conditions + '&type=1' + '&ajaxdata=1' + '&_=' + str(round(time.time() * 1000))
    
    # request headers
    my_headers = {
        'Host': 'search.kongfz.com',
        'Referer': 'http://search.kongfz.com/product_result/?' + search_conditions,
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
    }
    
    res = requests.post(url=request_url, headers = my_headers)  
    res.raise_for_status()  
    res.encoding = 'utf-8'  

    # 得到包含搜索结果的字典
    result = res.json()  
    return result

In [59]:
test = search('毛泽东', author='威尔逊')

In [57]:
type(test)

dict

In [62]:
test

{'status': 1,
 'data': {'itemList': [{'area': '1005000000',
    'isrelatedisbn': '1',
    'imgurl': 'G06/M00/5E/F4/p4YBAFtYCIuAMLkjAAHpSghb3Z4353.jpg',
    'quality': '90',
    'catid': '1000000000000000',
    'yearsgroup': '1',
    'years': '2000年代 （2000-2009）',
    'years2': '1002000000',
    'pubdate': '2008-08',
    'price': '20.30',
    'press': '中央文献出版社',
    'author': '威尔逊',
    'importantdesc': '9787507308259',
    'itemname': '毛泽东',
    'userid': '5351007',
    'class': '4',
    'binding': '2',
    'shopid': '225831',
    'shopname': '书式的生活',
    'nickname': '书式的生活',
    'biztype': '1',
    'itemid': '951073148',
    'params': {'ems': '',
     'paper': '',
     'series': '',
     'zcatId': '',
     'binding': '2',
     'edition': '',
     'express': '',
     'pageNum': '',
     'wordNum': '',
     'language': '',
     'pageSize': '165f00',
     'logistics': '',
     'printTimes': '',
     'foreignName': '',
     'noLogistics': '',
     'printingNum': '',
     'publishedIn': ''