In [None]:
# default_exp utils

# 进阶的爬虫
> 1. 更复杂的请求
    * 无headers
    * 有headers
    * 有cookies
* JSON 存储数据
* 自动识别列表
* 自动识别下一页
* 自动识别列表上的内容

* [Requests文档](https://requests.readthedocs.io/en/master/)
* [BeautifulSoup文档](https://beautiful-soup-4.readthedocs.io/en/latest/index.html)


In [2]:
# export
import requests,json,re
from bs4 import BeautifulSoup,Tag,NavigableString
from collections import Counter

In [3]:
urls = '''
https://www.zhihu.com/topic/19554834/hot
https://book.douban.com/tag/%E5%A4%96%E5%9B%BD%E6%96%87%E5%AD%A6
https://search.bilibili.com/all?keyword=%E5%A4%A7%E7%88%B7&from_source=banner_search
https://github.com/trending
https://search.jd.com/Search?keyword=%E7%94%B5%E8%84%91&enc=utf-8&wq=%E7%94%B5%E8%84%91&pvid=254ed16260384a18a34c1c452b6db180
https://bj.lianjia.com/ershoufang/
http://www.rrys2019.com/html/top/week_fav_list.html
https://weixin.sogou.com/weixin?type=2&ie=utf8&s_from=hotnews&query=%E9%9B%B7%E7%A5%9E%E5%B1%B1%E6%9C%BA%E5%99%A8%E4%BA%BA%E4%B8%8A%E5%B2%97
'''
url_list = urls.strip().split('\n')
soup_list = []

for url in url_list:
    print('start:',url)
    res = requests.get(url,headers={'user-agent':'Mozilla/5.0'})
    if res.status_code == 200:
        soup = BeautifulSoup(res.text,'lxml')
        soup_list.append(soup)
        print('Done:%s'%res.status_code)
    else:
        print(res,res.text)

start: https://www.zhihu.com/topic/19554834/hot
Done:200
start: https://book.douban.com/tag/%E5%A4%96%E5%9B%BD%E6%96%87%E5%AD%A6
Done:200
start: https://search.bilibili.com/all?keyword=%E5%A4%A7%E7%88%B7&from_source=banner_search
Done:200
start: https://github.com/trending
Done:200
start: https://search.jd.com/Search?keyword=%E7%94%B5%E8%84%91&enc=utf-8&wq=%E7%94%B5%E8%84%91&pvid=254ed16260384a18a34c1c452b6db180
Done:200
start: https://bj.lianjia.com/ershoufang/
Done:200
start: http://www.rrys2019.com/html/top/week_fav_list.html
<Response [404]> <h1>404: Not Found</h1>

start: https://weixin.sogou.com/weixin?type=2&ie=utf8&s_from=hotnews&query=%E9%9B%B7%E7%A5%9E%E5%B1%B1%E6%9C%BA%E5%99%A8%E4%BA%BA%E4%B8%8A%E5%B2%97
Done:200


In [4]:
len(soup_list) == len(url_list)

False

## 更复杂的请求
> 以豆瓣网为例

一开始没有header的请求会被拒绝，因为服务器会把这次的请求当成是机器人，status_code是418

In [6]:
# hide
search_query = '1234'
url = f'https://www.douban.com/search?q={search_query}'
res = requests.get(url)
res,res.text

(<Response [418]>, '')

加了header之后，服务器就把它当成了Mozilla浏览器，就有response的数据了

In [7]:
# hide
headers = {'user-agent':'Mozilla/5.0'}
res = requests.get(url,headers=headers)
res

<Response [200]>

想要发布一个动态就需要登录状态，而cookie就是登录状态的载体，cookie就是用帐号密码登录后获取的凭证

In [8]:
# hide
url = 'https://www.douban.com/'
data = {'comment':'hello world','ck':'zV8Z','privacy_and_reply_limit':'P,'}

In [9]:
# export
def format_cookie_str(cookie_str):
    '把chrome拷贝过来的cookie字符转化成dict'
    cookies = {}
    for item in cookie_str.split('; '):
        k,v = item.split('=',1)
        cookies[k] = v
    return cookies

In [10]:
cookie_str = 'bid=1i8YWHFPDwI; gr_user_id=5b798ccf-0dc3-41f7-9358-ab221ae5c248; __utmc=30149280; __utmz=30149280.1582040380.5.5.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); ll="118124"; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1582185091%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.682236232.1580713449.1582121697.1582185095.7; ap_v=0,6.0; viewed="19672873_30243169_4233221"; gr_cs1_6bb1b2b8-0a3e-4e02-9e3c-4f359d514576=user_id%3A0; __utmt_douban=1; dbcl2="140014301:Td6zJ+yn5sA"; ck=zV8Z; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=7a596b44-fe53-45f0-90fa-2c24b2faa365; gr_cs1_7a596b44-fe53-45f0-90fa-2c24b2faa365=user_id%3A1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_7a596b44-fe53-45f0-90fa-2c24b2faa365=true; push_noty_num=0; push_doumail_num=0; __utmv=30149280.14001; __utmt=1; ps=y; _pk_id.100001.8cb4=7bc8021c269d7e50.1580713448.6.1582185748.1582121848.; __utmb=30149280.18.10.1582185095'
cookies = format_cookie_str(cookie_str)
cookies

{'bid': '1i8YWHFPDwI',
 'gr_user_id': '5b798ccf-0dc3-41f7-9358-ab221ae5c248',
 '__utmc': '30149280',
 '__utmz': '30149280.1582040380.5.5.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)',
 'll': '"118124"',
 '_pk_ref.100001.8cb4': '%5B%22%22%2C%22%22%2C1582185091%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D',
 '_pk_ses.100001.8cb4': '*',
 '__utma': '30149280.682236232.1580713449.1582121697.1582185095.7',
 'ap_v': '0,6.0',
 'viewed': '"19672873_30243169_4233221"',
 'gr_cs1_6bb1b2b8-0a3e-4e02-9e3c-4f359d514576': 'user_id%3A0',
 '__utmt_douban': '1',
 'dbcl2': '"140014301:Td6zJ+yn5sA"',
 'ck': 'zV8Z',
 'gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03': '7a596b44-fe53-45f0-90fa-2c24b2faa365',
 'gr_cs1_7a596b44-fe53-45f0-90fa-2c24b2faa365': 'user_id%3A1',
 'gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_7a596b44-fe53-45f0-90fa-2c24b2faa365': 'true',
 'push_noty_num': '0',
 'push_doumail_num': '0',
 '__utmv': '30149280.14001',
 '__utmt': '1',
 'ps': 'y',
 '_pk_id.100001.8cb

In [11]:
# hide
res = requests.post(url,headers=headers,data=data,cookies=cookies)
res.text

'\n\n\n<!DOCTYPE HTML>\n<html lang="zh-cmn-Hans" class="">\n<head>\n<meta charset="UTF-8">\n<meta name="google-site-verification" content="ok0wCgT20tBBgo9_zat2iAcimtN4Ftf5ccsh092Xeyw" />\n<meta name="description" content="提供图书、电影、音乐唱片的推荐、评论和价格比较，以及城市独特的文化生活。">\n<meta name="keywords" content="豆瓣,小组,电影,同城,豆品,广播,登录豆瓣">\n<meta property="qc:admins" content="2554215131764752166375" />\n<meta property="wb:webmaster" content="375d4a17a4fa24c2" />\n<meta name="mobile-agent" content="format=html5; url=https://m.douban.com">\n<title>豆瓣</title>\n<script>\nfunction set_cookie(t,e,o,n){var i,a,r=new Date;r.setTime(r.getTime()+24*(e||30)*60*60*1e3),i="; expires="+r.toGMTString();for(a in t)document.cookie=a+"="+t[a]+i+"; domain="+(o||"douban.com")+"; path="+(n||"/")}function get_cookie(t){var e,o,n=t+"=",i=document.cookie.split(";");for(e=0;e<i.length;e++){for(o=i[e];" "==o.charAt(0);)o=o.substring(1,o.length);if(0===o.indexOf(n))return o.substring(n.length,o.length).replace(/\\"/g,"")}return null}wi

In [13]:
# hide
# 查看最近发布的5条动态
soup = BeautifulSoup(res.text)
items = soup.find_all(class_='new-status')
for item in items[:5]:
    print(item.find('p').text)

In [16]:
# hide
type(items[0]),items[0].attrs

IndexError: list index out of range

顺便看看tag的其他属性，上下左右查到其他的tag
* item.parent
* item.parents

* item.contents
* item.children

* item.next_sibling
* item.previous_sibling

## JSON 存储
> 这个格式本质上就是个dict，在MongoDB和Redis中也是这样存储，所以这里就开始学习下

比如这次要把谁在什么时间，说了什么话记下来

In [17]:
# hide
print(items[5]['data-sid'])
print(items[0].find(class_='text').a.text)
print(items[0].find(class_='created_at')['title'])
print(items[4].find(class_='bd').contents[1].text.replace('\n',''))

IndexError: list index out of range

In [None]:
# hide
contents = {}
for item in items:
    _id = item['data-sid']
    name = item.find(class_='text').a.text    
    created_at = item.find(class_='created_at')['title']
    content = item.find(class_='bd').contents[1].text.replace('\n','')  
    contents[_id] = {}
    contents[_id]['name'] = name
    contents[_id]['created_at'] = created_at  
    contents[_id]['content'] = content
contents

{'2858892068': {'name': 'Hawk',
  'created_at': '2020-03-10 00:01:38',
  'content': 'hello world'},
 '2858887533': {'name': 'Hawk',
  'created_at': '2020-03-09 23:59:24',
  'content': 'hello world'},
 '2858886319': {'name': 'Hawk',
  'created_at': '2020-03-09 23:58:48',
  'content': 'hello world'},
 '2858885049': {'name': 'Hawk',
  'created_at': '2020-03-09 23:58:15',
  'content': 'hello world'},
 '2858884667': {'name': 'Hawk',
  'created_at': '2020-03-09 23:58:04',
  'content': 'hello world'},
 '2858883902': {'name': 'Hawk',
  'created_at': '2020-03-09 23:57:44',
  'content': 'hello world'},
 '2858883414': {'name': 'Hawk',
  'created_at': '2020-03-09 23:57:30',
  'content': 'hello world'},
 '2858882120': {'name': 'Hawk',
  'created_at': '2020-03-09 23:56:53',
  'content': 'hello world'},
 '2858881201': {'name': 'Hawk',
  'created_at': '2020-03-09 23:56:25',
  'content': 'hello world'},
 '2858880617': {'name': 'Hawk',
  'created_at': '2020-03-09 23:56:09',
  'content': 'hello world'},


### 保存和读取json

In [None]:
# hide
with open('./data/01_douban.json', 'w') as f:
    json.dump(contents,f)

In [18]:
# hide
with open('./data/01_douban.json', 'r') as f:
    contents = json.loads(f.read())
contents['2726525573']

{'name': 'Malorie',
 'created_at': '2019-12-14 10:18:01',
 'content': "21 身份危机 |Model MinoritySomeone like me can be a real nightmare, completely aware But I'd rather be a real nightmare than die unaware So save me your prayers 一直很想写这篇日记，却因为太贴近真实的自我..."}

## 自动识别列表
> 思路就是看谁有最多的相同类的children（目前只支持静态页面,文字列表）

常见问题：容易识别成目录，特征为多层级列表，而正文内容则不会再嵌套列表

In [19]:
#export
def get_children(soup): return [c for c in soup.children if isinstance(c,Tag)]

def get_class(soup):
    '获取单个tag的class，没有则显示no_class'
    if 'class' in soup.attrs and soup.attrs['class']:
        return soup.attrs['class']
    else:
        return ['no_class']
    
def get_all_class(soup):
    '收集每个tag的class，没有则显示no_class'
    class_list = []
    for s in soup.find_all(True):
        class_list += get_class(s)
    return class_list

def get_class_count(soup):return len(list(set(get_all_class(soup))))

def is_content_list(soup): 
    '判断是否为内容列表，条件为：子集是重复的相同的tag，重复次数大于5'
    children = get_children(soup)
    if len(children) > 1 and len(soup.text.strip()) > 0:
        tag_counter = Counter([c.name for c in children])
        max_tag_name,max_tag_count = tag_counter.most_common(1)[0]
       
        tag_similarity = max_tag_count/len(children)
        
        class_similarity = 1
        if get_all_class(soup):
            max_class_name,max_class_count = Counter(get_all_class(soup)).most_common(1)[0]
            class_similarity = max_class_count/len(children)
            
        if max_tag_count >= 5 and tag_similarity > 0.9 and class_similarity > 0.9:
            return True
    return False

def find_main_list(soup):
    '特征：文字内容多；一条数据的样式&嵌套丰富；区别于目录，都是很短的词'
    score = 0
    main_list = soup
    
    items = soup.find_all(True) 
    for item in items:
        if is_content_list(item):
            text_count = len(item.text)
            class_count = get_class_count(item)
            text_max_lenth = max([len(i) for i in item.stripped_strings])
#             print('候选：\n',soup.name,get_class(soup),text_count,text_max_lenth,class_count)
            
            new_score = text_count*class_count*text_max_lenth
            if new_score > score:
                score = new_score
                main_list = item
    first_child = get_children(main_list)[0]
    print('终选：\n',main_list.name,get_class(main_list),first_child.name,get_class(first_child),'\n')
    return main_list

In [22]:
main_content_list = []
for url,soup in zip(url_list,soup_list):
    print(url)
    soup_test = soup.body
    main_content_list.append(find_main_list(soup.body))

https://www.zhihu.com/topic/19554834/hot
终选：
 div ['no_class'] div ['List-item', 'TopicFeedItem'] 

https://book.douban.com/tag/%E5%A4%96%E5%9B%BD%E6%96%87%E5%AD%A6
终选：
 ul ['subject-list'] li ['subject-item'] 

https://search.bilibili.com/all?keyword=%E5%A4%A7%E7%88%B7&from_source=banner_search
终选：
 div ['flow-loader'] div ['filter-wrap'] 

https://github.com/trending
终选：
 div ['no_class'] article ['Box-row'] 

https://search.jd.com/Search?keyword=%E7%94%B5%E8%84%91&enc=utf-8&wq=%E7%94%B5%E8%84%91&pvid=254ed16260384a18a34c1c452b6db180
终选：
 div ['selector'] div ['J_selectorLine', 's-brand'] 

https://bj.lianjia.com/ershoufang/
终选：
 ul ['sellListContent'] li ['clear', 'LOGVIEWDATA', 'LOGCLICKDATA'] 

http://www.rrys2019.com/html/top/week_fav_list.html
终选：
 ul ['news-list'] li ['no_class'] 



## 自动识别下一页
> 根据文字识别，暂不支持滚动加载，大部分未识别出来，因为下一页大多用js实现

In [23]:
#export
def is_next_page(tag): return tag.name == 'a' and re.search(r'[后|下]一*页',tag.text)
    
def get_next_page_url(soup):
    next_page = soup.find(is_next_page)
    if next_page and 'href' in next_page.attrs:
        print('下一页：',next_page['href'])
        return next_page['href']
    else:
        print('未识别出下一页')
        return None 

In [24]:
for url,soup in zip(url_list,soup_list):
#     if 'douban' in url:
        print('\n',url)
        soup_test = soup.body
#         print(soup_test)
        get_next_page_url(soup_test)


 https://www.zhihu.com/topic/19554834/hot
未识别出下一页

 https://book.douban.com/tag/%E5%A4%96%E5%9B%BD%E6%96%87%E5%AD%A6
下一页： /tag/外国文学?start=20&type=T

 https://search.bilibili.com/all?keyword=%E5%A4%A7%E7%88%B7&from_source=banner_search
未识别出下一页

 https://github.com/trending
未识别出下一页

 https://search.jd.com/Search?keyword=%E7%94%B5%E8%84%91&enc=utf-8&wq=%E7%94%B5%E8%84%91&pvid=254ed16260384a18a34c1c452b6db180
未识别出下一页

 https://bj.lianjia.com/ershoufang/
未识别出下一页

 http://www.rrys2019.com/html/top/week_fav_list.html
下一页： ?query=%E9%9B%B7%E7%A5%9E%E5%B1%B1%E6%9C%BA%E5%99%A8%E4%BA%BA%E4%B8%8A%E5%B2%97&s_from=hotnews&type=2&page=2&ie=utf8


## 自动识别列表内容
> 思路就是样式一样的，是一类的内容

目前支持的几种内容类型：
1. img link
* a link
* text

In [25]:
# export
def get_child_navigablestring(soup):
    navstr = []
    for c in soup.contents:
        if isinstance(c,NavigableString) and c.strip():
            navstr.append(c.strip())
    navstr = '&&'.join(navstr)
    return navstr

def get_data_name(soup): 
    class_name = '_'.join(get_class(soup))
    return f'{soup.name}_{class_name}'

def get_data(soup):
    '列出每一项，如果为a或img标签，就获取链接和内容；如果子集含navstr，就显示文字'
    data = {}
    for c in soup.find_all(True):
        if c.name == 'a':
            if 'href' in c.attrs and c['href']:
                data[get_data_name(c)+'_url'] = c['href']
            if 'title' in c.attrs and c['title']:
                data[get_data_name(c)+'_title'] = c['title']
            
        if c.name == 'img':
            if 'src' in c.attrs and c['src']:
                data[get_data_name(c)+'_src'] = c['src']
                 
        navstr = get_child_navigablestring(c)
        if navstr:
            data[get_data_name(c)+'_text'] = navstr
    return data


In [48]:
for url,soup in zip(url_list,main_content_list):
    if 'douban' in url:
        print('\n',url)
        data = get_data(get_children(soup)[0])
        print(data)


 https://book.douban.com/tag/%E5%A4%96%E5%9B%BD%E6%96%87%E5%AD%A6
{'a_nbg_url': 'https://book.douban.com/subject/35283396/', 'img_no_class_src': 'https://img9.doubanio.com/view/subject/s/public/s33841163.jpg', 'a_no_class_url': 'https://read.douban.com/ebook/174769730/?dcs=tag-buylink&dcm=douban&dct=35283396', 'a_no_class_title': '暗夜与黎明', 'a_no_class_text': '去看电子版', 'div_pub_text': '[英] 肯·福莱特 / 邓若虚、汪洋 / 江苏凤凰文艺出版社 / 2021-3-6 / 118.00元', 'span_rating_nums_text': '8.1', 'span_pl_text': '(2824人评价)', 'p_no_class_text': '《巨人的陨落》作者肯•福莱特重磅新作！\n全球读者平均2个通宵读完！\n----------------\n- 2020美国亚马逊年度图书\n- Goodreads...'}


In [58]:
for k,v in data.items():
    print(k,':',v)

a_nbg_url : https://book.douban.com/subject/35283396/
img_no_class_src : https://img9.doubanio.com/view/subject/s/public/s33841163.jpg
a_no_class_url : https://read.douban.com/ebook/174769730/?dcs=tag-buylink&dcm=douban&dct=35283396
a_no_class_title : 暗夜与黎明
a_no_class_text : 去看电子版
div_pub_text : [英] 肯·福莱特 / 邓若虚、汪洋 / 江苏凤凰文艺出版社 / 2021-3-6 / 118.00元
span_rating_nums_text : 8.1
span_pl_text : (2824人评价)
p_no_class_text : 《巨人的陨落》作者肯•福莱特重磅新作！
全球读者平均2个通宵读完！
----------------
- 2020美国亚马逊年度图书
- Goodreads...
