In [4]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import requests
from bs4 import BeautifulSoup as BS
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [5]:
# 测试 - 冰冷之海的主页
url = 'http://space.bilibili.com/742470/#!/video?keyword=&order=senddate&page=1&tid=0'

In [6]:
# 成功连接
requests.get(url).status_code

200

In [7]:
headers = {'Accept-Language':'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3', 
           'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:28.0) Gecko/20100101 Firefox/28.0',
           'Connection':'keep-alive'}

接下来就像一般的爬数据，不过这次练习用不同的function来实现。需要得到的数据：
- 视频页列表
- 视频页
    - 日期
    - 播放量、弹幕量、收藏量、分享量、评论量、硬币量、时间、充电量
    - 标签

声明：此文本有大量用于探索和练习的代码，并保留大量错误信息用于回顾。我的目标只有一个 - 得到我想要的所有内容，所有优化代码的内容均不在此文本中

## 探索

### 非function测试

In [8]:
# get url list
# 1st - get pager
response = requests.get(url, headers=headers)
list_soup = BS(response.content, 'html.parser')
list_soup

<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8">
<meta content="webkit|ie-comp|ie-stand" name="renderer">
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible">
<title>冰冷之海的个人空间 - 哔哩哔哩 ( ゜- ゜)つロ 乾杯~ Bilibili</title>
<meta content="bilibili是国内知名的视频弹幕网站，这里有最及时的动漫新番，最棒的ACG氛围，最有创意的Up主。大家可以在这里找到许多欢乐。" name="description">
<meta content="B站,弹幕,字幕,AMV,MAD,MTV,ANIME,动漫,动漫音乐,游戏,游戏解说,ACG,galgame,动画,番组,新番,初音,洛天依,vocaloid" name="keywords">
<link href="//static.hdslb.com/css/core-v5/base.css" rel="stylesheet">
<style type="text/css">
  .space-seo {
    display: none;
  }
  
  .wrapper {
    width: 1100px;
    margin: 0 auto;
    position: relative;
  }
  
  #browser-version-tip {
    position: absolute;
    display: none;
    top: 42px;
    left: 0;
    z-index: 100;
    width: 100%;
    height: 40px;
    line-height: 40px;
    background-color: #e40c0c;
    text-align: center;
    font-size: 14px;
    color: #fff;
  }
  
  #browser-version-tip a {
    margin: 0 2px;
    text-decoration

我天真了，事实证明，根本不需要这样生爬。bilibili有很好的metadata，只需要找到相应的地址就可以得到完整的json file。

经过搜索与实验，得出以下步骤：
- 进入移动版的其视频页，本例为：http://space.bilibili.com/742470/mobile/video
- 开发者工具 -> Network -> XHR -> 得到获得视频信息的URL为：http://space.bilibili.com/ajax/member/getSubmitVideos?pagesize=8&page=2&mid=742470&pid=NaN&_=1489370576492
- _（取消md的italic）经过测试，我们可以删掉一些不需要的字段，仅仅需要page和mid：http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=742470
- 然而我们发现created并没有具体时间，我们还需要访问具体页面爬到时间
- ok，那首先得到所有aid，放入一个list，并以aid为名所dict

In [9]:
import json

In [10]:
# 都是bytes，decode('utf-8')没用，头疼..
url = 'http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=742470'
requests.get(url).content

b'{"status":true,"data":{"vlist":[{"aid":9137695,"copyright":"Original","typeid":65,"title":"\\u51b0\\u51b7\\u89e3\\u8bf4\\uff1a\\u201c\\u8352\\u7af9\\u53cd\\u51fb\\u6d41\\u201d\\u5355\\u4eba\\u6597\\u6280\\u5b9e\\u51b5\\uff08\\u6392\\u540d50-20\\uff09","subtitle":"","play":21441,"review":228,"video_review":819,"favorites":223,"mid":742470,"author":"","description":"\\u7af9\\u5b50\\u914d\\u8352\\uff0c\\u5929\\u4e0b\\u65e0\\u53cc\\uff01\\n6\\u661f\\u8352\\u5fa1\\u9b42\\u5c5e\\u6027\\u65b9\\u9762\\u4ee5\\u540e\\u4f1a\\u4f18\\u5316\\uff0c\\u4e0d\\u6392\\u9664\\u4ee5\\u540e\\u6362\\u9b4d\\u9b49\\u4e4b\\u7c7b\\u63a7\\u5236\\u5fa1\\u9b42\\u7684\\u53ef\\u80fd\\uff0c\\u4f46\\u662f\\u8fd8\\u9700\\u8981\\u8fdb\\u4e00\\u6b65\\u6d4b\\u8bd5\\u3002\\n\\u53e6\\u5916\\u7af9\\u5b50\\u3001\\u6912\\u56fe\\u3001\\u96e8\\u5973\\u90fd\\u9700\\u8981\\u53476\\u661f\\u3002\\u8be5\\u9635\\u5bb9\\u5982\\u679c\\u9700\\u8981\\u673a\\u52a8\\u6838\\u5fc3\\uff0c\\u9570\\u9f2c\\u4e5f\\u662f\\u4e2a\\u4e0d\\u9519\\u7684

In [11]:
# 好吧，不需要解码
"\u51b0\u51b7\u89e3\u8bf4\uff1a\u9634\u9633\u5e08\u5168\u5f0f\u795e\u4ecb\u7ecd\u4e4b\u96ea\u5973"

'冰冷解说：阴阳师全式神介绍之雪女'

In [12]:
# 看来需要用re了
requests.get(url).content[80:240]

b'5,"title":"\\u51b0\\u51b7\\u89e3\\u8bf4\\uff1a\\u201c\\u8352\\u7af9\\u53cd\\u51fb\\u6d41\\u201d\\u5355\\u4eba\\u6597\\u6280\\u5b9e\\u51b5\\uff08\\u6392\\u540d50-20\\uff09","subtitle"'

In [13]:
"\u51b0\\u51b7\\u89e3\\u8bf4\\uff1a\\u201c\\u8352\\u7af9\\u53cd\\u51fb\\u6d41\\u201d\\u5355\\u4eba\\u6597\\u6280\\u5b9e\\u51b5\\uff08\\u6392\\u540d50-20\\uff09"

'冰\\u51b7\\u89e3\\u8bf4\\uff1a\\u201c\\u8352\\u7af9\\u53cd\\u51fb\\u6d41\\u201d\\u5355\\u4eba\\u6597\\u6280\\u5b9e\\u51b5\\uff08\\u6392\\u540d50-20\\uff09'

In [14]:
str(requests.get(url).content).decode('string_escape')

AttributeError: 'str' object has no attribute 'decode'

In [15]:
# 尝试codecs
import codecs
codecs.decode('\\u51b7\\u89e3\\u8bf4\\uff1a\\u201c\\u8352\\u7af9\\u53cd\\u51fb\\u6d41\\u201d\\u5355\\u4eba\\u6597\\u6280\\u5b9e\\u51b5\\uff08\\u6392\\u540d50-20\\uff09', 'unicode_escape')

'冷解说：“荒竹反击流”单人斗技实况（排名50-20）'

In [16]:
type('\\u51b7\\u89e3\\u8bf4\\uff1a\\u201c\\u8352\\u7af9\\u53cd\\u51fb\\u6d41\\u201d\\u5355\\u4eba\\u6597\\u6280\\u5b9e\\u51b5\\uff08\\u6392\\u540d50-20\\uff09')

str

In [20]:
# 终于... 通过使用codecs的 .decode(xxx, 'unicode_escape')
testre = codecs.decode(requests.get(url).content, 'unicode_escape')
testre

'{"status":true,"data":{"vlist":[{"aid":9137695,"copyright":"Original","typeid":65,"title":"冰冷解说：“荒竹反击流”单人斗技实况（排名50-20）","subtitle":"","play":21441,"review":228,"video_review":819,"favorites":223,"mid":742470,"author":"","description":"竹子配荒，天下无双！\n6星荒御魂属性方面以后会优化，不排除以后换魍魉之类控制御魂的可能，但是还需要进一步测试。\n另外竹子、椒图、雨女都需要升6星。该阵容如果需要机动核心，镰鼬也是个不错的选择。\n新浪微博：@冰冷之海解说\n粉丝群：\n1群：121951577\n2群：143191644\n3群：10825535\n4群：49057308\n5群：175953348\n6群：583401101\n（虽然6群马上满了，但是暂时不打算开新群，因为1-5群现在有很多空位） \u200b\u200b\u200b\u200b","created":1489375048,"pic":"http:\\/\\/i0.hdslb.com\\/bfs\\/archive\\/46ac899c49fb7e7ede627db415c112df1a4fe89f.jpg","comment":819,"length":"64:48","hide_click":false},{"aid":9122374,"copyright":"Original","typeid":65,"title":"冰冷解说：“荒竹反击流”协同斗技首秀","subtitle":"","play":12854,"review":119,"video_review":451,"favorites":106,"mid":742470,"author":"","description":"我们的口号是“竹子配荒，天下无双！”\n然后协同各种被吊打，打不带控的速控还凑合……\n而协同斗技效果不好，太怕返魂香和兵勇了。而且没有办法用晴明和妖琴师。\n晚上我斗技再试试，我自己的椒图也5星了。o(∩_∩)o \n直播地址：http:\\/\\/cc.163.com\\/

In [18]:
import re

In [42]:
m = re.search(r'title', testre)
m.groups()

()

In [32]:
len(testre)

9640

In [68]:
# 好吧 要用findall
print(re.findall('新浪',testre))

['新浪', '新浪', '新浪', '新浪', '新浪', '新浪', '新浪', '新浪', '新浪', '新浪', '新浪', '新浪', '新浪', '新浪']


In [124]:
re.findall('title":"(.*)","sub', testre)

['冰冷解说：“荒竹反击流”单人斗技实况（排名50-20）',
 '冰冷解说：“荒竹反击流”协同斗技首秀',
 '【阴阳师】针女荒VS石踞\\/年兽+万年竹全传记！',
 '【体验服快报】第005期：荒\\/万年竹\\/新版荒川\\/樱花妖等',
 '冰冷解说：妇女节特辑！全女式神斗技实况（孟婆、烟烟罗等）',
 '冰冷解说：阴阳师视频攻略031 椒图反击流之酒吞队',
 '冰冷解说：阴阳师黑科技005 万箭穿心流',
 '冰冷解说：体验服六星河童斗技首秀o(∩_∩)o',
 '【体验服快报】第004期：河畔童谣副本通关实况',
 '冰冷解说：“三琴余音流”斗技实况教学',
 '冰冷解说：阴阳师全式神介绍之孟婆',
 '冰冷解说：阴阳师全式神介绍之雨女',
 '【体验服快报】第003期：卖御魂\\/古笼火加强\\/雨女加强\\/魅妖加强',
 '【FGO】萌新100抽测试一下血统！","subtitle":"","play":27149,"review":591,"video_review":690,"favorites":42,"mid":742470,"author":"","description":"有没有老司机给我讲讲，我抽的这些东西厉不厉害？","created":1487231913,"pic":"http:\\/\\/i0.hdslb.com\\/bfs\\/archive\\/36426d548f42d692b1bc914b64d22839db43335c.jpg","comment":690,"length":"07:37","hide_click":false},{"aid":8627224,"copyright":"Original","typeid":65,"title":"冰冷解说：阴阳师全式神介绍之雪女',
 '冰冷解说：阴阳师全式神介绍之妖琴师',
 '冰冷解说：后手肉控流大战般若速控队（六星管狐+烟烟罗）',
 '冰冷解说：阴阳师全式神介绍之烟烟罗',
 '冰冷解说：阴阳师视频攻略030 鸡肋御魂变废为宝',
 '冰冷解说：阴阳师全式神介绍之黑童子']

In [135]:
len(re.findall('title":"(.*)","sub', testre))

19

In [85]:
"（）" == "()"

False

In [86]:
"o(∩_∩)o"

'o(∩_∩)o'

In [87]:
"∩" == "n"

False

In [97]:
'-' == '-'

True

In [126]:
# 终于，设一个范围
re.findall('title":"(.{1,50})","sub', testre)

['冰冷解说：“荒竹反击流”单人斗技实况（排名50-20）',
 '冰冷解说：“荒竹反击流”协同斗技首秀',
 '【阴阳师】针女荒VS石踞\\/年兽+万年竹全传记！',
 '【体验服快报】第005期：荒\\/万年竹\\/新版荒川\\/樱花妖等',
 '冰冷解说：妇女节特辑！全女式神斗技实况（孟婆、烟烟罗等）',
 '冰冷解说：阴阳师视频攻略031 椒图反击流之酒吞队',
 '冰冷解说：阴阳师黑科技005 万箭穿心流',
 '冰冷解说：体验服六星河童斗技首秀o(∩_∩)o',
 '【体验服快报】第004期：河畔童谣副本通关实况',
 '冰冷解说：“三琴余音流”斗技实况教学',
 '冰冷解说：阴阳师全式神介绍之孟婆',
 '冰冷解说：阴阳师全式神介绍之雨女',
 '【体验服快报】第003期：卖御魂\\/古笼火加强\\/雨女加强\\/魅妖加强',
 '【FGO】萌新100抽测试一下血统！',
 '冰冷解说：阴阳师全式神介绍之雪女',
 '冰冷解说：阴阳师全式神介绍之妖琴师',
 '冰冷解说：后手肉控流大战般若速控队（六星管狐+烟烟罗）',
 '冰冷解说：阴阳师全式神介绍之烟烟罗',
 '冰冷解说：阴阳师视频攻略030 鸡肋御魂变废为宝',
 '冰冷解说：阴阳师全式神介绍之黑童子']

In [134]:
# 如果在 findall 里加入两个或两个以上的()，返回的是一个list里的多个不可更改的 tuple
print(re.findall('title":"(.{1,50})","(sub)', testre),'\n')
print(re.findall('title":"(.{1,50})","(sub)', testre)[0],'\n')
print(re.findall('title":"(.{1,50})","(sub)', testre)[0][1])

[('冰冷解说：“荒竹反击流”单人斗技实况（排名50-20）', 'sub'), ('冰冷解说：“荒竹反击流”协同斗技首秀', 'sub'), ('【阴阳师】针女荒VS石踞\\/年兽+万年竹全传记！', 'sub'), ('【体验服快报】第005期：荒\\/万年竹\\/新版荒川\\/樱花妖等', 'sub'), ('冰冷解说：妇女节特辑！全女式神斗技实况（孟婆、烟烟罗等）', 'sub'), ('冰冷解说：阴阳师视频攻略031 椒图反击流之酒吞队', 'sub'), ('冰冷解说：阴阳师黑科技005 万箭穿心流', 'sub'), ('冰冷解说：体验服六星河童斗技首秀o(∩_∩)o', 'sub'), ('【体验服快报】第004期：河畔童谣副本通关实况', 'sub'), ('冰冷解说：“三琴余音流”斗技实况教学', 'sub'), ('冰冷解说：阴阳师全式神介绍之孟婆', 'sub'), ('冰冷解说：阴阳师全式神介绍之雨女', 'sub'), ('【体验服快报】第003期：卖御魂\\/古笼火加强\\/雨女加强\\/魅妖加强', 'sub'), ('【FGO】萌新100抽测试一下血统！', 'sub'), ('冰冷解说：阴阳师全式神介绍之雪女', 'sub'), ('冰冷解说：阴阳师全式神介绍之妖琴师', 'sub'), ('冰冷解说：后手肉控流大战般若速控队（六星管狐+烟烟罗）', 'sub'), ('冰冷解说：阴阳师全式神介绍之烟烟罗', 'sub'), ('冰冷解说：阴阳师视频攻略030 鸡肋御魂变废为宝', 'sub'), ('冰冷解说：阴阳师全式神介绍之黑童子', 'sub')] 

('冰冷解说：“荒竹反击流”单人斗技实况（排名50-20）', 'sub') 

sub


## 新三个爬虫设计

我们争取一次爬取一个 aid 的所有需要信息。

爬取信息（可在以上地址得到的）:
    - aid
    - title
    - play = view
    - review = reply
    - video_review = danmaku - 弹幕
    - favorites
    - comment
    - length
爬取信息（在http://api.bilibili.com/archive_stat/stat?aid=? 只需添加aid得到的）：
    - coin
爬取信息（在视频页上得到的）：
    - 视频地址
    - 发布时间

### 爬取信息一：spider1

In [153]:
# 最长的 description 大概231
len( "竹子配荒，天下无双！\n6星荒御魂属性方面以后会优化，不排除以后换魍魉之类控制御魂的可能，但是还需要进一步测试。\n另外竹子、椒图、雨女都需要升6星。该阵容如果需要机动核心，镰鼬也是个不错的选择。\n新浪微博：@冰冷之海解说\n粉丝群：\n1群：121951577\n2群：143191644\n3群：10825535\n4群：49057308\n5群：175953348\n6群：583401101\n（虽然6群马上满了，但是暂时不打算开新群，因为1-5群现在有很多空位） ​​​​")

231

In [160]:
# description 太长，干扰我们爬取信息，这里分成两部分：description之前和之后
# description 之前
re.findall('aid":(\d+).{1,50}title":"(.{1,50})","sub.{1,50}play":(\d+),"review":(\d+),"video_review":(\d+),"favorites":(\d+)', testre)

[('9137695', '冰冷解说：“荒竹反击流”单人斗技实况（排名50-20）', '21441', '228', '819', '223'),
 ('9122374', '冰冷解说：“荒竹反击流”协同斗技首秀', '12854', '119', '451', '106'),
 ('9109711', '【阴阳师】针女荒VS石踞\\/年兽+万年竹全传记！', '92013', '737', '804', '955'),
 ('9071355',
  '【体验服快报】第005期：荒\\/万年竹\\/新版荒川\\/樱花妖等',
  '55906',
  '846',
  '3019',
  '482'),
 ('9047855', '冰冷解说：妇女节特辑！全女式神斗技实况（孟婆、烟烟罗等）', '22153', '148', '603', '143'),
 ('9014983', '冰冷解说：阴阳师视频攻略031 椒图反击流之酒吞队', '27721', '308', '914', '425'),
 ('8941842', '冰冷解说：阴阳师黑科技005 万箭穿心流', '51900', '271', '1042', '451'),
 ('8850760', '冰冷解说：体验服六星河童斗技首秀o(∩_∩)o', '56255', '281', '1167', '230'),
 ('8803411', '【体验服快报】第004期：河畔童谣副本通关实况', '39754', '336', '1699', '206'),
 ('8778401', '冰冷解说：“三琴余音流”斗技实况教学', '45858', '429', '1649', '504'),
 ('8749188', '冰冷解说：阴阳师全式神介绍之孟婆', '15719', '250', '241', '124'),
 ('8688869', '冰冷解说：阴阳师全式神介绍之雨女', '21417', '308', '439', '167'),
 ('8665552',
  '【体验服快报】第003期：卖御魂\\/古笼火加强\\/雨女加强\\/魅妖加强',
  '51831',
  '379',
  '1665',
  '258'),
 ('8648632', '【FGO】萌新100抽测试一下血统！', '271

In [161]:
# description 之后
re.findall('comment":(\d+),"length":"([\d:]+)', testre)

[('819', '64:48'),
 ('451', '92:58'),
 ('804', '07:21'),
 ('3019', '41:55'),
 ('603', '70:00'),
 ('914', '109:19'),
 ('1042', '60:26'),
 ('1167', '71:15'),
 ('1699', '54:57'),
 ('1649', '89:02'),
 ('241', '16:08'),
 ('439', '20:06'),
 ('1665', '45:02'),
 ('690', '07:37'),
 ('715', '31:03'),
 ('795', '17:17'),
 ('1266', '73:27'),
 ('554', '15:23'),
 ('3020', '67:20'),
 ('1054', '14:59')]

In [172]:
# url_list 用于 得到所有的 url
url

'http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=742470'

In [174]:
no_page = int(re.findall('pages":(\d+)', str(requests.get(url).content))[0])

In [175]:
url_list = []
for i in range(1, no_page+1):
    url_list.append('http://space.bilibili.com/ajax/member/getSubmitVideos?page=%d&mid=742470' % i)

In [176]:
url_list

['http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=742470',
 'http://space.bilibili.com/ajax/member/getSubmitVideos?page=2&mid=742470',
 'http://space.bilibili.com/ajax/member/getSubmitVideos?page=3&mid=742470',
 'http://space.bilibili.com/ajax/member/getSubmitVideos?page=4&mid=742470',
 'http://space.bilibili.com/ajax/member/getSubmitVideos?page=5&mid=742470',
 'http://space.bilibili.com/ajax/member/getSubmitVideos?page=6&mid=742470',
 'http://space.bilibili.com/ajax/member/getSubmitVideos?page=7&mid=742470',
 'http://space.bilibili.com/ajax/member/getSubmitVideos?page=8&mid=742470']

In [228]:
# 做一个 list，里面包含诸多 dict，每一个 dict内包含所有的其他属性
videos = []
index = 0
for url in url_list:
    url_page = codecs.decode(requests.get(url).content, 'unicode_escape')
    spider1_1 = re.findall('aid":(\d+).{1,50}title":"(.{1,50})","sub.{1,50}play":(\d+),"review":(\d+),"video_review":(\d+),"favorites":(\d+)', url_page)
    spider1_2 = re.findall('comment":(\d+),"length":"([\d:]+)', url_page)    
    spider1_2_index = 0 # 为 spider_1_2做 index，每次大循环后重置
    for tuples in spider1_1:
        videos.append(dict())
        videos[index]['aid'] = tuples[0]
        videos[index]['title'] = tuples[1]
        videos[index]['play'] = tuples[2]
        videos[index]['review'] = tuples[3]
        videos[index]['video_review'] = tuples[4]
        videos[index]['favorites'] = tuples[5]
        videos[index]['comment'] = spider1_2[spider1_2_index][0] 
        videos[index]['length'] = spider1_2[spider1_2_index][1]
        index += 1
        spider1_2_index += 1
videos

[{'aid': '9137695',
  'comment': '881',
  'favorites': '239',
  'length': '64:48',
  'play': '23795',
  'review': '243',
  'title': '冰冷解说：“荒竹反击流”单人斗技实况（排名50-20）',
  'video_review': '881'},
 {'aid': '9122374',
  'comment': '512',
  'favorites': '117',
  'length': '92:58',
  'play': '14183',
  'review': '122',
  'title': '冰冷解说：“荒竹反击流”协同斗技首秀',
  'video_review': '512'},
 {'aid': '9109711',
  'comment': '822',
  'favorites': '986',
  'length': '07:21',
  'play': '96186',
  'review': '743',
  'title': '【阴阳师】针女荒VS石踞\\/年兽+万年竹全传记！',
  'video_review': '822'},
 {'aid': '9071355',
  'comment': '3027',
  'favorites': '482',
  'length': '41:55',
  'play': '56359',
  'review': '846',
  'title': '【体验服快报】第005期：荒\\/万年竹\\/新版荒川\\/樱花妖等',
  'video_review': '3027'},
 {'aid': '9047855',
  'comment': '605',
  'favorites': '144',
  'length': '70:00',
  'play': '22233',
  'review': '149',
  'title': '冰冷解说：妇女节特辑！全女式神斗技实况（孟婆、烟烟罗等）',
  'video_review': '605'},
 {'aid': '9014983',
  'comment': '915',
  'favorites': '

In [230]:
# 长度正确
len(videos)

146

In [244]:
# 清理 title里的'\\'
for i in videos:
    i['title'] = i['title'].replace('\\/','/')

In [245]:
# spider1成功
videos

[{'aid': '9137695',
  'comment': '881',
  'favorites': '239',
  'length': '64:48',
  'play': '23795',
  'review': '243',
  'title': '冰冷解说：“荒竹反击流”单人斗技实况（排名50-20）',
  'video_review': '881'},
 {'aid': '9122374',
  'comment': '512',
  'favorites': '117',
  'length': '92:58',
  'play': '14183',
  'review': '122',
  'title': '冰冷解说：“荒竹反击流”协同斗技首秀',
  'video_review': '512'},
 {'aid': '9109711',
  'comment': '822',
  'favorites': '986',
  'length': '07:21',
  'play': '96186',
  'review': '743',
  'title': '【阴阳师】针女荒VS石踞/年兽+万年竹全传记！',
  'video_review': '822'},
 {'aid': '9071355',
  'comment': '3027',
  'favorites': '482',
  'length': '41:55',
  'play': '56359',
  'review': '846',
  'title': '【体验服快报】第005期：荒/万年竹/新版荒川/樱花妖等',
  'video_review': '3027'},
 {'aid': '9047855',
  'comment': '605',
  'favorites': '144',
  'length': '70:00',
  'play': '22233',
  'review': '149',
  'title': '冰冷解说：妇女节特辑！全女式神斗技实况（孟婆、烟烟罗等）',
  'video_review': '605'},
 {'aid': '9014983',
  'comment': '915',
  'favorites': '426',
  

In [246]:
# 要注意 dict里所有数字都是 str
type(videos[0]['aid'])

str

### 爬取信息二：spider2

In [247]:
# 提取 videos里的信息，得到 aid
# 之后进入新网站提取 coin
# 添加 coin到相应的 dict
spider2_url = 'http://api.bilibili.com/archive_stat/stat?aid='

In [250]:
# 查看新网站结构，以 aid=9109711 测试
print(requests.get(spider2_url + str(9109711)).content)
print(type(requests.get(spider2_url + str(9109711)).content))

b'{"code":0,"data":{"view":96228,"danmaku":822,"reply":744,"favorite":986,"coin":681,"share":887,"now_rank":0,"his_rank":0},"message":""}'
<class 'bytes'>


In [254]:
# 得到 coin信息
print(re.findall('coin":(\d+)', str(requests.get(spider2_url + str(9109711)).content))[0])
print(type(re.findall('coin":(\d+)', str(requests.get(spider2_url + str(9109711)).content))[0]))

681
<class 'str'>


In [255]:
# 构建 spider2
for i in videos:
    aid = i['aid']
    url = spider2_url + aid # 因为 aid是 str，所以不需要 convert
    coin = re.findall('coin":(\d+)', str(requests.get(url).content))[0]
    i['coin'] = coin
videos[:2]

[{'aid': '9137695',
  'coin': '461',
  'comment': '881',
  'favorites': '239',
  'length': '64:48',
  'play': '23795',
  'review': '243',
  'title': '冰冷解说：“荒竹反击流”单人斗技实况（排名50-20）',
  'video_review': '881'},
 {'aid': '9122374',
  'coin': '199',
  'comment': '512',
  'favorites': '117',
  'length': '92:58',
  'play': '14183',
  'review': '122',
  'title': '冰冷解说：“荒竹反击流”协同斗技首秀',
  'video_review': '512'}]

In [256]:
# spider2成功
len(videos)

146

### 爬取信息三：spider3

In [257]:
# 同样要使用 aid信息
# 进入 url
# 用 BS得到日期和时间（考虑到以后可能会用到具体时间）
# 添加到 videos
spider3_url = 'http://www.bilibili.com/video/av'

In [260]:
# 以 aid=9109711 测试
test_time = BS(requests.get(spider3_url + str(9109711)).content, 'html.parser').find_all('time')[0]

In [264]:
print(test_time.get_text())
print(type(test_time.get_text()))
print(test_time.get_text().split(' '))

2017-03-12 03:41
<class 'str'>
['2017-03-12', '03:41']


In [271]:
# 测试成功，开始
for i in videos:
    aid = i['aid']
    url = spider3_url + aid
    v_d_t = BS(requests.get(url).content, 'html.parser').find('time').get_text().split(' ')
    i['date'] = v_d_t[0]
    i['time'] = v_d_t[1]
    i['url'] = url
videos[:2]      

[{'aid': '9137695',
  'coin': '461',
  'comment': '881',
  'date': '2017-03-13',
  'favorites': '239',
  'length': '64:48',
  'play': '23795',
  'review': '243',
  'time': '11:17',
  'title': '冰冷解说：“荒竹反击流”单人斗技实况（排名50-20）',
  'url': 'http://www.bilibili.com/video/av9137695',
  'video_review': '881'},
 {'aid': '9122374',
  'coin': '199',
  'comment': '512',
  'date': '2017-03-13',
  'favorites': '117',
  'length': '92:58',
  'play': '14183',
  'review': '122',
  'time': '10:57',
  'title': '冰冷解说：“荒竹反击流”协同斗技首秀',
  'url': 'http://www.bilibili.com/video/av9122374',
  'video_review': '512'}]

In [272]:
# spider3成功
len(videos)

146

## 用Pandas制表

### 制作与清理表格

In [277]:
# to DataFrame
df_videos = pd.DataFrame(videos)
df_videos.head()

Unnamed: 0,aid,coin,comment,date,favorites,length,play,review,time,title,url,video_review
0,9137695,461,881,2017-03-13,239,64:48,23795,243,11:17,冰冷解说：“荒竹反击流”单人斗技实况（排名50-20）,http://www.bilibili.com/video/av9137695,881
1,9122374,199,512,2017-03-13,117,92:58,14183,122,10:57,冰冷解说：“荒竹反击流”协同斗技首秀,http://www.bilibili.com/video/av9122374,512
2,9109711,681,822,2017-03-12,986,07:21,96186,743,03:41,【阴阳师】针女荒VS石踞/年兽+万年竹全传记！,http://www.bilibili.com/video/av9109711,822
3,9071355,3763,3027,2017-03-10,482,41:55,56359,846,14:21,【体验服快报】第005期：荒/万年竹/新版荒川/樱花妖等,http://www.bilibili.com/video/av9071355,3027
4,9047855,295,605,2017-03-09,144,70:00,22233,149,02:12,冰冷解说：妇女节特辑！全女式神斗技实况（孟婆、烟烟罗等）,http://www.bilibili.com/video/av9047855,605


In [282]:
# 改变 columns顺序
df_videos = df_videos[['aid','title', 'url', 'date','time','length', 'play', 'video_review', 'review', 'comment', 'favorites', 'coin']]
df_videos.head()

Unnamed: 0,aid,title,url,date,time,length,play,video_review,review,comment,favorites,coin
0,9137695,冰冷解说：“荒竹反击流”单人斗技实况（排名50-20）,http://www.bilibili.com/video/av9137695,2017-03-13,11:17,64:48,23795,881,243,881,239,461
1,9122374,冰冷解说：“荒竹反击流”协同斗技首秀,http://www.bilibili.com/video/av9122374,2017-03-13,10:57,92:58,14183,512,122,512,117,199
2,9109711,【阴阳师】针女荒VS石踞/年兽+万年竹全传记！,http://www.bilibili.com/video/av9109711,2017-03-12,03:41,07:21,96186,822,743,822,986,681
3,9071355,【体验服快报】第005期：荒/万年竹/新版荒川/樱花妖等,http://www.bilibili.com/video/av9071355,2017-03-10,14:21,41:55,56359,3027,846,3027,482,3763
4,9047855,冰冷解说：妇女节特辑！全女式神斗技实况（孟婆、烟烟罗等）,http://www.bilibili.com/video/av9047855,2017-03-09,02:12,70:00,22233,605,149,605,144,295


In [284]:
# 等下， review和 comment什么关系？
# 好吧.. video_review和 comment是一个东西，那 b站为什么把它们放到一个 api页面里
# rename 'video_review' to 'danmaku'
df_videos.rename(columns={'video_review':'danmaku'}, inplace=True)

In [286]:
# drop 'comment'
df_videos.drop('comment', axis=1, inplace=True)
df_videos.head()

Unnamed: 0,aid,title,url,date,time,length,play,danmaku,review,favorites,coin
0,9137695,冰冷解说：“荒竹反击流”单人斗技实况（排名50-20）,http://www.bilibili.com/video/av9137695,2017-03-13,11:17,64:48,23795,881,243,239,461
1,9122374,冰冷解说：“荒竹反击流”协同斗技首秀,http://www.bilibili.com/video/av9122374,2017-03-13,10:57,92:58,14183,512,122,117,199
2,9109711,【阴阳师】针女荒VS石踞/年兽+万年竹全传记！,http://www.bilibili.com/video/av9109711,2017-03-12,03:41,07:21,96186,822,743,986,681
3,9071355,【体验服快报】第005期：荒/万年竹/新版荒川/樱花妖等,http://www.bilibili.com/video/av9071355,2017-03-10,14:21,41:55,56359,3027,846,482,3763
4,9047855,冰冷解说：妇女节特辑！全女式神斗技实况（孟婆、烟烟罗等）,http://www.bilibili.com/video/av9047855,2017-03-09,02:12,70:00,22233,605,149,144,295


### 导出表格

In [283]:
df_videos.to_csv('冰冷之海数据.csv')