# <center>bilibili spider - clean version</center>
This spider is an upgraded version of \_explore notebook. I try to simplify codes and everything in this version. 

The structure of this notebook is as follows:  
1. Define Functions
2. Test
3. Run All  

You can run this program calling funtions separately like I did in "Test", or you can run them all with a single input - the user id, and after approximately 2:30 mins a csv file containing all video information of your specified user id should appear in your current directory, which should look like this:  
![sample image](https://raw.githubusercontent.com/estepona/Python_Spiders/master/1%20bilibili/sample_image.png)  
__Notice__: if the 'title' column doesn't look like this, for windows users, please open it in a notepad first, save it, close it, and then open in Excel; for Mac users, TextWrangler works fine. I ensured that I encoded .csv with 'utf-8' but it doesn't work so well. Please tell me if you know how to fix this and so that I can improve this spider!  

___

## Define Functions

Needed functions：
- get url list
- get info
    - spider1 + spider2 + spider3
- to pandas
    - clean it
- export to csv

In [1]:
import requests
from bs4 import BeautifulSoup as BS
import re
import codecs
import pandas as pd
from datetime import datetime

In [2]:
# get url_list
def get_url_list(mid):
    base_url = 'http://space.bilibili.com/ajax/member/getSubmitVideos?page='
    url_list = []
    no_page = int(re.findall('pages":(\d+)', str(requests.get('http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=%s' % mid).content))[-1])
    for i in range(1, no_page+1):
        url_list.append(base_url + '%d&mid=%s' % (i,mid))
    return url_list

In [3]:
# get info
def get_info(url_list):
    ## spider 1/3
    print('initiating spider 1/3...')
    videos = []
    index = 0
    total_videos = re.findall('count":(\d+)', str(requests.get('http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=%s' % mid).content))[-1]
    for url in url_list:
        url_page = codecs.decode(requests.get(url).content, 'unicode_escape')
        spider1_1 = re.findall('aid":(\d+).{1,50}title":"(.{1,80})","sub.{1,50}play":(\d+),"review":(\d+),"video_review":(\d+),"favorites":(\d+)', url_page)
        spider1_2 = re.findall('length":"([\d:]+)', url_page)    
        spider1_2_index = 0 # index of spider1_2，reset every big loop
        for tuples in spider1_1:
            videos.append(dict())
            videos[index]['aid'] = tuples[0]
            videos[index]['title'] = tuples[1]
            videos[index]['play'] = tuples[2]
            videos[index]['review'] = tuples[3]
            videos[index]['danmaku'] = tuples[4]
            videos[index]['favorites'] = tuples[5]
            videos[index]['length'] = spider1_2[spider1_2_index]
            index += 1
            spider1_2_index += 1
            print(str(index) + '/' + total_videos + ' information collected from ' + url)
        print('page' + str(url_list.index(url) + 1) + 'completed')
    # clean 'title'
    for i in videos:
        i['title'] = i['title'].replace('\\/','/')
    print('spider 1/3 finished')
    ## spider 2/3
    print('initiating spider 2/3...')
    spider2_url = 'http://api.bilibili.com/archive_stat/stat?aid='
    for i in videos:
        aid = i['aid']
        url = spider2_url + aid # type(aid) = aid, thus no need to convert it
        coin = re.findall('coin":(\d+)', str(requests.get(url).content))[0]
        i['coin'] = coin
        print(str(videos.index(i)+1) + '/' + total_videos + ' information collected from ' + url) 
    print('spider 2/3 finished')
    ## spider 3/3
    print('initiating spider 3/3...')
    spider3_url = 'http://www.bilibili.com/video/av'
    for i in videos:
        aid = i['aid']
        url = spider3_url + aid
        try:
            v_d_t = BS(requests.get(url).content, 'html.parser').find('time').get_text().split(' ')
            i['date'] = v_d_t[0]
            i['time'] = v_d_t[1]
            i['url'] = url
        except AttributeError:
            print('an error has occured here')
            i['date'] = ''
            i['time'] = ''
            i['url'] = url
        print(str(videos.index(i)+1) + '/' + total_videos + ' information collected from ' + url)
    print('spider 3/3 finished')
    ## finish
    return videos

In [4]:
# put into pandas
def into_pandas(videos):
    df_videos = pd.DataFrame(videos)
    # change columns order
    df_videos = df_videos[['aid','title', 'url', 'date','time','length', 'play', 'danmaku', 'review', 'favorites', 'coin']]
    # change object type to int
    df_videos['play'] = df_videos['play'].astype(int)
    df_videos['danmaku'] = df_videos['danmaku'].astype(int)
    df_videos['review'] = df_videos['review'].astype(int)
    df_videos['favorites'] = df_videos['favorites'].astype(int)
    df_videos['coin'] = df_videos['coin'].astype(int)
    print('Success putting data into Pandas')
    return df_videos

In [5]:
# export to csv file
# notice: to open it in excel, please open in notepad first, save it, and then open in excel
def export_csv(df):
    # get current time
    time = str(datetime.now())[:10] + '_' + str(datetime.now())[11:13] + '\'' + str(datetime.now())[14:16] + '\'' + str(datetime.now())[17:19]
    # export
    filename = 'stat_%s_%s.csv' % (mid, time)
    df.to_csv(filename, encoding='utf-8')

## Test
Hit shift + enter to try yourselves

In [6]:
# the only input you need to define, as a string
# this is an example - Virgoo Team
mid = '16693558'

In [7]:
url_list = get_url_list(mid)
print(url_list)

['http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558', 'http://space.bilibili.com/ajax/member/getSubmitVideos?page=2&mid=16693558', 'http://space.bilibili.com/ajax/member/getSubmitVideos?page=3&mid=16693558', 'http://space.bilibili.com/ajax/member/getSubmitVideos?page=4&mid=16693558', 'http://space.bilibili.com/ajax/member/getSubmitVideos?page=5&mid=16693558']


In [8]:
videos = get_info(url_list)
print('list length: ', len(videos))
print(videos[0])

initiating spider 1/3...




1/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
2/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
3/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
4/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
5/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
6/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
7/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
8/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
9/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
10/88 information collected from http://space.

Test of get_info():

>initiating spider1...
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\_\_main\_\_.py:8: DeprecationWarning: invalid escape sequence '\/'  
spider1 finished  
initiating spider2...  
spider2 finished  
initiating spider3...  
spider3 finished  

Total time used: 1:20 on my PC

In [9]:
df = into_pandas(videos)
df.head()

Success putting data into Pandas


Unnamed: 0,aid,title,url,date,time,length,play,danmaku,review,favorites,coin
0,9117019,[喂狗组]《仁王》二周目全BOSS应对详解-中国篇,http://www.bilibili.com/video/av9117019,2017-03-13,11:30,22:00,6589,104,39,52,270
1,9065805,[喂狗组]《仁王》二周目全BOSS应对详解-九州篇,http://www.bilibili.com/video/av9065805,2017-03-10,03:14,31:13,7801,198,55,111,470
2,8930354,[喂狗组]《仁王》十一种实用技巧详解,http://www.bilibili.com/video/av8930354,2017-03-03,01:47,11:18,62215,372,229,1688,1242
3,7999599,[喂狗组] 女神异闻录5 全中文剧情解说-Part33 游戏进度12月24,http://www.bilibili.com/video/av7999599,2017-01-14,10:14,194:13,10006,455,111,50,332
4,7935132,[喂狗组] 女神异闻录5 全中文剧情解说-Part32 游戏进度12月份19~24日 End,http://www.bilibili.com/video/av7935132,2017-01-10,06:54,108:15,6726,383,92,60,300


In [10]:
export_csv(df)

## Run All

Try to get the result with only one call

In [11]:
# get mid from input
mid = input("Enter user's id: ")
if mid.isdigit() == True and len(mid) <=9:
    print('User ID:', mid)
    mid = str(mid)
else:
    mid = input("Last chance, please enter a valid ID: ")
    if mid.isdigit() == True and len(mid) <=9:
        print('User ID:', mid)
    else:
        print('Application Terminated. Please run again.')
        mid = 0
# call functions
if mid != 0:
    # run all functions
    print('Application Running...')
    export_csv(into_pandas(get_info(get_url_list(mid))))
    print('\nA csv file containing information of all videos of the specified user id has been created in your current directory, please check!')

Enter user's id: 16693558
User ID: 16693558
Application Running...
initiating spider 1/3...




1/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
2/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
3/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
4/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
5/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
6/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
7/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
8/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
9/88 information collected from http://space.bilibili.com/ajax/member/getSubmitVideos?page=1&mid=16693558
10/88 information collected from http://space.