## Imports

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='white', color_codes=True)

## Describing Data

The dataset is retrieved at April 10, 2017 from [bilibili.com](http://www.bilibili.com/) that describes the information of all videos of [Virgoo Team](http://space.bilibili.com/16693558/#!/video), including: 
- aid
- title
- url
- date
- time
- length
- play
- danmaku (a number)
- review
- favorites
- coin

Let's begin

In [4]:
d = pd.read_csv('VirgooTeam_Videos.csv', index_col=0)
d.head()

Unnamed: 0,aid,title,url,date,time,length,play,danmaku,review,favorites,coin
0,9719223,[喂狗组]《尼尔:机械纪元》VH难度 手撕亚当,http://www.bilibili.com/video/av9719223,2017-04-09,02:07:59,09:46,15834,244,130,180,330
1,9709649,[喂狗组]拳皇14DLC人物 洛克霍华德 连续技,http://www.bilibili.com/video/av9709649,2017-04-08,15:32:10,01:26,9308,52,79,31,208
2,9667348,[喂狗组] 黑暗之魂3 DLC2 黑龙邪道100%成功方法教学,http://www.bilibili.com/video/av9667348,2017-04-07,02:44:02,05:43,20618,216,191,180,785
3,9564681,[喂狗组]《尼尔:机械纪元》最高难度通关攻略2,http://www.bilibili.com/video/av9564681,2017-04-02,13:17:01,37:44,10801,187,120,139,300
4,9477430,[喂狗组]黑魂3环之都 手撕老贼,http://www.bilibili.com/video/av9477430,2017-03-30,01:31:35,12:27,19963,245,117,71,513


In [5]:
d.describe()

Unnamed: 0,aid,play,danmaku,review,favorites,coin
count,104.0,104.0,104.0,104.0,104.0,104.0
mean,6709055.0,27985.153846,693.413462,114.769231,402.826923,651.923077
std,1681868.0,63858.34049,982.39916,218.822455,1447.540763,1412.906092
min,3895109.0,1699.0,12.0,5.0,14.0,11.0
25%,5201167.0,7995.75,149.5,44.5,54.75,167.5
50%,6656832.0,13887.5,462.5,72.5,93.0,415.5
75%,7677858.0,26129.5,932.5,117.75,236.0,637.25
max,9719223.0,602971.0,7634.0,2127.0,13598.0,13063.0


In [6]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104 entries, 0 to 103
Data columns (total 11 columns):
aid          104 non-null int64
title        104 non-null object
url          104 non-null object
date         104 non-null object
time         104 non-null object
length       104 non-null object
play         104 non-null int64
danmaku      104 non-null int64
review       104 non-null int64
favorites    104 non-null int64
coin         104 non-null int64
dtypes: int64(6), object(5)
memory usage: 9.8+ KB


*no missing value*

### Data Cleaning

#### Combine date & time

And change the time from EST to UTC+08:00 (Chinese time) so that the datetime reflects actual time when Virgoo Team released the videos. And easier to do calculations.

In [10]:
d['datetime'] = d['date'] + ' ' + d['time']
d.head()

Unnamed: 0,aid,title,url,date,time,length,play,danmaku,review,favorites,coin,datetime
0,9719223,[喂狗组]《尼尔:机械纪元》VH难度 手撕亚当,http://www.bilibili.com/video/av9719223,2017-04-09,02:07:59,09:46,15834,244,130,180,330,2017-04-09 02:07:59
1,9709649,[喂狗组]拳皇14DLC人物 洛克霍华德 连续技,http://www.bilibili.com/video/av9709649,2017-04-08,15:32:10,01:26,9308,52,79,31,208,2017-04-08 15:32:10
2,9667348,[喂狗组] 黑暗之魂3 DLC2 黑龙邪道100%成功方法教学,http://www.bilibili.com/video/av9667348,2017-04-07,02:44:02,05:43,20618,216,191,180,785,2017-04-07 02:44:02
3,9564681,[喂狗组]《尼尔:机械纪元》最高难度通关攻略2,http://www.bilibili.com/video/av9564681,2017-04-02,13:17:01,37:44,10801,187,120,139,300,2017-04-02 13:17:01
4,9477430,[喂狗组]黑魂3环之都 手撕老贼,http://www.bilibili.com/video/av9477430,2017-03-30,01:31:35,12:27,19963,245,117,71,513,2017-03-30 01:31:35


In [11]:
d = d.drop(['date', 'time'],1)
d.head()

Unnamed: 0,aid,title,url,length,play,danmaku,review,favorites,coin,datetime
0,9719223,[喂狗组]《尼尔:机械纪元》VH难度 手撕亚当,http://www.bilibili.com/video/av9719223,09:46,15834,244,130,180,330,2017-04-09 02:07:59
1,9709649,[喂狗组]拳皇14DLC人物 洛克霍华德 连续技,http://www.bilibili.com/video/av9709649,01:26,9308,52,79,31,208,2017-04-08 15:32:10
2,9667348,[喂狗组] 黑暗之魂3 DLC2 黑龙邪道100%成功方法教学,http://www.bilibili.com/video/av9667348,05:43,20618,216,191,180,785,2017-04-07 02:44:02
3,9564681,[喂狗组]《尼尔:机械纪元》最高难度通关攻略2,http://www.bilibili.com/video/av9564681,37:44,10801,187,120,139,300,2017-04-02 13:17:01
4,9477430,[喂狗组]黑魂3环之都 手撕老贼,http://www.bilibili.com/video/av9477430,12:27,19963,245,117,71,513,2017-03-30 01:31:35


In [12]:
d['datetime'] = pd.to_datetime(d['datetime'])
d.head()

Unnamed: 0,aid,title,url,length,play,danmaku,review,favorites,coin,datetime
0,9719223,[喂狗组]《尼尔:机械纪元》VH难度 手撕亚当,http://www.bilibili.com/video/av9719223,09:46,15834,244,130,180,330,2017-04-09 02:07:59
1,9709649,[喂狗组]拳皇14DLC人物 洛克霍华德 连续技,http://www.bilibili.com/video/av9709649,01:26,9308,52,79,31,208,2017-04-08 15:32:10
2,9667348,[喂狗组] 黑暗之魂3 DLC2 黑龙邪道100%成功方法教学,http://www.bilibili.com/video/av9667348,05:43,20618,216,191,180,785,2017-04-07 02:44:02
3,9564681,[喂狗组]《尼尔:机械纪元》最高难度通关攻略2,http://www.bilibili.com/video/av9564681,37:44,10801,187,120,139,300,2017-04-02 13:17:01
4,9477430,[喂狗组]黑魂3环之都 手撕老贼,http://www.bilibili.com/video/av9477430,12:27,19963,245,117,71,513,2017-03-30 01:31:35


In [14]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104 entries, 0 to 103
Data columns (total 10 columns):
aid          104 non-null int64
title        104 non-null object
url          104 non-null object
length       104 non-null object
play         104 non-null int64
danmaku      104 non-null int64
review       104 non-null int64
favorites    104 non-null int64
coin         104 non-null int64
datetime     104 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(6), object(3)
memory usage: 8.9+ KB


In [15]:
pd.Timedelta('12 hours')

Timedelta('0 days 12:00:00')

In [16]:
d['datetime'] = d['datetime'] + pd.Timedelta('12 hours')
d.head()

Unnamed: 0,aid,title,url,length,play,danmaku,review,favorites,coin,datetime
0,9719223,[喂狗组]《尼尔:机械纪元》VH难度 手撕亚当,http://www.bilibili.com/video/av9719223,09:46,15834,244,130,180,330,2017-04-09 14:07:59
1,9709649,[喂狗组]拳皇14DLC人物 洛克霍华德 连续技,http://www.bilibili.com/video/av9709649,01:26,9308,52,79,31,208,2017-04-09 03:32:10
2,9667348,[喂狗组] 黑暗之魂3 DLC2 黑龙邪道100%成功方法教学,http://www.bilibili.com/video/av9667348,05:43,20618,216,191,180,785,2017-04-07 14:44:02
3,9564681,[喂狗组]《尼尔:机械纪元》最高难度通关攻略2,http://www.bilibili.com/video/av9564681,37:44,10801,187,120,139,300,2017-04-03 01:17:01
4,9477430,[喂狗组]黑魂3环之都 手撕老贼,http://www.bilibili.com/video/av9477430,12:27,19963,245,117,71,513,2017-03-30 13:31:35


*This is the exact time we want*

#### Convert length to seconds (s)

In [19]:
aa = '09:46'

In [20]:
aa.split(':')

['09', '46']

In [21]:
aa.split(':')[-1]

'46'

In [22]:
type(aa.split(':')[-1])

str

In [23]:
int(aa.split(':')[-1]) + int(aa.split(':')[-2])*60

586