In [1]:
import matplotlib.font_manager as font_manager
from matplotlib.lines import Line2D
import matplotlib as mpl
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
params = {
    "axes.titlesize" : 14,
    'axes.labelsize': 12,
    'font.size': 12,
    'legend.fontsize': 12,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'text.usetex': False
}

mpl.rcParams.update(params)


import warnings
warnings.filterwarnings('ignore')

In [2]:
df_time_series = pd.read_csv("./data/df_timeseries_en.tsv.gz", compression="infer", sep="\t")
df_channels = pd.read_csv("./data/df_channels_en.tsv.gz", compression="infer", sep="\t")
df_time_series["datetime"] = pd.to_datetime(df_time_series["datetime"])
df_channels["join_date"] = pd.to_datetime(df_channels["join_date"])
video_metadatas = pd.read_feather("./data/yt_metadata_helper.feather", columns=["duration", "like_count", "view_count", "channel_id"])
video_metadatas["dummmy"] = 1

In [3]:
video_metadatas.sample(7)

Unnamed: 0,duration,like_count,view_count,channel_id,dummmy
21183074,224,0.0,29.0,UCauYUIWIWeA9W7CQthogYlg,1
11073436,52,2.0,393.0,UCybF_bgvjVTAPIm8HT-TNdQ,1
37104111,90,95.0,31711.0,UCP6HGa63sBC7-KHtkme-p-g,1
51169064,928,180.0,69720.0,UCEcryINEWViBdhxRzAPiZCg,1
42012656,755,4615.0,112197.0,UCL-019jclyQT5ngcf_RoALw,1
29947171,2112,5.0,155.0,UCV6-CUNsfe2-STYfYkd7bBQ,1
32225277,1431,42.0,919.0,UCT3FrMtOQD9qH_tpYt29gDA,1


In [4]:
video_metadatas['duration'].describe()

count    7.292479e+07
mean     7.576761e+02
std      1.820111e+03
min      1.000000e+00
25%      1.340000e+02
50%      2.840000e+02
75%      6.980000e+02
max      3.601570e+05
Name: duration, dtype: float64

In [5]:
print(df_time_series['category'].unique())
print(len(df_time_series['category']))
df_time_series.head(10)

['Film and Animation' 'Music' 'Comedy' 'Gaming' 'Science & Technology'
 'Sports' 'Entertainment' 'Education' 'Nonprofits & Activism'
 'People & Blogs' 'Howto & Style' 'News & Politics' 'Travel & Events'
 'Autos & Vehicles' 'Pets & Animals' nan]
18872499


Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity
0,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-03,202494.6,0.0,650.222222,0.0,5,0,3
1,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-10,394085.7,191591.111111,1046.0,395.777778,6,1,1
2,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-17,835393.8,441308.083333,1501.5,455.5,6,0,1
3,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-24,1104577.0,269183.25,1750.0,248.5,6,0,0
4,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-31,1284406.0,179828.6,2008.3,258.3,6,0,0
5,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-08-07,1493380.0,208974.2,2270.2,261.9,6,0,0
6,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-08-14,1721383.0,228003.2,2531.7,261.5,6,0,0
7,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-08-21,1932405.0,211022.0,2774.6,242.9,6,0,0
8,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-08-28,2221636.0,289230.75,3220.25,445.65,6,0,0
9,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-09-04,2632813.0,411177.535714,4114.285714,894.035714,6,0,0


In [6]:
print(df_channels['category_cc'].unique())
print(len(df_channels['category_cc']))
df_channels.head(7)

['Gaming' 'Education' 'Entertainment' 'Howto & Style' 'Sports' 'Music'
 'Film and Animation' 'Comedy' 'Nonprofits & Activism' 'People & Blogs'
 'News & Politics' 'Science & Technology' 'Pets & Animals'
 'Autos & Vehicles' 'Travel & Events' nan]
136470


Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.087
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.087
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.087
5,Entertainment,2007-01-15,UCIwFjwMjI0y7PDBVEO9-bkQ,Justin Bieber,46574085,134,12.0,2.087
6,Music,2014-03-12,UCFFbwnve3yF62-tVXkTyHqg,Zee Music Company,43451109,4241,13.0,2.087


In [7]:
# print the number of channels created every year 
channel_year = df_channels['join_date'].apply(lambda d: d.to_pydatetime().year)
# channel_year.value_counts().sort_values().plot(kind='bar')

In [8]:
top_ranked_channels = df_channels[df_channels.subscriber_rank_sb < 100]
top_ranked_channels.head()

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.087
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.087
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.087


In [9]:
date = top_ranked_channels["join_date"].apply(lambda d: d.to_pydatetime().year)
np.mean(date, axis=0)

2010.3396226415093

In [10]:
oldest_video = df_channels['join_date'].apply(lambda d: d.to_pydatetime().year)
oldest_video.nsmallest(5)

59578    2004.0
16       2005.0
40       2005.0
53       2005.0
54       2005.0
Name: join_date, dtype: float64

In [11]:
oldest_from_series = df_time_series['datetime'].apply(lambda d: d.to_pydatetime().year)
oldest_from_series.nsmallest(4)

79948     2015
96781     2015
96782     2015
643772    2015
Name: datetime, dtype: int64

In [12]:
channels_with_largest_subscribers = df_channels.nlargest(53, 'subscribers_cc')
channels_with_largest_subscribers.sample(7)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
15,Entertainment,2008-06-01,UCYvmuw-JtVrTZQ-7Y4kd63Q,Katy Perry,35028528,101,30.0,2.087
25,Music,2010-12-03,UCb2HGwORFBo94DmRx4oLzow,One Direction,29939074,157,47.0,2.087
44,Music,2011-01-19,UCAvCL8hyXjSUHKEGuUPr1BA,Shawn Mendes,23479003,87,89.0,2.087
33,Music,2009-05-12,UCANLZYMidaCbLQFWXBC95Jg,TaylorSwiftVEVO,27200000,102,69.0,2.087
37,Music,2009-12-13,UC-8Q-hLdECwQmaWNwXitYDw,KatyPerryVEVO,26000000,132,79.0,2.087
32,Entertainment,2007-08-04,UC6-F5tO8uklgE9Zy8IvbdFw,SAB TV,23700000,25390,65.0,2.087
20,Music,2012-08-25,UCJrOtniJ0-NWz37R30urifQ,Alan Walker,29588818,198,39.0,2.087


In [13]:
df_time_series.sample(7)

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity
14481107,UCaIc6SgS90ud_RgMSC6hW_w,Entertainment,2017-12-03 23:00:00,160430900.0,2908315.0,174516.911458,4569.182292,8594,29,65
10736942,UC5dbmHMwLmGZTBqrVC162Gg,Gaming,2017-01-08 23:00:00,1511347.0,22336.12,40051.203125,133.875,160,2,2
459263,UCJwcgUVJcTK3_kpVVrAmz1g,Film and Animation,2016-06-27 00:00:00,309228.1,1795.625,304.875,10.375,100,2,6
8154691,UCpqVqKP7iiGkKetA9qhyuNw,Music,2016-12-11 23:00:00,50207400.0,303912.4,56445.078125,203.729167,420,1,7
10931551,UCBBw4zAbSmUnq_-ECy8vDaQ,Music,2019-05-20 00:00:00,29960010.0,339308.5,57482.818182,627.090909,260,0,0
3500980,UCwDDFvFFZ1X9ivgS_oBYQ8A,News & Politics,2017-05-08 00:00:00,757799.0,5270.125,9347.0,14.0,512,5,13
8338002,UCuO9PYAcBHAN2RSDkxq2kmQ,Gaming,2018-07-09 01:00:00,1199155.0,11887.02,8211.25,38.28125,420,0,0


In [14]:
df_time_series[df_time_series['channel'] == 'UC-lHJZR3Gqxm24_Vd_AJ5Yw'].head()

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity
14565937,UC-lHJZR3Gqxm24_Vd_AJ5Yw,Gaming,2016-10-17 00:00:00,13577610000.0,39508020.0,48617960.0,0.0,2906,0,14
14565938,UC-lHJZR3Gqxm24_Vd_AJ5Yw,Gaming,2016-10-24 00:00:00,13633850000.0,56244030.0,48809300.0,191335.375,2913,7,14
14565939,UC-lHJZR3Gqxm24_Vd_AJ5Yw,Gaming,2016-10-30 23:00:00,13689590000.0,55739590.0,48971820.0,162521.305699,2919,6,15
14565940,UC-lHJZR3Gqxm24_Vd_AJ5Yw,Gaming,2016-11-06 23:00:00,13735940000.0,46351590.0,49093970.0,122152.626592,2926,7,15
14565941,UC-lHJZR3Gqxm24_Vd_AJ5Yw,Gaming,2016-11-13 23:00:00,13761950000.0,26009020.0,49176380.0,82404.010417,2932,6,14
