In [None]:
import matplotlib.font_manager as font_manager
from matplotlib.lines import Line2D
import matplotlib as mpl
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

params = {
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "font.size": 12,
    "legend.fontsize": 12,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "text.usetex": False,
}

mpl.rcParams.update(params)


import warnings

warnings.filterwarnings("ignore")


In [None]:
df_time_series = pd.read_csv(
    "./data/df_timeseries_en.tsv.gz", compression="infer", sep="\t"
)
df_channels = pd.read_csv("./data/df_channels_en.tsv.gz", compression="infer", sep="\t")
df_time_series["datetime"] = pd.to_datetime(df_time_series["datetime"])
df_channels["join_date"] = pd.to_datetime(df_channels["join_date"])
video_metadatas = pd.read_feather(
    "./data/yt_metadata_helper.feather",
    columns=["categories", "upload_date", "duration", "like_count", "view_count", "channel_id"],
)

In [None]:
# round the subs value, easier to consider 1 sub as a person and not half of a person...
df_time_series.subs = df_time_series.subs.round(0)

# Check length of videos

An idea would be to use cut to match length of videos into discrete intervals 

In [None]:
# long to compute

# bucket_durations = pd.cut(video_metadatas['duration'], bins=100)
# print(type(bucket_durations))


---

# Check frequency of videos

---

# How long to reach 1M subscribers 

Could be interesting to take all the videos with 10K, 20K, 30K, ... and see the evolution to reach 100K for example. And compare different evolution : from 50K to 100K or from 500K to 600K, etc.. which one is the fastest, easiest ?  

In [None]:
under_10K = df_time_series[df_time_series["subs"] < 10_000]
more_1M = df_time_series[df_time_series["subs"] > 1_000_000]


In [None]:
channel_10K_to_1M = df_time_series[
    df_time_series.channel.isin(under_10K.channel)
    & df_time_series.channel.isin(more_1M.channel)
]
print(
    "We have {} channels that begins with 10K subs and reach at least 1M".format(
        channel_10K_to_1M.channel.nunique()
    )
)


In [None]:
first_channel = channel_10K_to_1M.iloc[0]
test = channel_10K_to_1M[channel_10K_to_1M["channel"] == first_channel.channel]
# test = test[(test['datetime'] > np.datetime64('2018-07-20')) & (test['datetime'] < np.datetime64('2019-01-20'))]
test.plot(x="datetime", y="subs")
print("There are {} weeks for this channel".format(test.shape[0]))
print(
    "We should have approximately the same value : \n{} and {}".format(
        7 * test.shape[0], test.iloc[-1].datetime - test.iloc[0].datetime
    )
)


In [None]:
# get the time it takes to go from less than 10K to 1M
time_to_reach_1M = channel_10K_to_1M.groupby("channel").apply(
    lambda group: pd.Series(
        {
            "from_zero_to_hero_duration": group[group.subs > 1_000_000]["datetime"].iloc[0]
            - group[group.subs < 10_000]["datetime"].iloc[-1]
        }
    )
)
time_to_reach_1M.head()

In [None]:
print('In average YTbers take {} to reach 1M of subscribers'.format(time_to_reach_1M['from_zero_to_hero_duration'].mean()))

It would be nice to bootstrap this result to see with interval of confidence the time taken by Youtuber to reach 1M

---

# Check the number of views (maybe after we could add likes, dislike) given a certain categories and date of videos

For this task lets analyze and after ploting and getting some intuition, use logistic regression to 'predict' the nb of views given category and date 

In [14]:
video_metadatas.head()

Unnamed: 0,categories,upload_date,duration,like_count,view_count,channel_id
0,Film & Animation,2016-09-28,1159,8.0,1057.0,UCzWrhkg9eK5I8Bm3HfV-unA
1,Film & Animation,2016-09-28,2681,23.0,12894.0,UCzWrhkg9eK5I8Bm3HfV-unA
2,Film & Animation,2016-09-28,1394,1607.0,1800602.0,UCzWrhkg9eK5I8Bm3HfV-unA
3,Film & Animation,2016-09-28,5064,227.0,57640.0,UCzWrhkg9eK5I8Bm3HfV-unA
4,Film & Animation,2016-09-28,3554,105.0,86368.0,UCzWrhkg9eK5I8Bm3HfV-unA


In [18]:
encoded_cat = pd.get_dummies(video_metadatas.categories)
video_metadatas_encoded = video_metadatas.join(encoded_cat)
# video_metadatas_encoded.drop('categories', axis=1, inplace=True)

In [19]:
video_metadatas_encoded['year'] = video_metadatas_encoded['upload_date'].dt.year

In [20]:
video_metadatas_encoded.head()

Unnamed: 0,categories,upload_date,duration,like_count,view_count,channel_id,Unnamed: 7,Autos & Vehicles,Comedy,Education,...,Music,News & Politics,Nonprofits & Activism,People & Blogs,Pets & Animals,Science & Technology,Shows,Sports,Travel & Events,year
0,Film & Animation,2016-09-28,1159,8.0,1057.0,UCzWrhkg9eK5I8Bm3HfV-unA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2016
1,Film & Animation,2016-09-28,2681,23.0,12894.0,UCzWrhkg9eK5I8Bm3HfV-unA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2016
2,Film & Animation,2016-09-28,1394,1607.0,1800602.0,UCzWrhkg9eK5I8Bm3HfV-unA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2016
3,Film & Animation,2016-09-28,5064,227.0,57640.0,UCzWrhkg9eK5I8Bm3HfV-unA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2016
4,Film & Animation,2016-09-28,3554,105.0,86368.0,UCzWrhkg9eK5I8Bm3HfV-unA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2016


In [21]:
mean_views = video_metadatas_encoded.groupby(['year', 'categories']).apply(lambda x: pd.Series({
    'mean_view' : x.view_count.mean()
}))

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_view
year,categories,Unnamed: 2_level_1
2005,Autos & Vehicles,7358.667
2005,Comedy,2261696.0
2005,Education,11598.14
2005,Entertainment,178513.3
2005,Film & Animation,3186.667


In [22]:
# see the top most viewed categories every year 
mean_views[:30]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_view
year,categories,Unnamed: 2_level_1
2005,Autos & Vehicles,7358.667
2005,Comedy,2261696.0
2005,Education,11598.14
2005,Entertainment,178513.3
2005,Film & Animation,3186.667
2005,Howto & Style,114844.3
2005,Music,9104648.0
2005,News & Politics,7239.0
2005,Pets & Animals,21997.27
2005,Science & Technology,5572.0


---

In [None]:
top_ranked_channels = df_channels[df_channels.subscriber_rank_sb < 100]
top_ranked_channels.head()


In [None]:
date = top_ranked_channels["join_date"].apply(lambda d: d.to_pydatetime().year)
np.mean(date, axis=0)


In [None]:
oldest_video = df_channels["join_date"].apply(lambda d: d.to_pydatetime().year)
oldest_video.nsmallest(5)


In [None]:
channels_with_largest_subscribers = df_channels.nlargest(53, "subscribers_cc")
channels_with_largest_subscribers.sample(7)
