In [14]:
### IMPORTS ###
# import used libraries
import pandas as pd                        # pandas for data analysis
pd.options.mode.chained_assignment = None  # default='warn'
import matplotlib.pyplot as plt            # matplotlib for data visualisation

In [15]:
### PATHS ###

DIR = "../data/"

#read
TIMESERIES_PATH = DIR + "original_timeseries.tsv.gz" 
CHANNELS_PATH   = DIR + "original_channels.tsv.gz"

#write
PROCESSED_TIMESERIES_PATH = DIR + "processed_timeseries.tsv.zip"
SCORED_CHANNELS_PATH   = DIR + "scored_channels.tsv.zip" 

In [16]:
### READS ###

timeseries = pd.read_csv(TIMESERIES_PATH, sep='\t')
channels   = pd.read_csv(CHANNELS_PATH, sep='\t')

#set dates format
timeseries['datetime'] = pd.to_datetime(timeseries['datetime'])
channels['join_date'] = pd.to_datetime(channels['join_date'])

***
### Trying to see if there is any channels that started in 2015 with less than 10 videos

In [51]:
channels_2016 = timeseries.drop_duplicates('channel')[((timeseries['datetime'].dt.year == 2016) | 
                                                       (timeseries['datetime'].dt.year == 2015)) &
                                                      (timeseries['videos'] <= 10)].channel

#compute the number of subs at the start of the period for each youtuber 
channels = pd.merge(channels,
                    timeseries.sort_values(by='datetime').drop_duplicates('channel')[['channel','subs']].rename(columns={"subs":"initial_subs"}),
                    on = 'channel')

ent_channels = pd.merge(channels[channels.category_cc == 'Entertainment'], channels_2016)
ent_timeseries = pd.merge(timeseries, ent_channels.drop_duplicates('channel').channel)




  channels_2016 = timeseries.drop_duplicates('channel')[((timeseries['datetime'].dt.year == 2016) |


In [48]:
ent_channels
ent_timeseries.drop_duplicates('channel')

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity
0,UC4mvN14nN07FKzBRRAngqvA,Entertainment,2016-03-21,2.843375e+03,984.375,359.125,0.0,0,0,0
184,UC2V1ul4ul85kac0Y81wXTqQ,Entertainment,2016-08-15,4.025000e+02,0.000,186.250,0.0,0,0,0
347,UCm3jfzwwDCMb33AOsbLgnng,Entertainment,2016-02-22,4.336350e+05,557.000,316.250,0.0,1,0,0
535,UCeVWCgg-6XlDIjWJlrRnqYQ,Entertainment,2016-08-01,4.774485e+05,2282.500,4813.250,0.0,9,0,1
700,UCxMvjumCHLs5bZv0ISmClPg,Entertainment,2016-08-15,1.251590e+05,0.000,669.000,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
487353,UClRCyBAc8e-1bjE7jP620sg,Entertainment,2016-12-05,2.645000e+04,0.000,541.000,0.0,9,0,5
487501,UCEJpGxMVHCltUzIAMb8J4vQ,Entertainment,2016-09-19,1.461949e+05,146194.875,17489.000,0.0,0,0,0
487598,UCNLReGxOCGDExjb7pjPuPcA,Entertainment,2016-10-10,1.314013e+08,0.000,130447.000,0.0,0,0,0
487741,UCnFpajlezRfiOMdw8Tt3z5A,Entertainment,2016-10-10,1.003384e+08,0.000,116982.000,0.0,0,0,0


In [4]:
### CHOOSE FOCUS CHANNELS ###

#compute the number of subs at the start of the period for each youtuber 
channels = pd.merge(channels,
                    timeseries.sort_values(by='datetime').drop_duplicates('channel')[['channel','subs']].rename(columns={"subs":"initial_subs"}),
                    on = 'channel')

#keep only channels that started in 2015 and are in the entertainment category (because timeseries starts in 2015) --> not succesful at all
ent_channels = channels[(channels['join_date'].dt.year == 2015) &
                        (channels['category_cc'] == 'Entertainment')]

#keep the intersection between the choosen channels and 
channel_ids = ent_channels[['channel']].drop_duplicates()
ent_timeseries = pd.merge(timeseries, channel_ids)
channel_ids = ent_timeseries[['channel']].drop_duplicates()
ent_channels = pd.merge(ent_channels, channel_ids)

***

In [None]:
### CHOOSE FOCUS CHANNELS ###

#compute the number of subs at the start of the period for each youtuber 
channels = pd.merge(channels,
                    timeseries.sort_values(by='datetime').drop_duplicates('channel')[['channel','subs']].rename(columns={"subs":"initial_subs"}),
                    on = 'channel')

#keep only channels that started between 10k and 15k from the entertaining category
ent_channels = channels[(channels['initial_subs'] > 10e3) &
                        (channels['initial_subs'] < 15e3) &
                        (channels['category_cc'] == 'Entertainment')]

#keep the intersection between the choosen channels and 
channel_ids = ent_channels[['channel']].drop_duplicates()
ent_timeseries = pd.merge(timeseries, channel_ids)
channel_ids = ent_timeseries[['channel']].drop_duplicates()
ent_channels = pd.merge(ent_channels, channel_ids)

In [53]:
### SCORING CHANNELS ###

#compute the weekly growth for each channel in the timeseries
ent_timeseries['weekly_growth'] = ent_timeseries['delta_subs']/ent_timeseries['subs']

#compute the average growth score per channel
growth_score = ent_timeseries.groupby('channel').mean()['weekly_growth'].rename('growth_score')
ent_channels = pd.merge(ent_channels, growth_score, on='channel')

#keeps only 25% top and 25% bottom channels
top_channels = ent_channels.nlargest(int(len(ent_channels)*0.25), 'growth_score')[['channel','growth_score']]
bottom_channels = ent_channels.nsmallest(int(len(ent_channels)*0.25), 'growth_score')[['channel','growth_score']]
evo_channels = pd.concat([top_channels, bottom_channels]).sort_values('growth_score', ascending=False)
evo_channels['has_buzzed'] = 0 + 1 * (evo_channels['growth_score'] > bottom_channels['growth_score'].max())

ent_channels = pd.merge(ent_channels, evo_channels.drop(['growth_score'], axis=1), on='channel')
ent_timeseries = pd.merge(ent_timeseries, ent_channels[['channel', 'has_buzzed']], on='channel')

In [6]:
### SAVE PROCESSED DATA ###

ent_channels.to_csv(SCORED_CHANNELS_PATH, index=False, compression={'method':'zip'})
ent_timeseries.to_csv(PROCESSED_TIMESERIES_PATH, index=False, compression={'method':'zip'})

In [54]:
#sort ent_channels by growth_score
ent_channels = ent_channels.sort_values('growth_score', ascending=False)
ent_channels 

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights,initial_subs,growth_score,has_buzzed
441,Entertainment,2016-08-12,UCpeX7ds2tTpiRMwga0jUq8g,The LEGO Group,244000,66,73829.0,3.7770,161.750,0.174242,1
1300,Entertainment,2016-02-02,UCHOVHGRniakvAx0AaziVCGA,HollywoodHereICome!,19233,84,609995.0,9.1900,3907.250,0.139361,1
901,Entertainment,2016-10-08,UCOxBUBKIrtxrUEWgl9KQluA,Crypto PM,58400,223,246868.0,7.1165,10.000,0.126646,1
142,Entertainment,2016-07-29,UClguIWOD4hs4vHYBLRZJiVA,Music For Kids,1070000,152,15647.0,2.8240,10.500,0.099742,1
1127,Entertainment,2016-10-03,UCsfYsJT_LvMZnJux8nSlIgg,Kids Toys Tube,33900,43,397565.0,6.6785,81.500,0.090263,1
...,...,...,...,...,...,...,...,...,...,...,...
1430,Entertainment,2014-08-17,UCEYgGr0ry9I5dYTqyibURnw,TopRamenOnly,13400,22,793763.0,10.3330,15326.000,0.000020,0
511,Entertainment,2015-05-18,UCmn56iouEYh1XDCgAR5VKGg,Skye Crew,201015,49,91704.0,3.8570,214083.875,0.000019,0
391,Entertainment,2011-10-04,UCdSr4xliU8yDyS1aGnCUMTA,Sam Pepper Live,316218,11,58088.0,3.5535,316133.500,0.000014,0
835,Entertainment,2012-07-13,UCOOQeCTjRQXvP-5s2U7v5Gg,Satisfying Pill,73434,28,206624.0,6.1515,93179.750,0.000000,0


In [55]:
#display timeseries from UCZy8KS02Jzl9kly5dQZ-fPA
ent_timeseries[ent_timeseries['channel'] == 'UCHOVHGRniakvAx0AaziVCGA']

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity,weekly_growth,has_buzzed
138417,UCHOVHGRniakvAx0AaziVCGA,Entertainment,2016-07-18,3077045.0,174753.75,3907.25,0.0,2,0,2,0.0,1
138418,UCHOVHGRniakvAx0AaziVCGA,Entertainment,2016-07-25,4300321.0,1223276.25,5295.0,1387.75,2,0,2,0.262087,1
138419,UCHOVHGRniakvAx0AaziVCGA,Entertainment,2016-08-01,6402899.0,2102577.75,7132.5,1837.5,2,0,0,0.257624,1
138420,UCHOVHGRniakvAx0AaziVCGA,Entertainment,2016-08-08,8074860.0,1671961.5,8814.0,1681.5,2,0,0,0.190776,1
138421,UCHOVHGRniakvAx0AaziVCGA,Entertainment,2016-08-15,10035310.0,1960446.5,10468.25,1654.25,2,0,0,0.158025,1
138422,UCHOVHGRniakvAx0AaziVCGA,Entertainment,2016-08-22,11992100.0,1956794.75,12026.5,1558.25,2,0,0,0.129568,1
138423,UCHOVHGRniakvAx0AaziVCGA,Entertainment,2016-08-29,14107060.0,2114955.5,13732.5,1706.0,2,0,0,0.124231,1
138424,UCHOVHGRniakvAx0AaziVCGA,Entertainment,2016-09-05,16781870.0,2674816.5,15833.5,2101.0,2,0,0,0.132693,1
138425,UCHOVHGRniakvAx0AaziVCGA,Entertainment,2016-09-12,19856930.0,3075053.625,18089.5,2256.0,3,1,1,0.124713,1
138426,UCHOVHGRniakvAx0AaziVCGA,Entertainment,2016-09-19,22315010.0,2458078.875,19738.0,1648.5,3,0,1,0.083519,1
