In [14]:
### IMPORTS ###
# import used libraries
import pandas as pd                        # pandas for data analysis
pd.options.mode.chained_assignment = None  # default='warn'
import matplotlib.pyplot as plt            # matplotlib for data visualisation

In [15]:
### PATHS ###

DIR = "../data/"

#read
TIMESERIES_PATH = DIR + "original_timeseries.tsv.gz" 
CHANNELS_PATH   = DIR + "original_channels.tsv.gz"

#write
PROCESSED_TIMESERIES_PATH = DIR + "processed_timeseries.tsv.zip"
SCORED_CHANNELS_PATH   = DIR + "scored_channels.tsv.zip" 

In [16]:
### READS ###

timeseries = pd.read_csv(TIMESERIES_PATH, sep='\t')
channels   = pd.read_csv(CHANNELS_PATH, sep='\t')

#set dates format
timeseries['datetime'] = pd.to_datetime(timeseries['datetime'])
channels['join_date'] = pd.to_datetime(channels['join_date'])

***
### Trying to see if there is any channels that started in 2015 with less than 10 videos

In [25]:
timeseries[(timeseries['videos'] <= 10) &
           (timeseries['category'] == 'Entertainment') &
           (timeseries['datetime'].dt.year == 2015)].drop_duplicates('channel')

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity
79948,UC0-3LtEFq7Hd_ydneEM8ErA,Entertainment,2015-12-28,11645.0,0.0,71.75,0.0,2,0,0
2898365,UCG5GX8MHxDCZdVWwEWm8Ijw,Entertainment,2015-07-13,481723.375,5631.375,3426.5,0.0,7,0,1
4077861,UCp7aooRhiGrdzfurH46aQnQ,Entertainment,2015-04-13,3574356.625,26739.625,12348.75,0.0,0,0,0
4680681,UCn42HkmbUavCnHhwnvwpl_g,Entertainment,2015-08-31,499510.9,70617.9,96.4,0.0,0,0,0
12554736,UCxCMFuh1ie8_T7MKy_RqNmg,Entertainment,2015-12-07,396917.8,4001.8,4809.0,0.0,6,0,0
16910059,UC70zE3gJ2z8IW3Qtmj5TblA,Entertainment,2015-09-14,16912.75,45.75,82.0,0.0,1,0,0
17894937,UCQ9EMzXNjpHfWwp_bBzfGAg,Entertainment,2015-12-21,84026.75,4.75,32.0,0.0,3,0,0


In [4]:
### CHOOSE FOCUS CHANNELS ###

#compute the number of subs at the start of the period for each youtuber 
channels = pd.merge(channels,
                    timeseries.sort_values(by='datetime').drop_duplicates('channel')[['channel','subs']].rename(columns={"subs":"initial_subs"}),
                    on = 'channel')

#keep only channels that started in 2015 and are in the entertainment category (because timeseries starts in 2015) --> not succesful at all
ent_channels = channels[(channels['join_date'].dt.year == 2015) &
                        (channels['category_cc'] == 'Entertainment')]

#keep the intersection between the choosen channels and 
channel_ids = ent_channels[['channel']].drop_duplicates()
ent_timeseries = pd.merge(timeseries, channel_ids)
channel_ids = ent_timeseries[['channel']].drop_duplicates()
ent_channels = pd.merge(ent_channels, channel_ids)

***

In [None]:
### CHOOSE FOCUS CHANNELS ###

#compute the number of subs at the start of the period for each youtuber 
channels = pd.merge(channels,
                    timeseries.sort_values(by='datetime').drop_duplicates('channel')[['channel','subs']].rename(columns={"subs":"initial_subs"}),
                    on = 'channel')

#keep only channels that started between 10k and 15k from the entertaining category
ent_channels = channels[(channels['initial_subs'] > 10e3) &
                        (channels['initial_subs'] < 15e3) &
                        (channels['category_cc'] == 'Entertainment')]

#keep the intersection between the choosen channels and 
channel_ids = ent_channels[['channel']].drop_duplicates()
ent_timeseries = pd.merge(timeseries, channel_ids)
channel_ids = ent_timeseries[['channel']].drop_duplicates()
ent_channels = pd.merge(ent_channels, channel_ids)

In [5]:
### SCORING CHANNELS ###

#compute the weekly growth for each channel in the timeseries
ent_timeseries['weekly_growth'] = ent_timeseries['delta_subs']/ent_timeseries['subs']

#compute the average growth score per channel
growth_score = ent_timeseries.groupby('channel').mean()['weekly_growth'].rename('growth_score')
ent_channels = pd.merge(ent_channels, growth_score, on='channel')

#keeps only 25% top and 25% bottom channels
top_channels = ent_channels.nlargest(int(len(ent_channels)*0.25), 'growth_score')[['channel','growth_score']]
bottom_channels = ent_channels.nsmallest(int(len(ent_channels)*0.25), 'growth_score')[['channel','growth_score']]
evo_channels = pd.concat([top_channels, bottom_channels]).sort_values('growth_score', ascending=False)
evo_channels['has_buzzed'] = 0 + 1 * (evo_channels['growth_score'] > bottom_channels['growth_score'].max())

ent_channels = pd.merge(ent_channels, evo_channels.drop(['growth_score'], axis=1), on='channel')
ent_timeseries = pd.merge(ent_timeseries, ent_channels[['channel', 'has_buzzed']], on='channel')

In [6]:
### SAVE PROCESSED DATA ###

ent_channels.to_csv(SCORED_CHANNELS_PATH, index=False, compression={'method':'zip'})
ent_timeseries.to_csv(PROCESSED_TIMESERIES_PATH, index=False, compression={'method':'zip'})

In [7]:
#sort ent_channels by growth_score
ent_channels = ent_channels.sort_values('growth_score', ascending=False)
ent_channels 

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights,initial_subs,growth_score,has_buzzed
556,Entertainment,2015-05-20,UCZy8KS02Jzl9kly5dQZ-fPA,Tobey in the MCU,103570,35,149536.0,4.8220,5.308333e+02,0.250602,1
719,Entertainment,2015-01-24,UCYUunpzH7_WtYwvss2Ja9GQ,Storytime With Reddi...,55600,348,247326.0,7.1410,1.318182e+02,0.203761,1
665,Entertainment,2015-07-29,UCypdeKGOdPA6mma3z2Dzo9w,J4CKSON7,52557,15,211760.0,5.2955,1.585000e+02,0.171179,1
1188,Entertainment,2015-01-14,UC5Qa0ksDc4cE-RKHLcw5VDg,The Watcher,13100,198,757649.0,12.0190,1.300000e+01,0.163399,1
657,Entertainment,2015-08-09,UCCgHTlvBWzrMaL-15HjJX4A,RANDOMLINK GH,70600,26,204594.0,5.8875,2.106278e+04,0.151052,1
...,...,...,...,...,...,...,...,...,...,...,...
59,Entertainment,2015-06-18,UC_17vC75UDfHgTJjX5Ravgw,Jumbo,1730000,340,8418.0,2.5360,1.759949e+06,0.000145,0
1094,Entertainment,2015-07-21,UCH6PcEuNFkEUkbVTAdvWmFQ,BLORANGETIGER,12800,123,626098.0,11.1550,1.237500e+02,0.000052,0
380,Entertainment,2015-05-18,UCmn56iouEYh1XDCgAR5VKGg,Skye Crew,201015,49,91704.0,3.8570,2.140839e+05,0.000019,0
911,Entertainment,2015-01-14,UCyHKikViYPiuniJCHUXKg5g,Love Love,29952,32,404168.0,8.4425,3.076900e+04,0.000000,0


In [13]:
#display timeseries from UCZy8KS02Jzl9kly5dQZ-fPA
ent_timeseries[ent_timeseries['channel'] == 'UCYUunpzH7_WtYwvss2Ja9GQ']

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity,weekly_growth,has_buzzed
167903,UCYUunpzH7_WtYwvss2Ja9GQ,Entertainment,2019-05-13,1972.455,630.4545,131.818182,0.0,3,0,0,0.0,1
167904,UCYUunpzH7_WtYwvss2Ja9GQ,Entertainment,2019-05-20,2836.375,863.9205,135.25,3.431818,3,0,0,0.025374,1
167905,UCYUunpzH7_WtYwvss2Ja9GQ,Entertainment,2019-05-27,3588.0,751.625,144.0,8.75,3,0,0,0.060764,1
167906,UCYUunpzH7_WtYwvss2Ja9GQ,Entertainment,2019-06-03,7078.5,3490.5,154.5,10.5,6,3,3,0.067961,1
167907,UCYUunpzH7_WtYwvss2Ja9GQ,Entertainment,2019-06-10,10569.0,3490.5,165.0,10.5,10,4,7,0.063636,1
167908,UCYUunpzH7_WtYwvss2Ja9GQ,Entertainment,2019-06-17,27539.1,16970.1,223.8,58.8,18,8,13,0.262735,1
167909,UCYUunpzH7_WtYwvss2Ja9GQ,Entertainment,2019-06-24,344913.0,317373.9,952.0,728.2,35,17,26,0.764916,1
167910,UCYUunpzH7_WtYwvss2Ja9GQ,Entertainment,2019-07-01,1593042.0,1248129.0,3407.75,2455.75,65,30,51,0.720637,1
167911,UCYUunpzH7_WtYwvss2Ja9GQ,Entertainment,2019-07-08,4096587.0,2503545.0,8279.75,4872.0,93,28,65,0.588424,1
167912,UCYUunpzH7_WtYwvss2Ja9GQ,Entertainment,2019-07-15,7095072.0,2998485.0,14771.5,6491.75,111,18,55,0.439478,1
