In [1]:
### IMPORTS ###
# import used libraries
import pandas as pd                        # pandas for data analysis
pd.options.mode.chained_assignment = None  # default='warn'
import matplotlib.pyplot as plt            # matplotlib for data visualisation

In [2]:
### PATHS ###

DIR = "../data/"

#read
TIMESERIES_PATH = DIR + "original_timeseries.tsv.gz" 
CHANNELS_PATH   = DIR + "original_channels.tsv.gz"

#write
PROCESSED_TIMESERIES_PATH = DIR + "processed_timeseries.tsv.zip"
SCORED_CHANNELS_PATH   = DIR + "scored_channels.tsv.zip" 

In [4]:
### READS ###

timeseries = pd.read_csv(TIMESERIES_PATH, sep='\t')
channels   = pd.read_csv(CHANNELS_PATH, sep='\t')

#set dates format
timeseries['datetime'] = pd.to_datetime(timeseries['datetime'])
channels['join_date'] = pd.to_datetime(channels['join_date'])

In [5]:
### CHOOSE FOCUS CHANNELS ###

#compute the number of subs at the start of the period for each youtuber 
channels = pd.merge(channels,
                    timeseries.sort_values(by='datetime').drop_duplicates('channel')[['channel','subs']].rename(columns={"subs":"initial_subs"}),
                    on = 'channel')

#keep only channels that started between 10k and 15k from the entertaining category
ent_channels = channels[(channels['initial_subs'] > 10e3) &
                        (channels['initial_subs'] < 15e3) &
                        (channels['category_cc'] == 'Entertainment')]

#keep the intersection between the choosen channels and 
channel_ids = ent_channels[['channel']].drop_duplicates()
ent_timeseries = pd.merge(timeseries, channel_ids)
channel_ids = ent_timeseries[['channel']].drop_duplicates()
ent_channels = pd.merge(ent_channels, channel_ids)

In [6]:
### SCORING CHANNELS ###

#compute the weekly growth for each channel in the timeseries
ent_timeseries['weekly_growth'] = ent_timeseries['delta_subs']/ent_timeseries['subs']

#compute the average growth score per channel
growth_score = ent_timeseries.groupby('channel').mean()['weekly_growth'].rename('growth_score')
ent_channels = pd.merge(ent_channels, growth_score, on='channel')

#keeps only 25% top and 25% bottom channels
top_channels = ent_channels.nlargest(int(len(ent_channels)*0.25), 'growth_score')[['channel','growth_score']]
bottom_channels = ent_channels.nsmallest(int(len(ent_channels)*0.25), 'growth_score')[['channel','growth_score']]
evo_channels = pd.concat([top_channels, bottom_channels]).sort_values('growth_score', ascending=False)
evo_channels['has_buzzed'] = 0 + 1 * (evo_channels['growth_score'] > bottom_channels['growth_score'].max())

ent_channels = pd.merge(ent_channels, evo_channels.drop(['growth_score'], axis=1), on='channel')
ent_timeseries = pd.merge(ent_timeseries, ent_channels[['channel', 'has_buzzed']], on='channel')

In [7]:
### SAVE PROCESSED DATA ###

ent_channels.to_csv(SCORED_CHANNELS_PATH, index=False, compression={'method':'zip'})
ent_timeseries.to_csv(PROCESSED_TIMESERIES_PATH, index=False, compression={'method':'zip'})