In [1]:
### IMPORTS ###
# import used libraries
import pandas as pd                        # pandas for data analysis
pd.options.mode.chained_assignment = None  # default='warn'
import matplotlib.pyplot as plt            # matplotlib for data visualisation

In [2]:
### PATHS ###

DIR = "../data/"

#read
READ_TIMESERIES_PATH = DIR + "original_timeseries.tsv.gz" 
READ_CHANNELS_PATH   = DIR + "original_channels.tsv.gz"

#write
WRITE_TIMESERIES_PATH = DIR + "ent_timeseries.tsv.zip"
WRITE_CHANNELS_PATH   = DIR + "ent_channels.tsv.zip" 

In [3]:
### READS ###

timeseries = pd.read_csv(READ_TIMESERIES_PATH, sep='\t')
channels   = pd.read_csv(READ_CHANNELS_PATH, sep='\t')

#set dates format
timeseries['datetime'] = pd.to_datetime(timeseries['datetime'])
channels['join_date'] = pd.to_datetime(channels['join_date'])

In [4]:
### CHOOSE FOCUS CHANNELS ###

#keep only channels from Entertainment category
ent_channels = channels[(channels['category_cc'] == 'Entertainment')]

#keep the intersection between the choosen channels and the timeseries
channel_ids = ent_channels[['channel']].drop_duplicates()
ent_timeseries = pd.merge(timeseries, channel_ids)
channel_ids = ent_timeseries[['channel']].drop_duplicates()
ent_channels = pd.merge(ent_channels, channel_ids)

In [5]:
### SCORING CHANNELS ###
p = 0.25 #percentile

#take last data date for each channel
ent_channels_last_data = ent_timeseries.sort_values(by=['datetime'], inplace=False, ascending=False) \
                                   .drop_duplicates('channel') \
                                   .rename(columns={'datetime':'last_date','subs':'last_subs'})

# assign last data to each channel
ent_channels = pd.merge(ent_channels,ent_channels_last_data[['channel','last_date','last_subs']], on='channel')

#compute the activity period for each channel (in days)
ent_channels['period'] = ent_channels.last_date - ent_channels.join_date

#compute the growth score for each channel
ent_channels['growth_score'] = ent_channels.last_subs / ent_channels.period.dt.days

#sort channels by growth score and filter out NaN values
ent_channels.sort_values(by=['growth_score'], inplace=True, ascending=False)
ent_channels = ent_channels[ent_channels['growth_score'].notna()]

#keeps only 25% top and 25% bottom channels
top_channels = ent_channels.nlargest(int(len(ent_channels)*p), 'growth_score')[['channel','growth_score']]
bottom_channels = ent_channels.nsmallest(int(len(ent_channels)*p), 'growth_score')[['channel','growth_score']]
evo_channels = pd.concat([top_channels, bottom_channels]).sort_values('growth_score', ascending=False)
evo_channels['has_buzzed'] = 0 + 1 * (evo_channels['growth_score'] > bottom_channels['growth_score'].max())

ent_channels = pd.merge(ent_channels, evo_channels[['channel','has_buzzed']], on='channel')
ent_timeseries = pd.merge(ent_timeseries, ent_channels[['channel', 'has_buzzed']], on='channel')

In [6]:
### SAVE PROCESSED DATA ###

ent_channels.to_csv(WRITE_CHANNELS_PATH, index=False, compression={'method':'zip'})
ent_timeseries.to_csv(WRITE_TIMESERIES_PATH, index=False, compression={'method':'zip'})