In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as scs
import scipy.integrate as integrate
import swifter

DATA_PATH = "../data/"

# Initial preprocessing: sliding windows

In this notebook the initial preoprocessing of the dataset is computed, specifically, of the `‘df_timeseries_en.tsv.gz’` file.  

As a first step, we filtered out all the channels that were observed for less that 52 weeks (approximatively one year) to have sufficient observations to study the progress and strategy of each channel.  

Then, we implemented a sliding window technique to define a metric called "growth_rate" that will be used later to identify the channels with faster growth and the date of their fame gain. This metric captures the rate of views growth (or decrease) for each channel during the 5 weeks considered within the window. In other words, the higher "growth_rate" is, the faster the channel's views grew in the considered window, and the more negative its values are, the quicker the channel views dropped. Since we want to analyse the three months before and after success gain to set a uniform analysis methodology, when computing this metric we didn't consider the first 12 and the last 14 datapoints of observation of each one of the channels.  
For its calculation, we compared the views curve over the window time period to a linear interpolation between the ends of the window by computing the area between the two curves. To be able to make comparisons between the growth rates of different channels, the metric should be independent of the views' order of magnitude. Therefore, we applied a normalization by using a coefficient that takes into account both the views scale within the window and within the whole timeseries.  



In [2]:
timeseries_df = pd.read_csv(f"{DATA_PATH}df_timeseries_en.tsv.gz", sep='\t', parse_dates=["datetime"], compression="gzip")
timeseries_df.head(2)

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity
0,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-03,202494.555556,0.0,650.222222,0.0,5,0,3
1,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-10,394085.666667,191591.111111,1046.0,395.777778,6,1,1


In [3]:
channel_df = pd.read_csv(f"{DATA_PATH}df_channels_en.tsv.gz", sep='\t', compression="gzip")
channel_df.head(2)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087


In [4]:
# filter out channels that have less than 52 weeks of observations
idx = timeseries_df.groupby("channel")["datetime"].transform(len) > 52
new_ts_df = timeseries_df[idx]

In [5]:
# remove the first 12 and the last 14 weeks of each channel
datetimes = (
    new_ts_df[["channel", "datetime", "views"]]
        .reset_index()
        .swifter
        .groupby("channel")
        .apply(lambda group: group.iloc[12:-14])
        .set_index("index")
)

  0%|          | 0/16 [00:00<?, ?it/s]

2022-12-23 02:26:00,309	INFO worker.py:1528 -- Started a local Ray instance.


In [6]:
WINDOW_SIZE = 5

def calc_window(views):
    b, e = views.iloc[0], views.iloc[-1]

    if e == b:
        # avoid calculating the integral if the increase is zero
        return pd.Series([0])

    # calculate area between the linear increase and the actual increase
    line = np.array([(e - b) * j / (WINDOW_SIZE - 1) + b for j in range(WINDOW_SIZE)])
    
    # since we are measuring the sudden increase in views as an integral of the difference between the linear increase and the actual increase,
    # we have to make sure that the linear increase is above the actual increase at least at the beginning of the window
    if views.iloc[1] > line[1]:
        return pd.Series([0])

    # if the linear increase is below the actual increase, we don't count it
    if integrate.simpson(line - views) < 0:
        return pd.Series([0])

    # steepness is weighted proportionally to the relative variation in views ()
    window_delta = line[-1] - line[0]
    coefficent = window_delta # should be divided by / max_delta but we don't have it yet so it's done at a later step
    growth_steepness = integrate.simpson(np.abs(line - views)) / integrate.simpson(line) * coefficent
    return pd.Series([growth_steepness])

In [7]:
# apply window function to each channel and create the column "growth_rate"
ts = (
    new_ts_df[["channel", "views"]]
        .reset_index()
        .swifter
        .groupby("channel")
        .apply(lambda group: group.iloc[13 - WINDOW_SIZE : -14 + WINDOW_SIZE])
        .set_index("index")
        .groupby("channel")
        .rolling(WINDOW_SIZE)
        .apply(calc_window)
        .reset_index()
        .groupby("channel")
        .apply(lambda group: group.iloc[WINDOW_SIZE-1:-WINDOW_SIZE])
        .set_index("index")
        .rename(columns={"views": "growth_rate"})
)

  0%|          | 0/16 [00:00<?, ?it/s]

  File "python\ray\_raylet.pyx", line 1142, in ray._raylet.spill_objects_handler
  File "python\ray\_raylet.pyx", line 1145, in ray._raylet.spill_objects_handler
  File "C:\Users\tomma\AppData\Roaming\Python\Python39\site-packages\ray\_private\external_storage.py", line 668, in spill_objects
    return _external_storage.spill_objects(object_refs, owner_addresses)
  File "C:\Users\tomma\AppData\Roaming\Python\Python39\site-packages\ray\_private\external_storage.py", line 305, in spill_objects
    return self._write_multiple_objects(f, object_refs, owner_addresses, url)
  File "C:\Users\tomma\AppData\Roaming\Python\Python39\site-packages\ray\_private\external_storage.py", line 149, in _write_multiple_objects
    written_bytes = f.write(payload)
OSError: [Errno 28] No space left on device
An unexpected internal error occurred while the IO worker was spilling objects: [Errno 28] No space left on device


In [8]:
# define the difference between the maximum and the minimum views for each channel
views_grouped = datetimes.groupby("channel")["views"]
max_views_difference = views_grouped.max() - views_grouped.min()
max_views_difference.name = "max_views_difference"

In [9]:
# normalize the growth rate by the difference between the maximum and the minimum views to get a value between 0 and 1
new_ts = (
    datetimes
        .reset_index()
        .merge(ts, on=["channel", "index"])
        .merge(max_views_difference, on="channel")
        .set_index("index")
)
new_ts["growth_rate"] = new_ts["growth_rate"] / new_ts["max_views_difference"]
new_ts.head(2)

Unnamed: 0_level_0,channel,datetime,views,growth_rate,max_views_difference
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1928804,UC--24Q3_ZQeFmgJE-Um5QZQ,2016-10-24 00:00:00,94149.0,0.0,4005293.0
1928805,UC--24Q3_ZQeFmgJE-Um5QZQ,2016-10-30 23:00:00,96850.927461,0.0,4005293.0


In [10]:
# save the temporary file
new_ts.to_csv(f"{DATA_PATH}df_timeseries_with_sudden_growth.tsv.gz", compression="gzip", sep="\t", index=False)

In [11]:
# merge the original timeseries dataframe with the new one with the computed metrics
x = new_ts_df.merge(new_ts, on=["channel", "datetime", "views"], how="left") 
x.head(20)

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity,growth_rate,max_views_difference
0,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-03 00:00:00,202494.6,0.0,650.222222,0.0,5,0,3,,
1,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-10 00:00:00,394085.7,191591.1,1046.0,395.777778,6,1,1,,
2,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-17 00:00:00,835393.8,441308.1,1501.5,455.5,6,0,1,,
3,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-24 00:00:00,1104577.0,269183.2,1750.0,248.5,6,0,0,,
4,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-31 00:00:00,1284406.0,179828.6,2008.3,258.3,6,0,0,,
5,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-08-07 00:00:00,1493380.0,208974.2,2270.2,261.9,6,0,0,,
6,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-08-14 00:00:00,1721383.0,228003.2,2531.7,261.5,6,0,0,,
7,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-08-21 00:00:00,1932405.0,211022.0,2774.6,242.9,6,0,0,,
8,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-08-28 00:00:00,2221636.0,289230.8,3220.25,445.65,6,0,0,,
9,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-09-04 00:00:00,2632813.0,411177.5,4114.285714,894.035714,6,0,0,,


In [12]:
new_ts_df.shape[0] == x.shape[0]

True

In [13]:
# save the final file
x.to_csv(f"{DATA_PATH}df_timeseries_with_sudden_growth_merged.tsv.gz", compression="gzip", sep="\t", index=False)

The final dataframe is the same as the original one without the channels that had less than one year of observations and with two new columns:
- `growth_rate` : the new metric that describes the rate of views growth per data point of each channel. Each row has the value corresponding to the growth rate of the 4 weeks before and NaN values for the data points in the first 11 and last 13 weeks of observation of each channel
- `max_views_difference` : the difference between the maximum number of views and the minimum during the time of observation of each channel (without the first 11 and last 13 weeks of observation)