In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as scs
import scipy.integrate as integrate
import swifter
import matplotlib.animation as animation

DATA_PATH = "../data/"

In [2]:
# timeseries_df = pd.read_csv("../ada_project/data/df_timeseries_en.tsv.gz", sep='\t', compression="gzip", parse_dates=["datetime"])
timeseries_df = pd.read_csv(f"{DATA_PATH}df_timeseries_en.tsv.gz", sep='\t', parse_dates=["datetime"], compression="gzip")
timeseries_df.head(2)

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity
0,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-03,202494.555556,0.0,650.222222,0.0,5,0,3
1,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-10,394085.666667,191591.111111,1046.0,395.777778,6,1,1


In [3]:
channel_df = pd.read_csv(f"{DATA_PATH}df_channels_en.tsv.gz", sep='\t', compression='gzip', parse_dates=["join_date"])
channel_df.head(2)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087


In [26]:
WINDOW_SIZE = 5

def calc_window(views):
    b, e = views.iloc[0], views.iloc[-1]

    if e == b:
        # avoid calculating the integral if the increase is zero
        return None, None, 0
    
    # calculate area between the linear increase and the actual increase
    line = np.array([(e - b) * j / (WINDOW_SIZE - 1) + b for j in range(WINDOW_SIZE)])
    
    # since we are measuring the sudden increase in views as an integral of the difference between the linear increase and the actual increase,
    # we have to make sure that the linear increase is above the actual increase at least at the beginning of the window
    if views.iloc[1] > line[1]:
        return line, False, 0

    # if the linear increase is below the actual increase, we don't count it
    if integrate.simpson(line - views) < 0:
        return line, False, 0

    window_delta = line[-1] - line[0]
    coefficent = window_delta # should be divided by / max_delta but we don't have it yet so it's done at a later step
    growth_steepness = integrate.simpson(np.abs(line - views)) / integrate.simpson(line) * coefficent

    return line, True, growth_steepness

In [32]:
from IPython.display import HTML

current_channel = 'UCj-R_ePoJvWGiLOD6aDgMSg'
current_ts = timeseries_df[timeseries_df["channel"] == current_channel]
views_max, views_min = current_ts["views"].max(), current_ts["views"].min()
max_delta = views_max - views_min


fig, ax = plt.subplots(figsize=(15, 4))
fig.suptitle(f"Growth rate visualization - Channel name: \"{channel_df[channel_df['channel'] == current_channel]['name_cc'].iloc[0]}\"")

def plot(t0):
    ax.clear()
    current_ts.plot(x="datetime", y="views", ax=ax, fontsize=12)
    ax.axvline(x=current_ts.iloc[t0]["datetime"], color='k')
    ax.axvline(x=current_ts.iloc[t0+WINDOW_SIZE-1]["datetime"], color='k')
    line, isvalid, value = calc_window(current_ts.iloc[t0:t0+WINDOW_SIZE]["views"])
    values = value / max_delta

    ax.text(current_ts.iloc[t0+2]["datetime"], views_max, f"{values:.2f}", fontsize=12, color='g' if isvalid else 'r', alpha=1 if isvalid else 0.5, horizontalalignment="center", verticalalignment="center")

    if line is not None:
        ax.plot(current_ts.iloc[t0:t0+WINDOW_SIZE]["datetime"], line, color='g' if isvalid else 'r', alpha=1 if isvalid else 0.5)
        if isvalid:
            ax.fill_between(current_ts.iloc[t0:t0+WINDOW_SIZE]["datetime"], line, current_ts.iloc[t0:t0+WINDOW_SIZE]["views"], color='g', alpha=0.1)

# plot(2)
# ax.legend(loc='upper left');

anim = animation.FuncAnimation(fig, plot, frames=range(0, len(current_ts) - WINDOW_SIZE), interval=500)


plt.close()
HTML(anim.to_html5_video())