In [1]:
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats import diagnostic

from scipy import stats
import scipy.fft as sf

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

pd.options.plotting.backend = "plotly" 

In [2]:
def ft(df, metric):
    N = df.shape[0]

    # Fourier transform
    fft = np.abs(sf.rfft(df[metric].to_numpy()))/N
    freqs = sf.rfftfreq(N, d=0.1)[::-1]

    return freqs, fft

# ***Channels DF***

In [3]:
channels_df = pd.read_csv('data/df_channels_en.tsv.gz', sep='\t', compression='infer')
channels_df.head()

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.087
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.087
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.087


In [4]:
channels_df.groupby('category_cc').mean()

Unnamed: 0_level_0,subscribers_cc,videos_cc,subscriber_rank_sb,weights
category_cc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Autos & Vehicles,111158.375169,682.188664,396373.648043,7.680089
Comedy,432108.482347,287.109371,299991.221927,6.782554
Education,268202.63514,552.903883,327824.047033,6.868359
Entertainment,351383.375321,645.804148,323417.997255,6.903652
Film and Animation,228242.992727,325.526109,371171.466182,7.608686
Gaming,202022.438068,750.962568,375404.958,7.745133
Howto & Style,233022.904842,392.432337,322855.754779,6.834346
Music,292134.635948,666.219518,366227.229524,7.420399
News & Politics,294673.487406,7844.407424,318111.243482,6.788842
Nonprofits & Activism,94647.703818,902.767802,436178.927761,8.362881


In [5]:
px.scatter(channels_df.sample(1000), x='subscribers_cc', y='weights', log_x=True)

### *Interesting Subgroups*

In [6]:
top_10_channels = channels_df.sort_values(by='subscribers_cc', ascending=False).iloc[:10]
top_10_channels

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
55,Music,2006-03-13,UCq-Fj5jknLsUf-MWSy4_brA,T-Series,112139463,13839,102.0,2.087
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.087
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.087
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.087
5,Entertainment,2007-01-15,UCIwFjwMjI0y7PDBVEO9-bkQ,Justin Bieber,46574085,134,12.0,2.087
7,Sports,2009-03-17,UCRijo3ddMTht_IHyNSNXpNQ,Dude Perfect,45800000,213,14.0,2.087
6,Music,2014-03-12,UCFFbwnve3yF62-tVXkTyHqg,Zee Music Company,43451109,4241,13.0,2.087
9,Music,2006-08-08,UC0C-w0YjGpqDXGB8IHb662A,Ed Sheeran,42000000,154,17.0,2.087


# ***Timeseries***

In [31]:
timeseries_df = pd.read_csv('data/df_timeseries_en.tsv.gz', sep='\t', compression='infer')
timeseries_df['datetime'] = pd.to_datetime(timeseries_df['datetime'])
timeseries_df.head()

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity
0,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-03,202494.6,0.0,650.222222,0.0,5,0,3
1,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-10,394085.7,191591.111111,1046.0,395.777778,6,1,1
2,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-17,835393.8,441308.083333,1501.5,455.5,6,0,1
3,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-24,1104577.0,269183.25,1750.0,248.5,6,0,0
4,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-31,1284406.0,179828.6,2008.3,258.3,6,0,0


In [32]:
mean_timeseries = timeseries_df.groupby('datetime').mean().reset_index()
mean_timeseries.head()

Unnamed: 0,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity
0,2015-01-05,14455690.0,0.0,43823.5,0.0,710.0,0.0,10.0
1,2015-01-12,79990700.0,159754.659722,115951.47963,126.204167,307.0,0.833333,4.166667
2,2015-01-19,68884110.0,320650.937992,99979.8794,524.611146,263.428571,0.285714,2.0
3,2015-01-26,69265670.0,433905.663195,100398.323796,445.512698,264.285714,0.857143,1.285714
4,2015-02-02,69619670.0,406342.946529,100821.772006,450.519048,265.428571,1.142857,2.142857


In [33]:
mean_timeseries.plot.line(x='datetime', y='delta_subs', log_y=True)

In [34]:
stat = 'subs'

N = mean_timeseries.shape[0]

# Fourier transform
fft = sf.rfft(mean_timeseries[stat].to_numpy())
freqs = sf.rfftfreq(N, d=0.1)

px.bar(x=freqs[::-1], y=abs(fft)/N, title=f'Mean {stat} FT')

### *Global Analysis*

##### ***Per Category***

In [35]:
# Parameters 
metric = 'delta_views'
categories_nb = timeseries_df.category.unique().shape[0]

In [36]:
fig = make_subplots(rows=categories_nb, cols=2, subplot_titles=('Timeseries', 'Fourier Transform'))

for i, (category, category_df) in enumerate(timeseries_df.groupby(['category', 'datetime']).mean().reset_index().groupby('category')):
    # Timeseries Plot
    fig.add_trace(
        go.Scatter(x=pd.to_datetime(category_df.datetime), y=category_df[metric]),
        row=i+1, col=1
    )
    
    # Computing Fourier Transform
    freqs, fft = ft(category_df, metric)

    # FT Plot
    fig.add_trace(
        go.Bar(x=freqs, y=fft),
        row=i+1, col=2
    )

    fig.update_yaxes(title_text=f'{category}', row=i+1, col=1)


fig.update_layout(height=2500, width=1000, title_text=f"{metric} timeseries and its FT", showlegend=False)
fig.show()

##### ***Per Channel***

In [45]:
gaming_channels = channels_df[channels_df['category_cc'] == 'Gaming']

In [65]:
# Parameters 
sample_channels = gaming_channels.sort_values(by='subscriber_rank_sb').iloc[0:10].channel.unique()
sample_df = timeseries_df[timeseries_df['channel'].isin(sample_channels)]
sample_df = sample_df[(sample_df['datetime'] >= '2017-01-01') & (sample_df['datetime'] <= '2020-01-01')]
sample_df.shape

(1435, 10)

In [66]:
N = sample_channels.shape[0]
fig = make_subplots(rows=N, cols=2, shared_xaxes=True, subplot_titles=('Delta Subs', 'Delta Views'))

for i, (channel, channel_df) in enumerate(sample_df.groupby('channel')):
    # Timeseries Plot
    fig.add_trace(
        go.Scatter(x=pd.to_datetime(channel_df.datetime), y=channel_df['delta_subs']),
        row=i+1, col=1
    )

    # Timeseries Plot
    fig.add_trace(
        go.Scatter(x=pd.to_datetime(channel_df.datetime), y=channel_df['delta_views']),
        row=i+1, col=2
    )

    channel_name = channels_df[channels_df.channel == channel]['name_cc'].values[0]
    fig.update_yaxes(title_text=f'{channel_name}', row=i+1, col=1)


fig.update_layout(height=2000, width=1000, title_text=f"{metric} timeseries and its FT", showlegend=False)
fig.show()

In [70]:
delta_views_corr = np.zeros(shape=(N,N))
delta_subs_corr = np.zeros(shape=(N,N))

for i, (channel_a, channel_a_df) in enumerate(sample_df.groupby('channel')):
    for j, (channel_b, channel_b_df) in enumerate(sample_df.groupby('channel')):   
        channel_name_a = channels_df[channels_df.channel == channel_a]['name_cc'].values[0]
        channel_name_b = channels_df[channels_df.channel == channel_b]['name_cc'].values[0]
        print('-'*30)
        print(f"{channel_name_a}-{channel_name_b}")
        try:
            dv_corr = stats.pearsonr(channel_a_df['delta_views'], channel_b_df['delta_views'])
            delta_views_corr[i,j] = round(dv_corr[0], 2)
            print(f"Delta views Corr. : {dv_corr}")

            ds_corr = stats.pearsonr(channel_a_df['delta_subs'], channel_b_df['delta_subs'])
            delta_subs_corr[i,j] = round(ds_corr[0], 2)
            print(f"Delta subs Corr. : {ds_corr}")
        except:
            pass

------------------------------
PewDiePie-PewDiePie
Delta views Corr. : (1.0, 0.0)
Delta subs Corr. : (1.0, 0.0)
------------------------------
PewDiePie-Jelly
Delta views Corr. : (0.7054988417881914, 5.497576006157716e-23)
Delta subs Corr. : (0.6928028560627637, 6.589993024088139e-22)
------------------------------
PewDiePie-Markiplier
------------------------------
PewDiePie-Ninja
Delta views Corr. : (-0.1317479235075975, 0.11547229443338754)
Delta subs Corr. : (-0.10268468866614383, 0.2206817864094931)
------------------------------
PewDiePie-Marshmello
------------------------------
PewDiePie-VanossGaming
------------------------------
PewDiePie-DanTDM
Delta views Corr. : (0.21671007788004182, 0.009082424562368144)
Delta subs Corr. : (-0.3049933122250308, 0.0002016160174737867)
------------------------------
PewDiePie-Ali-A
Delta views Corr. : (-0.14995207236286384, 0.0728292606623543)
Delta subs Corr. : (-0.08709832598797937, 0.29924915285512843)
------------------------------
PewD

In [71]:
print(delta_views_corr.shape)

(10, 10)


In [69]:
channels_name = [channels_df[channels_df.channel == channel]['name_cc'].values[0] for channel in sample_channels]

fig = px.imshow(delta_views_corr, 
                x=channels_name,
                y=channels_name,
                text_auto=True)

fig.show()