In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
from ipywidgets import interact, Dropdown, SelectionSlider


params = {
    "axes.titlesize" : 14,
    'axes.labelsize': 12,
    'font.size': 12,
    'legend.fontsize': 12,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    # 'font.family': TODO add font family
    # 'text.usetex': True
}

mpl.rcParams.update(params)

import sys
# Local Modules
sys.path.insert(0, '../')

# I. Load Data 

In [2]:
df_timeseries = pd.read_csv("./../data/df_timeseries_en.tsv", sep="\t", compression='infer', usecols=['datetime', 'channel', 'views', 'subs'])
df_timeseries['datetime'] = pd.to_datetime(df_timeseries['datetime'])

In [3]:
# TODO partial preprocessing : all dates do not match, need to set them to the start of each week
# Add 1 hour to the datetime to make 23:00:00 -> 00:00:00 so that the days all match
df_timeseries['datetime'] = df_timeseries['datetime'] + pd.DateOffset(hours=1)
# Set the time to 00:00:00 for all the dates
df_timeseries['datetime'] = df_timeseries['datetime'].dt.floor('d')

print("there are", df_timeseries['channel'].nunique(), "unique channels among the", df_timeseries.shape[0], "entries")
print("from", df_timeseries['datetime'].min(), "to", df_timeseries['datetime'].max(), ":", df_timeseries['datetime'].nunique(), "unique timestamps")

df_timeseries

there are 133516 unique channels among the 18872499 entries
from 2015-01-05 00:00:00 to 2019-09-30 00:00:00 : 248 unique timestamps


Unnamed: 0,channel,datetime,views,subs
0,UCBJuEqXfXTdcPSbGO9qqn1g,2017-07-03,2.024946e+05,650.222222
1,UCBJuEqXfXTdcPSbGO9qqn1g,2017-07-10,3.940857e+05,1046.000000
2,UCBJuEqXfXTdcPSbGO9qqn1g,2017-07-17,8.353938e+05,1501.500000
3,UCBJuEqXfXTdcPSbGO9qqn1g,2017-07-24,1.104577e+06,1750.000000
4,UCBJuEqXfXTdcPSbGO9qqn1g,2017-07-31,1.284406e+06,2008.300000
...,...,...,...,...
18872494,UC0UeVA9YHpOEr_Ng442xiRw,2019-09-02,6.012938e+06,61268.611111
18872495,UC0UeVA9YHpOEr_Ng442xiRw,2019-09-09,6.244579e+06,62631.666667
18872496,UC0UeVA9YHpOEr_Ng442xiRw,2019-09-16,6.480902e+06,64010.000000
18872497,UC0UeVA9YHpOEr_Ng442xiRw,2019-09-23,6.745317e+06,65480.000000


# II. Explore Data 

## 1. Subscriber Analysis 

In [4]:
# Create delta subscribers column 
df_timeseries = df_timeseries.sort_values(['channel', 'datetime'])
df_timeseries['delta_subs'] = df_timeseries.groupby('channel')['subs'].diff()

# Create a column to detect bad buzz events
THRESHOLD_LOSS_SUBS = 0.1 
df_timeseries['percent_loss'] = df_timeseries['delta_subs'] / df_timeseries['subs'].shift(1)
df_timeseries['bad_buzz'] = (df_timeseries['percent_loss'] <= -THRESHOLD_LOSS_SUBS)

# Get the bad buzz events
bad_buzz_events = df_timeseries[df_timeseries['bad_buzz']]
bad_buzz_events


Unnamed: 0,channel,datetime,views,subs,delta_subs,percent_loss,bad_buzz
1911411,UC-DuRqsBQOEk_5o1q4Ze-Fg,2017-08-21,4.293630e+07,174143.000000,-30793.375000,-0.150258,True
3455279,UC-KGhpo5BmmBx6chQm32Ryg,2016-06-27,1.463740e+06,1286.555556,-360.888889,-0.219060,True
16891890,UC-KdughvLV6oHGKiNE31fQw,2016-10-03,3.880000e+02,4677.000000,-4711.000000,-0.501811,True
4121864,UC-QDukb9C5mbZIZTx-Ip81A,2018-01-29,1.028849e+01,4.000000,-0.755208,-0.158817,True
5393558,UC-QPGqV-_p6B4IYLFfOwekA,2018-12-17,4.407738e+05,1618.291667,-883.578125,-0.353167,True
...,...,...,...,...,...,...,...
12404612,UCzYxGD8ze4-865FK7JNaMag,2017-08-21,7.947234e+06,49344.000000,-5492.875000,-0.100168,True
10081373,UCzk0mdDVDMqAJP4_pzTy9Tw,2017-08-21,1.934067e+06,75406.250000,-11823.500000,-0.135544,True
7958600,UCzlTGvOsTFGzIdWL4B8fkRg,2017-08-14,3.177713e+06,14313.444444,-2523.111111,-0.149859,True
9533235,UCzqmBflQBorUStH5b4V8bJg,2018-08-13,3.853750e+02,7.125000,-0.875000,-0.109375,True


In [5]:
bad_buzz_channels = bad_buzz_events['channel'].unique()
print(f"With a threshold of {THRESHOLD_LOSS_SUBS*100}% there are {bad_buzz_channels.size} that suffered from a bad buzz")

With a threshold of 10.0% there are 1176 that suffered from a bad buzz


In [6]:
bad_buzz_events[bad_buzz_events['channel'] == bad_buzz_channels[0]]

Unnamed: 0,channel,datetime,views,subs,delta_subs,percent_loss,bad_buzz
1911411,UC-DuRqsBQOEk_5o1q4Ze-Fg,2017-08-21,42936302.75,174143.0,-30793.375,-0.150258,True


In [7]:
def plot_channel(channel):
    plt.figure(figsize=(15, 5))

    # Line plot for subscriber count over time
    sns.lineplot(data=df_timeseries[df_timeseries['channel'] == channel], x='datetime', y='subs', label='Subscribers')

    # Scatter plot to highlight significant "bad buzz" events
    plt.scatter(
        bad_buzz_events[bad_buzz_events['channel'] == channel]['datetime'],
        bad_buzz_events[bad_buzz_events['channel'] == channel]['subs'],
        color='red', label='Bad Buzz Event'
    )

    plt.xlabel("Date")
    plt.ylabel("Subscribers")
    plt.title(f"Subscriber Trends for {channel}")
    plt.legend()
    plt.xticks(rotation=45)
    plt.show()

# Create an interactive dropdown
interact(plot_channel, channel=SelectionSlider(options=bad_buzz_channels, description='Channel:'), continuous_update=False)

interactive(children=(SelectionSlider(description='Channel:', options=('UC-DuRqsBQOEk_5o1q4Ze-Fg', 'UC-KGhpo5B…

<function __main__.plot_channel(channel)>

## 2. Views Analysis 

We want to understand if there is a correlation between the loss of subscribers and the number of views. 
Do bad buzzes influence the number of views ? Usually bad buzzes increase the number of views of teh channel. 
Can we assert this hypothesis ? 

- caluclate change of subcribers and views in period of bad buzz 
- Statistical analysis of correlation : 
    - analysze correlation between magnitude of subscriber loss and change in views during these periods 
    - if bad buzz events do indeed boost views, we should see a postive correlation between subscriber loss and increase views 
- Hypothesis Testing : 
    - conduct statistical test (paired t-test) to determine if there's a significant increase in views following bad buzz event 