In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from time import time
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import scipy.stats as stats
from ipywidgets import interact, Dropdown, SelectionSlider, widgets
from preprocessing import apply_complete_preprocessing
from datasets import load_processed_data, update_processed_data
from bbdatasets import *

# I. Load the data

In [None]:
# decomment the next line if we need to update the preprocessed data 
# update_processed_data(verbose=True)
original_data = load_processed_data(verbose=True)

  full_bar = Bar(frac,
Loading data in chunks of 1000: 100%|██████████| 18605/18604.825 [01:19<00:00, 233.57it/s]


In [3]:
# decomment the next line if we need to update the preprocessed data 
# update_processed_bb_timeseries(verbose=True)
data = load_bb_timeseries_processed(verbose=True)

Loading data in chunks of 1000: 6it [00:00, 239.92it/s]                  

Loaded 5030 rows





- i have a dataset that gathers all channels that have suffered at least once of a bad buzz 
- GOAL : find a general method that can find the bad buzz
- HOW CAN I DO IT ? 
    - i have the list of channels with the date of the bb 
    - i need to find these dates thanks to a statistical analysis 
    - i have for each pair of channel and week index : 
        - category
        - views : Total number of views the channel had this week.
        - delta_views : Delta views obtained this week.
        - subs : Total number of subscribers the channel had this week.
        - delta_subs : Delta subscribers obtained this week.
        - number of videos : Total number of videos the channel had this week.
        - delta_videos : Delta videos obtained this week.
        - activity : number of videos posted this week 
        - view_count => number of views f viedos posted
        - like_count => number of likes on pposted video 
        - dislike_count => number of dislikes on pposted video 

BAD BUZZ INDICATORS : 
    - increase in dislike count 
    - drop in subscriber growth or loss of subscribers
    - decrease in views or view growth 
    - changes in likes/dislikes ratios : higher dislikes ratio 
    - changes in number of videos posted, fewer views despite more content 

In [4]:
nb_channels = data.reset_index()['channel'].nunique()
print(f'There are {nb_channels} channels in the new dataset')

There are 33 channels in the new dataset


# II. Statistical Analysis : anomaly detection

## A. Compute rolling averages and standard deviations

In [5]:
def moving_avg_anomaly_detection(data, metric, window_size, bound_size): 
    data[f'moving_avg_{metric}'] = data.groupby('channel')[metric].transform(lambda x: x.rolling(window_size, min_periods=1).mean())
    data[f'moving_std_{metric}'] = data.groupby('channel')[metric].transform(lambda x: x.rolling(window_size, min_periods=1).std())
    data[f'upper_bound_{metric}'] = data[f'moving_avg_{metric}'] + bound_size * data[f'moving_std_{metric}']
    data[f'lower_bound_{metric}'] = data[f'moving_avg_{metric}'] - bound_size * data[f'moving_std_{metric}']
    data[f'is_anomaly_{metric}'] = data[metric] < data[f'lower_bound_{metric}']
    return data.drop(columns=[f'moving_avg_{metric}', f'moving_std_{metric}', f'upper_bound_{metric}', f'lower_bound_{metric}'])

In [6]:
# TODO : need to play with values 
WINDOW_SIZE = 8
BOUND_SIZE = 2

df_moving_avg = data.copy()
df_moving_avg = moving_avg_anomaly_detection(df_moving_avg, 'subs', WINDOW_SIZE, BOUND_SIZE)
df_moving_avg = moving_avg_anomaly_detection(df_moving_avg, 'views', WINDOW_SIZE, BOUND_SIZE)
df_moving_avg = moving_avg_anomaly_detection(df_moving_avg, 'dislike_count', WINDOW_SIZE, BOUND_SIZE)

channels_with_subs_anomalies = df_moving_avg.groupby('channel').filter(lambda x : x['is_anomaly_subs'].sum() > 0)
channels_with_views_anomalies = df_moving_avg.groupby('channel').filter(lambda x : x['is_anomaly_views'].sum() > 0)
channels_with_dislikes_anomalies = df_moving_avg.groupby('channel').filter(lambda x : x['is_anomaly_dislike_count'].sum() > 0)

nb_channels_with_subs_anomalies = channels_with_subs_anomalies.reset_index()['channel'].nunique()
nb_channels_with_views_anomalies = channels_with_views_anomalies.reset_index()['channel'].nunique()
nb_channels_with_dislikes_anomalies = channels_with_dislikes_anomalies.reset_index()['channel'].nunique()

print(f'Number of channels with subs anomalies : {nb_channels_with_subs_anomalies}')
print(f'Number of channels with views anomalies : {nb_channels_with_views_anomalies}')
print(f'Number of channels with dislikes anomalies : {nb_channels_with_dislikes_anomalies}')

Number of channels with subs anomalies : 16
Number of channels with views anomalies : 8
Number of channels with dislikes anomalies : 7


In [7]:
def plot_moving_avg(channel):
    df_plot = df_moving_avg.xs(channel, level='channel')
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=df_plot, x='week', y='subs')
    plt.scatter(df_plot[df_plot['is_anomaly_subs']].reset_index()['week'], 
                df_plot[df_plot['is_anomaly_subs']]['subs'],
                color='red', label='moving_avg_anomalies', marker='+')
    plt.title(f'Channel: {channel}')
    plt.legend()
    plt.show()

# Create a dropdown widget for selecting channels
channel_selector = widgets.SelectionSlider(
    options=df_moving_avg.index.get_level_values('channel').unique(),
    description='Channel:',
    disabled=False
)

# Display the widget and update the plot when the selection changes
widgets.interactive(plot_moving_avg, channel=channel_selector)

interactive(children=(SelectionSlider(description='Channel:', options=('UC-lHJZR3Gqxm24_Vd_AJ5Yw', 'UC0v-tlzsn…

## B. Z-score analysis

Z-scores are useful because they indicate how many standard deviations a value is from the mean 

In [81]:
def z_score_anomaly_detection(data, metric, threshold): 
    data[f'z_score_{metric}'] = data.groupby('channel')[metric].transform(lambda x : stats.zscore(x.dropna()))
    data[f'is_anomaly_{metric}'] = data[f'z_score_{metric}'] < -threshold
    print(data.xs('UCDsO-0Yo5zpJk575nKXgMVA', level='channel')[[f'z_score_{metric}', 'delta_subs']])
    return data.drop(columns=[f'z_score_{metric}'])

In [82]:
Z_SCORE_THRESHOLD = 1

df_z_score = data.copy()
df_z_score = z_score_anomaly_detection(df_z_score, 'delta_subs', Z_SCORE_THRESHOLD)
df_z_score = z_score_anomaly_detection(df_z_score, 'delta_views', Z_SCORE_THRESHOLD)

channels_with_subs_z_score_anomalies = df_z_score.groupby('channel').filter(lambda x : x['is_anomaly_delta_subs'].sum() > 1)
channels_with_views_z_score_anomalies = df_z_score.groupby('channel').filter(lambda x : x['is_anomaly_delta_views'].sum() > 1)

nb_channels_with_subs_z_score_anomalies = channels_with_subs_z_score_anomalies.reset_index()['channel'].nunique()
nb_channels_with_views_z_score_anomalies = channels_with_views_z_score_anomalies.reset_index()['channel'].nunique()
nb_channels_subs_views_anomalies = df_z_score[(df_z_score['is_anomaly_delta_subs']) & 
                                              (df_z_score['is_anomaly_delta_views'])].reset_index()['channel'].nunique()

print(f'Number of channels with subs anomalies : {nb_channels_with_subs_z_score_anomalies}')
print(f'Number of channels with views anomalies : {nb_channels_with_views_z_score_anomalies}')
print(f'Number of channels that have both subs and views anomalies : {nb_channels_subs_views_anomalies}')

      z_score_delta_subs    delta_subs
week                                  
92                   NaN           NaN
93             -0.886522   1328.250000
94             -1.267507  -1249.549223
95             -1.208361   -849.362236
96             -1.088993    -41.697917
...                  ...           ...
243             0.534616  10943.875000
244             0.486878  10620.875000
245            -0.040638   7051.625000
246            -0.614432   3169.250000
247            -1.015916    452.750000

[153 rows x 2 columns]
      z_score_delta_views    delta_subs
week                                   
92                    NaN           NaN
93              -1.118289   1328.250000
94               0.467402  -1249.549223
95              -1.112394   -849.362236
96              -1.234311    -41.697917
...                   ...           ...
243              1.369339  10943.875000
244              1.139081  10620.875000
245              0.466373   7051.625000
246              0.050000   3

In [None]:
z_score_anomalies_count = df_z_score.groupby('channel')['is_anomaly_delta_subs'].sum()
z_score_anomalies_count.columns = ['channel', 'num_anomalies']
z_score_anomalies_count

channel
UC-lHJZR3Gqxm24_Vd_AJ5Yw     2
UC0v-tlzsn0QZwJnkiaUSJVQ    23
UC1r4VtVE__5K6c_L_3Vlxxg     8
UC2e0bNZ6CzT-Xvr070VaGsw     3
UC6-NBhOCP8DJqnpZE4TNE-A     1
UCAq9s3QQVCDMvg1iWQBVtxQ     1
UCBHu7LsKiwiYViR230RtsCA    16
UCDsO-0Yo5zpJk575nKXgMVA    39
UCEHf6KUY7Zw7hlXQ7hDemwQ     7
UCJZ7f6NQzGKZnFXzFW9y9UQ     4
UCKGiTasUqLcZUuUjQiyKotw     6
UCKMugoa0uHpjUuq14yOpagw     3
UCKlhpmbHGxBE6uw9B_uLeqQ    17
UCV9_KinVpV-snHe3C3n1hvA     1
UCVJK2AT3ea5RTXNRjX_kz8A    13
UCVtFOytbRpEvzLjvqGG5gxQ     2
UCWwWOFsW68TqXE-HZLC3WIA    13
UCX6OQ3DkcsbYNE6H8uQQuVA     0
UCXhSCMRRPyxSoyLSPFxK7VA    10
UC_DptbqTndVt_Im3KkuIK5Q     0
UCcgVECVN4OKV6DH1jLkqmcA    11
UCdJdEguB1F1CiYe7OEi3SBg     1
UCdoLeDxfcGwvj_PRl7TLTzQ     4
UCiH828EtgQjTyNIMH6YiOSw     8
UClWD8su9Sk6GzZDwy9zs3_w    19
UCnEn0EUV13IR-_TK7fiIp3g     2
UCoiIt_v1D-6z75LmrdIU2aw    23
UCtVubfONoPpn4kNuuZ1h6iQ     0
UCucot-Zp428OwkyRm2I7v2Q     2
UCxJf49T4iTO_jtzWX3rW_jg    23
UCy_YiQx1t8oOgz74QIB4Jrw    13
UCzJIliq68IHSn-Kwgjeg2AQ     0


If I keep a threshold of 2 (meaning that i keeo values that are away of 2 std or more from the mean) then i can get almost all the channels and in each channel i detect more than 1 anomaly 

It would be now interesting to look at the date of the anomaly to understand effetively is there was a bad buzz

I could first plot the number of subs along the time and add a point where an anomaly was detected 

In [None]:
def plot_z_score(channel):
    df_plot = df_z_score.xs(channel, level='channel')
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=df_plot, x='week', y='subs')
    plt.scatter(df_plot[df_plot['is_anomaly_delta_subs']].reset_index()['week'], 
                df_plot[df_plot['is_anomaly_delta_subs']]['subs'],
                color='red', label='z_scores_anomalies', marker='+')
    plt.title(f'Channel: {channel}')
    plt.legend()
    plt.show()

# Create a dropdown widget for selecting channels
channel_selector = widgets.SelectionSlider(
    options=df_z_score.index.get_level_values('channel').unique(),
    description='Channel:',
    disabled=False
)

# Display the widget and update the plot when the selection changes
widgets.interactive(plot_z_score, channel=channel_selector)

interactive(children=(SelectionSlider(description='Channel:', options=('UC-lHJZR3Gqxm24_Vd_AJ5Yw', 'UC0v-tlzsn…

## C. Linear regression 

# D. Delta delta derivative analysis

In [97]:
def delta_delta_anomaly_detection(data, metric, threshold):
    data[f'delta_{metric}'] = data.groupby('channel')[metric].diff()
    data[f'delta_delta_z_score_{metric}'] = data.groupby('channel')[f'delta_{metric}'].transform(lambda x : stats.zscore(x.dropna()))
    data[f'is_anomaly_delta_{metric}'] = data[f'delta_delta_z_score_{metric}'] < -threshold
    return data.drop(columns=[f'delta_{metric}', f'delta_delta_z_score_{metric}'])

DELTA_DELTA_THRESHOLD = 2

df_delta_delta = data.copy()
df_delta_delta = delta_delta_anomaly_detection(df_delta_delta, 'delta_subs', DELTA_DELTA_THRESHOLD)
df_delta_delta = delta_delta_anomaly_detection(df_delta_delta, 'delta_views', DELTA_DELTA_THRESHOLD)

channels_with_subs_delta_delta_anomalies = df_delta_delta.groupby('channel').filter(lambda x : x['is_anomaly_delta_delta_subs'].sum() > 1)
channels_with_views_delta_delta_anomalies = df_delta_delta.groupby('channel').filter(lambda x : x['is_anomaly_delta_delta_views'].sum() > 1)

nb_channels_with_subs_delta_delta_anomalies = channels_with_subs_delta_delta_anomalies.reset_index()['channel'].nunique()
nb_channels_with_views_delta_delta_anomalies = channels_with_views_delta_delta_anomalies.reset_index()['channel'].nunique()
nb_channels_subs_views_delta_delta_anomalies = df_delta_delta[(df_delta_delta['is_anomaly_delta_delta_subs']) &
                                                                (df_delta_delta['is_anomaly_delta_delta_views'])].reset_index()['channel'].nunique()

print(f'Number of channels with subs anomalies : {nb_channels_with_subs_delta_delta_anomalies}')
print(f'Number of channels with views anomalies : {nb_channels_with_views_delta_delta_anomalies}')
print(f'Number of channels that have both subs and views anomalies : {nb_channels_subs_views_delta_delta_anomalies}')

def plot_delta_delta(channel):
    df_plot = df_delta_delta.xs(channel, level='channel')
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=df_plot, x='week', y='subs')
    plt.scatter(df_plot[df_plot['is_anomaly_delta_delta_subs']].reset_index()['week'],
                df_plot[df_plot['is_anomaly_delta_delta_subs']]['subs'],
                color='red', label='delta_delta_anomalies', marker='+')
    plt.title(f'Channel: {channel}')
    plt.legend()
    plt.show()

# Create a dropdown widget for selecting channels
channel_selector = widgets.SelectionSlider(
    options=df_z_score.index.get_level_values('channel').unique(),
    description='Channel:',
    disabled=False
)

# Display the widget and update the plot when the selection changes
widgets.interactive(plot_delta_delta, channel=channel_selector)

Number of channels with subs anomalies : 31
Number of channels with views anomalies : 29
Number of channels that have both subs and views anomalies : 17


interactive(children=(SelectionSlider(description='Channel:', options=('UC-lHJZR3Gqxm24_Vd_AJ5Yw', 'UC0v-tlzsn…