# Making the decline detection better

Building on top of our findings in milestone 2, we find a technique to better detect declines.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.io as pio
import tqdm
import copy
import csv

from src.data.dataloader_functions import *
from src.utils.results_utils import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

# Preprocess the data

In [None]:
# Run the shell script
import subprocess

# Path to the shell script
script_path = "./src/scripts/preprocessing_pipeline.sh"

# Run the shell script
try:
    result = subprocess.run(
        ["bash", script_path],  # Specify the shell script and interpreter
        check=True,             # Raise an error if the command fails
        text=True,              # Capture output as a string
        capture_output=True     # Capture stdout and stderr
    )
    # Print the output
    print(result.stdout)
except subprocess.CalledProcessError as e:
    print("Error running the script:")
    print(e.stderr)

## Load the data:

In [None]:
# Load the original data for the index
original_data = load_processed_data(verbose = True)

Re-compute the rolling growth rate and store it to file

/!\ Note: Running the following cell is very time-consuming and requires around 16GB of memory. Run it only if you don't have the file `df_with_rgr.csv` in the `data` folder.

In [None]:
ROLLING_WINDOW = 20  # Set the rolling window for the growth rate, (5 months by default, could be changed in the future)

# Initialize an empty DataFrame to store the results
result = pd.DataFrame()

# Iterate over each group with a progress bar
for name, group in tqdm(original_data.groupby('channel'), desc="Processing channels"):
    group['rolling_growth_rate'] = group['delta_subs'].rolling(ROLLING_WINDOW, min_periods=ROLLING_WINDOW).mean()
    result = pd.concat([result, group])

result['growth_diff'] = result['delta_subs'] - result['rolling_growth_rate']

result.to_csv('data/df_with_rgr_new.tsv', sep='\t', index=True)

Load the data with the rolling growth rate

In [3]:
# Load the data with rolling growth rate
df_with_rgr_new = pd.read_csv('data/df_with_rgr_new.tsv', sep='\t')
df_with_rgr_new.set_index(['channel', 'week'], inplace=True)

# Detection of period where growth_rate < rolling_growth_rate
df_with_rgr_new['decline_event_detected'] = df_with_rgr_new['growth_diff'] < 0

In [None]:
df_with_rgr_grouped = df_with_rgr_new.reset_index().groupby('channel')

print(f'Number of channels : {len(df_with_rgr_grouped.groups.keys())}')

In [None]:
# Create a map that contains for each entry: the id of the channel, and the starting week and end week of each decline event
decline_events = {}

# For loop to detect the starting and ending week of the decline event
for channel in tqdm(df_with_rgr_grouped.groups.keys(), desc="Processing channels"):
    channel_data = df_with_rgr_grouped.get_group(channel)

    # Identify indices where decline has started and ended
    for i in range(1, len(channel_data)):
        if channel_data['decline_event_detected'].iloc[i] and not channel_data['decline_event_detected'].iloc[i-1]:
            # Add the starting week of the decline event
            if channel not in decline_events:
                decline_events[channel] = []
            if channel == 'UC-lHJZR3Gqxm24_Vd_AJ5Yw':
                print(f'Channel {channel} has a decline event that starts.')
            decline_events[channel].append((channel_data['week'].iloc[i], None))
        if ((not channel_data['decline_event_detected'].iloc[i]) and channel_data['decline_event_detected'].iloc[i-1]):
            # Add the ending week of the decline event
            decline_events[channel][-1] = (decline_events[channel][-1][0], channel_data['week'].iloc[i])
            if channel == 'UC-lHJZR3Gqxm24_Vd_AJ5Yw':
                    print(f'Channel {channel} has a decline event that ends.')


In [7]:
# Remove only the events (tuples) with None as the second element
decline_events_no_None = {k: [x for x in v if x[1] is not None] for k, v in decline_events.items()}

# Add the duration of the decline event for each decline event for each channel (created from a deep copy of the original decline events)
decline_events_with_duration = copy.deepcopy(decline_events_no_None)

for channel in decline_events_with_duration:
    for i in range(len(decline_events_with_duration[channel])):
        decline_events_with_duration[channel][i] = ((decline_events_with_duration[channel][i][0], decline_events_with_duration[channel][i][1]), decline_events_with_duration[channel][i][1] - decline_events_with_duration[channel][i][0])

In [None]:
# Analyse the duration of the decline events (mean, median, min, max, plot)
duration_list = []
for channel in decline_events_with_duration:
    for event in decline_events_with_duration[channel]:
        duration_list.append(event[1])

duration_list = np.array(duration_list)

print(f'Mean duration of decline events: {np.mean(duration_list)}')
print(f'Median duration of decline events: {np.median(duration_list)}')
print(f'Min duration of decline events: {np.min(duration_list)}')
print(f'Max duration of decline events: {np.max(duration_list)}')

plt.figure(figsize=(10, 6))
sns.histplot(duration_list, bins=20, kde=True)
plt.title('Distribution of the duration of decline events')
plt.xlabel('Duration (weeks)')
plt.ylabel('Count')
plt.show()

In [None]:
# Same plot with log scale to un-scew the distribution
plt.figure(figsize=(10, 6))
sns.histplot(duration_list, bins=20, kde=True)
plt.title('Distribution of the duration of decline events')
plt.xlabel('Duration (weeks)')
plt.ylabel('Count')
plt.xscale('log')
plt.show()

In [10]:
# Filter out the decline events that are shorter than the minimum duration

DECLINE_MIN_DURATION = 8 # Set the minimum duration of a detected decline event to be considered as an effective decline event
de_filtered_on_duration = {k: [x for x in v if x[1] >= DECLINE_MIN_DURATION] for k, v in decline_events_with_duration.items()}

In [None]:
print(f'There are {sum([len(v) for v in de_filtered_on_duration.values()])} decline events with duration of at least {DECLINE_MIN_DURATION} weeks.')

In [None]:
# Analyse the duration of the decline events (mean, median, min, max, plot) after filtering out the decline events that are shorter than the minimum duration
duration_list_filtered = []
for channel in de_filtered_on_duration:
    for event in de_filtered_on_duration[channel]:
        duration_list_filtered.append(event[1])

duration_list_filtered = np.array(duration_list_filtered)

print(f'Mean duration of decline events: {np.mean(duration_list_filtered)}')
print(f'Median duration of decline events: {np.median(duration_list_filtered)}')
print(f'Min duration of decline events: {np.min(duration_list_filtered)}')
print(f'Max duration of decline events: {np.max(duration_list_filtered)}')

plt.figure(figsize=(10, 6))
sns.histplot(duration_list_filtered, bins=20, kde=True)
plt.title('Distribution of the duration of decline events')
plt.xlabel('Duration (weeks)')
plt.ylabel('Count')
plt.xscale('log')
plt.show()

In [None]:
# Compute the growth diff percentage
df_with_rgr_final = df_with_rgr_new
df_with_rgr_final['growth_diff_percentage'] = (df_with_rgr_final['growth_diff'] / df_with_rgr_final['rolling_growth_rate']) * 100
df_with_rgr_final.head()

In [14]:
df_with_rgr_grouped_final = df_with_rgr_final.reset_index().groupby('channel')

In [None]:
# From the decline events filtered out because of a too short duration, we check if the event should still be included in the list of decline events, 
# because it is too intense (i.e. the minimum growth difference is below a certain threshold)

DECLINE_MIN_GROWTH_DIFF_PERCENTAGE = - 80 # Set the minimum magnitude of the growth difference to be considered as an effective decline event

# For each channel, for each event:
# We keep the event only if the miimum growth difference during the period starting from the first week of the event to the last week of the event is less than the threshold
de_filtered_on_growth_diff = {}

for channel in tqdm(df_with_rgr_grouped_final.groups.keys(), desc="Processing channels"):
    channel_data = df_with_rgr_grouped_final.get_group(channel)
    for event in decline_events_with_duration.get(channel, []):
        start_week = event[0][0]
        end_week = event[0][1]
        min_growth_diff = channel_data.loc[(channel_data['week'] >= start_week) & (channel_data['week'] <= end_week)]['growth_diff_percentage'].min()
        if min_growth_diff < DECLINE_MIN_GROWTH_DIFF_PERCENTAGE:
            if channel not in de_filtered_on_growth_diff:
                de_filtered_on_growth_diff[channel] = []
            de_filtered_on_growth_diff[channel].append(event)

In [24]:
# From the decline events filtered on growth difference, we remove the events that are shorter than 2 weeks to avoid outliers

ANTI_OUTLIERS_MIN_DURATION = 8
de_filtered_on_growth_diff_no_outliers = {k: [x for x in v if x[1] >= ANTI_OUTLIERS_MIN_DURATION] for k, v in de_filtered_on_growth_diff.items()}

In [None]:
print(f'There are {sum([len(v) for v in de_filtered_on_growth_diff_no_outliers.values()])} decline events (based on growth rate) with duration of at least {ANTI_OUTLIERS_MIN_DURATION} weeks.')

In [None]:
# Merge the decline events filtered on duration (de_filtered_on_duration) and growth difference (de_filtered_on_growth_diff_no_outliers) in one dictionary
decline_events_final = {k: de_filtered_on_duration.get(k, []) + de_filtered_on_growth_diff_no_outliers.get(k, []) for k in set(de_filtered_on_duration) | set(de_filtered_on_growth_diff_no_outliers)}
decline_events_final = {k: de_filtered_on_growth_diff_no_outliers.get(k, []) for k in set(de_filtered_on_growth_diff_no_outliers)}

print(f'There are {sum([len(v) for v in decline_events_final.values()])} decline events in total.')

# Sort the decline events by starting week
decline_events_final_sorted = {k: sorted(v, key=lambda x: x[0][0]) for k, v in decline_events_final.items()}
print(f'Number of channels with decline events detected: {len(decline_events_final_sorted)}')

In [33]:
# Write to CSV file
with open('data/decline_events_complete.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    # Write header
    writer.writerow(["Channel", "Event", "Duration"])
    
    # Write rows
    for channel, events in decline_events_final_sorted.items():
        for event, end_week in events:
            writer.writerow([channel, event, end_week])

In [34]:
# Compute the mean number of subscribers for each channel
mean_subscribers = df_with_rgr_new.reset_index().groupby('channel')['subs'].mean().reset_index()
mean_subscribers.columns = ['Channel', 'Mean_Number_of_Subscribers']

In [35]:
# Convert mean_subscribers DataFrame to a dictionary
mean_subscribers_dict = mean_subscribers.set_index('Channel')['Mean_Number_of_Subscribers'].to_dict()

# Filter the dictionary to keep only channels with mean subscribers > 1e6
filtered_mean_subscribers_dict = {k: v for k, v in mean_subscribers_dict.items() if v > 1e6}

decline_events_bb = decline_events_final_sorted.copy()

# Iterate over the keys and append mean subscribers
for k in list(decline_events_bb.keys()):  # Convert keys to list to avoid runtime error if modifying the dictionary
    if k in filtered_mean_subscribers_dict:
        decline_events_bb[k].append(filtered_mean_subscribers_dict[k])
    else:
        del decline_events_bb[k]  # Remove the channel if it doesn't meet the criteria

In [None]:
# Check if this channel is present is the dictionnary : 'UC-lHJZR3Gqxm24_Vd_AJ5Yw'
channel = 'UC-lHJZR3Gqxm24_Vd_AJ5Yw'
if channel in decline_events_bb:
    nb_events = len(decline_events_bb.get(channel))
else:
    nb_events = 0
print(f'Number of decline events detected for channel {channel}: {nb_events}')

In [None]:
# Check how many original BB are in this new subset of decline events
data = load_bb_timeseries_processed(verbose=True)
bb_channels = data.index.get_level_values('channel').unique()

nb_channels = 0
for channel in bb_channels:
    if channel in decline_events_bb:
        nb_channels += 1
print(f'Number of channels with decline events detected: {nb_channels}')

In [38]:
# Write to CSV file
with open('data/bb_from_declined_events.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    # Write header
    writer.writerow(["Channel", "Event", "Duration", "Mean_Number_of_Subscribers"])
    
    # Write rows
    for channel, events in decline_events_bb.items():
        mean_subscribers = events[-1]  
        for event, end_week in events[:-1]:  # Iterate over all events except the last one
            writer.writerow([channel, event, end_week, mean_subscribers])

## Plot for the datastory

In [None]:
lance_stewart  =  'UC6-NBhOCP8DJqnpZE4TNE-A'

print("With the past technique, every red area was a decline event.")
plot_rolling_growth_rate2(lance_stewart, df_with_rgr_final)

print("Now, we obtain less, more meaningful decline events.")

channel_data = df_with_rgr_grouped_final.get_group(lance_stewart )

fig = go.Figure()

# Highlight decline events - add these first
for event in decline_events_final_sorted.get(lance_stewart , []):
    if isinstance(event, tuple) and len(event) == 2:
        fig.add_vrect(
            x0=event[0][0], 
            x1=event[0][1], 
            fillcolor='#DFC5FE', 
            opacity=0.5, 
            line_width=0,
            layer='below',
            name='Decline event'
        )
    
        
fig.add_trace(go.Scatter(
        x=[None],  # Empty data to not affect the chart
        y=[None], 
        mode='lines', 
        line=dict(color='#DFC5FE', width=10), 
        name=f'Decline Event'  # Adjust legend name
))

# Add growth difference trace with a specific blue color - add these after
fig.add_trace(go.Scatter(
    x=channel_data['week'], 
    y=channel_data['delta_subs'], 
    mode='lines', 
    name='Growth diff',
    line=dict(color='#004AAD')  # Specify the color here
))

# Add rolling growth rate trace with a specific red color
fig.add_trace(go.Scatter(
    x=channel_data['week'], 
    y=channel_data['rolling_growth_rate'], 
    mode='lines', 
    name='Rolling growth rate',
    line=dict(color='#FF0000')  # Specify the color here
))

# Update axes, layout, and other styling
fig.update_xaxes(
    ticks='outside', 
    tickvals=np.arange(0, 260, 10), 
    ticktext=[str(i) for i in np.arange(0, 260, 10)]
)
fig.update_yaxes(ticks='outside')

fig.update_layout(
    title=f'Growth diff and rolling growth rate for Lance Stewart\'s channel',
    # center the title
    title_x=0.5,
    xaxis_title='Week',
    yaxis_title='Subscribers',
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='center',
        x=0.5,
        bgcolor='rgba(255, 255, 255, 0.5)',
        bordercolor='grey',
        borderwidth=1.5
    ),
    template='plotly_white',
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    yaxis_zeroline=False,
    margin=dict(l=50, r=50, t=100, b=50),
    xaxis_tickcolor='black',
    yaxis_tickcolor='black',
    autosize=False,
    width=800,
    height=600,
)

# Add a rectangle shape to create the border
fig.add_shape(
    type="rect",
    x0=0, y0=0, x1=1, y1=1,
    xref='paper', yref='paper',
    line=dict(color="grey", width=2)
)

fig.show()

pio.write_html(fig, file="plot_lancet.html", auto_open=False)