In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from src.data.dataloader_functions import *
from src.utils.results_utils import *
from src.utils.recovery_analysis_utils import match_declines

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

GREEN = '#2ca02c'
RED = '#d62728'

# Load the data

In [None]:
df_all_declines_original = pd.read_csv('data/decline_events_complete_-80pc.csv')
df_channels = pd.read_csv('data/df_channels_en.tsv', sep='\t', usecols=['channel', 'category_cc'], index_col='channel')
df_data_processed = load_processed_data(usecols=['channel', 'week', 'subs', 'delta_videos', 'activity', 'views'])

# Finding the outcome

Using the duration of the decline, determine whether the YouTuber recovered or not.

In [None]:
df_all_declines = df_all_declines_original.copy()

# If the decline is longer than 3 months without recovery, we consider the YouTuber was not successful in handling it.
# Our aim is to find strategies that lead to quick recoveries, therefore taking more than 3 months would be considered unsuccessful.
RECOVERY_THRESHOLD = 4 * 4

# Add the decline outcome
df_all_declines['Recovered'] = df_all_declines['Duration'] < RECOVERY_THRESHOLD

# Split the tuple (decline start, decline end) into two separate columns
df_all_declines['Event'] = df_all_declines['Event'].apply(lambda s: [int(week_id) for week_id in s[1:-1].split(', ')]) # TODO why is it a string in the first place?
df_all_declines['Start'] = df_all_declines['Event'].apply(lambda e: e[0])
df_all_declines['End'] = df_all_declines['Event'].apply(lambda e: e[1])
df_all_declines.drop('Event', axis=1, inplace=True)

# Add the channel category
df_all_declines['Category'] = df_all_declines['Channel'].apply(lambda c: df_channels.loc[c]['category_cc'])

# Add the channel's subs at the start of the decline
decline_index = list(zip(df_all_declines['Channel'], df_all_declines['Start']))
df_all_declines['Subs_start'] = df_data_processed.loc[decline_index, 'subs'].values

# Add the activity at the start of the decline
df_all_declines['Activity_start'] = df_data_processed.loc[decline_index, 'activity'].values

# Add the delta videos at the start of the decline
df_all_declines['Delta_videos'] = df_data_processed.loc[decline_index, 'delta_videos'].values

# Add the channel's subs at the start of the decline
df_all_declines['Views_start'] = df_data_processed.loc[decline_index, 'views'].values

df_all_declines.head()

# How is the recovery distributed?

To get a first idea of what factors come into play when a YouTuber tries to recover from a decline, we make the dataset balanced by using a matched observational study.

In [None]:
plt.figure(figsize=(13, 5))
ax = plt.subplot(1, 2, 1)

# show percentage and count of recovered vs not recovered
counts = df_all_declines['Recovered'].value_counts(normalize=False)

sns.barplot(x=counts.index, y=counts.values, hue=counts.index, palette=[RED, GREEN], legend=False)
plt.title('Recovery after a decline')
plt.xticks([0, 1], ['No', 'Yes'])
plt.yticks([100000, 200000, 300000], ['100k', '200k', '300k'])
plt.xlabel('Managed to recover from the decline')
plt.ylabel('Count')
plt.ylim(0, max(counts) * 1.1)

# add text with the percentage
for i, count in enumerate(counts):
    plt.text(i, count, count, ha='center', va='bottom')

ax = plt.subplot(1, 2, 2)

# bar plot with categories
counts = df_all_declines.groupby('Category')['Recovered'].value_counts(normalize=True).unstack().fillna(0) * 100
# add mean line
mean =  (1 - df_all_declines['Recovered'].mean()) * 100
plt.axhline(mean, color='black', linestyle='--', linewidth=1)
counts.plot(kind='bar', stacked=True, color=[RED, GREEN], ax=ax, legend=False)
plt.title('Proportion of successful recoveries by category')
plt.xlabel('Category')
plt.ylim(0, 100)
plt.ylabel('Percentage')
plt.yticks([0, 20, 40, 60, 80, 100], ['0%', '20%', '40%', '60%', '80%', '100%'])

# put the mean on the right
ax_right = plt.gca().twinx()
ax_right.set_ylim(0, 100)
ax_right.set_yticks([mean])
ax_right.set_yticklabels([f'{mean:.2f}%'])
ax.legend([f'Mean over all declines', 'Not recovered', 'Recovered'], loc='lower center')

plt.show()

In [None]:
plt.figure(figsize=(14, 8))

ax = plt.subplot(2, 2, 1)

sns.histplot(data=df_all_declines, x="Subs_start", hue="Recovered", log_scale=True, element="step", palette=[RED, GREEN], ax=ax)

plt.title('Distribution of channels by subscribers\nat the start of the decline')
plt.xlabel('Subscribers at the start of the decline')
plt.ylabel('Number of channels')

ax = plt.subplot(2, 2, 2)

sns.histplot(data=df_all_declines, x="Views_start", hue="Recovered", log_scale=True, element="step", palette=[RED, GREEN], ax=ax)

plt.title('Distribution of channels by total number of\nviews at the start of the decline')
plt.xlabel('Views at the start of the decline')
plt.ylabel('Number of channels')

ax = plt.subplot(2, 2, 3)

sns.histplot(data=df_all_declines, x="Activity_start", hue="Recovered", log_scale=True, element="step", palette=[RED, GREEN], ax=ax)

plt.title('Distribution of channels by activity\nat the start of the decline')
plt.xlabel('Activity at the start of the decline')
plt.ylabel('Number of channels')

ax = plt.subplot(2, 2, 4)

sns.histplot(data=df_all_declines, x="Delta_videos", hue="Recovered", log_scale=True, element="step", palette=[RED, GREEN], ax=ax)

plt.title('Distribution of channels by delta videos\nat the start of the decline')
plt.xlabel('Delta videos at the start of the decline')
plt.ylabel('Number of channels')

plt.tight_layout()
plt.show()

Since we can observe that some features are not balanced between the treatment and control groups, especially views and subscribers at the start of the decline, we will perform matching between the two groups.

In [None]:
print(df_all_declines['Recovered'].value_counts())
print(f"\nTotal number of declines: {len(df_all_declines)}")

Considering the size of the dataset, we use random sampling to ease the matching's computation.

To check that sampling does not mess with the distribution of recoveries, we plot them depending on the sampling proportion.

In [None]:
# Make the code reproducible
SEED = 42
np.random.seed(SEED)

# Sample the data at different sampling rates
sample_proportions = np.linspace(0.01, 1, 100)
new_dfs = {}
for prop in sample_proportions:
    new_dfs[prop] = df_all_declines.sample(frac=prop, replace=False)

# Plot the recovery rates
recovered_props = [new_dfs[prop]['Recovered'].mean() for prop in sample_proportions]
unrecovered_props = [1 - prop for prop in recovered_props]
plt.figure(figsize=(6, 2))
plt.plot(sample_proportions, recovered_props, label='Recovered', color=GREEN)
plt.plot(sample_proportions, unrecovered_props, label='Not recovered', color=RED)
plt.xlabel('Sample proportion')
plt.ylabel('Proportion of declines')
plt.legend()
plt.show()

We choose to sample 30% of the data since the distribution of recoveries is left mostly unchanged, and it allows to keep a representative sample of the data.

In [None]:
df_sampled = new_dfs[0.3]

print(df_sampled['Recovered'].value_counts())
print(f"\nTotal number of declines after sampling: {len(df_sampled)}")

### Propensity score matching

In [None]:
df_matched = match_declines(df_sampled)

The number of subscribers, as well as the channel's category, do not seem to have an impact on the outcome.

# YouTuber reactions

As we want to find the best ways to deal with a decline depending on the situation, we then take a look at how the YouTubers reacted to the decline, and what methods proved effective.