In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import statsmodels.api as sm
from tqdm.notebook import tqdm

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import cross_validate

from src.data.dataloader_functions import *
from src.utils.results_utils import *
from src.utils.recovery_analysis_utils import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

# Make the code reproducible
SEED = 42

# Load the data

In [2]:
df_all_declines_original = pd.read_csv('data/decline_events_complete.csv')
df_channels = pd.read_csv('data/df_channels_en.tsv', sep='\t', usecols=['channel', 'category_cc'], index_col='channel')
df_data_processed = load_processed_data(usecols=['channel', 'week', 'subs', 'activity', 'views'])

# Finding the outcome

Using the duration of the decline, determine whether the YouTuber recovered or not.

In [None]:
df_all_declines = df_all_declines_original.copy()

# If the decline is longer than 4 months without recovery, we consider the YouTuber was not successful in handling it.
# Our aim is to find strategies that lead to quick recoveries, therefore taking more than 4 months would be considered unsuccessful.
RECOVERY_THRESHOLD = 4 * 4

# Add the decline outcome
df_all_declines['Recovered'] = df_all_declines['Duration'] < RECOVERY_THRESHOLD

# Split the tuple (decline start, decline end) into two separate columns
df_all_declines['Event'] = df_all_declines['Event'].apply(lambda s: [int(week_id) for week_id in s[1:-1].split(', ')])
df_all_declines['Start'] = df_all_declines['Event'].apply(lambda e: e[0])
df_all_declines['End'] = df_all_declines['Event'].apply(lambda e: e[1])
df_all_declines.drop('Event', axis=1, inplace=True)

# Add the channel category
df_all_declines['Category'] = df_all_declines['Channel'].apply(lambda c: df_channels.loc[c]['category_cc'])

# Add the channel's subs at the start of the decline
decline_index = list(zip(df_all_declines['Channel'], df_all_declines['Start']))
df_all_declines['Subs_start'] = df_data_processed.loc[decline_index, 'subs'].values

# Add the activity at the start of the decline
df_all_declines['Activity_start'] = df_data_processed.loc[decline_index, 'activity'].values

# Add the channel's subs at the start of the decline
df_all_declines['Views_start'] = df_data_processed.loc[decline_index, 'views'].values

print(f"Overall recovery rate: {df_all_declines['Recovered'].mean():.2f}")

df_all_declines.head()

# How is the recovery distributed?

To get a first idea of what factors come into play when a YouTuber tries to recover from a decline, we plot some distributions.

In [None]:
plot_recovered_by_categories(df_all_declines)

In [None]:
plot_group_distributions(df_all_declines)

We can observe that some features are not balanced between the declines that recovered and those that did not, especially views and subscribers at the start of the decline.

In [None]:
print(df_all_declines['Recovered'].value_counts())
print(f"\nTotal number of declines: {len(df_all_declines)}")

# YouTuber reactions

Aiming at finding the best ways to deal with a decline depending on the situation, we take a look at how the YouTubers reacted to the decline, and what methods proved effective.\
In order to observe the reaction's impact, we will conduct a matched observational study on the dataset by using propensity score matching.

Considering the size of the dataset, we use random sampling to ease the matching's computation.\
To check that sampling does not mess with the distribution of recoveries, we plot them depending on the sampling proportion.

In [None]:
plot_sampling_rates(df_all_declines, SEED)

We choose to sample 30% of the data since it considerably reduces the size of the dataset, and allows to keep a representative sample of the data without perturbing the recovery distribution.

In [None]:
df_sampled = df_all_declines.sample(frac=0.3, replace=False, random_state=SEED)

print(df_sampled['Recovered'].value_counts())
print(f"\nTotal number of declines after sampling: {len(df_sampled)}")

### Adding the reaction metrics whose impact we want to measure

The reactions that we are able to observe here are the following:
- Did the YouTuber change video publication frequency?
- Did the YouTuber change video length?
- Did the YouTuber change video category?

In [None]:
# Load the videos around the declines, from week (decline start - decline duration) to week (decline end)
videos_around_declines = pd.read_csv('data/videos_around_declines.csv')

# Add the declines with the indices of the corresponding videos
df_sampled = get_sampled_declines_with_videos(df_sampled, videos_around_declines)

# Augment the data with the video stats : videos per week and mean video duration, before and during the declines
df_sampled = add_video_stats(df_sampled, videos_around_declines)

DIV_BY_ZERO_TOLERANCE = 1e-6

# Indicate whether the channel increased mean video duration after the start of the decline. We include a tolerance of 50% change.
df_sampled['Mean_duration_difference'] = df_sampled.apply(lambda row: row['Mean_duration_after'] - row['Mean_duration_before'], axis=1)
df_sampled['Posted_longer_videos'] = df_sampled.apply(lambda row: (row['Mean_duration_difference']) / np.max([row['Mean_duration_before'], DIV_BY_ZERO_TOLERANCE]) > 0.5, axis=1)
df_sampled['Posted_shorter_videos'] = df_sampled.apply(lambda row: (row['Mean_duration_difference']) / np.max([row['Mean_duration_before'], DIV_BY_ZERO_TOLERANCE]) < -0.5, axis=1)
print(f"\n{df_sampled['Posted_longer_videos'].mean() * 100:.2f}% of the channels posted longer videos after the start of the decline.")
print(f"{df_sampled['Posted_shorter_videos'].mean() * 100:.2f}% of the channels posted shorter videos after the start of the decline.\n")

# Indicate whether the channel changed publishing frequency after the start of the decline. We include a tolerance of 100% change (doubling the frequency).
df_sampled['Mean_frequency_difference'] = df_sampled.apply(lambda row: row['Videos_per_week_after'] - row['Videos_per_week_before'], axis=1)
df_sampled['Posted_more'] = df_sampled.apply(lambda row: (row['Mean_frequency_difference']) / np.max([row['Videos_per_week_before'], DIV_BY_ZERO_TOLERANCE]) > 0.5, axis=1)
df_sampled['Posted_less'] = df_sampled.apply(lambda row: (row['Mean_frequency_difference']) / np.max([row['Videos_per_week_before'], DIV_BY_ZERO_TOLERANCE]) < -0.5, axis=1)
print(f"{df_sampled['Posted_more'].mean() * 100:.2f}% of the channels posted more videos after the start of the decline.")
print(f"{df_sampled['Posted_less'].mean() * 100:.2f}% of the channels posted less videos after the start of the decline.")

# Adding three columns for the topic of the videos before and after the decline, and whether the topic changed or not
topic_change_data = pd.read_csv('data/df_topic_change_20_15w.csv')
topic_change_data.columns = ['Decline', 'Topic_Change', 'Topic_Before', 'Topic_After']
df_sampled = pd.merge(df_sampled, topic_change_data, left_index=True, right_on='Decline', how='left')
print(f"{df_sampled['Topic_Change'].mean() * 100:.2f}% of the channels changed of topic after the start of the decline.")
df_sampled = df_sampled.drop(columns=['Decline', 'Topic_Before', 'Topic_After'])


# Drop the declines with missing data (usually due to having no video before or after the decline, or no category)
df_sampled = df_sampled.dropna()

# Put the differences aside, to be used later but not in the models
df_videos_per_week = df_sampled[['Videos_per_week_before', 'Videos_per_week_after']]
df_video_duration = df_sampled[['Mean_duration_before', 'Mean_duration_after']] 
df_sampled = df_sampled.drop(['Mean_duration_before', 'Mean_duration_after'], axis=1)
df_sampled = df_sampled.drop(['Videos_per_week_before', 'Videos_per_week_after'], axis=1)

# Keep a copy of the declines that have videos before and after the decline
df_sampled_without_zero_videos = df_sampled[(df_sampled['Videos_before'].apply(len) > 0) & (df_sampled['Videos_after'].apply(len) > 0)]

# Drop the indices of the videos, they are not needed anymore
df_sampled = df_sampled.drop(['Videos_before', 'Videos_after'], axis=1)
df_sampled_without_zero_videos = df_sampled_without_zero_videos.drop(['Videos_before', 'Videos_after'], axis=1)

df_sampled.head()

### Propensity score matching

Seeing that the declines do not have the same distribution on their features, we perform propensity score matchings to balance the treatment and control groups looking at the effect that changing publication frequency,video duration and video category after the start of the decline have on the recovery.

After the propensity score matching, we can observe the effect of the different treatments on the recovery :

In [None]:
# The variables to be used for the matching
# Treatments in the same array will be plotted together
TREATMENTS = [
    ['Posted_more',
    'Posted_less'],
    ['Posted_longer_videos',
    'Posted_shorter_videos'],
    ['Topic_Change']
]

# The variables to be dropped for each treatment (to avoid multicollinearity or strong correlation)
# Use the same order as the TREATMENTS array
to_drop = [
    [['Mean_frequency_difference', 'Posted_less'],
    ['Mean_frequency_difference', 'Posted_more']],
    [['Mean_duration_difference', 'Posted_shorter_videos'],
    ['Mean_duration_difference', 'Posted_longer_videos']],
    [[]]
]

plot_df = pd.DataFrame(columns=['Strategy', 'Adopted the strategy', 'Did not adopt'])

matched_dfs = {}
for plot_treatments, plot_dropped in zip(TREATMENTS, to_drop):
    fig, axes = plt.subplots(1, len(plot_treatments), figsize=(5*len(plot_treatments), 4))
    for subplot_id, (treatment, dropped) in enumerate(zip(plot_treatments, plot_dropped)):

        # Try to load the matches from the file, otherwise compute them
        matches = get_matches(treatment=treatment, declines=df_sampled.drop(dropped, axis=1), verbose=False)

        print(f"{treatment} matches :", matches)

        # Flatten
        matches = [index for match in matches for index in match]

        # Get the matched declines
        matched_dfs[treatment] = df_sampled.loc[matches]

        counts = matched_dfs[treatment].groupby(treatment)['Recovered'].mean() * 100
        plot_df.loc[len(plot_df)] = [treatment, counts[True], counts[False]]

        plot_treatment_effect(matched_dfs[treatment], treatment, ax=axes[subplot_id] if len(plot_treatments) > 1 else axes)

    plt.tight_layout()
    plt.show()

plot_df.to_csv('plot_data/matches_results.csv', index=False)

### 2. Decision tree

In [None]:
def perform_decision_tree(X, y):
    # Create the decision tree model
    tree = DecisionTreeClassifier(max_depth=3, random_state=SEED)

    # Evaluate the model using cross-validation
    cv_results = cross_validate(tree, X, y, cv=5, scoring='accuracy')

    # Fit the model on the whole dataset
    tree.fit(X, y)

    return tree, cv_results

drop_cols = ['Channel', 'Start', 'End', 'Duration', 'Recovered', 'Mean_duration_difference', 'Mean_frequency_difference']

tree_X = df_sampled.drop(drop_cols, axis=1)
tree_X = pd.get_dummies(tree_X, columns=['Category'], drop_first=True)

tree_y = df_sampled['Recovered']

tree, cv_results = perform_decision_tree(tree_X, tree_y)

print(f"Decision tree accuracy: {cv_results['test_score'].mean():.2f}")

plt.figure(figsize=(20, 10))
plot_tree(tree, filled=True, feature_names=tree_X.columns, class_names=['Not recovered', 'Recovered'])
plt.show()

# New Start 

In [None]:
df_sampled.head()

## Summary Statistics 

In [None]:
df_sampled.describe()

## Correlation Analysis

In [None]:
df_sampled.head()

In [None]:
# Transform the categorical variable into dummies
df_sampled_processed = pd.get_dummies(df_sampled, columns=['Category'], drop_first=True)

# Transform the boolean variables into integers
df_sampled_processed['Recovered'] = df_sampled_processed['Recovered'].astype(int)
df_sampled_processed['Posted_more'] = df_sampled_processed['Posted_more'].astype(int)
df_sampled_processed['Posted_less'] = df_sampled_processed['Posted_less'].astype(int)
df_sampled_processed['Posted_longer_videos'] = df_sampled_processed['Posted_longer_videos'].astype(int)
df_sampled_processed['Posted_shorter_videos'] = df_sampled_processed['Posted_shorter_videos'].astype(int)
df_sampled_processed['Topic_Change'] = df_sampled_processed['Topic_Change'].astype(int)


# Calculate the correlation matrix
correlation_matrix = df_sampled_processed.drop(columns=['Channel']).corr()

# Display the correlation matrix
print(correlation_matrix['Recovered'].sort_values(ascending=False))

**Summary**:

* **Positive Correlations** : Posting more videos, changes in upload frequency, and certain categories (e.g., Gaming, News & Politics) are slightly associated with recovery.

* **Negative Correlations** : Longer decline durations and posting fewer videos are more strongly associated with lower chances of recovery.

## Visual Analysis

In [None]:
# Pairplot to visualize relationships
sns.pairplot(df_sampled_processed, vars=['Recovered', 'Duration', 'Subs_start', 'Mean_duration_difference', 'Mean_frequency_difference', 'Topic_Change'])
plt.show()

In [None]:
# Heatmap of the correlation matrix
sns.heatmap(correlation_matrix, cmap='coolwarm')
plt.show()

**Thoughts :** This visual analysis confirm the correlation analysis 

## Feature importance analysis 

First, do the regression on all the declines that we kept until now.

As a sanity check, we run the same regression, removing the declines that have no videos either before, during or both to check that the presence of zeros does not have a significant impact on the results.

In [None]:
# Columns that are not included in the regression
drop_cols = ['Channel', 'Start', 'End', 'Duration', 'Recovered', 'Mean_duration_difference', 'Mean_frequency_difference', 'Category']

# Prepare the data for the logistic regression
logit_X = df_sampled.drop(drop_cols, axis=1)
logit_y = df_sampled['Recovered']

# Perform the regression
logit_result = perform_logistic_regression(logit_X, logit_y)

plt.figure(figsize=(6, 5), dpi=400)
plot_logit_coefficients(logit_result, title='Logistic regression Coefficients', filename='logit_coefficients.png')

# Save the coefficients, p-values and variable names to a file for plotting
with open('plot_data/logit_results.csv', 'w') as f:
    res = pd.DataFrame({'coef': logit_result.params, 'p-value': logit_result.pvalues}).reset_index()
    res.columns = ['Variable', 'Coefficient', 'p-value']
    res.to_csv(f, index=False)


**Key Insights:**

* **Posted_more**:  Posting more videos during the decline period significantly increases the chances of recovery.

* **Posted_shorter_videos** : Posting shorter videos has a marginally significant positive effect on recovery.

* **Subs_start** : The number of subscribers at the start of the decline has a marginally significant positive effect on recovery.

* **Posted_less** : Posting fewer videos significantly decreases the chances of recovery.

* **Category Impact**: Categories with positive coefficients (e.g., Gaming, News & Politics) are more likely to recover. Focus on content that fits these categories.


**Actionable Advice:**

* **Increase Video Uploads** : Consistently post more videos during the decline period to engage your audience and increase the chances of recovery.

* **Avoid Reducing Uploads** : Avoid posting fewer videos, as this significantly decreases the chances of recovery.

* **Consider Video Length** : Posting shorter videos may have a positive impact on recovery.

* **Focus on Content Categories** : If possible, avoid focusing solely on Education and Music categories, as these are associated with lower recovery rates.

* **Leverage Subscriber Base** : Engage with your existing subscribers to maximize their support during the decline period.

### Let's look at how we could answer to Youtuber's questions 

Here are some potential questions that a Youtube cerator could ask us, let's understand which one we could answer : 

1. How often should I post new videos?

2. Should I focus on shorter or longer videos?
**Answer**: This can depend on your audience's preferences. Analyze your video performance data to see if shorter or longer videos perform better. You can also experiment with different lengths to find the optimal duration for your content.

3. What type of content should I focus on?
**Answer**: Focus on content that has historically performed well on your channel. Additionally, consider creating content that fits into popular categories like Gaming or News & Politics, as these have shown higher recovery rates.

4. How can I maintain quality while increasing quantity?
**Answer**: Plan your content in advance and create a content calendar. Batch filming and editing can also help you maintain quality while increasing the number of videos you post.

5. Will posting more videos affect my channel's overall quality?
**Answer**: It's important to strike a balance between quantity and quality. Ensure that each video provides value to your audience. If necessary, consider outsourcing tasks like editing to maintain quality.

6. How can I keep my audience engaged with more frequent uploads?
**Answer**: Engage with your audience through comments, community posts, and live streams. Ask for their feedback and involve them in your content creation process to keep them interested.

7. What if I don't see immediate results from posting more videos?
**Answer**: Recovery can take time, so be patient and consistent. Monitor your analytics to track progress and make adjustments as needed. Consistency is key to building and maintaining audience engagement.

8. How can I come up with more video ideas?
**Answer**: Use tools like YouTube Analytics, Google Trends, and social media to identify trending topics and popular content in your niche. Engage with your audience to get ideas and feedback on what they want to see.

9. What other strategies can complement posting more videos?
**Answer**: In addition to posting more videos, focus on optimizing your video titles, descriptions, and tags for SEO. Collaborate with other creators, promote your videos on social media, and engage with your audience to boost visibility and engagement.

10. How do I measure the success of posting more videos?
**Answer**: Use YouTube Analytics to track key metrics such as views, watch time, subscriber growth, and engagement. Compare these metrics before and after increasing your upload frequency to measure the impact.



## Q1 : How often should I post new videos?
We saw that **uploading more videos** should help to maximize the chances of recovery. Hence we should look at how many videos we should advise him to post each week 

As `Posted_more` has a positive impact on recovery, we want to look into what situations benefit the most from posting videos more often.

We therefore look at the characteristics of decline which reacted that way, and when it worked best :

- Did channels who increased publication frequency already post often, or did they post few videos before the
- Is posting more often associated with a lower average video duration?

In [None]:
# Build a new dataframe useful for reaction analysis
kept_cols = ['Channel', 'Duration', 'Start', 'End', 'Posted_more', 'Posted_less', 'Posted_longer_videos', 'Posted_shorter_videos', 'Recovered', 'Mean_duration_difference', 'Mean_frequency_difference']
df_reactions = pd.concat([df_sampled[kept_cols], df_videos_per_week, df_video_duration], axis=1)

df_reactions['No_change'] = ~df_reactions['Posted_more'] & ~df_reactions['Posted_less']
df_reactions['Frequency_reaction'] = pd.from_dummies(df_reactions[['Posted_more', 'Posted_less', 'No_change']])
df_reactions = df_reactions.drop(['Posted_more', 'Posted_less', 'No_change'], axis=1)

df_reactions['No_change'] = ~df_reactions['Posted_longer_videos'] & ~df_reactions['Posted_shorter_videos']
df_reactions['Video_duration_reaction'] = pd.from_dummies(df_reactions[['Posted_longer_videos', 'Posted_shorter_videos', 'No_change']])
df_reactions = df_reactions.drop(['Posted_longer_videos', 'Posted_shorter_videos', 'No_change'], axis=1)

df_reactions

In [None]:
plot_distribution_by_frequency_reaction(df_reactions, 'Videos_per_week_before', 'Distribution of videos per week, before the decline')
plot_distribution_by_frequency_reaction(df_reactions, 'Videos_per_week_after', 'Distribution of videos per week, after the decline')
plt.show()

**Plot description :**

* In red are the channels that during their decline didn't change the frequency of their publications, we plot the ditribution of videos published before and after their decline.

* In light blue are the channels that during they declined decreased the frequency of their publications, we plot the distribution of videos published before and after their decline.

* In dark blue are the channels that during they declined increased the frequency of their publications, we plot the distribution of videos published before and after their decline.

* The red line represents the average number of videos published before/after the decline.

It is interesting to note that the channels that increased video frequency after the start of the decline used to post less than average before the decline, while the ones that reduced video frequency used to post approximately as much as the average. The two groups almost switch places in terms of video frequency.

These observations are good because their decline and recovery can be caused because they posted less videos, and they could have recovered by posting more videos.

**Next :** 

It would be interesting to give an indicator of the **number of videos to post per week** in order to have more chance to recover

In [None]:
df_reaction_posted_more = df_reactions[df_reactions['Frequency_reaction'] == 'Posted_more']

stats_before = df_reaction_posted_more[df_reaction_posted_more['Recovered']==1]['Videos_per_week_before'].describe()
stats_after = df_reaction_posted_more[df_reaction_posted_more['Recovered']==1]['Videos_per_week_after'].describe()

print("Statistics for Videos per Week Before Decline:")
print(stats_before)

print("\nStatistics for Videos per Week After Decline:")
print(stats_after)

I don't think that the mean is really meaningful because we can notice quite some outliers, we should therefore preferably look at the median. Here we can observe that before a decline 0.41 videos were posted and after a decline 1.2 videos were posted. Meaning that we could advice to post at least one video per week in order to increase the chance of recovering from the decline 

In [None]:
from scipy.stats import ttest_ind

# Perform t-test for number of videos per week before the decline
t_stat_before, p_value_before = ttest_ind(df_reaction_posted_more[df_reaction_posted_more['Recovered'] == 1]['Videos_per_week_before'],
                                          df_reaction_posted_more[df_reaction_posted_more['Recovered'] == 0]['Videos_per_week_before'], equal_var=False)

# Perform t-test for number of videos per week after the decline
t_stat_after, p_value_after = ttest_ind(df_reaction_posted_more[df_reaction_posted_more['Recovered'] == 1]['Videos_per_week_after'],
                                        df_reaction_posted_more[df_reaction_posted_more['Recovered'] == 0]['Videos_per_week_after'], equal_var=False)

print(f'T-test for Videos per Week Before Decline: t-statistic = {t_stat_before}, p-value = {p_value_before}')
print(f'T-test for Videos per Week After Decline: t-statistic = {t_stat_after}, p-value = {p_value_after}')

Both t-tests suggest that the number of videos per week (both before and after the decline) is significantly different between channels that recovered and those that did not. This implies that the frequency of video uploads may play a role in a channel's recovery from a decline.

In [None]:
import statsmodels.api as sm

# Prepare the data for logistic regression
X = df_reaction_posted_more[['Mean_frequency_difference', 'Mean_duration_before', 'Mean_duration_after']]
y = df_reaction_posted_more['Recovered'].astype(int)  # Ensure the target variable is integer

# Add a constant to the model
X = sm.add_constant(X)

print(X.dtypes)
# Fit the logistic regression model
logit_model = sm.Logit(y, X).fit()

# Print the summary of the model
print(logit_model.summary())

**Summary:**
* **Intercept (const)**: `Statistically significant` (p-value = 0.022), indicating a baseline log-odds of recovery.
* **Mean_frequency_difference**: `Statistically significant` (p-value = 0.016), suggesting that the mean frequency difference has a significant positive effect on the recovery rate.
* **Mean_duration_before**: `Statistically significant` (p-value = 0.020), suggesting that the mean duration of videos before the decline has a significant positive effect on the recovery rate.
* **Mean_duration_after**: `Not statistically significant` (p-value = 0.703), suggesting that the mean duration of videos after the decline does not have a significant effect on the recovery rate.

The only variable that migth be interesting to look into is the mean frequency difference

In [None]:
# Analyse relationship between mean frequency difference and recovery
correlation = df_sampled_processed['Mean_frequency_difference'].corr(df_sampled_processed['Recovered'])
print(f'Correlation between Mean Frequency Difference and Recovery: {correlation}')

# Plot the relationship between mean frequency difference and recovery
sns.scatterplot(x='Mean_frequency_difference', y='Recovered', data=df_sampled_processed)
plt.title('Upload Frequency vs. Recovery')
plt.xlabel('Upload Frequency (videos per week)')
plt.ylabel('Recovery')
plt.show()

In [None]:
#TODO supprimer si pas pertinent
plt.figure(figsize=(8, 6))
sns.kdeplot(df_sampled[df_sampled['Recovered'] == True]['Mean_frequency_difference'], label='Recovered', fill=True, alpha=0.5)
sns.kdeplot(df_sampled[df_sampled['Recovered'] == False]['Mean_frequency_difference'], label='Not Recovered', fill=True, alpha=0.5)
plt.xlim(-20, 20)
plt.title('Distribution of Video Frequency by Recovery Status')
plt.xlabel('Upload Frequency (videos per week)')
plt.ylabel('Density')
plt.legend()
plt.show()

In order to advise a creator of the number of videos to upload, we should look at the probability to recover of each different frequency of upload

In [None]:
bins = [0, 0.5, 1, 2, 3, 4, 5, 10]
labels = ['<0.5', '0.5-1', '1-2', '2-3', '3-4', '4-5', '>5']

df = df_sampled_processed.copy()

# Bin the upload frequencies
df['Frequency_bin'] = pd.cut(df['Mean_frequency_difference'], bins=bins, labels=labels)

# Calculate the average recovery rate for each bin
recovery_by_frequency = df.groupby('Frequency_bin')['Recovered'].mean().reset_index()

# Plot the recovery rates by upload frequency
sns.barplot(x='Frequency_bin', y='Recovered', data=recovery_by_frequency)
plt.title('Average Recovery Rate by Upload Frequency')
plt.xlabel('Upload Frequency (videos per week)')
plt.ylabel('Average Recovery Rate')
plt.ylim(0.4, 0.6)
plt.show()

In [None]:
current_upload_frequency = df_sampled_processed['Mean_frequency_difference'].mean()

# Example recommendation based on the barplot
if current_upload_frequency < 0.5:
    recommendation = "You should increase your upload frequency to at least 1 video per week to improve your chances of recovery."
elif 0.5 <= current_upload_frequency < 1:
    recommendation = "You are currently uploading less than 1 video per week. Aim to increase your upload frequency to 1-2 videos per week."
elif 1 <= current_upload_frequency < 2:
    recommendation = "You are in the optimal range of 1-2 videos per week. Maintain this frequency to maximize your chances of recovery."
elif 2 <= current_upload_frequency < 3:
    recommendation = "You might want to upload less frequently to 1-2 videos per week to improve your chances of recovery or to upload more frequently to 3-4 videos per week"
else:
    recommendation = "You are uploading more than 3 videos per week. Ensure that you maintain the quality of your content while keeping up with this frequency."

print(recommendation)

## Q2 : Should I focus on shorter or longer videos? 

Now I know that I should upload more frequently, but what about the duration of my videos 

From the regression, we observed that changing the duration of a video doesn't change much. But let's still be a bit curious and see if we can extract something out of it 

Here we want to visualize if changing the duration of the videos uploaded change the rate of recovery. We observe that if there is no change in the duration of the videos uploaded then the creator has 50% chance of recovery. If the creator posts shorter videos, he has 50% chance of recovery as well and if he posts longer videos then he has 47% chance of recovery. 

So by just visualizing the data, we may want to conclude that during a decline changing the duration of the videos uploaded doesn't change the recovery. 

In [None]:
# Plot the relationship between the video duration reaction and recovery
df_reaction_posted_more['Recovered'] = df_reaction_posted_more['Recovered'].astype(int)
sns.barplot(data=df_reaction_posted_more, x='Video_duration_reaction', y='Recovered', errorbar=None)
plt.title('Recovery vs Video Duration Reaction')
plt.xlabel('Video Duration Reaction')
plt.ylabel('Recovery Rate')
plt.ylim(0.4, 0.55)
plt.show()

In [29]:
# change the column name of Mean_duration_difference to mean_video_duration
df.rename(columns={'Mean_duration_difference': 'Mean_video_duration'}, inplace=True)

In [None]:
# Calculate the correlation
correlation = df['Mean_video_duration'].corr(df['Recovered'])
print(f'Correlation between mean video duration and recovery: {correlation}')

# Plot the relationship
sns.scatterplot(x='Mean_video_duration', y='Recovered', data=df)
plt.title('Mean Video Duration vs. Recovery')
plt.xlabel('Mean Video Duration (minutes)')
plt.ylabel('Recovery')
plt.show()

In [None]:
#TODO supprimer si pas pertinent
plt.figure(figsize=(8, 6))
sns.kdeplot(df[df['Recovered'] == True]['Mean_video_duration'], label='Recovered', fill=True, alpha=0.5)
sns.kdeplot(df[df['Recovered'] == False]['Mean_video_duration'], label='Not Recovered', fill=True, alpha=0.5)
plt.xlim(-2000, 2000)
plt.title('Distribution of Video Durations by Recovery Status')
plt.xlabel('Mean video duration')
plt.ylabel('Density')
plt.legend()
plt.show()

In [None]:
df.head()

In [None]:
# Define duration bins
duration_bins = [0*60, 5*60, 10*60, 15*60, 20*60, 30*60, 60*60, 120*60]
duration_labels = ['<5', '5-10', '10-15', '15-20', '20-30', '30-60', '>60']

# Bin the video durations
df['Duration_bin'] = pd.cut(df['Mean_video_duration'], bins=duration_bins, labels=duration_labels)

# Calculate the average recovery rate for each bin
recovery_by_duration = df.groupby('Duration_bin')['Recovered'].mean().reset_index()

# Plot the recovery rates by video duration
sns.barplot(x='Duration_bin', y='Recovered', data=recovery_by_duration)
plt.title('Average Recovery Rate by Video Duration')
plt.xlabel('Mean Video Duration (minutes)')
plt.ylabel('Average Recovery Rate')
plt.ylim(0.35, 0.5)
plt.show()

## Q3 : What type of content should i focus on ? 

In [None]:
df_sampled.head()

In [None]:
df = df_sampled.copy()

# Calculate the average recovery rate for each content category
recovery_by_category = df.groupby('Category')['Recovered'].mean().reset_index()

# Sort the categories by recovery rate
recovery_by_category = recovery_by_category.sort_values(by='Recovered', ascending=False)

# Display the recovery rates by category
print(recovery_by_category)


In [None]:
# Plot the recovery rates by content category
plt.figure(figsize=(12, 8))
sns.barplot(x='Recovered', y='Category', data=recovery_by_category, palette='viridis')
plt.title('Average Recovery Rate by Content Category')
plt.xlabel('Average Recovery Rate')
plt.ylabel('Content Category')
plt.show()

In [None]:
# Example recommendation based on the barplot
high_recovery_categories = recovery_by_category[recovery_by_category['Recovered'] > 0.45]['Category'].tolist()
low_recovery_categories = recovery_by_category[recovery_by_category['Recovered'] <= 0.45]['Category'].tolist()

recommendation = f"To maximize your chances of recovery, focus on creating content in the following categories: {', '.join(high_recovery_categories)}. These categories have shown higher recovery rates. Consider avoiding or minimizing content in the following categories: {', '.join(low_recovery_categories)}, as they have shown lower recovery rates."

print(recommendation)

## Q? : Should I change the topic of my videos? 

### How does a change of topic influence the recovery?

In [None]:
df_reactions_topics = pd.merge(df_reactions, topic_change_data, left_index=True, right_on='Decline', how='left')
df_reactions_topics = df_reactions_topics.dropna()

change_vs_unchanged = df_reactions_topics.groupby('Topic_Change')['Recovered'].mean().reset_index()

plt.figure(figsize=(8, 6))
sns.barplot(x='Topic_Change', y='Recovered', data=change_vs_unchanged, palette='viridis')
plt.title('Impact of Topic Change on Recovery Rate')
plt.xlabel('Topic Changed')
plt.ylabel('Recovery Rate')
plt.xticks([0, 1], ['No', 'Yes'])
plt.ylim(0.4, 0.45)
plt.show()

print(change_vs_unchanged.head())

Very small variation, not statistically significant \
BUT maybe different topic transitions have different correlations with the recovery

### Analysis of different toopic changes

In [None]:
df_reactions_changed_topic = df_reactions_topics[df_reactions_topics['Topic_Change'] == True]
df_reactions_changed_topic = df_reactions_changed_topic.drop(columns=['Decline', 'Topic_Change'])
df_reactions_changed_topic = df_reactions_changed_topic.dropna()
df_reactions_changed_topic.head()

In [None]:
# Group by Topic_Before and Topic_After to calculate recovery rates
topic_transitions = df_reactions_changed_topic.groupby(['Topic_Before', 'Topic_After']).agg(
    recovery_rate=('Recovered', 'mean'),
    count=('Recovered', 'size')
).reset_index()

# Filter for meaningful transitions, with more than 10 cases
topic_transitions = topic_transitions[topic_transitions['count'] > 10]
topic_transitions.head()

In [None]:
pivot_data = topic_transitions.pivot(
    index='Topic_Before', 
    columns='Topic_After', 
    values='recovery_rate'
)

# Heatmap of the recovery rates by topic transitions
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_data, annot=True, fmt=".2f", cmap='coolwarm', cbar_kws={'label': 'Recovery Rate'})
plt.title('Recovery Rate by Topic Transition')
plt.xlabel('Topic After')
plt.ylabel('Topic Before')
plt.show()

In [None]:
# Sort transitions by recovery rate
sorted_transitions = topic_transitions.sort_values(by='recovery_rate', ascending=False)

# Create the bar plot
plt.figure(figsize=(12, 8))
sns.barplot(
    data=sorted_transitions,
    x='recovery_rate',
    y=sorted_transitions.apply(lambda row: f"{row['Topic_Before']} -> {row['Topic_After']}", axis=1),
    palette='viridis'
)
plt.title('Recovery Rate by Topic Transition')
plt.xlabel('Recovery Rate')
plt.ylabel('Topic Transition')
plt.tight_layout()
plt.show()

In [None]:
import plotly.express as px

fig = px.bar(
    sorted_transitions,
    x='recovery_rate',
    y=sorted_transitions.apply(lambda row: f"{row['Topic_Before']} -> {row['Topic_After']} (n={row['count']})", axis=1),
    orientation='h',
    title='Recovery Rate by Topic Transition',
    labels={'x': 'Recovery Rate', 'y': 'Topic Transition'},
    hover_data=['count']
)
fig.update_layout(height=1000) 
fig.show()
