In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable

import seaborn as sns
import statsmodels.api as sm
from tqdm.notebook import tqdm
import json
from scipy.stats import ttest_ind

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import cross_validate

from src.data.dataloader_functions import *
from src.utils.results_utils import *
from src.utils.recovery_analysis_utils import *
from src.utils.plots_utils import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

# Make the code reproducible
SEED = 42

# Load the data

In [2]:
df_all_declines_original = pd.read_csv('data/decline_events_complete.csv')
df_channels = pd.read_csv('data/df_channels_en.tsv', sep='\t', usecols=['channel', 'category_cc'], index_col='channel')
df_data_processed = load_processed_data(usecols=['channel', 'week', 'subs', 'activity', 'views'])

In [None]:
df_all_declines_original.head()

# Finding the outcome

Using the duration of the decline, determine whether the YouTuber recovered or not.

In [None]:
df_all_declines = add_declines_to_db(df_all_declines_original, df_channels, df_data_processed)
print(f"Overall recovery rate: {df_all_declines['Recovered'].mean():.2f}")
df_all_declines.head()

## How is the recovery distributed?

To get a first idea of what factors come into play when a YouTuber tries to recover from a decline, we plot some distributions.

In [None]:
plot_recovered_by_categories(df_all_declines)

In [None]:
plot_group_distributions(df_all_declines)

We can observe that some features are not balanced between the declines that recovered and those that did not, especially views and subscribers at the start of the decline.

In [None]:
print(df_all_declines['Recovered'].value_counts())
print(f"\nTotal number of declines: {len(df_all_declines)}")

# YouTuber reactions

Aiming at finding the best ways to deal with a decline depending on the situation, we take a look at how the YouTubers reacted to the decline, and what methods proved effective.\
In order to observe the reaction's impact, we will conduct a matched observational study on the dataset by using propensity score matching.

Considering the size of the dataset, we use random sampling to ease the matching's computation.\
To check that sampling does not mess with the distribution of recoveries, we plot them depending on the sampling proportion.

In [None]:
plot_sampling_rates(df_all_declines, SEED)

We choose to sample 30% of the data since it considerably reduces the size of the dataset, and allows to keep a representative sample of the data without perturbing the recovery distribution.

In [None]:
df_sampled = df_all_declines.sample(frac=0.3, replace=False, random_state=SEED)

print(df_sampled['Recovered'].value_counts())
print(f"\nTotal number of declines after sampling: {len(df_sampled)}")

# Save the data for the plots
df_sampled.to_csv('plot_data/decline_events.csv', index=False)

### Adding the reaction metrics whose impact we want to measure

The reactions that we are able to observe here are the following:
- Did the YouTuber change video publication frequency?
- Did the YouTuber change video length?
- Did the YouTuber change video category?

In [None]:
# Load the videos around the declines, from week (decline start - decline duration) to week (decline end)
videos_around_declines = pd.read_csv('data/videos_around_declines.csv')

# Add the declines with the indices of the corresponding videos
df_sampled = get_sampled_declines_with_videos(df_sampled, videos_around_declines)

# Augment the data with the video stats : videos per week and mean video duration, before and during the declines
df_sampled = add_video_stats(df_sampled, videos_around_declines)

DIV_BY_ZERO_TOLERANCE = 1e-6


# Calculate mean duration differences
df_sampled = calculate_difference(df_sampled, 'Mean_duration_after', 'Mean_duration_before', 'Mean_duration_difference')
df_sampled = add_change_columns(df_sampled, 'Mean_duration_difference', 'Mean_duration_before', 'Posted_longer_videos', 'Posted_shorter_videos', DIV_BY_ZERO_TOLERANCE, threshold=0.5)
print_stats(df_sampled, 'Posted_longer_videos', 'Posted_shorter_videos', 'mean video duration')

# Calculate mean frequency differences
df_sampled = calculate_difference(df_sampled, 'Videos_per_week_after', 'Videos_per_week_before', 'Mean_frequency_difference')
df_sampled = add_change_columns(df_sampled, 'Mean_frequency_difference', 'Videos_per_week_before', 'Posted_more', 'Posted_less', DIV_BY_ZERO_TOLERANCE, threshold=0.5)
print_stats(df_sampled, 'Posted_more', 'Posted_less', 'publishing frequency')

# Merge and analyze topic changes
df_sampled = merge_and_report_topic_changes(df_sampled, 'data/df_topic_change_20_15w.csv')
df_sampled = df_sampled.dropna()# Drop unnecessary columns

# Save the dataframes for later and drop unnecessary columns
df_videos_per_week = df_sampled[['Videos_per_week_before', 'Videos_per_week_after']]
df_video_duration = df_sampled[['Mean_duration_before', 'Mean_duration_after']]
df_sampled = df_sampled.drop(columns=['Mean_duration_before', 'Mean_duration_after', 'Videos_per_week_before', 'Videos_per_week_after'])

# Drop videos indices
df_sampled = df_sampled.drop(columns=['Videos_before', 'Videos_after'])


In [None]:
# Visualise mean video duration at the start of the decline as it has just been added (for the datastory)
plt.figure(figsize=(7, 3))

sns.histplot(data=df_video_duration / 60, x="Mean_duration_before", hue=df_sampled['Recovered'], log_scale=True, element="step", palette=[RED, GREEN],)
plt.title('Distribution of channels by mean video duration \nat the start of the decline')
plt.xlabel('Mean video duration (minutes)')
plt.ylabel('Number of channels')

plt.savefig('plot_data/mean_video_duration_start_decline.png', bbox_inches='tight')
plt.show()

# Our analysis

In [None]:
df_sampled.head()

## Summary Statistics 

In [None]:
df_sampled.describe()

## Correlation Analysis

In [None]:
df_sampled.head()

In [None]:
# Transform the categorical variable into dummies
df_correlation = df_sampled[['Recovered', 'Posted_more', 'Posted_less', 'Posted_longer_videos', 'Posted_shorter_videos', 'Topic_change']].copy()

for col in df_correlation.columns:
    df_correlation[col] = df_correlation[col].astype(int)

# Calculate the correlation matrix
correlation_matrix = df_correlation.corr()
# Display the correlation matrix
print(correlation_matrix['Recovered'].sort_values(ascending=False))

# Save the correlation matrix as a CSV file for the plots
correlation_matrix.to_csv('plot_data/correlation_matrix.csv')

**Summary**:

* **Positive Correlations** : Posting more videos (higher upload frequency) is slightly associated with recovery.

* **Negative Correlations** : Posting fewer videos are more strongly associated with lower chances of recovery.

* **No Correlations** : Other factors are nearly not correlated with recovery.

## Visual Analysis

In [None]:
# Heatmap of the correlation matrix
sns.heatmap(correlation_matrix, cmap='coolwarm')
plt.show()

**Thoughts :** This visual analysis confirm the correlation analysis 

## Feature importance analysis 

First, do the regression on all the declines that we kept until now.

As a sanity check, we run the same regression, removing the declines that have no videos either before, during or both to check that the presence of zeros does not have a significant impact on the results.

In [None]:
# Columns that are not included in the regression
drop_cols = ['Channel', 'Start', 'End', 'Duration', 'Recovered', 'Mean_duration_difference', 'Mean_frequency_difference', 'Category', 'Subs_start', 'Views_start', 'Activity_start']

# Prepare the data for the logistic regression
logit_X = df_sampled.drop(drop_cols, axis=1)
logit_y = df_sampled['Recovered']

# Perform the regression
logit_result = perform_logistic_regression(logit_X, logit_y)

plt.figure(figsize=(6, 5), dpi=400)
plot_logit_coefficients(logit_result, title='Logistic regression Coefficients')

# Save the coefficients, p-values and variable names to a file for plotting
with open('plot_data/logit_results.csv', 'w') as f:
    res = pd.DataFrame({'coef': logit_result.params, 'p-value': logit_result.pvalues}).reset_index()
    res.columns = ['Variable', 'Coefficient', 'p-value']
    res.to_csv(f, index=False)

**Key Insights:**

* **Posted_more**:  Posting more videos during the decline period significantly increases the chances of recovery.

* **Posted_shorter_videos** : Posting shorter videos has a marginally significant positive effect on recovery.

* **Posted_less** : Posting fewer videos significantly decreases the chances of recovery.
* 

**Actionable Advice:**

* **Increase Video Uploads** : Consistently post more videos during the decline period to engage your audience and increase the chances of recovery.

* **Avoid Reducing Uploads** : Avoid posting fewer videos, as this significantly decreases the chances of recovery.

* **Consider Video Length** : Posting shorter videos may have a positive impact on recovery.

### Propensity score matching

Seeing that the declines do not have the same distribution on their features, we perform propensity score matchings to balance the treatment and control groups looking at the effect that changing publication frequency,video duration and video category after the start of the decline have on the recovery.

After the propensity score matching, we can observe the effect of the different treatments on the recovery :

In [None]:
# The variables to be used for the matching
# Treatments in the same array will be plotted together
TREATMENTS = [
    ['Posted_more',
    'Posted_less'],
    ['Posted_longer_videos',
    'Posted_shorter_videos'],
    ['Topic_change'], 
]

# The variables to be dropped for each treatment (to avoid multicollinearity or strong correlation)
# Use the same order as the TREATMENTS array
to_drop = [
    [['Mean_frequency_difference', 'Posted_less'],
    ['Mean_frequency_difference', 'Posted_more']],
    [['Mean_duration_difference', 'Posted_shorter_videos'],
    ['Mean_duration_difference', 'Posted_longer_videos']],
    [[]]
]

plot_df = pd.DataFrame(columns=['Strategy', 'Adopted the strategy', 'Did not adopt'])

matched_dfs = {}
for plot_treatments, plot_dropped in zip(TREATMENTS, to_drop):
    fig, axes = plt.subplots(1, len(plot_treatments), figsize=(5*len(plot_treatments), 4))
    for subplot_id, (treatment, dropped) in enumerate(zip(plot_treatments, plot_dropped)):

        # Try to load the matches from the file, otherwise compute them
        matches = get_matches(treatment=treatment, declines=df_sampled.drop(dropped, axis=1), verbose=False)

        print(f"{treatment} matches :", matches)

        # Flatten
        matches = [index for match in matches for index in match]

        # Get the matched declines
        matched_dfs[treatment] = df_sampled.loc[matches]

        counts = matched_dfs[treatment].groupby(treatment)['Recovered'].mean() * 100
        plot_df.loc[len(plot_df)] = [treatment, counts[True], counts[False]]

        plot_treatment_effect(matched_dfs[treatment], treatment, ax=axes[subplot_id] if len(plot_treatments) > 1 else axes)

    plt.tight_layout()
    plt.show()

plot_df.to_csv('plot_data/matches_results.csv', index=False)

### T-tests

In [None]:
# Store results for reporting
t_test_results = []

for treatment, df in matched_dfs.items():
    # Separate groups and convert 'Recovered' to numeric
    group_adopted = df[df[treatment] == True]['Recovered'].astype(float)
    group_not_adopted = df[df[treatment] == False]['Recovered'].astype(float)
    
    # Perform t-test
    t_stat, p_value = ttest_ind(group_adopted, group_not_adopted, equal_var=False)  # Welch's t-test for unequal variances

    # Calculate means for reporting
    mean_adopted = group_adopted.mean()
    mean_not_adopted = group_not_adopted.mean()
    
    # Determine significance
    significant = p_value < 0.05

    # Store results
    t_test_results.append({
        "Treatment": treatment,
        "Mean Adopted": mean_adopted,
        "Mean Not Adopted": mean_not_adopted,
        "T-Statistic": t_stat,
        "P-Value": p_value,
        "Significant": significant
    })

# Display results as a DataFrame
t_test_results_df = pd.DataFrame(t_test_results)
print(t_test_results_df)

# Let's look at how we could answer to Youtuber's questions 

## Q1 : How often should I post new videos?
We saw that **uploading more videos** should help to maximize the chances of recovery. Hence we should look at how many videos we should advise him to post each week 

As `Posted_more` has a positive impact on recovery, we want to look into what situations benefit the most from posting videos more often.

We therefore look at the characteristics of decline which reacted that way, and when it worked best :

- Did channels who increased publication frequency already post often, or did they post few videos before the
- Is posting more often associated with a lower average video duration?

In [None]:
# Build the reaction dataframe
df_reactions = build_reaction_dataframe(df_sampled, df_videos_per_week, df_video_duration)
df_reactions


In [None]:
plot_distribution_by_frequency_reaction(df_reactions, 'Videos_per_week_before', 'Distribution of videos per week, before the decline')
plot_distribution_by_frequency_reaction(df_reactions, 'Videos_per_week_after', 'Distribution of videos per week, after the decline')
plt.show()

**Plot description :**

* In red are the channels that during their decline didn't change the frequency of their publications, we plot the ditribution of videos published before and after their decline.

* In light blue are the channels that during they declined decreased the frequency of their publications, we plot the distribution of videos published before and after their decline.

* In dark blue are the channels that during they declined increased the frequency of their publications, we plot the distribution of videos published before and after their decline.

* The red line represents the average number of videos published before/after the decline.

It is interesting to note that the channels that increased video frequency after the start of the decline used to post less than average before the decline, while the ones that reduced video frequency used to post approximately as much as the average. The two groups almost switch places in terms of video frequency.

These observations are good because their decline and recovery can be caused because they posted less videos, and they could have recovered by posting more videos.

**Next :** 

It would be interesting to give an indicator of the **number of videos to post per week** in order to have more chance to recover

In [None]:
df_reaction_posted_more = df_reactions[df_reactions['Frequency_reaction'] == 'Posted_more']

stats_before = df_reaction_posted_more[df_reaction_posted_more['Recovered']==1]['Videos_per_week_before'].describe()
stats_after = df_reaction_posted_more[df_reaction_posted_more['Recovered']==1]['Videos_per_week_after'].describe()

print("Statistics for Videos per Week Before Decline:")
print(stats_before)

print("\nStatistics for Videos per Week After Decline:")
print(stats_after)

I don't think that the mean is really meaningful because we can notice quite some outliers, we should therefore preferably look at the median. Here we can observe that before a decline 0.41 videos were posted and after a decline 1.2 videos were posted. Meaning that we could advice to post at least one video per week in order to increase the chance of recovering from the decline 

In [None]:
# Perform t-test for number of videos per week before the decline
t_stat_before, p_value_before = ttest_ind(df_reaction_posted_more[df_reaction_posted_more['Recovered'] == 1]['Videos_per_week_before'],
                                          df_reaction_posted_more[df_reaction_posted_more['Recovered'] == 0]['Videos_per_week_before'], equal_var=False)

# Perform t-test for number of videos per week after the decline
t_stat_after, p_value_after = ttest_ind(df_reaction_posted_more[df_reaction_posted_more['Recovered'] == 1]['Videos_per_week_after'],
                                        df_reaction_posted_more[df_reaction_posted_more['Recovered'] == 0]['Videos_per_week_after'], equal_var=False)

print(f'T-test for Videos per Week Before Decline: t-statistic = {t_stat_before}, p-value = {p_value_before}')
print(f'T-test for Videos per Week After Decline: t-statistic = {t_stat_after}, p-value = {p_value_after}')

Both t-tests suggest that the number of videos per week (both before and after the decline) is significantly different between channels that recovered and those that did not. This implies that the frequency of video uploads may play a role in a channel's recovery from a decline.

In [None]:
# Prepare the data for logistic regression
X = df_reaction_posted_more[['Mean_frequency_difference', 'Mean_duration_before', 'Mean_duration_after']]
y = df_reaction_posted_more['Recovered'].astype(int)  # Ensure the target variable is integer

# Add a constant to the model
X = sm.add_constant(X)

print(X.dtypes)
# Fit the logistic regression model
logit_model = sm.Logit(y, X).fit()

# Print the summary of the model
print(logit_model.summary())

**Summary:**
* **Intercept (const)**: `Statistically significant` (p-value = 0.022), indicating a baseline log-odds of recovery.
* **Mean_frequency_difference**: `Statistically significant` (p-value = 0.016), suggesting that the mean frequency difference has a significant positive effect on the recovery rate.
* **Mean_duration_before**: `Statistically significant` (p-value = 0.020), suggesting that the mean duration of videos before the decline has a significant positive effect on the recovery rate.
* **Mean_duration_after**: `Not statistically significant` (p-value = 0.703), suggesting that the mean duration of videos after the decline does not have a significant effect on the recovery rate.

The only variable that migth be interesting to look into is the mean frequency difference

In [None]:
# Analyse relationship between mean frequency difference and recovery
correlation = df_sampled['Mean_frequency_difference'].corr(df_sampled['Recovered'])
print(f'Correlation between Mean Frequency Difference and Recovery: {correlation}')

In order to advise a creator of the number of videos to upload, we should look at the probability to recover of each different frequency of upload

#### Propensity Score Matching on each frequency bin

In [None]:
# Define bins and labels for upload frequency
bins = [0, 0.5, 1, 2, 3, 4, 5, 10]
labels = ['<0.5', '0.5-1', '1-2', '2-3', '3-4', '4-5', '>5']

df = df_sampled.drop(columns=(['Mean_frequency_difference', 'Posted_more', 'Posted_less']))
matched_dfs = {}

df['Frequency_bin'] = pd.cut(df_videos_per_week['Videos_per_week_after'], bins=bins, labels=labels)

plot_df = pd.DataFrame(columns=['Frequency_bin', 'Recovery_rate'])

for bin_label in labels:
    df['Is_in_bin'] = (df['Frequency_bin'] == bin_label)
    print(f'Processing bin: {bin_label} ({df["Is_in_bin"].sum() / len(df) * 100:.2f}% of declines)')

    df_dropped = df.drop(columns = ['Frequency_bin'])
    
    # Perform PSM for the treatment of interest
    matches = get_matches(treatment='Is_in_bin', declines=df_dropped, verbose=False)

    matched_indices_flat = [index for match in matches for index in match]
    matched_df = df.loc[matched_indices_flat]

    # Calculate recovery rate for "in bin" (True)
    recovery_rate = matched_df.groupby('Is_in_bin')['Recovered'].mean() * 100

    matched_dfs[bin_label] = matched_df
    if True in recovery_rate.index: # Append recovery rate for the current bin
        plot_df.loc[len(plot_df)] = [bin_label, recovery_rate[True]]
    else:
        plot_df.loc[len(plot_df)] = [bin_label, 0]
print(plot_df)

In [None]:
sns.barplot(data=plot_df, x='Frequency_bin', y='Recovery_rate', errorbar=None)
plt.title(f'Recovery by Upload Frequency')
plt.xlabel('Upoad Frequency (per week)')
plt.ylabel('Recovery Rate (%)')
plt.ylim(0, 100)
plt.show()

We can see on this histogram what upload frequencies have the best average recovery rates.

## Q2 : Should I focus on shorter or longer videos? 

Now I know that I should upload more frequently, but what about the duration of my videos 

From the regression, we observed that changing the duration of a video doesn't change much. But let's still be a bit curious and see if we can extract something out of it 

Here we want to visualize if changing the duration of the videos uploaded change the rate of recovery. We observe that if there is no change in the duration of the videos uploaded then the creator has 50% chance of recovery. If the creator posts shorter videos, he has 50% chance of recovery as well and if he posts longer videos then he has 47% chance of recovery. 

So by just visualizing the data, we may want to conclude that during a decline changing the duration of the videos uploaded doesn't change the recovery. 

In [None]:
# Calculate the correlation
correlation = df['Mean_video_duration'].corr(df['Recovered'])
print(f'Correlation between mean video duration and recovery: {correlation}')

#### Propensity Score Matching on each duration bin

In [None]:
# Define bins and labels for upload frequency
duration_bins = [0*60, 5*60, 10*60, 15*60, 20*60, 30*60, 60*60, 120*60]
duration_labels = ['<5', '5-10', '10-15', '15-20', '20-30', '30-60', '>60']

df = df_sampled.copy()
df = df.dropna()

matched_dfs = {}

# Bin the video durations
df['Duration_bin'] = pd.cut(df_video_duration['Mean_duration_after'], bins=duration_bins, labels=duration_labels)
print(df['Duration_bin'])
plot_df = pd.DataFrame(columns=['Duration_bin', 'Recovery_rate'])

bin_counts = df['Duration_bin'].value_counts()
print("Number of samples in each bin:")
print(bin_counts)

for bin_label in duration_labels:
    df['Is_in_bin_duration'] = (df['Duration_bin'] == bin_label)
    df = df.dropna()
    print(f'Processing bin: {bin_label}')

    df_dropped = df.drop(columns = ['Duration_bin']).dropna()
    

    # Scale the data if necessary (especially for numerical columns)
    if (df_dropped == np.inf).any().any() or (df_dropped.isna()).any().any():
        print(f"Data contains NaN or Infinite values for {bin_label}.")
        continue  # Skip this bin if data is problematic

    df_dropped = df_dropped.loc[:, df_dropped.nunique() > 1]


    # Perform PSM for the treatment of interest
    matches = get_matches(treatment='Is_in_bin_duration', declines=df_dropped, verbose=False)

    matched_indices_flat = [index for match in matches for index in match]
    matched_df = df.loc[matched_indices_flat]

    # Calculate recovery rate for "in bin" (True)
    recovery_rate = matched_df.groupby('Is_in_bin_duration')['Recovered'].mean() * 100

    matched_dfs[bin_label] = matched_df
    if True in recovery_rate.index: # Append recovery rate for the current bin
        plot_df.loc[len(plot_df)] = [bin_label, recovery_rate[True]]
    else:
        plot_df.loc[len(plot_df)] = [bin_label, 0]
print(plot_df)

In [None]:
sns.barplot(data=plot_df, x='Duration_bin', y='Recovery_rate', errorbar=None)
plt.title(f'Recovery by Video Duration')
plt.xlabel('Mean Video Duration (minutes)')
plt.ylabel('Recovery Rate (%)')
plt.ylim(35, 50)
plt.show()

## Q3 : What type of content should i focus on ? 

In [None]:
df = df_sampled.copy()

# Calculate the average recovery rate for each content category
recovery_by_category = df.groupby('Category')['Recovered'].mean().reset_index()

# Sort the categories by recovery rate
recovery_by_category = recovery_by_category.sort_values(by='Recovered', ascending=False)

# Display the recovery rates by category
print(recovery_by_category)


In [None]:
plot_horizontal_barplot('Content Category', recovery_by_category, 'Recovered', 'Category')

This is naive and does not lead to any conclusions regarding what reactions are best. 

We will therefore explore topic changes

In [None]:
# Example recommendation based on the barplot
high_recovery_categories = recovery_by_category[recovery_by_category['Recovered'] > 0.45]['Category'].tolist()
low_recovery_categories = recovery_by_category[recovery_by_category['Recovered'] <= 0.45]['Category'].tolist()

recommendation = f"To maximize your chances of recovery, focus on creating content in the following categories: {', '.join(high_recovery_categories)}. These categories have shown higher recovery rates. Consider avoiding or minimizing content in the following categories: {', '.join(low_recovery_categories)}, as they have shown lower recovery rates."

print(recommendation)

## Q? : Should I change the topic of my videos? 

### How does a change of topic influence the recovery?

In [39]:
topic_change_data = pd.read_csv('data/df_topic_change_20_15w.csv')
topic_change_data.columns = ['Decline', 'Topic_change', 'Topic_before', 'Topic_after']

df_reactions_topics = pd.merge(df_reactions, topic_change_data, left_index=True, right_on='Decline', how='left')
df_reactions_topics = df_reactions_topics.dropna()

Very small variation, not statistically significant \
BUT maybe different topic transitions have different correlations with the recovery

### Analysis of different topic changes

In [None]:
df_reactions_changed_topic = df_reactions_topics[df_reactions_topics['Topic_change'] == True]
df_reactions_changed_topic = df_reactions_changed_topic.drop(columns=['Decline', 'Topic_change'])
df_reactions_changed_topic = df_reactions_changed_topic.dropna()
df_reactions_changed_topic.head()

### Use the LLM-generated topic themes

In [None]:
df_reactions_changed_topic = map_topics_to_llm_themes('data/LLM_topics.json', df_reactions_changed_topic)
df_reactions_changed_topic.head()

In [None]:
topic_transitions = filter_topic_transitions(df_reactions_changed_topic)
topic_transitions.head()

In [None]:
plot_barplot_topics_plotly(topic_transitions)

In [None]:
sankey_diagram(topic_transitions)

In [None]:
# Define the range for the color bar
x, y = 0.31, 0.57  # Replace with your desired range

# Normalize the color bar values
norm = Normalize(vmin=x, vmax=y)

# Create a colormap object using the coolwarm palette
cmap = cm.coolwarm

# Create a ScalarMappable to map normalized values to colors
sm = ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])  # Required to set the array for ScalarMappable

# Create the color bar
plt.figure(figsize=(0.5, 6))
cbar = plt.colorbar(sm, cax=plt.gca(), orientation='vertical')  # Vertical color bar
cbar.set_label('Recovery Rate', fontsize=12)
cbar.ax.tick_params(labelsize=10)

plt.show()
