# Characterizing Patronage on YouTube

#### Libaries imports

In [None]:
import os 
import io
import pandas as pd
import json
import re
import zstandard
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
import numpy as np
import seaborn as sns
import gzip
from tqdm import tqdm
import timeit
import ast
import math
import datetime
import ruptures as rpt
from statsmodels.tsa.stattools import grangercausalitytests
import pickle

#### Global variables definitions

In [None]:
# data folder paths
DATA_FOLDER = "/dlabdata1/youtube_large/"
LOCAL_DATA_FOLDER = "local_data/"

In [None]:
# list all files in DATA_FOLDER
# !ls -lh {DATA_FOLDER}

In [None]:
# list all files in LOCAL_DATA_FOLDER
!ls -lh {LOCAL_DATA_FOLDER}

## 1. Exploratory Data Analysis (EDA)

• Exploratory Data Analysis can be found in the [eda.ipynb](./eda.ipynb) notebook

## 2. Match data

Files used in this section

**YouNiverse dataset:**

- (`df_channels_en.tsv.gz`: channel metadata.)
- `df_timeseries_en.tsv.gz`: channel-level time-series.
- `yt_metadata_en.jsonl.gz`: raw video metadata.

**Graphteon dataset:**
- `final_processed_file.jsonl.gz` all graphteon time-series.

### 2.1. Filter YouTube metadata containing patreon id
_Extract Patreon urls from YouTube metadata description (if they exist) and keep only those rows_

YT_metadata_filter_results_040422.jpg _(filter script in script/scripts.ipynb)_
<div>
    <img src="img/YT_metadata_filter_results_040422.jpg" alt="YT_metadata_filter_results_040422.jpg" />
</div>

In [None]:
# declare global variable for size of original YT dataset
DF_YT_METADATA_ROWS = 72_924_794

In [None]:
# YT metadata containing patreon ids in description
!ls -lh {LOCAL_DATA_FOLDER}yt_metadata_en_pt_040422.tsv.gz

In [None]:
# read filtered youtube metadata file (takes about 2 mins)
df_yt_metadata_pt = pd.read_csv(LOCAL_DATA_FOLDER+"yt_metadata_en_pt_040422.tsv.gz", sep="\t", lineterminator='\n', compression='gzip') 

In [None]:
# remove rows where patreon_ids = patreon.com/posts or patreon.com/user (in the future fix in regex)
df_yt_metadata_pt = df_yt_metadata_pt[df_yt_metadata_pt['patreon_id'] != 'patreon.com/posts']
df_yt_metadata_pt = df_yt_metadata_pt[df_yt_metadata_pt['patreon_id'] != 'patreon.com/user']

# lowercase all patreon ids to avoid duplicates
df_yt_metadata_pt['patreon_id'] = df_yt_metadata_pt['patreon_id'].str.lower()

In [None]:
df_yt_metadata_pt.head(1)

In [None]:
# stats 
print("[YouTube metadata] Total number of videos:                                                {:>10,}".format(DF_YT_METADATA_ROWS))
print("[Filtered YouTube metadata] number of videos that contain a patreon link in description:  {:>10,} ({:.1%} of total dataset)".format(len(df_yt_metadata_pt), len(df_yt_metadata_pt)/DF_YT_METADATA_ROWS))

# get list of all unique patreon ids in df_yt_metadata_pt
yt_patreon_list = df_yt_metadata_pt['patreon_id'].unique()
yt_pt_channel_list = df_yt_metadata_pt['channel_id'].unique()

print("[Filtered YouTube metadata] total number of unique patreon ids:                           {:>9,}".format(len(yt_patreon_list)))
print("[Filtered YouTube metadata] number of unique channels that contain a patreon account:     {:>9,}".format(len(yt_pt_channel_list)))

**Observation:** \
We can see that we have _**more patreon ids than channels**_ . Let's investigate further:

#### Restrict to 1 patreon id per youtube channel

In [None]:
# group by channel_id AND patreon_id and count the number of unique videos (display_ids)
df_yt_metadata_pt_grp_chan = df_yt_metadata_pt.groupby(['channel_id','patreon_id']).agg(display_id_cnt=("display_id", pd.Series.nunique))
df_yt_metadata_pt_grp_chan.head()

In [None]:
# reset index
df_yt_metadata_pt_grp_chan = df_yt_metadata_pt_grp_chan.reset_index()
# df_yt_metadata_pt_grp_chan.head(4)

# count the number of patreon_ids per channel
pt_id_cnt_pr_chan = df_yt_metadata_pt_grp_chan.groupby('channel_id').count()['patreon_id'].sort_values(ascending=False)
pt_id_cnt_pr_chan = pt_id_cnt_pr_chan.to_frame(name='patreon_id_cnt')
pt_id_cnt_pr_chan.head()

In [None]:
# plot Distribution of patreon ids per channel
fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(6,4))

# plot with log scale for x axis and log scale for y axis
sns.histplot(data=pt_id_cnt_pr_chan, ax=axs, bins=50, kde=False, legend=False, color=f'C{0}')
axs.set(title=f'Distribution of patreon ids per channel (log scale)')
axs.set_xlabel("Number of patreon ids")
axs.set_ylabel("Count of channels (log scale)")
axs.set(yscale="log")

# plt.tight_layout()
plt.show()

# descriptive statistics table
pt_id_cnt_pr_chan.describe().T

**Discussion:** \
As we observed earlier, some channels use more than 1 patreon id, and use different patreon urls for different videos. For example:
- [Patreon_Gaming](https://www.youtube.com/channel/UCAsLyFlWkbdhvri02tO6veA) uses 73 different patreon ids.
- [Artistic Maniacs](https://www.youtube.com/channel/UC3pcSD6_RRisNLaHGznemJA) uses 69 different patreon ids.

In [None]:
# example for Artistic Maniacs
df_yt_metadata_pt_grp_chan[df_yt_metadata_pt_grp_chan['channel_id'] == 'UC3pcSD6_RRisNLaHGznemJA'].head()

_Optional: Keep only most used patreon_id per channel (patreon_id with most videos for each channel)_

In [None]:
# sort metadata df by diplay_id_cnt within each channel_id group
df_yt_metadata_pt_grp_chan = df_yt_metadata_pt_grp_chan.sort_values(['channel_id','display_id_cnt'], ascending=[True, False])
df_yt_metadata_pt_grp_chan.head(5)

In [None]:
# calculate the number of duplicate of rows with same channel id but different patreon ids
dup_chan_id = df_yt_metadata_pt_grp_chan[df_yt_metadata_pt_grp_chan.duplicated(subset=['channel_id'], keep='first')]
print("Number of duplicate rows (same channel id with multiple patreon_ids): {:,}".format(len(dup_chan_id)))

In [None]:
# drop duplicate rows, keep the patreon ids with the most videos
df_yt_metadata_unique_pt = df_yt_metadata_pt_grp_chan.drop_duplicates(subset='channel_id', keep='first')
print('Removed {:,} rows'.format(len(df_yt_metadata_pt_grp_chan) - len(df_yt_metadata_unique_pt)))
df_yt_metadata_unique_pt.head()

#### "Match" dataframe (channel/patreon)

Consider them linked only if 
- [TODO] there is a Patreon link >10% of their videos and if the second most common Patreon link occurs less than 2-3 videos.
- [TODO] Remove channels whose patreon ids are not unique
- Match YouTube channel to Patreon id which appears in most of its videos

In [None]:
# store into new "matched" dataframe
df_matched_channel_patreon = df_yt_metadata_unique_pt[['channel_id', 'patreon_id']]
df_matched_channel_patreon.head()

In [None]:
# save "matched" dataframe to LOCAL SCRATCH FOLDER as a compressed tsv
# output_file_path = LOCAL_DATA_FOLDER+"df_matched_channel_patreon.tsv.gz"
# df_matched_channel_patreon.to_csv(output_file_path, index=False, sep='\t', compression='gzip')

##### [_Ignore for now_] Further Observation

**Further Observation:** \
When grouping YouTube metadata by `channel_id` and `patreon_id`, we also notice that we have more rows than the total number of unique patreon ids. \
This is because some `patreon_id` are used on multiple channels. 

In [None]:
print("total rows:                        {:,}".format(len(df_yt_metadata_pt_grp_chan)))
print("total number of unique patreon ids {:,}".format(df_yt_metadata_pt.patreon_id.nunique()))

In [None]:
# show patreon_id that are used on multiple channels.
df_yt_metadata_pt_grp_chan[df_yt_metadata_pt_grp_chan.duplicated(subset=['patreon_id'], keep=False)].sort_values(by='patreon_id')

In [None]:
print("[Filtered YouTube metadata] number of channels per patreon id:")

chan_cnt_per_patreon_id = df_yt_metadata_pt.groupby('patreon_id')\
                                            .agg(channel_id_count=('channel_id', 'count'))\
                                            .sort_values(by=['channel_id_count'], ascending=False)
chan_cnt_per_patreon_id
# chan_cnt_per_patreon_id.reset_index()

##### [_Ignore for now_] Number of videos per patreon id

In [None]:
# group by patreon_id and count the number of unique display_ids
vids_cnt_per_patreon_id = df_yt_metadata_pt.groupby('patreon_id').agg({"display_id": pd.Series.nunique}).sort_values(by='display_id', ascending=False)
vids_cnt_per_patreon_id.rename(columns={'display_id':'display_id_cnt'}, inplace=True)

print("[Filtered YouTube metadata] number of videos per patreon id:")
vids_cnt_per_patreon_id

In [None]:
# plot with linear scale for both axes
fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(6,4))


# plot with log scale for x axis and log scale for y axis
sns.histplot(data=vids_cnt_per_patreon_id, ax=axs, bins=50, kde=False, color=f'C{0}')
axs.set(title=f'Distribution of videos per patreon id (log scale)')
axs.set_xlabel("Number of videos")
axs.set_ylabel("# patreon ids (log scale)")
axs.set(yscale="log")

plt.tight_layout()
plt.show()

# descriptive statistics table
vids_cnt_per_patreon_id.describe().T

**Discussion:** \
From the above graph and table, we can see that the _videos_ distributions among patreon ids follows a **power law**, meaning that most patreon accounts have a only a few videos, but a few of them have a lot of videos.

More specifically:
- 25% of the Patreon accounts have 1 video
- 50% of the Patreon accounts have less than 4 videos

### 2.2 Filter YouTube timeseries - Restrict YouTube channels (4 filters)
Restrict YouTube channels according to the following criteria (filters are applied sequentially):
- Filter 1: Keep only YouTube channels that are in YouTube Timeseries dataset AND linked to a patreon account 
- Filter 2: At least 2 year between first and last video
- Filter 3: At least 20 videos with patreon ids
- Filter 4: At least 250k subscribers at data crawling time

In [None]:
!ls -lh {DATA_FOLDER}df_timeseries_en.tsv.gz

In [None]:
# load channel-level time-series. (takes about 50 secs)
df_yt_timeseries = pd.read_csv(DATA_FOLDER+'df_timeseries_en.tsv.gz', sep="\t", compression='gzip', parse_dates=['datetime'])

In [None]:
df_yt_timeseries.head(3)

In [None]:
# Define global values for filters
MIN_DAYS_DELTA = "730 day"    # filter 2
NB_PATREON_VIDS = 20          # filter 3
NB_SUBS = 250_000             # filter 4

In [None]:
# Nb of channels of original YT timeseries dataset (need to first load df_yt_timeseries in 1.1.2)
yt_ts_uniq_chan_cnt = df_yt_timeseries['channel'].nunique()
print("[YouTube Timeseries] Nb of rows of original dataset:                  {:>10,}".format(len(df_yt_timeseries)))
print("[YouTube Timeseries] Nb of channels of original dataset:              {:>10,}".format(yt_ts_uniq_chan_cnt))

---
##### **• Filter 1:** Keep only YouTube channels that are in YouTube Timeseries dataset AND linked to a patreon account

In [None]:
# Apply filter 1: retain only the YT channels that exist in the filtered YT metadata dataset (need to first load df_yt_metadata_pt and yt_pt_channel_list in 2.2.1)
df_yt_timeseries_filt1 = df_yt_timeseries[df_yt_timeseries['channel'].isin(yt_pt_channel_list)]
chan_list_filt1 = df_yt_timeseries_filt1['channel'].unique()
chan_list_filt1_cnt = len(chan_list_filt1)

print("[YouTube Timeseries] Nb of rows of after applying filter 1:           {:>10,} ({:5.1%} of original dataset)".format(len(df_yt_timeseries_filt1), len(df_yt_timeseries_filt1)/len(df_yt_timeseries)))
print("[YouTube Timeseries] Nb of channels after applying filter 1:          {:>10,} ({:5.1%} of original dataset)".format(chan_list_filt1_cnt, chan_list_filt1_cnt/yt_ts_uniq_chan_cnt))

---
##### **• Filter 2:** At least 2 year between first and last video

In [None]:
# among filter1 channels, calculate time difference between the first and the last video for each channel
datetime_data = df_yt_timeseries_filt1.groupby('channel').agg(datetime_min=('datetime', 'min'),
                                                              datetime_max=('datetime', 'max'))
datetime_data['delta_datetime'] = datetime_data['datetime_max'] - datetime_data['datetime_min']

# filter channels that we have data for at least MIN_TIME_DELTA days
datetime_data_filt2 = datetime_data[datetime_data['delta_datetime'] > pd.Timedelta(MIN_DAYS_DELTA)]

# Apply filter on YT Timeseries dataset: retain only those channels that have data for at least MIN_TIME_DELTA days
df_yt_timeseries_filt2 = df_yt_timeseries_filt1[df_yt_timeseries_filt1['channel'].isin(datetime_data_filt2.index)]

chan_list_filt2 = df_yt_timeseries_filt2['channel'].unique()
chan_list_filt2_cnt = len(chan_list_filt2)

print("[YouTube Timeseries] Nb of rows of after applying filter 1+2:         {:>10,} ({:5.1%} of original dataset, {:5.1%} of filter 1 dataset)".format(len(df_yt_timeseries_filt2), len(df_yt_timeseries_filt2)/len(df_yt_timeseries), len(df_yt_timeseries_filt2)/len(df_yt_timeseries_filt1)))
print("[YouTube Timeseries] Nb of channels after applying filter 1+2:        {:>10,} ({:5.1%} of original dataset, {:5.1%} of filter 1 channels)".format(chan_list_filt2_cnt, chan_list_filt2_cnt/yt_ts_uniq_chan_cnt, chan_list_filt2_cnt/chan_list_filt1_cnt))

___

##### **• Filter 3:** At least 20 videos with patreon ids per channel 

In [None]:
# group by channel_id AND patreon_id and count the number of unique videos (=display_ids). (need to load df_yt_metadata_pt_grp_chan from point 2.2.1)
# Then filter rows that have at least 20 videos (display_ids) 
df_yt_metadata_pt_grp_chan_filt3 = df_yt_metadata_pt_grp_chan[df_yt_metadata_pt_grp_chan['display_id_cnt'] > NB_PATREON_VIDS]
df_yt_metadata_pt_grp_chan_filt3

# get list of unique channels satisfying filter 3
chan_list_filt_3 = df_yt_metadata_pt_grp_chan_filt3['channel_id'].unique()

# Apply filter on YT Timeseries dataset: retain only those channels from filt 2 that are in the chan_list_filt_3
df_yt_timeseries_filt3 = df_yt_timeseries_filt2[df_yt_timeseries_filt2['channel'].isin(chan_list_filt_3)]

chan_list_filt3 = df_yt_timeseries_filt3['channel'].unique()
chan_list_filt3_cnt = len(chan_list_filt3)

print("[YouTube Timeseries] Nb of rows of after applying filter 1+2+3:       {:>10,} ({:5.1%} of original dataset, {:5.1%} of filter 2 dataset)".format(len(df_yt_timeseries_filt3), len(df_yt_timeseries_filt3)/len(df_yt_timeseries), len(df_yt_timeseries_filt3)/len(df_yt_timeseries_filt2)))
print("[YouTube Timeseries] Nb of channels after applying filter 1+2+3:      {:>10,} ({:5.1%} of original dataset, {:5.1%} of filter 2 channels)".format(chan_list_filt3_cnt, chan_list_filt3_cnt/yt_ts_uniq_chan_cnt, chan_list_filt3_cnt/chan_list_filt2_cnt))

---
##### **• Filter 4:** At least 250k subscribers at data crawling time

In [None]:
# Aggregates per channel
subs_aggr_per_channel = df_yt_timeseries_filt3.groupby('channel')\
                                               .agg(min_subs=('subs', 'min'),
                                                    max_subs=('subs', 'max'))\
                                                .sort_values(by=['max_subs'], ascending=False)\
                                                .reset_index()
# subs_aggr_per_channel.head()

In [None]:
# Need to first load data_per_channel (aggregates per channel in 1.1.2 'Datetime points accross channels' section)
subs_per_channel_filt4 = subs_aggr_per_channel[subs_aggr_per_channel['max_subs'] > NB_SUBS]

# get list of unique channels satisfying filter 4
chan_list_filt_4 = subs_per_channel_filt4['channel'].unique()

# # Apply filter on YT Timeseries dataset: retain only those channels from filt_3 that are in the chan_list_filt_4
df_yt_timeseries_filt4 = df_yt_timeseries_filt3[df_yt_timeseries_filt3['channel'].isin(chan_list_filt_4)]

chan_list_filt4 = df_yt_timeseries_filt4['channel'].unique()
chan_list_filt4_cnt = len(chan_list_filt4)

print("[YouTube Timeseries] Nb of rows of after applying filter 1+2+3+4:     {:>10,} ({:5.1%} of original dataset, {:5.1%} of filter 3 dataset)".format(len(df_yt_timeseries_filt4), len(df_yt_timeseries_filt4)/len(df_yt_timeseries), len(df_yt_timeseries_filt4)/len(df_yt_timeseries_filt3)))
print("[YouTube Timeseries] Nb of channels after applying filter 1+2+3+4:    {:>10,} ({:5.1%} of original dataset, {:5.1%} of filter 3 channels)".format(chan_list_filt4_cnt, chan_list_filt4_cnt/yt_ts_uniq_chan_cnt, chan_list_filt4_cnt/chan_list_filt3_cnt))

---
##### **• Filter 4b**: At least 50k subscribers in the first 6 months

___
___
**• Filters summary**

In [None]:
print("[YouTube Timeseries] Stats before and after filters:")
print()

print("Filter 1 = \"keep only YouTube channels that are in YouTube Timeseries dataset AND linked to a patreon account\"")
print("Filter 2 = \"at least {:.1f} years ({} days) between first and last video\"".format(pd.Timedelta(MIN_DAYS_DELTA).days/365, pd.Timedelta(MIN_DAYS_DELTA).days))
print("Filter 3 = \"at least {:,} videos with patreon ids per channel\"".format(NB_PATREON_VIDS))
print("Filter 4 = \"at least {:,} subscribers at data crawling time\"".format(NB_SUBS))
print()




print("[YouTube Timeseries] Nb of rows of original dataset:                  {:>10,}".format(len(df_yt_timeseries)))
print("[YouTube Timeseries] Nb of rows of after applying filter 1:           {:>10,} ({:5.1%} of original dataset)".format(len(df_yt_timeseries_filt1), len(df_yt_timeseries_filt1)/len(df_yt_timeseries)))
print("[YouTube Timeseries] Nb of rows of after applying filter 1+2:         {:>10,} ({:5.1%} of original dataset, {:5.1%} of filter 1 dataset)".format(len(df_yt_timeseries_filt2), len(df_yt_timeseries_filt2)/len(df_yt_timeseries), len(df_yt_timeseries_filt2)/len(df_yt_timeseries_filt1)))
print("[YouTube Timeseries] Nb of rows of after applying filter 1+2+3:       {:>10,} ({:5.1%} of original dataset, {:5.1%} of filter 2 dataset)".format(len(df_yt_timeseries_filt3), len(df_yt_timeseries_filt3)/len(df_yt_timeseries), len(df_yt_timeseries_filt3)/len(df_yt_timeseries_filt2)))
print("[YouTube Timeseries] Nb of rows of after applying filter 1+2+3+4:     {:>10,} ({:5.1%} of original dataset, {:5.1%} of filter 3 dataset)".format(len(df_yt_timeseries_filt4), len(df_yt_timeseries_filt4)/len(df_yt_timeseries), len(df_yt_timeseries_filt4)/len(df_yt_timeseries_filt3)))


print()

print("[YouTube Timeseries] Nb of channels of original dataset:              {:>10,}".format(yt_ts_uniq_chan_cnt))
print("[YouTube Timeseries] Nb of channels after applying filter 1:          {:>10,} ({:5.1%} of original dataset)".format(chan_list_filt1_cnt, chan_list_filt1_cnt/yt_ts_uniq_chan_cnt))
print("[YouTube Timeseries] Nb of channels after applying filter 1+2:        {:>10,} ({:5.1%} of original dataset, {:5.1%} of filter 1 channels)".format(chan_list_filt2_cnt, chan_list_filt2_cnt/yt_ts_uniq_chan_cnt, chan_list_filt2_cnt/chan_list_filt1_cnt))
print("[YouTube Timeseries] Nb of channels after applying filter 1+2+3:      {:>10,} ({:5.1%} of original dataset, {:5.1%} of filter 2 channels)".format(chan_list_filt3_cnt, chan_list_filt3_cnt/yt_ts_uniq_chan_cnt, chan_list_filt3_cnt/chan_list_filt2_cnt))
print("[YouTube Timeseries] Nb of channels after applying filter 1+2+3+4:    {:>10,} ({:5.1%} of original dataset, {:5.1%} of filter 3 channels)".format(chan_list_filt4_cnt, chan_list_filt4_cnt/yt_ts_uniq_chan_cnt, chan_list_filt4_cnt/chan_list_filt3_cnt))
print()


print('[YouTube Timeseries] Time range of original dataset                   {} and {}'.format(df_yt_timeseries['datetime'].min().strftime('%B %d, %Y'),
                                                              df_yt_timeseries['datetime'].max().strftime('%B %d, %Y')))

print('[YouTube Timeseries] Time range after applying filter 1+2+3+4        {} and {}'.format(df_yt_timeseries_filt4['datetime'].min().strftime('%B %d, %Y'),
                                                              df_yt_timeseries_filt4['datetime'].max().strftime('%B %d, %Y')))

display(df_yt_timeseries_filt4.head())

print("Restricted list of channels after 4 filters (count = {:,}):".format(chan_list_filt4_cnt))
print(chan_list_filt4)

[ignore] Match patreon_ids and channel_ids

In [None]:
# # filter YT metadata dataset by list of filtered channels from YT timeseries above
# df_yt_metadata_pt_restr = df_yt_metadata_pt[df_yt_metadata_pt['channel_id'].isin(chan_list_filt4)]

# # get unique channels for youtube metadata (original and restricted)
# yt_metadata_uniq_chan = df_yt_metadata_pt['channel_id'].unique()
# yt_metadata_uniq_chan_restr = df_yt_metadata_pt_restr['channel_id'].unique()

# # get unique patreon ids for youtube metadata (original and restricted)
# yt_metadata_uniq_pat = df_yt_metadata_pt['patreon_id'].unique()
# yt_metadata_uniq_pat_restr = df_yt_metadata_pt_restr['patreon_id'].unique()

# print("[YouTube Metadata]:")
# print()
# print("Restriction = \"keep only YouTube channels that are in YouTube Timeseries filtered (filters 1-4) dataset\"")
# print()
# # print("[YouTube Metadata] Nb of videos in original dataset:                                   {:>10,}".format(DF_YT_METADATA_ROWS))
# # print("[YouTube Metadata] Nb of videos in pre-filtered (containing patreon id) dataset:       {:>10,}".format(len(df_yt_metadata_pt)))
# # print("[YouTube Metadata] Nb of videos after filtering by restricted channels:                {:>10,} ({:5.1%} of pre-filtered dataset dataset)".format(len(df_yt_metadata_pt_restr), len(df_yt_metadata_pt_restr)/len(df_yt_metadata_pt)))
# # print()
# print("[YouTube Metadata] Nb of channels in pre-filtered (containing patreon id) dataset:     {:>10,}".format(len(yt_metadata_uniq_chan)))
# print("[YouTube Metadata] Nb of channels after filtering by restricted channels:              {:>10,} ({:5.1%} of pre-filtered dataset dataset)".format(len(yt_metadata_uniq_chan_restr), len(yt_metadata_uniq_chan_restr)/len(yt_metadata_uniq_chan)))
# print()
# print("[YouTube Metadata] Nb of patreon ids in pre-filtered (containing patreon id) dataset:  {:>10,}".format(len(yt_metadata_uniq_pat)))
# print("[YouTube Metadata] Nb of patreon ids after filtering by restricted channels:           {:>10,} ({:5.1%} of pre-filtered dataset dataset)".format(len(yt_metadata_uniq_pat_restr), len(yt_metadata_uniq_pat_restr)/len(yt_metadata_uniq_pat)))


### 2.3 Filter Graphtreon to keep only the ones matching patreon id

GT_timeseries_filter_results_032622.jpg _(filter script in scripts/scripts.ipynb)_
<div>
    <img src="img/GT_timeseries_filter_results_032622.jpg" alt="GT_timeseries_filter_results_032622.jpg" />
</div>

In [None]:
# declare global variable for size of original GT dataset
GT_final_processed_file_ROWS = 232_269

In [None]:
!ls -lh {LOCAL_DATA_FOLDER}df_gt_timeseries_filtered.tsv.gz

In [None]:
df_gt_timeseries_filtered = pd.read_csv(LOCAL_DATA_FOLDER+"df_gt_timeseries_filtered.tsv.gz", sep="\t", compression='gzip')
df_gt_timeseries_filtered.head(3)

In [None]:
print("Statistics of loaded pre-filtered Graphtreon Timeseries file:")
print("[Graphtreon Timeseries] Total number of patreon ids:                                                   {:>9,}".format(GT_final_processed_file_ROWS))
print("[Graphtreon Timeseries] Nb of patreon ids that exist in both GT Timeseries and YT metadata:            {:>9,} ({:.1%} of GT timeseries dataset)".format(len(df_gt_timeseries_filtered), len(df_gt_timeseries_filtered)/GT_final_processed_file_ROWS))


#### 2.3.1. Join GT timeseries with matched channel_id
match the channels in the restricted list of channels of the matched dataframe 

In [None]:
# join GT timeseries and matched channels
df_gt_timeseries_merged = df_gt_timeseries_filtered.merge(df_matched_channel_patreon, left_on='patreon', right_on='patreon_id')
df_gt_timeseries_merged.head(3)

#### 2.3.2. Filter/Restrict GT timeseries further
We now want to reduce the Graphtreon dataset by keeping only rows in filtered list of channels (chan_list_filt4)

In [None]:
# filter Graphtreon dataset by keeping only rows in filtered list of channels (chan_list_filt4)
df_gt_timeseries_restricted = df_gt_timeseries_merged[df_gt_timeseries_merged['channel_id'].isin(chan_list_filt4)]

print("[Graphtreon Timeseries] Total number of patreon ids:                                                   {:>9,}".format(GT_final_processed_file_ROWS))
print("[Graphtreon Timeseries] Nb of patreon ids that exist in both GT Timeseries and YT metadata:            {:>9,} ({:.1%} of GT timeseries dataset)".format(len(df_gt_timeseries_filtered), len(df_gt_timeseries_filtered)/GT_final_processed_file_ROWS))
print("[Graphtreon Timeseries] Nb of patreon ids that exist in both GT Timeseries and YT metadata restricted  {:>9,} ({:.1%} of GT timeseries dataset)".format(len(df_gt_timeseries_restricted), len(df_gt_timeseries_restricted)/GT_final_processed_file_ROWS))


#### 2.3.3 Extract the date and daily earnings per patreon account

In [None]:
# get list of all unique patreon ids in df_gt_timeseries_restricted
yt_gt_patreon_list_restricted = df_gt_timeseries_restricted.patreon.unique()
print("list of restricted patreon ids", yt_gt_patreon_list_restricted)
print("number of restricted patreon ids", len(yt_gt_patreon_list_restricted))

In [None]:
df_gt_timeseries_restricted.head(3)

In [None]:
# example of NaN value
# df_gt_timeseries_sample[df_gt_timeseries_sample['creatorName'] == 'Comedy Trap House']

In [None]:
# # From the Graphtreon dataset, for each channel, extract the date and earnings from “dailyGraph_earningsSeriesData” (takes about 3 mins)
# input_file_path = DATA_FOLDER+"/final_processed_file.jsonl.gz"

# MAX_ITER = 100

# nb_rows_read = 0
# valid_predicate_count = 0
# JSONDecodeErrors_cnt = 0 
# dailyEarningsError_cnt = 0 
# lines_json = []    

# compressed_file_size = os.stat(input_file_path).st_size
# print("Compressed file size is :                 {:>8,.2f} GB".format(compressed_file_size / 2**30))

# uncompressed_file_size = 13_310_000_000
# print("Estimated Uncompressed file size is :     {:>8,.2f} GB".format(uncompressed_file_size / 2**30))

# start = timeit.default_timer()

# # Load tqdm with size counter instead of file counter
# with tqdm(total=uncompressed_file_size, unit='B', unit_scale=True, unit_divisor=1024) as pbar:
#     with gzip.open(input_file_path, "r") as f:
#         for i, line in enumerate(f): 

#             read_bytes = len(line)
#             if read_bytes:
#                 pbar.set_postfix(file=input_file_path[len(DATA_FOLDER)+1:], refresh=False)
#                 pbar.update(read_bytes)

#             nb_rows_read += 1
            
#             # set a maximum iteration for tests
#             if nb_rows_read >= MAX_ITER:
#                 break
    
#             try:
#                 line_json = json.loads(line)
#             except Exception as e:
#                 JSONDecodeErrors_cnt += 1
#                 continue
                
#             # add line if patreon id is exists in df_yt_metadata_pt
#             if line_json['patreon'] in yt_gt_patreon_list_restricted:
#                 valid_predicate_count += 1
                
#                 # Use ast.literal_eval to convert string of lists, to list of list
#                 dailyGraph_earningsSeriesData = line_json.get('dailyGraph_earningsSeriesData')
                
#                 if dailyGraph_earningsSeriesData:
#                     daily_earnings = ast.literal_eval(dailyGraph_earningsSeriesData)
#                 else:
#                     daily_earnings = [[np.nan, np.nan]]
                                            
#                 for daily_earning in daily_earnings:
#                     # case where there are multiple tuples per row
#                     if isinstance(daily_earning, list):
#                         date = daily_earning[0]
#                         earning = daily_earning[1]
#                         lines_json.append({
#                             'creatorName':   line_json.get('creatorName'), 
#                             'creatorRange':  line_json.get('creatorRange'), 
#                             'startDate':     line_json.get('startDate'),
#                             'categoryTitle': line_json.get('categoryTitle'),
#                             'patreon':       line_json.get('patreon'),
#                             'date':          date,
#                             'earning':       earning
#                         })
#                     else:
#                         dailyEarningsError_cnt += 1
#                         print(">>>> dailyEarningsError - skipped line value: ")
#                         print(line_json.get('creatorName'), line_json.get('creatorRange'), line_json.get('startDate'), line_json.get('categoryTitle'), line_json.get('patreon'), daily_earnings)

# stop = timeit.default_timer()
# time_diff = stop - start

# print()
# print("==> total time to read and filter graphtreon time series:                      {:>10.0f} min. ({:.0f}s.)".format(time_diff/60, time_diff)) 
# print("==> number of rows read:                                                       {:>10,}".format(nb_rows_read))
# print("==> number of patreon ids that exist in both GTts and restricted YT metadata:  {:>10,} ({:.2%})".format(valid_predicate_count, valid_predicate_count/nb_rows_read ))
# print("==> number of skipped rows (JSONDecodeErrors):                                 {:>10,}".format(JSONDecodeErrors_cnt))
# print("==> number of skipped rows (dailyEarningsError):                               {:>10,}".format(dailyEarningsError_cnt))

# # create new dataframe with the filtered lines
# df_dailyGraph_earningsSeries = pd.DataFrame(data=lines_json)

GT_timeseries_date_earnings_extract_040422.jpg _(filter script above)_
<div>
    <img src="img/GT_timeseries_date_earnings_extract_040422.jpg" alt="GT_timeseries_date_earnings_extract_040422.jpg" />
</div>

In [None]:
# check for NaN values
# df_dailyGraph_earningsSeries[df_dailyGraph_earningsSeries.isna().any(axis=1)]

In [None]:
# save filtered data to LOCAL SCRATCH FOLDER as a compressed tsv (5.3Mb)
# output_file_path = LOCAL_DATA_FOLDER+"dailyGraph_earningsSeries.tsv.gz"
# df_dailyGraph_earningsSeries.to_csv(output_file_path, index=False, sep='\t', compression='gzip')

#### 2.3.4 Extract the date and daily patrons per patreon account

In [None]:
# # From the Graphtreon dataset, for each channel, extract the date and patrons from “dailyGraph_patronSeriesData” (takes about 3 mins)
# input_file_path = DATA_FOLDER+"/final_processed_file.jsonl.gz"

# MAX_ITER = 1000

# nb_rows_read = 0
# valid_predicate_count = 0
# JSONDecodeErrors_cnt = 0 
# dailyPatronsError_cnt = 0 
# lines_json = []    

# compressed_file_size = os.stat(input_file_path).st_size
# print("Compressed file size is :                 {:>8,.2f} GB".format(compressed_file_size / 2**30))

# uncompressed_file_size = 13_310_000_000
# print("Estimated Uncompressed file size is :     {:>8,.2f} GB".format(uncompressed_file_size / 2**30))

# start = timeit.default_timer()

# # Load tqdm with size counter instead of file counter
# with tqdm(total=uncompressed_file_size, unit='B', unit_scale=True, unit_divisor=1024) as pbar:
#     with gzip.open(input_file_path, "r") as f:
#         for i, line in enumerate(f): 

#             read_bytes = len(line)
#             if read_bytes:
#                 pbar.set_postfix(file=input_file_path[len(DATA_FOLDER)+1:], refresh=False)
#                 pbar.update(read_bytes)

#             nb_rows_read += 1
            
#             # set a maximum iteration for tests
#             if nb_rows_read >= MAX_ITER:
#                 break
    
#             try:
#                 line_json = json.loads(line)
#             except Exception as e:
#                 JSONDecodeErrors_cnt += 1
#                 continue
                
#             # add line if patreon id is exists in df_yt_metadata_pt
#             if line_json['patreon'] in yt_gt_patreon_list_restricted:
#                 valid_predicate_count += 1
                
#                 # Use ast.literal_eval to convert string of lists, to list of list
#                 dailyGraph_patronSeriesData = line_json.get('dailyGraph_patronSeriesData')
                
#                 if dailyGraph_patronSeriesData:
#                     daily_patrons = ast.literal_eval(dailyGraph_patronSeriesData)
#                 else:
#                     daily_patrons = [[np.nan, np.nan]]
                                            
#                 for daily_patron in daily_patrons:
#                     # case where there are multiple tuples per row
#                     if isinstance(daily_patron, list):
#                         date = daily_patron[0]
#                         patrons = daily_patron[1]
#                         lines_json.append({
#                             'creatorName':   line_json.get('creatorName'), 
#                             'creatorRange':  line_json.get('creatorRange'), 
#                             'startDate':     line_json.get('startDate'),
#                             'categoryTitle': line_json.get('categoryTitle'),
#                             'patreon':       line_json.get('patreon'),
#                             'date':          date,
#                             'patrons':       patrons
#                         })
#                     else:
#                         dailyPatronsError_cnt += 1
#                         print(">>>> dailyPatronsError - skipped line value: ")
#                         print(line_json.get('creatorName'), line_json.get('creatorRange'), line_json.get('startDate'), line_json.get('categoryTitle'), line_json.get('patreon'), daily_patrons)

# stop = timeit.default_timer()
# time_diff = stop - start

# print()
# print("==> total time to read and filter graphtreon time series:                      {:>10.0f} min. ({:.0f}s.)".format(time_diff/60, time_diff)) 
# print("==> number of rows read:                                                       {:>10,}".format(nb_rows_read))
# print("==> number of patreon ids that exist in both GTts and restricted YT metadata:  {:>10,} ({:.2%})".format(valid_predicate_count, valid_predicate_count/nb_rows_read ))
# print("==> number of skipped rows (JSONDecodeErrors):                                 {:>10,}".format(JSONDecodeErrors_cnt))
# print("==> number of skipped rows (dailyPatronsError):                               {:>10,}".format(dailyPatronsError_cnt))

# # create new dataframe with the filtered lines
# df_dailyGraph_patronsSeries = pd.DataFrame(data=lines_json)
# df_dailyGraph_patronsSeries

GT_timeseries_date_patrons_extract_042922.jpg _(filter script above)_
<div>
    <img src="img/GT_timeseries_date_patrons_extract_042922.jpg" alt="GT_timeseries_date_patrons_extract_042922.jpg" />
</div>

In [None]:
# check for NaN values
# df_dailyGraph_patronsSeries[df_dailyGraph_patronsSeries.isna().any(axis=1)]

In [None]:
# save filtered data to LOCAL SCRATCH FOLDER as a compressed tsv (7.1Mb)
# output_file_path = LOCAL_DATA_FOLDER+"dailyGraph_patronsSeries.tsv.gz"
# df_dailyGraph_patronsSeries.to_csv(output_file_path, index=False, sep='\t', compression='gzip')

#### 2.3.5 Merge extracted times series of daily earnings and daily patrons

In [None]:
!ls -lh {LOCAL_DATA_FOLDER}dailyGraph_earningsSeries.tsv.gz

In [None]:
# read dailyGraph_earningsSeries file from disk and convert dates
df_dailyGraph_earningsSeries = pd.read_csv(LOCAL_DATA_FOLDER+"dailyGraph_earningsSeries.tsv.gz", sep="\t", compression='gzip')
# df_dailyGraph_earningsSeries.date = pd.to_datetime(df_dailyGraph_earningsSeries.date, unit='ms')
df_dailyGraph_earningsSeries

In [None]:
!ls -lh {LOCAL_DATA_FOLDER}dailyGraph_patronsSeries.tsv.gz

In [None]:
# read dailyGraph_patronsSeries from disk and convert dates
df_dailyGraph_patronsSeries = pd.read_csv(LOCAL_DATA_FOLDER+"dailyGraph_patronsSeries.tsv.gz", sep="\t", compression='gzip')
# df_dailyGraph_patronsSeries.date = pd.to_datetime(df_dailyGraph_patronsSeries.date, unit='ms')
df_dailyGraph_patronsSeries

In [None]:
# join dailyGraph_earningsSeries with df_dailyGraph_patronsSeries
df_dailyGraph_patrons_and_earnings_Series = df_dailyGraph_earningsSeries.merge(df_dailyGraph_patronsSeries, how='outer')

# convert patrons column to Int64 so it can hold NaN values after outer join
df_dailyGraph_patrons_and_earnings_Series['patrons'] = df_dailyGraph_patrons_and_earnings_Series['patrons'].astype('Int64')
df_dailyGraph_patrons_and_earnings_Series.head()

In [None]:
# save filtered data to LOCAL SCRATCH FOLDER as a compressed tsv (6.2Mb)
# output_file_path = LOCAL_DATA_FOLDER+"dailyGraph_patrons_and_earnings_Series.tsv.gz"
# df_dailyGraph_patrons_and_earnings_Series.to_csv(output_file_path, index=False, sep='\t', compression='gzip')

### 2.4 Plots

In [None]:
!ls -lh {LOCAL_DATA_FOLDER}dailyGraph_patrons_and_earnings_Series.tsv.gz

In [None]:
# read merged dailyGraph_patrons_and_earnings_Series from disk
df_dailyGraph_patrons_and_earnings_Series = pd.read_csv(LOCAL_DATA_FOLDER+"dailyGraph_patrons_and_earnings_Series.tsv.gz", sep="\t", compression='gzip')
df_dailyGraph_patrons_and_earnings_Series['date'] = pd.to_datetime(df_dailyGraph_patrons_and_earnings_Series['date'], unit='ms')
df_dailyGraph_patrons_and_earnings_Series['patrons'] = df_dailyGraph_patrons_and_earnings_Series['patrons'].astype('Int64')

print(df_dailyGraph_patrons_and_earnings_Series.dtypes)
df_dailyGraph_patrons_and_earnings_Series

#### 2.4.1 Plot Patreon Time Series

In [None]:
years = mdates.YearLocator()
months = mdates.MonthLocator()
years_fmt = mdates.DateFormatter('%Y')

In [None]:
# RE-declare global variable for size of original GT dataset
GT_final_processed_file_ROWS = 232_269

##### Restrict to top patrons

In [None]:
TOP_CNT = 863
# group by patreon account, sort by max number of patrons
dailyGraph_grp_patreon = df_dailyGraph_patrons_and_earnings_Series.groupby('patreon')\
                                                     .agg(date_cnt=('date', 'count'),
                                                          earliest_date=('date', 'min'),
                                                          lastest_date=('date', 'max'),
                                                          daily_earning_mean=('earning', 'mean'),
                                                          daily_earning_max=('earning', 'max'),
                                                          daily_patrons_mean=('patrons', 'mean'),
                                                          daily_patrons_max=('patrons', 'max'))\
                                                     .sort_values(by=['daily_patrons_max'], ascending=False)\
                                                     .round(2)

# remove patreon accounts with no earnings data at all
dailyGraph_grp_patreon = dailyGraph_grp_patreon[dailyGraph_grp_patreon['daily_earning_mean'].notna()]
dailyGraph_grp_patreon = dailyGraph_grp_patreon.reset_index()

# remove hours from dates
# dailyGraph_grp_patreon.earliest_date = dailyGraph_grp_patreon.earliest_date.dt.date
# dailyGraph_grp_patreon.lastest_date = dailyGraph_grp_patreon.lastest_date.dt.date

# extract the top most profitable patreon accounts
top_patreons = dailyGraph_grp_patreon[:TOP_CNT]['patreon']

print("[Graphtreon Timeseries] Total number of patreon ids (original file):                      {:>9,}".format(GT_final_processed_file_ROWS))
print("[Graphtreon Timeseries] Nb of patreon ids in dailyGraph patreon + earnings time series:   {:>9,} ({:.1%} of original dataset)".format(len(dailyGraph_grp_patreon), len(dailyGraph_grp_patreon)/GT_final_processed_file_ROWS))

print()

dailyGraph_grp_patreon[:10].style.set_caption(f"Top {TOP_CNT} highest-earning Patreon accounts (sorted by max daily earnings)")


In [None]:
df_top_pt_daily_earnings = df_dailyGraph_patrons_and_earnings_Series[df_dailyGraph_patrons_and_earnings_Series['patreon'].isin(top_patreons)]
df_top_pt_daily_earnings

In [None]:
years = mdates.YearLocator()   # every year
months = mdates.MonthLocator()  # every month

TOP_CNT_local = TOP_CNT
TOP_CNT_local = 4
# plot Patreon daily earningsSeriesData for top patreon accounts
fig, axs = plt.subplots(math.ceil(TOP_CNT_local/2), 2, figsize=(12, TOP_CNT_local*1.2), sharey=False, sharex=False)
for idx, patreon in tqdm(enumerate(top_patreons[:TOP_CNT_local])):
    row = math.floor(idx/2)
    col = idx % 2
    ax1 = axs[row, col]
    
    # ax1.scatter(x[:4], y[:4], s=10, c='b', marker="s", label='first')
    # ax1.scatter(x[40:],y[40:], s=10, c='r', marker="o", label='second')

    tmp_df = df_top_pt_daily_earnings[df_top_pt_daily_earnings['patreon'] == patreon]

    # sbplt = axs[idx, 0]
    

    color = 'tab:blue'
    patrons, = ax1.plot(tmp_df['date'], tmp_df['patrons'], color=color, label='patrons')
    ax1.set_ylabel('# Patrons', color=color) 
    ax1.tick_params(axis='y', labelcolor=color)
    ax1.set(title=patreon)
    
    
    color = 'tab:orange'
    ax2 = ax1.twinx()  # Create a twin Axes sharing the xaxis.
    earnings, = ax2.plot(tmp_df['date'], tmp_df['earning'], color=color, label='earnings')
    ax2.set_ylabel("Earnings per month", color=color) 
    ax2.tick_params(axis='y', labelcolor=color)
    
    ax1.xaxis.set_major_locator(years)
    ax1.xaxis.set_major_formatter(years_fmt)
    ax1.xaxis.set_minor_locator(months)
    # ax1.legend(handles=[earnings, patrons], loc='upper left');
    
fig.suptitle(f'Timeseries of the top {TOP_CNT_local} Patreon accounts (most subscribers) \n', fontweight="bold")
fig.text(0.5,0, 'Month')
# fig.text(0,0.5, 'Earnings per month ($)', rotation = 90)
fig.tight_layout(pad=3, w_pad=5, h_pad=2)

**Observation:**
We can see a drop of income at the beginning of each month. 
--> due to people unsubscribing
--> could do some averaging

In [None]:
# analyse 1 account in detail
patreon_account = 'patreon.com/tinymeatgang'

with pd.option_context('display.max_rows', 90, 'display.min_rows', 90):
    display(df_top_pt_daily_earnings[(df_top_pt_daily_earnings['patreon'] == patreon_account) 
                                     # & (df_top_pt_daily_earnings['date'] > pd.Timestamp('2021-01-01'))
                                    ].head(20))
        
df_top_pt_daily_earnings.dtypes

# check for NaN values
# df_top_pt_daily_earnings[df_top_pt_daily_earnings.isna().any(axis=1)]

##### Detect breaks / shocks

In [None]:
# n_breaks = 3
# model = rpt.Dynp(model="l1")

# # plot Patreon daily earningsSeriesData for top patreon accounts
# fig, axs = plt.subplots(int(TOP_CNT/2), 2, figsize=(12, TOP_CNT*1.2), sharey=False, sharex=False)
# for idx, patreon in enumerate(top_patreons):
#     print(patreon)
#     row = math.floor(idx/2)
#     col = idx % 2
#     sbplt = axs[row, col]

#     tmp_df = df_top_pt_daily_earnings[df_top_pt_daily_earnings['patreon'] == patreon]

#     # convert the dataframe into a time series.
#     ts_df = tmp_df.set_index(tmp_df['date'])
#     ts = ts_df['earning']

#     y = np.array(ts.tolist())

#     model.fit(y)
#     breaks = model.predict(n_bkps=n_breaks-1)
    
#     breaks_rpt = []
#     for i in breaks:
#         breaks_rpt.append(ts.index[i-1])
#     breaks_rpt = pd.to_datetime(breaks_rpt)
    
#     sbplt.plot(ts, label='data')
#     sbplt.set(title=patreon)
#     print_legend = True
#     for i in breaks_rpt:
#         if print_legend:
#             sbplt.axvline(i, color='red',linestyle='dashed', label='breaks')
#             print_legend = False
#         else:
#             sbplt.axvline(i, color='red',linestyle='dashed')

#     sbplt.xaxis.set_major_locator(years)
#     sbplt.xaxis.set_major_formatter(years_fmt)
#     sbplt.xaxis.set_minor_locator(months)
#     sbplt.xaxis.grid(color="#CCCCCC", ls=":")
#     sbplt.yaxis.grid(color="#CCCCCC", ls=":")
    
    
# fig.suptitle(f'Timeseries of the top {TOP_CNT} highest-earning Patreon accounts \n (earnings per month in dollars)', fontweight="bold")
# fig.text(0.5,0, 'Month')
# fig.text(0,0.5, 'Earnings per month ($)', rotation = 90)
# fig.tight_layout(pad=3, w_pad=5, h_pad=2)

#### 2.4.2 Plot YouTube timeseries for channels matching top Patreon accounts

In [None]:
# load matching dataframe
df_matched_channel_patreon = pd.read_csv(LOCAL_DATA_FOLDER+"df_matched_channel_patreon.tsv.gz", sep="\t", compression="gzip")

In [None]:
# add patreon_id column to YT timeseries
df_yt_timeseries_filt4_merged = df_yt_timeseries_filt4.merge(df_matched_channel_patreon, left_on='channel', right_on='channel_id')
df_yt_timeseries_filt4_merged.head(1)

In [None]:
# filter channels matching top patreon accounts
df_yt_timeseries_top_pt = df_yt_timeseries_filt4_merged[df_yt_timeseries_filt4_merged['patreon_id'].isin(top_patreons)]


print('[YouTube Timeseries] Time range after applying filter 1+2+3+4              {} and {}'.format(df_yt_timeseries_filt4['datetime'].min().strftime('%B %d, %Y'),
                                                                                                    df_yt_timeseries_filt4['datetime'].max().strftime('%B %d, %Y')))

print('[YouTube Timeseries] Time range after matching top patreon accounts        {} and {}'.format(df_yt_timeseries_top_pt['datetime'].min().strftime('%B %d, %Y'),
                                                                                                    df_yt_timeseries_top_pt['datetime'].max().strftime('%B %d, %Y')))

top_yt_patreons = df_yt_timeseries_top_pt.patreon_id.unique()
top_yt_patreons

In [None]:
df_yt_timeseries_top_pt.groupby(['patreon_id', 'channel_id'])\
                                                     .agg(datetime_cnt=('datetime', 'count'),
                                                          date_min=('datetime', 'min'),
                                                          date_max=('datetime', 'max'),
                                                          views_max=('views', 'max'),
                                                          subs_date=('subs', 'max'),
                                                          videos_max=('videos', 'mean'))\
                                                     .sort_values(by=['videos_max'], ascending=False)
                                                     #      \
                                                     # .reset_index()\
                                                     # .round(2)

In [None]:
df_yt_timeseries_top_pt.head(3)

In [None]:
# # plot YT cumulative views timeseries for top patreon accounts
# fig, axs = plt.subplots(int(math.ceil(len(top_yt_patreons)/2)), 2, figsize=(12, len(top_yt_patreons)*1.2), sharey=False, sharex=False)
# for idx, patreon in enumerate(top_yt_patreons):
#     row = math.floor(idx/2)
#     col = idx % 2
#     sbplt = axs[row, col]

#     tmp_df = df_yt_timeseries_top_pt[df_yt_timeseries_top_pt['patreon_id'] == patreon]

#     sbplt.plot(tmp_df['datetime'], tmp_df['views'])
#     sbplt.set(title=patreon+"\n"+tmp_df['channel'].iloc[0])
#     sbplt.xaxis.set_major_locator(years)
#     sbplt.xaxis.set_major_formatter(years_fmt)
#     sbplt.xaxis.set_minor_locator(months)
    
    
# fig.suptitle(f'YouTube timeseries of the channels corresponging to the top {TOP_CNT} highest-earning Patreon accounts \n (YT views per week)', fontweight="bold")
# fig.text(0.5,0, 'Week')
# fig.text(0,0.5, 'Views', rotation = 90)
# fig.tight_layout(pad=3, w_pad=5, h_pad=2)

In [None]:
# # plot YT views timeseries for top patreon accounts
# print(f'YouTube views per week timeseries per channel (for the top {TOP_CNT} highest-earning Patreon accounts)')

# for idx, patreon in enumerate(top_yt_patreons):
#     fig, axs = plt.subplots(1, 2, figsize=(12, 3), sharey=False, sharex=True)
#     row = idx    

#     tmp_df = df_yt_timeseries_top_pt[df_yt_timeseries_top_pt['patreon_id'] == patreon]

#     # delta views per week
#     sbplt = axs[0]
#     sbplt.plot(tmp_df['datetime'], tmp_df['delta_views'])
#     sbplt.set(title="YouTube delta views per week")
#     sbplt.set_xlabel('Week')
#     sbplt.set_ylabel('Delta Views')
#     sbplt.xaxis.set_major_locator(years)
#     sbplt.xaxis.set_major_formatter(years_fmt)
#     sbplt.xaxis.set_minor_locator(months)
    
#     # cumulative views per week
#     sbplt = axs[1]
#     sbplt.plot(tmp_df['datetime'], tmp_df['views'])
#     sbplt.set(title="YouTube cumulative views per week")
#     sbplt.set_xlabel('Week')
#     sbplt.set_ylabel('Views')
#     sbplt.xaxis.set_major_locator(years)
#     sbplt.xaxis.set_major_formatter(years_fmt)
#     sbplt.xaxis.set_minor_locator(months)
    
    
#     print(f"{idx+1}: https://youtube.com/channel/{tmp_df['channel'].iloc[0]} \t\t {patreon}")
#     fig.suptitle(f"{patreon}\n{tmp_df['channel'].iloc[0]}", fontweight="bold") 
#     fig.tight_layout(w_pad=5)
    

In [None]:
# # explore the descending drop in cumulative views per week for sonicether channel 
# with pd.option_context('display.max_rows', 90, 'display.min_rows', 90):
#     display(df_yt_timeseries_top_pt[
#         (df_yt_timeseries_top_pt['channel_id'] == 'UCAbpj6UljjAz7JvJt-yJIjg') 
#      & (df_yt_timeseries_top_pt['datetime'] > pd.Timestamp('2018-04-08'))
#      & (df_yt_timeseries_top_pt['datetime'] < pd.Timestamp('2018-06-01'))
#     ].head(90))

In [None]:
# # plot YT subsciptions timeseries for top patreon accounts
# fig, axs = plt.subplots(int(math.ceil(len(top_yt_patreons)/2)), 2, figsize=(12, len(top_yt_patreons)*1.2), sharey=False, sharex=False)
# for idx, patreon in enumerate(top_yt_patreons):
#     row = math.floor(idx/2)
#     col = idx % 2
#     sbplt = axs[row, col]

#     tmp_df = df_yt_timeseries_top_pt[df_yt_timeseries_top_pt['patreon_id'] == patreon]

#     sbplt.plot(tmp_df['datetime'], tmp_df['subs'])
#     sbplt.set(title=patreon+"\n"+tmp_df['channel'].iloc[0])
#     sbplt.xaxis.set_major_locator(years)
#     sbplt.xaxis.set_major_formatter(years_fmt)
#     sbplt.xaxis.set_minor_locator(months)
    
    
# fig.suptitle(f'YouTube timeseries of the channels corresponging to the top {TOP_CNT} highest-earning Patreon accounts \n (YT subscriptions per week)', fontweight="bold")
# fig.text(0.5,0, 'Week')
# fig.text(0,0.5, 'Views', rotation = 90)
# fig.tight_layout(pad=3, w_pad=5, h_pad=2)

In [None]:
# # plot YT # videos timeseries for top patreon accounts

# fig, axs = plt.subplots(int(math.ceil(len(top_yt_patreons)/2)), 2, figsize=(12, len(top_yt_patreons)*1.2), sharey=False, sharex=False)
# for idx, patreon in enumerate(top_yt_patreons):
#     row = math.floor(idx/2)
#     col = idx % 2
#     sbplt = axs[row, col]

#     tmp_df = df_yt_timeseries_top_pt[df_yt_timeseries_top_pt['patreon_id'] == patreon]

#     sbplt.plot(tmp_df['datetime'], tmp_df['videos'])
#     sbplt.set(title=patreon+"\n"+tmp_df['channel'].iloc[0])
#     sbplt.xaxis.set_major_locator(years)
#     sbplt.xaxis.set_major_formatter(years_fmt)
#     sbplt.xaxis.set_minor_locator(months)
    
# fig.suptitle(f'YouTube timeseries of the channels corresponging to the top {TOP_CNT} highest-earning Patreon accounts \n (YT videos per week)', fontweight="bold")
# fig.text(0.5,0, 'Week')
# fig.text(0,0.5, 'Views', rotation = 90)
# fig.tight_layout(pad=3, w_pad=5, h_pad=2)

#### 2.4.3 Compare YouTube and top Patreon timeseries

In [None]:
# remove patreon accounts that have more than 1 youtube channel
df_yt_timeseries_top_pt_chan_id_cnt = df_yt_timeseries_top_pt.groupby(['patreon_id','channel_id']).agg(channel_id_cnt=("channel_id", pd.Series.nunique))
df_yt_timeseries_top_pt_chan_id_cnt = df_yt_timeseries_top_pt_chan_id_cnt.groupby('patreon_id').count()
df_yt_timeseries_top_pt_unique_chan = df_yt_timeseries_top_pt_chan_id_cnt[df_yt_timeseries_top_pt_chan_id_cnt['channel_id_cnt']==1]

top_patreons_unique_chan = df_yt_timeseries_top_pt_unique_chan.index

print("Number of patreon accounts with only 1 YT channel:")
top_patreons_unique_chan.size

In [None]:
def KM(x, pos):
    'The two args are the value and tick position'
    if x > 999_999:
        return '%2.1fM' % (x * 1e-6)
    elif x > 999:
        return '%2.1fK' % (x * 1e-3)
    else:
        return '%3.0f ' % (x)
KM_formatter = FuncFormatter(KM)

In [None]:
# def custom_plot(ax, x, y, title, x_axis_label="default x", y_axis_label="default y", color="#1f77b4", alpha=1):
#     ax.plot(x, y, color, alpha)
#     ax.set(title=title)
#     ax.set_xlabel(x_axis_label)    
#     ax.set_ylabel(y_axis_label)    

# custom_plot(axs[0,0], ts_pt_df['date'], ts_pt_df['patrons'], alpha=0.2)
# custom_plot(axs[0,0], ts_pt_df['date'], ts_pt_df['patrons_ravg'], "Number of patrons", y_axis_label="# Patrons")

##### Manual change point detection

In [None]:
# find max increase algo V2
def find_breakpoint_v2(df, column):
    max_diff = 0
    max_index = 0
    i = 0
    df_len = len(df)

    # scan dataset for largest increase
    for date_index, row in ts_pt_df.iterrows():
        if (i >= 30 and i < df_len-30):
            sub30 = df.iloc[i-30][column]
            point = df.iloc[i][column]
            add30 = df.iloc[i+30][column]

            d1 = point - sub30
            d2 = add30 - point
            cur_diff = d2 - d1
            
            if cur_diff > max_diff:
                max_diff = cur_diff
                max_index = i
        i = i + 1
    
    return df.iloc[max_index]['date']

In [None]:
# # Find breakpoint and store Patreon breakpoint related statistics in dataframe

# # variables declaration
# month_offset = pd.DateOffset(months=1)
# week_offset = pd.DateOffset(weeks=1)
# rolling_avg_window = 30
# pt_bkpnt = []


# print(f'Iterate over {len(top_patreons_unique_chan)} patreon accounts...')
# # LOOP OVER TOP PATREON ACCOUNTS
# for idx, patreon in tqdm(enumerate(top_patreons_unique_chan)):
    
#     ########################## RESTRICT DATAFRAMES TO 1 PATREON ACCOUNT ##########################

#     # patreon earnings and users
#     tmp_df_pt = df_top_pt_daily_earnings[df_top_pt_daily_earnings['patreon'] == patreon].copy()  
#     tmp_df_pt = tmp_df_pt.drop_duplicates()
    
#     # youtube videos
#     tmp_df_yt = df_yt_timeseries_top_pt[df_yt_timeseries_top_pt['patreon_id'] == patreon].copy()
        
        
#     ########################## RESTRICT DATES FOR ZOOM OUT ##########################
    
#     # set min and max dates for plots   
#     date_min = max([tmp_df_yt['datetime'].min(), tmp_df_pt['date'].min()])
#     date_max = min([tmp_df_yt['datetime'].max(), tmp_df_pt['date'].max()])
    
#     # if no overlap period between YT and Patreon datasets, skip account
#     if date_max < date_min:
#         # print(f":( no overlapping period between YouTube and Patreon datasets\n")
#         continue
        
#     # restrict datasets between min and max dates
#     tmp_df_pt = tmp_df_pt[(tmp_df_pt['date'] >= date_min) & (tmp_df_pt['date'] <= date_max)]
#     tmp_df_yt = tmp_df_yt[(tmp_df_yt['datetime'] >= date_min) & (tmp_df_yt['datetime'] <= date_max)]
    
#     # align both dataframes since youtube starts once a week
#     tmp_df_pt = tmp_df_pt[tmp_df_pt['date'] >= tmp_df_yt['datetime'].min()]
    
    
    
#     ########################## PATREON: CALCULATE MOVING AVERAGE AND WEEKLY DELTAS ##########################
    
#     tmp_df_pt['patrons_ma'] = tmp_df_pt['patrons'].rolling(rolling_avg_window, center=True).mean()
#     tmp_df_pt['earning_ma'] = tmp_df_pt['earning'].rolling(rolling_avg_window, center=True).mean()
#     ts_pt_df = tmp_df_pt.set_index(tmp_df_pt['date']) # set the date as the index
    
#     # resample time series to get 7 days intervals in order to calculate weekly deltas
#     ts_pt_weekly_avg_df = ts_pt_df.resample('7D').mean()
#     ts_pt_weekly_avg_df['delta_patrons'] = ts_pt_weekly_avg_df['patrons'].diff(periods=1)
#     ts_pt_weekly_avg_df['delta_earning'] = ts_pt_weekly_avg_df['earning'].diff(periods=1)
#     ts_pt_weekly_avg_df = ts_pt_weekly_avg_df[1:]  # remove 1st row (which is NA)
#     tmp_df_yt = tmp_df_yt[1:] # remove YT 1st row to start at the same time as PT
    
#     # reorder columns to have deltas columns next to their respective columns
#     patreon_column_names = ['earning', 'delta_earning', 'earning_ma', 'patrons', 'delta_patrons', 'patrons_ma']
#     ts_pt_weekly_avg_df = ts_pt_weekly_avg_df[patreon_column_names]
    
#     # convert Float64 columns to float64 to avoid Matplotlib NAType error
#     ts_pt_weekly_avg_df_float64 = ts_pt_weekly_avg_df.astype({'patrons': 'float64', 'delta_patrons': 'float64'})


    
#     ########################## PRINT TITLES ##########################
#     # print URLs for patreon, graphtreon, YT channel(s) related to this patreon account, and breakpoint date
#     # ch_ids = tmp_df_yt['channel'].unique()
#     # print(f"\n\033[1mRank {idx+1}: {patreon[12:]} \033[0m")
    
#     ########################## DETECT BREAKPOINT AND REJECT PATREON ACCOUNT IF NOT VALID ##########################

#     breakpoint_date = find_breakpoint_v2(tmp_df_pt, 'patrons_ma')
#     # print("Breakpoint date: ", breakpoint_date.date())
    
#     # check that dates prior and after breakpoint exist
#     if not (((breakpoint_date - 2*month_offset) in ts_pt_df.index and ((breakpoint_date + 2*month_offset)) in ts_pt_df.index)):
#         # print(f":( ERROR: Breakpoint too close to edge of patreon time series or missing data\n")
#         continue
    
    
#     ################################### CALCULATE INCREASE AND REJECT IF NOT VALID OR LESS THAN THRESHOLD ###################################

#     try: 
#         avg_patrons_bkpnt = ts_pt_df.loc[breakpoint_date, 'patrons_ma']
#         avg_patrons_sub30 = ts_pt_df.loc[breakpoint_date-month_offset, 'patrons_ma']
#         avg_patrons_add30 = ts_pt_df.loc[breakpoint_date+month_offset, 'patrons_ma']
#     except Exception:
#         # print("one of the dates was not present in dataset")
#         continue
    
    
#     d1 = avg_patrons_bkpnt - avg_patrons_sub30
#     d2 = avg_patrons_add30 - avg_patrons_bkpnt

#     d1 = 1 if 0 <= d1 < 1 else d1
#     d2 = 1 if 0 <= d2 < 1 else d2

#     r = d2/d1
    
#     pt_bkpnt.append((patreon, (breakpoint_date-month_offset), avg_patrons_sub30, breakpoint_date, avg_patrons_bkpnt, (breakpoint_date+month_offset), avg_patrons_add30, d1, d2, r))


# df_pt_bkpnt = pd.DataFrame(pt_bkpnt, columns =['patreon_id', 'bkpt_date_sub30', 'avg_patrons_sub30', 'bkpt_date', 'avg_patrons_bkpnt', 'bkpt_date_add30', 'avg_patrons_add30', 'd1', 'd2', 'ratio'])
# print(f'Patreon accounts matching dates criteria:  {len(df_pt_bkpnt)}')
# df_pt_bkpnt

In [None]:
# Find breakpoint and store Patreon breakpoint related statistics in dataframe V2

# variables declaration
month_offset = pd.DateOffset(months=1)
week_offset = pd.DateOffset(weeks=1)
rolling_avg_window = 30
pt_bkpnt = []


print(f'Iterate over {len(top_patreons_unique_chan)} patreon accounts...')
# LOOP OVER TOP PATREON ACCOUNTS
for idx, patreon in tqdm(enumerate(top_patreons_unique_chan[:3])):
    
    ########################## RESTRICT DATAFRAMES TO 1 PATREON ACCOUNT ##########################

    # patreon earnings and users
    tmp_df_pt = df_top_pt_daily_earnings[df_top_pt_daily_earnings['patreon'] == patreon].copy()  
    tmp_df_pt = tmp_df_pt.drop_duplicates()

    # youtube videos
    tmp_df_yt = df_yt_timeseries_top_pt[df_yt_timeseries_top_pt['patreon_id'] == patreon].copy()
    
    # youtube metadata
    tmp_df_yt_meta = df_yt_metadata_pt_filtered[df_yt_metadata_pt_filtered['patreon_id'] == patreon].copy()   
    tmp_df_yt_meta = tmp_df_yt_meta.sort_values('upload_date')
    
    # replace dates that were collected after 23:00 to their next day, and remove hour
    tmp_df_yt['datetime_original'] = tmp_df_yt['datetime']
    tmp_df_yt['datetime'] = tmp_df_yt['datetime'].apply(lambda date: (date + pd.DateOffset(days=1)) if date.hour >= 23 else date) 
    
    # remove hours and convert to datetime type
    tmp_df_yt['datetime'] = pd.to_datetime(tmp_df_yt['datetime'].dt.date)
    
        
    ########################## RESTRICT DATES FOR ZOOM OUT ##########################
    
    # set min and max dates for plots   
    date_min = max([tmp_df_yt['datetime'].min(), tmp_df_pt['date'].min()])
    date_max = min([tmp_df_yt['datetime'].max(), tmp_df_pt['date'].max()])
    
    # if no overlap period between YT and Patreon datasets, skip account
    if date_max < date_min:
        # print(f":( no overlapping period between YouTube and Patreon datasets\n")
        continue
        
    # restrict datasets between min and max dates
    tmp_df_pt = tmp_df_pt[(tmp_df_pt['date'] >= date_min) & (tmp_df_pt['date'] <= date_max)]
    tmp_df_yt = tmp_df_yt[(tmp_df_yt['datetime'] >= date_min) & (tmp_df_yt['datetime'] <= date_max)]
    
    # align both dataframes since youtube starts once a week
    tmp_df_pt = tmp_df_pt[tmp_df_pt['date'] >= tmp_df_yt['datetime'].min()]
    
    
    
    ########################## PATREON: CALCULATE MOVING AVERAGE AND WEEKLY DELTAS ##########################
    
    tmp_df_pt['patrons_ma'] = tmp_df_pt['patrons'].rolling(rolling_avg_window, center=True).mean()
    tmp_df_pt['earning_ma'] = tmp_df_pt['earning'].rolling(rolling_avg_window, center=True).mean()
    ts_pt_df = tmp_df_pt.set_index(tmp_df_pt['date']) # set the date as the index
    
    # resample time series to get 7 days intervals in order to calculate weekly deltas
    ts_pt_weekly_avg_df = ts_pt_df.resample('7D').mean()
    ts_pt_weekly_avg_df['delta_patrons'] = ts_pt_weekly_avg_df['patrons'].diff(periods=1)
    ts_pt_weekly_avg_df['delta_earning'] = ts_pt_weekly_avg_df['earning'].diff(periods=1)
    ts_pt_weekly_avg_df = ts_pt_weekly_avg_df[1:]  # remove 1st row (which is NA)
    tmp_df_yt = tmp_df_yt[1:] # remove YT 1st row to start at the same time as PT
    
    # reorder columns to have deltas columns next to their respective columns
    patreon_column_names = ['earning', 'delta_earning', 'earning_ma', 'patrons', 'delta_patrons', 'patrons_ma']
    ts_pt_weekly_avg_df = ts_pt_weekly_avg_df[patreon_column_names]
    
    # convert Float64 columns to float64 to avoid Matplotlib NAType error
    ts_pt_weekly_avg_df_float64 = ts_pt_weekly_avg_df.astype({'patrons': 'float64', 'delta_patrons': 'float64'})


    
    ########################## PRINT TITLES ##########################
    # print URLs for patreon, graphtreon, YT channel(s) related to this patreon account, and breakpoint date
    # ch_ids = tmp_df_yt['channel'].unique()
    # print(f"\n\033[1mRank {idx+1}: {patreon[12:]} \033[0m")
    
    ########################## DETECT BREAKPOINT AND REJECT PATREON ACCOUNT IF NOT VALID ##########################

    breakpoint_date = find_breakpoint_v2(tmp_df_pt, 'patrons_ma')
    # print("Breakpoint date: ", breakpoint_date.date())
    
    # check that dates prior and after breakpoint exist
    if not (((breakpoint_date - 2*month_offset) in ts_pt_df.index and ((breakpoint_date + 2*month_offset)) in ts_pt_df.index)):
        # print(f":( ERROR: Breakpoint too close to edge of patreon time series or missing data\n")
        continue
    
    
    
    ################################### CALCULATE INCREASE AND REJECT IF NOT VALID OR LESS THAN THRESHOLD ###################################

    bkpt_date       = breakpoint_date
    bkpt_date_sub30 = breakpoint_date-month_offset
    bkpt_date_add30 = breakpoint_date+month_offset
        
    try: 
        avg_patrons_bkpnt = ts_pt_df.loc[bkpt_date, 'patrons_ma']
        avg_patrons_sub30 = ts_pt_df.loc[bkpt_date_sub30, 'patrons_ma']
        avg_patrons_add30 = ts_pt_df.loc[bkpt_date_add30, 'patrons_ma']

    except Exception:
        # print("one of the dates was not present in dataset")
        continue
    
    
    d1 = avg_patrons_bkpnt - avg_patrons_sub30
    d2 = avg_patrons_add30 - avg_patrons_bkpnt

    d1 = 1 if 0 <= d1 < 1 else d1
    d2 = 1 if 0 <= d2 < 1 else d2

    r = d2/d1
    
    
    ################################### CALCULATE DELTA MEANS BEFORE AND AFTER BKPOINT ###################################  
    
    ##### PATREON #####
    tmp_df_PT_sub30 = ts_pt_weekly_avg_df_float64[(ts_pt_weekly_avg_df_float64.index >= bkpt_date_sub30) & (ts_pt_weekly_avg_df_float64.index <= bkpt_date)]
    tmp_df_PT_add30 = ts_pt_weekly_avg_df_float64[(ts_pt_weekly_avg_df_float64.index >= bkpt_date) & (ts_pt_weekly_avg_df_float64.index <= bkpt_date_add30)]

    # delta patrons
    mean_delta_patrons_befor = tmp_df_PT_sub30['delta_patrons'].mean()
    mean_delta_patrons_after = tmp_df_PT_add30['delta_patrons'].mean()
        
    # delta earnings
    mean_delta_earnings_befor = tmp_df_PT_sub30['delta_earning'].mean()
    mean_delta_earnings_after = tmp_df_PT_add30['delta_earning'].mean()  

    
    ##### YOUTUBE TIME SERIES #####
    tmp_df_YT_sub30 = tmp_df_yt[(tmp_df_yt['datetime'] >= bkpt_date_sub30) & (tmp_df_yt['datetime'] <= bkpt_date      )]
    tmp_df_YT_add30 = tmp_df_yt[(tmp_df_yt['datetime'] >= bkpt_date      ) & (tmp_df_yt['datetime'] <= bkpt_date_add30)]
    
    # delta videos
    mean_delta_videos_befor = tmp_df_YT_sub30['delta_videos'].mean()
    mean_delta_videos_after = tmp_df_YT_add30['delta_videos'].mean()  

    # delta views
    mean_delta_views_befor = tmp_df_YT_sub30['delta_views'].mean()
    mean_delta_views_after = tmp_df_YT_add30['delta_views'].mean()  

    # delta subscriptions
    mean_delta_subs_befor = tmp_df_YT_sub30['delta_subs'].mean()
    mean_delta_subs_after = tmp_df_YT_add30['delta_subs'].mean()  

    
    ##### YOUTUBE METADATA #####
    tmp_df_YT_META_sub30 = tmp_df_yt_meta[(tmp_df_yt_meta['upload_date'] >= bkpt_date_sub30) & (tmp_df_yt_meta['upload_date'] <= bkpt_date      )]
    tmp_df_YT_META_add30 = tmp_df_yt_meta[(tmp_df_yt_meta['upload_date'] >= bkpt_date      ) & (tmp_df_yt_meta['upload_date'] <= bkpt_date_add30)]
        
    # durations
    mean_duration_befor = tmp_df_YT_META_sub30['duration'].mean()
    mean_duration_after = tmp_df_YT_META_add30['duration'].mean()      
        
    # likes
    mean_likes_befor = tmp_df_YT_META_sub30['like_count'].mean()
    mean_likes_after = tmp_df_YT_META_add30['like_count'].mean()      
    
    
    yt_channel_id = tmp_df_yt['channel']
    
    pt_bkpnt.append(
        (          
            patreon, 
            yt_channel_id,   
            bkpt_date, 
            bkpt_date_sub30, 
            bkpt_date_add30,
            avg_patrons_bkpnt, 
            avg_patrons_sub30, 
            avg_patrons_add30, 
            d1, 
            d2, 
            r,
            mean_delta_patrons_befor, 
            mean_delta_patrons_after, 
            mean_delta_earnings_befor, 
            mean_delta_earnings_after, 
            mean_delta_videos_befor, 
            mean_delta_videos_after,
            mean_delta_views_befor,
            mean_delta_views_after,
            mean_delta_subs_befor,
            mean_delta_subs_after,
            mean_duration_befor,
            mean_likes_after
        )
    )

            
# df_pt_bkpnt = pd.DataFrame(pt_bkpnt, columns =['patreon_id', 'bkpt_date_sub30', 'avg_patrons_sub30', 'bkpt_date', 'avg_patrons_bkpnt', 'bkpt_date_add30', 'avg_patrons_add30', 'd1', 'd2', 'ratio'])
df_pt_bkpnt = pd.DataFrame(pt_bkpnt, columns = [
    'patreon_id',
    'yt_channel_id',
    'bkpt_date',     
    'bkpt_date_sub30', 
    'bkpt_date_add30', 
    'avg_patrons_bkpnt', 
    'avg_patrons_sub30', 
    'avg_patrons_add30', 
    'd1', 
    'd2', 
    'ratio',
    'mean_delta_patrons_befor', 
    'mean_delta_patrons_after', 
    'mean_delta_earnings_befor', 
    'mean_delta_earnings_after', 
    'mean_delta_videos_befor', 
    'mean_delta_videos_after',
    'mean_delta_views_befor',
    'mean_delta_views_after',
    'mean_delta_subs_befor',
    'mean_delta_subs_after',
    'mean_duration_befor',
    'mean_likes_after'
])
print(f'Patreon accounts matching dates criteria:  {len(df_pt_bkpnt)}')
df_pt_bkpnt

In [None]:
# save "breakpoints" dataframe to LOCAL SCRATCH FOLDER as a compressed tsv
# output_file_path = LOCAL_DATA_FOLDER+"df_pt_bkpnt.tsv.gz"
# df_pt_bkpnt.to_csv(output_file_path, index=False, sep='\t', compression='gzip')

In [None]:
df_pt_bkpnt = pd.read_csv(LOCAL_DATA_FOLDER+"df_pt_bkpnt.tsv.gz", sep="\t", compression='gzip')
df_pt_bkpnt['bkpt_date'] = pd.to_datetime(df_pt_bkpnt['bkpt_date'])
df_pt_bkpnt['bkpt_date_sub30'] = pd.to_datetime(df_pt_bkpnt['bkpt_date_sub30'])
df_pt_bkpnt['bkpt_date_add30'] = pd.to_datetime(df_pt_bkpnt['bkpt_date_add30'])
df_pt_bkpnt

##### Filter patreons (monotonically increasing)

<div>
    <img src="img/increase_decrease_options_051322.jpg" alt="increase_decrease_options_051322.jpg" width="800" />
</div>

In [None]:
# filter patreons accounts according to criteria
incr_thresh_ratio = 1
predicate1 = df_pt_bkpnt['ratio'] > incr_thresh_ratio
predicate2 = df_pt_bkpnt['d1'] > 0
predicate3 = df_pt_bkpnt['d2'] > df_pt_bkpnt['d1']

df_pt_bkpnt_filt = df_pt_bkpnt[predicate1 & predicate2 & predicate3]
df_pt_bkpnt_filt = df_pt_bkpnt_filt.sort_values('ratio', ascending=False)

print(f'Number of accounts with a ratio threshold of d2 > d1 > 0 and r > {incr_thresh_ratio}: {len(df_pt_bkpnt_filt)}')
df_pt_bkpnt_filt = df_pt_bkpnt_filt.reset_index(drop=True)
df_pt_bkpnt_filt

##### Patreon VS YouTube Plots

In [None]:
def color_neg_pos(ax, x, y):
    if y.isnull().all():
        return
    if (y.min() < 0): 
        # fill negative values in red and draw a horizontal line at 0
        ax.fill_between(x, y.min(), 0, color='red', alpha=0.05)
        ax.axhline(y=0, linestyle='solid', color= 'black', linewidth=0.5)
    # fill positive values in green
    # ax.fill_between(x, 0, y.max(), color='green', alpha=0.05)

In [None]:
# compare YouTube and Patreon timeseries for top patreon accounts with rolling average - MANUAL VERSION 2
month_offset = pd.DateOffset(months=1)
week_offset = pd.DateOffset(weeks=1)
rolling_avg_window = 30

# variables for Granger Tests
MAXLAG = 2
granger_dict = {} # dictionary with  keys (cause --> effect) and values with list of corresponding patreon account(s)
not_granger = []
YT_variables = ['yt_delta_videos', 'yt_delta_views', 'yt_delta_subs']
# PT_variables = ['pt_delta_patrons', 'pt_delta_earning']
PT_variables = ['pt_delta_patrons']

df_granger = df_pt_bkpnt_filt.copy()

# LOOP OVER TOP PATREON ACCOUNTS
for idx, row in df_granger[:2].iterrows():
    fig, axs = plt.subplots(5, 4, figsize=(26, 10), sharey=False, sharex=False)
    
    
    patreon = row['patreon_id']
    ########################## RESTRICT DATAFRAMES TO 1 PATREON ACCOUNT ##########################

    # patreon earnings and users
    tmp_df_pt = df_top_pt_daily_earnings[df_top_pt_daily_earnings['patreon'] == patreon].copy()  
    
    # youtube videos
    tmp_df_yt = df_yt_timeseries_top_pt[df_yt_timeseries_top_pt['patreon_id'] == patreon].copy()
    
    # replace dates that were collected after 23:00 to their next day, and remove hour
    tmp_df_yt['datetime_original'] = tmp_df_yt['datetime']
    tmp_df_yt['datetime'] = tmp_df_yt['datetime'].apply(lambda date: (date + pd.DateOffset(days=1)) if date.hour >= 23 else date) 
    
    # remove hours and convert to datetime type
    tmp_df_yt['datetime'] = pd.to_datetime(tmp_df_yt['datetime'].dt.date)
    
    ########################## PRINT TITLES ##########################
    
    # print URLs for patreon, graphtreon, YT channel(s) related to this patreon account, and breakpoint date
    ch_ids = tmp_df_yt['channel'].unique()
    print(f"\n\n\n\033[1mRank {idx+1}: {patreon[12:]} \033[0m")
    print(f"https://www.{patreon}")
    print(f"https://graphtreon.com/creator/{patreon[12:]}")
    for ch_id in ch_ids:
        print(f"https://youtube.com/channel/{ch_id}")

    
    ########################## RESTRICT DATES FOR ZOOM OUT ##########################
    
    # set min and max dates for plots   
    date_min = max([tmp_df_yt['datetime'].min(), tmp_df_pt['date'].min()])
    date_max = min([tmp_df_yt['datetime'].max(), tmp_df_pt['date'].max()])
    
    if date_max < date_min:
        print(f":( no overlapping period between YouTube and Patreon datasets\n")
        continue
    
    # restrict datasets between min and max dates
    tmp_df_pt = tmp_df_pt[(tmp_df_pt['date'] >= date_min) & (tmp_df_pt['date'] <= date_max)]
    tmp_df_yt = tmp_df_yt[(tmp_df_yt['datetime'] >= date_min) & (tmp_df_yt['datetime'] <= date_max)]
    
    # align both dataframes since youtube starts once a week
    tmp_df_pt = tmp_df_pt[tmp_df_pt['date'] >= tmp_df_yt['datetime'].min()]
    
    
    ########################## PATREON: CALCULATE MOVING AVERAGE AND WEEKLY DELTAS ##########################
    
    tmp_df_pt['patrons_ma'] = tmp_df_pt['patrons'].rolling(rolling_avg_window, center=True).mean()
    tmp_df_pt['earning_ma'] = tmp_df_pt['earning'].rolling(rolling_avg_window, center=True).mean()
    ts_pt_df = tmp_df_pt.set_index(tmp_df_pt['date']) # set the date as the index
    
    # resample time series to get 7 days intervals in order to calculate weekly deltas
    ts_pt_weekly_avg_df = ts_pt_df.resample('7D').mean()
    ts_pt_weekly_avg_df['delta_patrons'] = ts_pt_weekly_avg_df['patrons'].diff(periods=1)
    ts_pt_weekly_avg_df['delta_earning'] = ts_pt_weekly_avg_df['earning'].diff(periods=1)
    ts_pt_weekly_avg_df = ts_pt_weekly_avg_df[1:]  # remove 1st row (which is NA)
    tmp_df_yt = tmp_df_yt[1:] # remove YT 1st row to start at the same time as PT
    
    # reorder columns to have deltas columns next to their respective columns
    patreon_column_names = ['earning', 'delta_earning', 'earning_ma', 'patrons', 'delta_patrons', 'patrons_ma']
    ts_pt_weekly_avg_df = ts_pt_weekly_avg_df[patreon_column_names]
    
    # convert Float64 columns to float64 to avoid Matplotlib NAType error
    ts_pt_weekly_avg_df_float64 = ts_pt_weekly_avg_df.astype({'patrons': 'float64', 'delta_patrons': 'float64'})
              
    ########################## DETECT BREAKPOINT AND REJECT PATREON ACCOUNT IF NOT VALID ##########################

    breakpoint_date = row['bkpt_date']
    # breakpoint_date = find_breakpoint_v2(tmp_df_pt, 'patrons_ma')
    # print("Breakpoint date: ", breakpoint_date.date())

    # check that dates prior and after breakpoint exist
    if not (((breakpoint_date - 1*month_offset)) in ts_pt_df.index and ((breakpoint_date + 1*month_offset) in ts_pt_df.index)):
        print(f"ERROR: Breakpoint too close to edge of patreon time series or missing data\n")
        plt.figure().clear(); plt.close(); plt.cla(); plt.clf(); plt.show()
        continue
    
    
    ################################### CALCULATE INCREASE AND REJECT IF NOT VALID OR LESS THAN THRESHOLD ###################################

    avg_patrons_bkpnt = row['avg_patrons_bkpnt']
    avg_patrons_sub30 = row['avg_patrons_sub30']
    avg_patrons_add30 = row['avg_patrons_add30']
    
    bkpt_date       = row['bkpt_date']
    bkpt_date_sub30 = row['bkpt_date_sub30']
    bkpt_date_add30 = row['bkpt_date_add30']
    
    d1 = row['d1']
    d2 = row['d2']

    
    r = row['ratio']

    print(f'\nAverage number of patrons: (values calculated using a 30 days centered moving average)')
    print(f'• At breakpoint - 30days ({bkpt_date_sub30.date()}): {avg_patrons_sub30:,.1f}')
    print(f'• At breakpoint          ({bkpt_date.date()}): {avg_patrons_bkpnt:,.1f}')
    print(f'• At breakpoint + 30days ({bkpt_date_add30.date()}): {avg_patrons_add30:,.1f}')
    
    print(f'\nIncrease of patrons in the period before and after the breakpoint:')
    print(f"• Increase of patrons from {bkpt_date_sub30.date()} to {bkpt_date.date()}:        d1  = {d1:>+6.1f} patrons")
    print(f"• Increase of patrons from {bkpt_date.date()} to {bkpt_date_add30.date()}:        d2  = {d2:>+6.1f} patrons")
    
    print(f'\nRatio of the increases of the 2 periods: ')
    print(f"• Ratio between 2 increases:                            d2/d1  = {r:.2f}")
    print(f"• Percentage increase:                            |d2/d1|*100  = {abs(r):>+.0%}")
    
    
    
    
    
    ################################### ZOOM OUT PLOTS ###################################
    
    # number of patrons (delta)
    axs[0,0].scatter(ts_pt_weekly_avg_df_float64.index, ts_pt_weekly_avg_df_float64['delta_patrons'], c='orange', s=30, marker='+')
    axs[0,0].set(title="Delta patrons per week")
    axs[0,0].set_ylabel("Δ Patrons")    
    color_neg_pos(axs[0,0], ts_pt_weekly_avg_df_float64.index, ts_pt_weekly_avg_df_float64['delta_patrons'])

    # number of patrons (cumulative)
    axs[0,1].plot(tmp_df_pt['date'], tmp_df_pt['patrons'], alpha=0.2)
    axs[0,1].plot(tmp_df_pt['date'], tmp_df_pt['patrons_ma'])
    axs[0,1].set(title="Number of patrons")
    axs[0,1].set_ylabel("# Patrons")

    # patreon earnings (delta)
    axs[1,0].scatter(ts_pt_weekly_avg_df_float64.index, ts_pt_weekly_avg_df_float64['delta_earning'], color='royalblue', s=30, marker='+')
    axs[1,0].set(title="Patreon delta earnings per week")
    axs[1,0].set_ylabel("Δ Earnings") 
    color_neg_pos(axs[1,0], ts_pt_weekly_avg_df_float64.index, ts_pt_weekly_avg_df_float64['delta_earning'])

    # patreon earnings (cumulative)
    axs[1,1].plot(tmp_df_pt['date'], tmp_df_pt['earning'], alpha=0.2)
    axs[1,1].plot(tmp_df_pt['date'], tmp_df_pt['earning_ma'], color='royalblue')
    axs[1,1].set(title="Patreon earnings per month")
    axs[1,1].set_ylabel("Earnings")
    
    # youtube videos (delta)
    axs[2,0].scatter(tmp_df_yt['datetime'], tmp_df_yt['delta_videos'], c='r', s=30, marker='+')
    axs[2,0].set(title="YouTube delta videos per week")
    axs[2,0].set_ylabel("Δ Videos")
    color_neg_pos(axs[2,0], tmp_df_yt['datetime'], tmp_df_yt['delta_videos'])

    # youtube videos (cumulative)
    axs[2,1].plot(tmp_df_yt['datetime'], tmp_df_yt['videos'], 'r')
    axs[2,1].set(title="YouTube cumulative videos")
    axs[2,1].set_ylabel("# Videos")

    # youtube views (delta)
    axs[3,0].scatter(tmp_df_yt['datetime'], tmp_df_yt['delta_views'], c='g', s=30, marker='+')
    axs[3,0].set(title="YouTube delta views per week")
    axs[3,0].set_ylabel("Δ Views")
    color_neg_pos(axs[3,0], tmp_df_yt['datetime'], tmp_df_yt['delta_views'])

    # youtube views (cumulative)
    axs[3,1].plot(tmp_df_yt['datetime'], tmp_df_yt['views'], 'g')
    axs[3,1].set(title="YouTube cumulative views")
    axs[3,1].set_ylabel("# Views")

    # youtube subs (delta)
    axs[4,0].scatter(tmp_df_yt['datetime'], tmp_df_yt['delta_subs'], c='m', s=30, marker='+')
    axs[4,0].set(title="YouTube delta subscriptions per week")
    axs[4,0].set_ylabel("Δ Subscriptions")
    color_neg_pos(axs[4,0], tmp_df_yt['datetime'], tmp_df_yt['delta_subs'])

    # youtube subs (cumulative)
    axs[4,1].plot(tmp_df_yt['datetime'], tmp_df_yt['subs'], 'm')
    axs[4,1].set(title="YouTube cumulative subscriptions")
    axs[4,1].set_ylabel("# Subscriptions")
    
        

    ########################## RESTRICT DATES FOR ZOOM IN (+/- 2 months around breakpoint) ##########################

    # calculate min and max dates for zoom
    date_min_zoom = breakpoint_date - (2 * month_offset)
    date_max_zoom = breakpoint_date + (2 * month_offset)
            
    # restrict datasets between min and max dates
    tmp_df_pt_zoomed = tmp_df_pt[(tmp_df_pt['date'] >= date_min_zoom) & (tmp_df_pt['date'] <= date_max_zoom)].copy()
    tmp_df_yt_zoomed = tmp_df_yt[(tmp_df_yt['datetime'] >= date_min_zoom) & (tmp_df_yt['datetime'] <= date_max_zoom)].copy()

    # used for coloration
    ts_pt_weekly_avg_df_zoomed = ts_pt_weekly_avg_df_float64[(ts_pt_weekly_avg_df_float64.index >= date_min_zoom) & (ts_pt_weekly_avg_df_float64.index <= date_max_zoom)]
    
    
   ################################### ZOOM IN PLOTS  ###################################

    # zoomed in patron numbers (delta)
    axs[0,2].scatter(ts_pt_weekly_avg_df_zoomed.index, ts_pt_weekly_avg_df_zoomed['delta_patrons'], c='orange', s=30, marker='+')
    axs[0,2].plot(ts_pt_weekly_avg_df_zoomed.index, ts_pt_weekly_avg_df_zoomed['delta_patrons'], c='orange', alpha=0.3)
    axs[0,2].set(title="Delta patrons per week")
    axs[0,2].set_ylabel("Δ Patrons")
    color_neg_pos(axs[0,2], ts_pt_weekly_avg_df_float64.index, ts_pt_weekly_avg_df_zoomed['delta_patrons'])
    
    # zoomed in patron numbers (cumulative)
    axs[0,3].plot(tmp_df_pt_zoomed['date'], tmp_df_pt_zoomed['patrons'], alpha=0.2)
    axs[0,3].plot(tmp_df_pt_zoomed['date'], tmp_df_pt_zoomed['patrons_ma'])
    axs[0,3].set(title="Number of patrons (zoomed in)")
    axs[0,3].set_ylabel("# Patrons")
    
    # zoomed in patron earnings (delta)
    axs[1,2].scatter(ts_pt_weekly_avg_df_zoomed.index, ts_pt_weekly_avg_df_zoomed['delta_earning'], color='royalblue', s=30, marker='+')
    axs[1,2].plot(ts_pt_weekly_avg_df_zoomed.index, ts_pt_weekly_avg_df_zoomed['delta_earning'], color='royalblue', alpha=0.3)
    axs[1,2].set(title="Delta Patreon earnings per week (zoomed in)")
    axs[1,2].set_ylabel("Earnings")  
    color_neg_pos(axs[1,2], ts_pt_weekly_avg_df_float64.index, ts_pt_weekly_avg_df_zoomed['delta_earning'])

    # zoomed in patron earnings (cumulative)
    axs[1,3].plot(tmp_df_pt_zoomed['date'], tmp_df_pt_zoomed['earning'], alpha=0.2)
    axs[1,3].plot(tmp_df_pt_zoomed['date'], tmp_df_pt_zoomed['earning_ma'], color='royalblue')
    axs[1,3].set(title="Patreon earnings per month (zoomed in)")
    axs[1,3].set_ylabel("Earnings")
    
    # zoomed in youtube videos (delta)
    axs[2,2].scatter(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['delta_videos'], c='r', s=30, marker='+')
    axs[2,2].plot(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['delta_videos'], c='r', alpha=0.3)
    axs[2,2].set(title="YouTube delta videos per week (zoomed in)")
    axs[2,2].set_ylabel("Δ Videos")
    color_neg_pos(axs[2,2], tmp_df_yt['datetime'], tmp_df_yt_zoomed['delta_videos'])

    # zoomed in youtube videos (cumulative)
    axs[2,3].plot(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['videos'], 'r')
    axs[2,3].set(title="YouTube cumulative videos (zoomed in)")
    axs[2,3].set_ylabel("# Videos")

    # zoomed in youtube views (delta)
    axs[3,2].scatter(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['delta_views'], c='g', s=30, marker='+')
    axs[3,2].plot(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['delta_views'], c='g', alpha=0.3)
    axs[3,2].set(title="YouTube delta views per week (zoomed in)")
    axs[3,2].set_ylabel("Δ Views")
    color_neg_pos(axs[3,2], tmp_df_yt['datetime'], tmp_df_yt_zoomed['delta_views'])

    # zoomed in youtube views (cumulative)
    axs[3,3].plot(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['views'], 'g')
    axs[3,3].set(title="YouTube cumulative views (zoomed in)")
    axs[3,3].set_ylabel("# Views")
    
    # zoomed in youtube subs (delta)
    axs[4,2].scatter(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['delta_subs'], c='m', s=30, marker='+')
    axs[4,2].plot(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['delta_subs'], c='m', alpha=0.3)
    axs[4,2].set(title="YouTube delta subscriptions per week (zoomed in)")
    axs[4,2].set_ylabel("Δ Subscriptions")
    color_neg_pos(axs[4,2], tmp_df_yt['datetime'], tmp_df_yt_zoomed['delta_subs'])

    # zoomed in youtube subs (cumulative)
    axs[4,3].plot(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['subs'], 'm')
    axs[4,3].set(title="YouTube cumulative subscriptions (zoomed in)")
    axs[4,3].set_ylabel("# Subscriptions")
    
    
    ################################### FORMAT AXES ###################################

    # format the axes
    for i in range(axs.shape[0]):
        for j in range(axs.shape[1]):
            if j < 2:
                axs[i,j].set_xlim([date_min, date_max])
                axs[i,j].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
                axs[i,j].xaxis.set_major_locator(mdates.YearLocator())
                axs[i,j].xaxis.set_minor_locator(mdates.MonthLocator())
            if j >= 2:
                axs[i,j].set_xlim([date_min_zoom, date_max_zoom])
                axs[i,j].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%b'))
                axs[i,j].xaxis.set_major_locator(mdates.MonthLocator())
                # axs[i,j].xaxis.set_minor_locator(mdates.WeekdayLocator())
            axs[i,j].xaxis.grid(color="#CCCCCC", ls=":")
            axs[i,j].yaxis.grid(color="#CCCCCC", ls=":")
            axs[i,j].yaxis.set_major_formatter(KM_formatter)
            
            
    ################################### PLOT BREAKPOINT LINES AND POINTS ###################################

    # plot vertical lines for breakpoint, breakpoint-1month, breakpoint+1month
    print_legend = True
    for i in range(axs.shape[0]):
        for j in range(axs.shape[1]):
            if print_legend:
                axs[i,j].axvline(breakpoint_date, color='red', linestyle='--', label='break', linewidth=2.5)
                axs[i,j].axvline(breakpoint_date - month_offset, color='green', linestyle=':', label='- ' + str(month_offset.months)+' months', linewidth=2)
                axs[i,j].axvline(breakpoint_date + month_offset, color='orange', linestyle=':', label='+' + str(month_offset.months)+' months', linewidth=2)          
                # print_legend = False
            else:
                axs[i,j].axvline(breakpoint_date, color='red', linestyle='--', linewidth=2.5)
                axs[i,j].axvline(breakpoint_date - month_offset, color='green', linestyle=':', linewidth=2)
                axs[i,j].axvline(breakpoint_date + month_offset, color='orange', linestyle=':', linewidth=2)
    # axs[0,0].legend()
    axs[0,1].legend()

    # plot point for mean nb of patrons for breakpoint, breakpoint-1month, breakpoint+1month    
    axs[0,3].plot(breakpoint_date - month_offset, ts_pt_df.at[(breakpoint_date - month_offset), 'patrons_ma'], marker='o', color='green')
    axs[0,3].plot(breakpoint_date,               ts_pt_df.at[breakpoint_date              , 'patrons_ma'], marker='o', color='red')    
    axs[0,3].plot(breakpoint_date + month_offset, ts_pt_df.at[(breakpoint_date + month_offset), 'patrons_ma'], marker='o', color='orange')    


    ################################### GRANGER CAUSALITY TESTS ###################################

    # create a new dataframe with merged columns (the dates might have a day difference)
    selected_pt_columns  = ['delta_earning', 'delta_patrons']
    df_pt = ts_pt_weekly_avg_df_zoomed
    df_pt = df_pt[selected_pt_columns].reset_index().add_prefix('pt_')

    # selected_yt_columns = ['datetime', 'delta_views', 'delta_subs', 'delta_videos']
    selected_yt_columns = ['datetime', 'datetime_original', 'delta_views', 'delta_subs', 'delta_videos']
    df_yt = tmp_df_yt_zoomed
    df_yt = df_yt[selected_yt_columns].reset_index().add_prefix('yt_')

    # concatenated 2 dfs and select and reorder columns
    df_concat = pd.concat([df_pt, df_yt], axis=1)
    concat_columns = ['pt_date', 'yt_datetime', 'pt_delta_earning', 'pt_delta_patrons', 'yt_delta_views', 'yt_delta_subs', 'yt_delta_videos']
    df_concat = df_concat[concat_columns]
    # df_concat['dates_match'] = df_concat['pt_date'] == df_concat['yt_datetime']
    
    # display(df_concat.round())
    # display(df_concat.style.set_caption(f"df_concat"))
    
    
    
    print(f"\nGranger Causality Tests:")
    
    granger_causal_link = False
    for pt_var in PT_variables:
        for yt_var in YT_variables:
            
            # if nan values in this df, skip
            if df_concat[[yt_var, pt_var]].isna().values.any():
                continue
                
            pvalue_fwd = {}
            pvalue_rev = {}
            
            try:
                # print(f'\n\n• {pt_var} --> {yt_var}')
                granger_test_fwd = grangercausalitytests(df_concat[[yt_var, pt_var]], maxlag=MAXLAG, verbose=False)  
                # print(f'\n\n• {yt_var} --> {pt_var}')
                granger_test_rev = grangercausalitytests(df_concat[[pt_var, yt_var]], maxlag=MAXLAG, verbose=False) 
            except Exception:
                continue


            for lag in range(1, MAXLAG+1):           
                pvalue_fwd[lag] = granger_test_fwd[lag][0]['ssr_ftest'][1]
                pvalue_rev[lag] = granger_test_rev[lag][0]['ssr_ftest'][1]
                
            
            
            
            min_pvalue_fwd = min(pvalue_fwd.values())
            if min_pvalue_fwd < 0.05:
                granger_causal_link = True
                min_lag_fwd = [k for k, v in pvalue_fwd.items() if v == min_pvalue_fwd][0]
                print(f'• {pt_var} --> {yt_var} (pvalue={min_pvalue_fwd:.3f}, lag={min_lag_fwd})')

                # add value to df
                df_granger.loc[idx, pt_var+'->'+yt_var] = 1

                if (pt_var, yt_var) in granger_dict:                   
                    granger_dict[(pt_var, yt_var)].append(patreon)
                else:
                    granger_dict[(pt_var, yt_var)] = [patreon]
            else: 
                df_granger.loc[idx, pt_var+'->'+yt_var] = 0
                
                
                
            min_pvalue_rev = min(pvalue_rev.values())
            if min_pvalue_rev < 0.05:
                granger_causal_link = True
                min_lag_rev = [k for k, v in pvalue_rev.items() if v == min_pvalue_rev][0]
                print(f'• {yt_var} --> {pt_var} (pvalue={min_pvalue_rev:.3f}, lag={min_lag_rev})')

                # add value to df
                df_granger.loc[idx, yt_var+'->'+pt_var] = 1
                
                if (yt_var, pt_var) in granger_dict:
                    granger_dict[(yt_var, pt_var)].append(patreon)
                else:
                    granger_dict[(yt_var, pt_var)] = [patreon]
            else: 
                df_granger.loc[idx, yt_var+'->'+pt_var] = 0
                

    if (granger_causal_link == False):
        print("• No Granger causality found for this account")
        not_granger.append(patreon)
    
    print("\n")

    fig.tight_layout(w_pad=0)
    plt.show()
    
    print('\n\n\n---------------------------------------------------------------------------------------------------------------------------------------------------')
    
# print('\n\nGranger tests summary statistics:')
    
# print(f'• Number of patreon accounts analysed (patrons increase ratio > {incr_thresh_ratio}): {len(df_granger)}')
# print(f'• Number of patreon with no Granger-causal link: {len(not_granger)} ({len(not_granger)/len(df_granger):.0%})')

# print(f'• Number of patreon accounts per Granger-causal link:')
# # Converting granger dict into list of tuples (in order to sort it), the 2nd value of the tuple being the count of accounts
# granger_list = [(k, len(v)) for k, v in granger_dict.items()]
# # sort by count desc
# granger_list_desc = sorted(granger_list, key=lambda tup: -tup[1])
# for (k,v) in granger_list_desc:
#     print(f'    • {k[0]} \t--> {k[1]}:\t {v} ({v/len(df_granger):.0%})')


# df_granger[columns] = df_granger[columns].astype('Int64')
# df_granger

##### Granger Tests statistics

In [None]:
# # compare YouTube and Patreon timeseries for top patreon accounts with rolling average - MANUAL VERSION 2
# month_offset = pd.DateOffset(months=1)
# week_offset = pd.DateOffset(weeks=1)
# rolling_avg_window = 30

# # variables for Granger Tests
# MAXLAG = 2
# granger_dict = {} # dictionary with  keys (cause --> effect) and values with list of corresponding patreon account(s)
# not_granger = []
# YT_variables = ['yt_delta_videos', 'yt_delta_views', 'yt_delta_subs']
# # PT_variables = ['pt_delta_patrons', 'pt_delta_earning']
# PT_variables = ['pt_delta_patrons']

# df_granger = df_pt_bkpnt_filt.copy()

# # LOOP OVER TOP PATREON ACCOUNTS
# for idx, row in tqdm(df_granger.iterrows()):   

    
#     ########################## RESTRICT DATAFRAMES TO 1 PATREON ACCOUNT ##########################

#     patreon = row['patreon_id']

#     # patreon earnings and users
#     tmp_df_pt = df_top_pt_daily_earnings[df_top_pt_daily_earnings['patreon'] == patreon].copy()  
    
#     # youtube videos
#     tmp_df_yt = df_yt_timeseries_top_pt[df_yt_timeseries_top_pt['patreon_id'] == patreon].copy()
    
#     # replace dates that were collected after 23:00 to their next day, and remove hour
#     tmp_df_yt['datetime_original'] = tmp_df_yt['datetime']
#     tmp_df_yt['datetime'] = tmp_df_yt['datetime'].apply(lambda date: (date + pd.DateOffset(days=1)) if date.hour >= 23 else date) 
    
#     # remove hours and convert to datetime type
#     tmp_df_yt['datetime'] = pd.to_datetime(tmp_df_yt['datetime'].dt.date)
    
#     ########################## PRINT TITLES ##########################
    
#     # print URLs for patreon, graphtreon, YT channel(s) related to this patreon account, and breakpoint date
#     # ch_ids = tmp_df_yt['channel'].unique()
#     # print(f"\n\n\n\033[1mRank {idx+1}: {patreon[12:]} \033[0m")
#     # print(f"https://www.{patreon}")
#     # print(f"https://graphtreon.com/creator/{patreon[12:]}")
#     # for ch_id in ch_ids:
#     #     print(f"https://youtube.com/channel/{ch_id}")

    
#     ########################## RESTRICT DATES FOR ZOOM OUT ##########################
    
#     # set min and max dates for plots   
#     date_min = max([tmp_df_yt['datetime'].min(), tmp_df_pt['date'].min()])
#     date_max = min([tmp_df_yt['datetime'].max(), tmp_df_pt['date'].max()])
    
#     if date_max < date_min:
#         print(f":( no overlapping period between YouTube and Patreon datasets\n")
#         continue
    
#     # restrict datasets between min and max dates
#     tmp_df_pt = tmp_df_pt[(tmp_df_pt['date'] >= date_min) & (tmp_df_pt['date'] <= date_max)]
#     tmp_df_yt = tmp_df_yt[(tmp_df_yt['datetime'] >= date_min) & (tmp_df_yt['datetime'] <= date_max)]
    
#     # align both dataframes since youtube starts once a week
#     tmp_df_pt = tmp_df_pt[tmp_df_pt['date'] >= tmp_df_yt['datetime'].min()]
    
    
    
#     ########################## PATREON: CALCULATE MOVING AVERAGE AND WEEKLY DELTAS ##########################
    
#     tmp_df_pt['patrons_ma'] = tmp_df_pt['patrons'].rolling(rolling_avg_window, center=True).mean()
#     tmp_df_pt['earning_ma'] = tmp_df_pt['earning'].rolling(rolling_avg_window, center=True).mean()
#     ts_pt_df = tmp_df_pt.set_index(tmp_df_pt['date']) # set the date as the index
    
#     # resample time series to get 7 days intervals in order to calculate weekly deltas
#     ts_pt_weekly_avg_df = ts_pt_df.resample('7D').mean()
#     ts_pt_weekly_avg_df['delta_patrons'] = ts_pt_weekly_avg_df['patrons'].diff(periods=1)
#     ts_pt_weekly_avg_df['delta_earning'] = ts_pt_weekly_avg_df['earning'].diff(periods=1)
#     ts_pt_weekly_avg_df = ts_pt_weekly_avg_df[1:]  # remove 1st row (which is NA)
#     tmp_df_yt = tmp_df_yt[1:] # remove YT 1st row to start at the same time as PT
    
#     # reorder columns to have deltas columns next to their respective columns
#     patreon_column_names = ['earning', 'delta_earning', 'earning_ma', 'patrons', 'delta_patrons', 'patrons_ma']
#     ts_pt_weekly_avg_df = ts_pt_weekly_avg_df[patreon_column_names]
    
#     # convert Float64 columns to float64 to avoid Matplotlib NAType error
#     ts_pt_weekly_avg_df_float64 = ts_pt_weekly_avg_df.astype({'patrons': 'float64', 'delta_patrons': 'float64'})
    
               
#     ################################### CALCULATE INCREASE AND REJECT IF NOT VALID OR LESS THAN THRESHOLD ###################################

#     breakpoint_date = row['bkpt_date']

#     avg_patrons_bkpnt = row['avg_patrons_bkpnt']
#     avg_patrons_sub30 = row['avg_patrons_sub30']
#     avg_patrons_add30 = row['avg_patrons_add30']
    
#     bkpt_date       = row['bkpt_date']
#     bkpt_date_sub30 = row['bkpt_date_sub30']
#     bkpt_date_add30 = row['bkpt_date_add30']
    
#     d1 = row['d1']
#     d2 = row['d2']

    
#     r = row['ratio']

# #     print(f'\nAverage number of patrons: (values calculated using a 30 days centered moving average)')
# #     print(f'• At breakpoint - 30days ({bkpt_date_sub30.date()}): {avg_patrons_sub30:,.1f}')
# #     print(f'• At breakpoint          ({bkpt_date.date()}): {avg_patrons_bkpnt:,.1f}')
# #     print(f'• At breakpoint + 30days ({bkpt_date_add30.date()}): {avg_patrons_add30:,.1f}')
    
# #     print(f'\nIncrease of patrons in the period before and after the breakpoint:')
# #     print(f"• Increase of patrons from {bkpt_date_sub30.date()} to {bkpt_date.date()}:        d1  = {d1:>+6.1f} patrons")
# #     print(f"• Increase of patrons from {bkpt_date.date()} to {bkpt_date_add30.date()}:        d2  = {d2:>+6.1f} patrons")
    
# #     print(f'\nRatio of the increases of the 2 periods: ')
# #     print(f"• Ratio between 2 increases:                            d2/d1  = {r:.2f}")
# #     print(f"• Percentage increase:                            |d2/d1|*100  = {abs(r):>+.0%}")
    


#     ########################## RESTRICT DATES FOR ZOOM IN (+/- 2 months around breakpoint) ##########################

#     # calculate min and max dates for zoom
#     date_min_zoom = breakpoint_date - (2 * month_offset)
#     date_max_zoom = breakpoint_date + (2 * month_offset)
            
#     # restrict datasets between min and max dates
#     tmp_df_pt_zoomed = tmp_df_pt[(tmp_df_pt['date'] >= date_min_zoom) & (tmp_df_pt['date'] <= date_max_zoom)].copy()
#     tmp_df_yt_zoomed = tmp_df_yt[(tmp_df_yt['datetime'] >= date_min_zoom) & (tmp_df_yt['datetime'] <= date_max_zoom)].copy()

#     # used for coloration
#     ts_pt_weekly_avg_df_zoomed = ts_pt_weekly_avg_df_float64[(ts_pt_weekly_avg_df_float64.index >= date_min_zoom) & (ts_pt_weekly_avg_df_float64.index <= date_max_zoom)]
    


#     ################################### GRANGER CAUSALITY TESTS ###################################

#     # create a new dataframe with merged columns (the dates might have a day difference)
#     selected_pt_columns  = ['delta_earning', 'delta_patrons']
#     df_pt = ts_pt_weekly_avg_df_zoomed
#     df_pt = df_pt[selected_pt_columns].reset_index().add_prefix('pt_')

#     # selected_yt_columns = ['datetime', 'delta_views', 'delta_subs', 'delta_videos']
#     selected_yt_columns = ['datetime', 'datetime_original', 'delta_views', 'delta_subs', 'delta_videos']
#     df_yt = tmp_df_yt_zoomed
#     df_yt = df_yt[selected_yt_columns].reset_index().add_prefix('yt_')

#     # concatenated 2 dfs and select and reorder columns
#     df_concat = pd.concat([df_pt, df_yt], axis=1)
#     concat_columns = ['pt_date', 'yt_datetime', 'pt_delta_earning', 'pt_delta_patrons', 'yt_delta_views', 'yt_delta_subs', 'yt_delta_videos']
#     df_concat = df_concat[concat_columns]
#     # df_concat['dates_match'] = df_concat['pt_date'] == df_concat['yt_datetime']
    
#     # display(df_concat.round())
#     # display(df_concat.style.set_caption(f"df_concat"))
    
    
    
#     # print(f"\nGranger Causality Tests:")
    
#     granger_causal_link = False
#     for pt_var in PT_variables:
#         for yt_var in YT_variables:
            
#             # if nan values in this df, skip
#             if df_concat[[yt_var, pt_var]].isna().values.any():
#                 continue
                
#             pvalue_fwd = {}
#             pvalue_rev = {}
            
#             try:
#                 # print(f'\n\n• {pt_var} --> {yt_var}')
#                 granger_test_fwd = grangercausalitytests(df_concat[[yt_var, pt_var]], maxlag=MAXLAG, verbose=False)  
#                 # print(f'\n\n• {yt_var} --> {pt_var}')
#                 granger_test_rev = grangercausalitytests(df_concat[[pt_var, yt_var]], maxlag=MAXLAG, verbose=False) 
#             except Exception:
#                 continue


#             for lag in range(1, MAXLAG+1):           
#                 pvalue_fwd[lag] = granger_test_fwd[lag][0]['ssr_ftest'][1]
#                 pvalue_rev[lag] = granger_test_rev[lag][0]['ssr_ftest'][1]
                
            
            
            
#             min_pvalue_fwd = min(pvalue_fwd.values())
#             if min_pvalue_fwd < 0.05:
#                 granger_causal_link = True
#                 min_lag_fwd = [k for k, v in pvalue_fwd.items() if v == min_pvalue_fwd][0]
#                 # print(f'• {pt_var} --> {yt_var} (pvalue={min_pvalue_fwd:.3f}, lag={min_lag_fwd})')

#                 # add value to df
#                 df_granger.loc[idx, pt_var+'->'+yt_var] = 1

#                 if (pt_var, yt_var) in granger_dict:                   
#                     granger_dict[(pt_var, yt_var)].append(patreon)
#                 else:
#                     granger_dict[(pt_var, yt_var)] = [patreon]
#             else: 
#                 df_granger.loc[idx, pt_var+'->'+yt_var] = 0
                
                
                
#             min_pvalue_rev = min(pvalue_rev.values())
#             if min_pvalue_rev < 0.05:
#                 granger_causal_link = True
#                 min_lag_rev = [k for k, v in pvalue_rev.items() if v == min_pvalue_rev][0]
#                 # print(f'• {yt_var} --> {pt_var} (pvalue={min_pvalue_rev:.3f}, lag={min_lag_rev})')

#                 # add value to df
#                 df_granger.loc[idx, yt_var+'->'+pt_var] = 1
                
#                 if (yt_var, pt_var) in granger_dict:
#                     granger_dict[(yt_var, pt_var)].append(patreon)
#                 else:
#                     granger_dict[(yt_var, pt_var)] = [patreon]
#             else: 
#                 df_granger.loc[idx, yt_var+'->'+pt_var] = 0
                

#     if (granger_causal_link == False):
#         # print("• No Granger causality found for this account")
#         not_granger.append(patreon)
    
#     # print("\n")

#     # fig.tight_layout(w_pad=0)
#     # plt.show()
    
#     # print('\n\n\n---------------------------------------------------------------------------------------------------------------------------------------------------')
    
# print(F'\n\nGranger tests summary statistics: (with maxlag = {MAXLAG}')
    
# print(f'• Number of patreon accounts analysed (patrons increase ratio > {incr_thresh_ratio}): {len(df_granger)}')
# print(f'• Number of patreon with no Granger-causal link: {len(not_granger)} ({len(not_granger)/len(df_granger):.0%})')

# print(f'• Number of patreon accounts per Granger-causal link:')

# # Converting granger dict into list of tuples (in order to sort it), the 2nd value of the tuple being the count of accounts
# granger_list = [(k, len(v)) for k, v in granger_dict.items()]
# # sort by count desc
# granger_list_desc = sorted(granger_list, key=lambda tup: -tup[1])
# for (k,v) in granger_list_desc:
#     print(f'    • {k[0]} \t--> {k[1]}:\t {v} ({v/len(df_granger):.0%})')


# df_granger[columns] = df_granger[columns].astype('Int64')
# df_granger

In [None]:
# save "df_granger" dataframe to LOCAL SCRATCH FOLDER as a compressed tsv
# output_file_path = LOCAL_DATA_FOLDER+"df_granger.tsv.gz"
# df_granger.to_csv(output_file_path, index=False, sep='\t', compression='gzip')

##### Granger causality plots

In [None]:
!ls -lh {LOCAL_DATA_FOLDER}df_granger.tsv.gz

In [None]:
columns = [
'pt_delta_patrons->yt_delta_videos',
'pt_delta_patrons->yt_delta_views',
'pt_delta_patrons->yt_delta_subs',
'yt_delta_videos->pt_delta_patrons',
'yt_delta_views->pt_delta_patrons',
'yt_delta_subs->pt_delta_patrons'
]

In [None]:
df_granger = pd.read_csv(LOCAL_DATA_FOLDER+"df_granger.tsv.gz", sep="\t", compression='gzip')
df_granger['bkpt_date'] = pd.to_datetime(df_granger['bkpt_date'])
df_granger['bkpt_date_sub30'] = pd.to_datetime(df_granger['bkpt_date_sub30'])
df_granger['bkpt_date_add30'] = pd.to_datetime(df_granger['bkpt_date_add30'])
df_granger[columns] = df_granger[columns].astype('Int64')
df_granger.head()

In [None]:
# split columns in PT->YT, and reverse YT->PT
cols1 = [
'pt_delta_patrons->yt_delta_videos',
'pt_delta_patrons->yt_delta_views',
'pt_delta_patrons->yt_delta_subs'
]
cols2 = [
'yt_delta_videos->pt_delta_patrons',
'yt_delta_views->pt_delta_patrons',
'yt_delta_subs->pt_delta_patrons'
]

In [None]:
# For different minimum ratios of increase, plot sum of Granger-causal links between Patreon and YouTube time-series (in blue) and vice-versa (in orange)

nb_plots = 10
sbplt_cols = 5
sbplt_rows = int(nb_plots / sbplt_cols)

fig, axs = plt.subplots(sbplt_rows, sbplt_cols, figsize=(16,12), sharey=True, sharex=True)
for idx in range(0, nb_plots):

    row = math.floor(idx/sbplt_cols)
    col = idx % sbplt_cols
    sbplt = axs[row, col]
    
    ratio_df = df_granger[df_granger['ratio'] > idx+1]
    
    # print(f'\n\nratio > {idx+1}:')
    # print(f'total number of accounts: {len(ratio_df)}:')
    # no_causal_links_df = ratio_df[ratio_df[cols1 + cols2].sum(axis=1) == 0]
    # print(f'nb accts with no Granger-causal links: {len(no_causal_links_df)} ({len(no_causal_links_df)/len(ratio_df):.0%})')
    # print(f'\nPatreon --> YouTube:')
    # print(ratio_df[cols1].sum())
    # print(f'\nYouTube --> Patreon:')
    # print(ratio_df[cols2].sum())
    
    granger_series = ratio_df[cols1 + cols2].sum()/len(ratio_df)
    sbplt.bar(granger_series[cols1].index, granger_series[cols1].values, label='PT --> YT')
    sbplt.bar(granger_series[cols2].index, granger_series[cols2].values, label='YT --> PT')
    sbplt.set_title(f"ratio > {idx+1}\n # accnts = {len(ratio_df)}")
    # sbplt.set_xlabel("Granger-causal links")
    # sbplt.set_ylabel("% of PT accts")
    sbplt.tick_params(labelrotation=90)
    

axs[0, 0].legend()
axs[1, 4].legend()


fig.suptitle('Granger-causal links between Patreon and YouTube time-series, for different minimum ratios of increase at breakpoint \n (one account can have multiple causal-links)', fontweight="bold")
fig.text(0.5,0, 'Granger-causal links')
fig.text(0,0.5, 'Percentage of Patreon accts ', rotation = 90)


fig.tight_layout(pad=3, w_pad=3)
plt.show()

#### 2.4.4 Compare with YT videos durations and likes

In [None]:
# YT metadata containing patreon ids in description (already loaded in 2.1)
!ls -lh {LOCAL_DATA_FOLDER}yt_metadata_en_pt_040422.tsv.gz

In [None]:
# filter accounts that match selected Patreon ids
df_yt_metadata_pt_filtered = df_yt_metadata_pt[df_yt_metadata_pt['patreon_id'].isin(df_pt_bkpnt_filt['patreon_id'])].copy()
print(f'Filter accounts that match selected Patreon ids: {len(df_yt_metadata_pt_filtered):,} ({len(df_yt_metadata_pt_filtered)/len(df_yt_metadata_pt):.1%} of videos containing a PT accounts) ')

In [None]:
df_yt_metadata_pt_filtered['crawl_date'] = pd.to_datetime(df_yt_metadata_pt_filtered['crawl_date'])
df_yt_metadata_pt_filtered['upload_date'] = pd.to_datetime(df_yt_metadata_pt_filtered['upload_date'])
df_yt_metadata_pt_filtered.head()

In [None]:
# compare Patreon and YouTube timeseries + YouTube metadata
month_offset = pd.DateOffset(months=1)
week_offset = pd.DateOffset(weeks=1)
rolling_avg_window = 30

# variables for Granger Tests
MAXLAG = 2
granger_dict = {} # dictionary with  keys (cause --> effect) and values with list of corresponding patreon account(s)
not_granger = []
YT_variables = ['yt_delta_videos', 'yt_delta_views', 'yt_delta_subs']
# PT_variables = ['pt_delta_patrons', 'pt_delta_earning']
PT_variables = ['pt_delta_patrons']

df_granger = df_pt_bkpnt_filt.copy()

# LOOP OVER TOP PATREON ACCOUNTS
for idx, row in df_granger[:2].iterrows():
    fig, axs = plt.subplots(7, 4, figsize=(26, 10), sharey=False, sharex=False)
    
    
    patreon = row['patreon_id']
    ########################## RESTRICT DATAFRAMES TO 1 PATREON ACCOUNT ##########################

    # patreon earnings and users
    tmp_df_pt = df_top_pt_daily_earnings[df_top_pt_daily_earnings['patreon'] == patreon].copy()  
    tmp_df_pt = tmp_df_pt.drop_duplicates()

    # youtube videos
    tmp_df_yt = df_yt_timeseries_top_pt[df_yt_timeseries_top_pt['patreon_id'] == patreon].copy()
    
    # youtube metadata
    tmp_df_yt_meta = df_yt_metadata_pt_filtered[df_yt_metadata_pt_filtered['patreon_id'] == patreon].copy()   
    tmp_df_yt_meta = tmp_df_yt_meta.sort_values('upload_date')
    # tmp_df_yt_meta['upload_date'] = pd.to_datetime(tmp_df_yt_meta['upload_date'])
    
    # replace dates that were collected after 23:00 to their next day, and remove hour
    tmp_df_yt['datetime_original'] = tmp_df_yt['datetime']
    tmp_df_yt['datetime'] = tmp_df_yt['datetime'].apply(lambda date: (date + pd.DateOffset(days=1)) if date.hour >= 23 else date) 
    
    # remove hours and convert to datetime type
    tmp_df_yt['datetime'] = pd.to_datetime(tmp_df_yt['datetime'].dt.date)
    
    
    ########################## PRINT TITLES ##########################
    
    # print URLs for patreon, graphtreon, YT channel(s) related to this patreon account, and breakpoint date
    ch_ids = tmp_df_yt['channel'].unique()
    print(f"\n\n\n\033[1mRank {idx+1}: {patreon[12:]} \033[0m")
    print(f"https://www.{patreon}")
    print(f"https://graphtreon.com/creator/{patreon[12:]}")
    for ch_id in ch_ids:
        print(f"https://youtube.com/channel/{ch_id}")
   
    print(f'\nYouTube Metadata: ')
    print('• YT videos were uploaded between {} and {}'.format(tmp_df_yt_meta['upload_date'].min().strftime('%B %d, %Y'),
                                                             tmp_df_yt_meta['upload_date'].max().strftime('%B %d, %Y')))

    print('• YT metadata was crawled between {} and {}'.format(tmp_df_yt_meta['crawl_date'].min().strftime('%B %d, %Y'),
                                                             tmp_df_yt_meta['crawl_date'].max().strftime('%B %d, %Y')))
    
    ########################## RESTRICT DATES FOR ZOOM OUT ##########################
    
    # set min and max dates for plots   
    date_min = max([tmp_df_yt['datetime'].min(), tmp_df_pt['date'].min()])
    date_max = min([tmp_df_yt['datetime'].max(), tmp_df_pt['date'].max()])
    
    if date_max < date_min:
        print(f":( no overlapping period between YouTube and Patreon datasets\n")
        continue
    
    # restrict datasets between min and max dates
    tmp_df_pt = tmp_df_pt[(tmp_df_pt['date'] >= date_min) & (tmp_df_pt['date'] <= date_max)]
    tmp_df_yt = tmp_df_yt[(tmp_df_yt['datetime'] >= date_min) & (tmp_df_yt['datetime'] <= date_max)]
    tmp_df_yt_meta = tmp_df_yt_meta[(tmp_df_yt_meta['upload_date'] >= date_min) & (tmp_df_yt_meta['upload_date'] <= date_max)]
    
    # align both dataframes since youtube starts once a week
    tmp_df_pt = tmp_df_pt[tmp_df_pt['date'] >= tmp_df_yt['datetime'].min()]
    
    
    
    ########################## PATREON: CALCULATE MOVING AVERAGE AND WEEKLY DELTAS ##########################
    
    tmp_df_pt['patrons_ma'] = tmp_df_pt['patrons'].rolling(rolling_avg_window, center=True).mean()
    tmp_df_pt['earning_ma'] = tmp_df_pt['earning'].rolling(rolling_avg_window, center=True).mean()
    ts_pt_df = tmp_df_pt.set_index(tmp_df_pt['date']) # set the date as the index
    
    # resample time series to get 7 days intervals in order to calculate weekly deltas
    ts_pt_weekly_avg_df = ts_pt_df.resample('7D').mean()
    ts_pt_weekly_avg_df['delta_patrons'] = ts_pt_weekly_avg_df['patrons'].diff(periods=1)
    ts_pt_weekly_avg_df['delta_earning'] = ts_pt_weekly_avg_df['earning'].diff(periods=1)
    ts_pt_weekly_avg_df = ts_pt_weekly_avg_df[1:]  # remove 1st row (which is NA)
    tmp_df_yt = tmp_df_yt[1:] # remove YT 1st row to start at the same time as PT
    
    # reorder columns to have deltas columns next to their respective columns
    patreon_column_names = ['earning', 'delta_earning', 'earning_ma', 'patrons', 'delta_patrons', 'patrons_ma']
    ts_pt_weekly_avg_df = ts_pt_weekly_avg_df[patreon_column_names]
    
    # convert Float64 columns to float64 to avoid Matplotlib NAType error
    ts_pt_weekly_avg_df_float64 = ts_pt_weekly_avg_df.astype({'patrons': 'float64', 'delta_patrons': 'float64'})
    
              
    ########################## DETECT BREAKPOINT AND REJECT PATREON ACCOUNT IF NOT VALID ##########################

    breakpoint_date = row['bkpt_date']
    # breakpoint_date = find_breakpoint_v2(tmp_df_pt, 'patrons_ma')
    # print("Breakpoint date: ", breakpoint_date.date())

    # check that dates prior and after breakpoint exist
    if not (((breakpoint_date - 1*month_offset)) in ts_pt_df.index and ((breakpoint_date + 1*month_offset) in ts_pt_df.index)):
        print(f"ERROR: Breakpoint too close to edge of patreon time series or missing data\n")
        plt.figure().clear(); plt.close(); plt.cla(); plt.clf(); plt.show()
        continue
    
    
    ################################### CALCULATE INCREASE AND REJECT IF NOT VALID OR LESS THAN THRESHOLD ###################################

    avg_patrons_bkpnt = row['avg_patrons_bkpnt']
    avg_patrons_sub30 = row['avg_patrons_sub30']
    avg_patrons_add30 = row['avg_patrons_add30']
    
    bkpt_date       = row['bkpt_date']
    bkpt_date_sub30 = row['bkpt_date_sub30']
    bkpt_date_add30 = row['bkpt_date_add30']
    
    d1 = row['d1']
    d2 = row['d2']

    
    r = row['ratio']

    print(f'\nAverage number of patrons: (values calculated using a 30 days centered moving average)')
    print(f'• At breakpoint - 30days ({bkpt_date_sub30.date()}): {avg_patrons_sub30:,.1f}')
    print(f'• At breakpoint          ({bkpt_date.date()}): {avg_patrons_bkpnt:,.1f}')
    print(f'• At breakpoint + 30days ({bkpt_date_add30.date()}): {avg_patrons_add30:,.1f}')
    
    print(f'\nIncrease of patrons in the period before and after the breakpoint:')
    print(f"• Increase of patrons from {bkpt_date_sub30.date()} to {bkpt_date.date()}:        d1  = {d1:>+6.1f} patrons")
    print(f"• Increase of patrons from {bkpt_date.date()} to {bkpt_date_add30.date()}:        d2  = {d2:>+6.1f} patrons")
    
    print(f'\nRatio of the increases of the 2 periods: ')
    print(f"• Ratio between 2 increases:                            d2/d1  = {r:.2f}")
    print(f"• Percentage increase:                            |d2/d1|*100  = {abs(r):>+.0%}")

    
    
    

    ################################### CALCULATE DELTA MEANS BEFORE AND AFTER BKPOINT ###################################  
    
    ##### PATREON #####
    tmp_df_PT_sub30 = ts_pt_weekly_avg_df_float64[(ts_pt_weekly_avg_df_float64.index >= bkpt_date_sub30) & (ts_pt_weekly_avg_df_float64.index <= bkpt_date)]
    tmp_df_PT_add30 = ts_pt_weekly_avg_df_float64[(ts_pt_weekly_avg_df_float64.index >= bkpt_date) & (ts_pt_weekly_avg_df_float64.index <= bkpt_date_add30)]

    # delta patrons
    mean_delta_patrons_befor = tmp_df_PT_sub30['delta_patrons'].mean()
    mean_delta_patrons_after = tmp_df_PT_add30['delta_patrons'].mean()
        
    # delta earnings
    mean_delta_earnings_befor = tmp_df_PT_sub30['delta_earning'].mean()
    mean_delta_earnings_after = tmp_df_PT_add30['delta_earning'].mean()  

    
    ##### YOUTUBE TIME SERIES #####
    tmp_df_YT_sub30 = tmp_df_yt[(tmp_df_yt['datetime'] >= bkpt_date_sub30) & (tmp_df_yt['datetime'] <= bkpt_date      )]
    tmp_df_YT_add30 = tmp_df_yt[(tmp_df_yt['datetime'] >= bkpt_date      ) & (tmp_df_yt['datetime'] <= bkpt_date_add30)]
    
    # delta videos
    mean_delta_videos_befor = tmp_df_YT_sub30['delta_videos'].mean()
    mean_delta_videos_after = tmp_df_YT_add30['delta_videos'].mean()  

    # delta views
    mean_delta_views_befor = tmp_df_YT_sub30['delta_views'].mean()
    mean_delta_views_after = tmp_df_YT_add30['delta_views'].mean()  

    # delta subscriptions
    mean_delta_subs_befor = tmp_df_YT_sub30['delta_subs'].mean()
    mean_delta_subs_after = tmp_df_YT_add30['delta_subs'].mean()  

    
    ##### YOUTUBE METADATA #####
    tmp_df_YT_META_sub30 = tmp_df_yt_meta[(tmp_df_yt_meta['upload_date'] >= bkpt_date_sub30) & (tmp_df_yt_meta['upload_date'] <= bkpt_date      )]
    tmp_df_YT_META_add30 = tmp_df_yt_meta[(tmp_df_yt_meta['upload_date'] >= bkpt_date      ) & (tmp_df_yt_meta['upload_date'] <= bkpt_date_add30)]
        
    # durations
    mean_duration_befor = tmp_df_YT_META_sub30['duration'].mean()
    mean_duration_after = tmp_df_YT_META_add30['duration'].mean()      
        
    # likes
    mean_likes_befor = tmp_df_YT_META_sub30['like_count'].mean()
    mean_likes_after = tmp_df_YT_META_add30['like_count'].mean()      
        
    
    # plot dots in the middle of region for the region means   
    axs[0,2].plot(tmp_df_PT_sub30.index.mean(), mean_delta_patrons_befor, marker='o', color='green', markersize=15)
    axs[0,2].plot(tmp_df_PT_add30.index.mean(), mean_delta_patrons_after, marker='o', color='orange', markersize=15)
    axs[1,2].plot(tmp_df_PT_sub30.index.mean(), mean_delta_earnings_befor, marker='o', color='green', markersize=15)
    axs[1,2].plot(tmp_df_PT_add30.index.mean(), mean_delta_earnings_after, marker='o', color='orange', markersize=15)
    axs[2,2].plot(tmp_df_YT_sub30['datetime'].mean(), mean_delta_videos_befor, marker='o', color='green', markersize=15)
    axs[2,2].plot(tmp_df_YT_add30['datetime'].mean(), mean_delta_videos_after, marker='o', color='orange', markersize=15)
    axs[3,2].plot(tmp_df_YT_sub30['datetime'].mean(), mean_delta_views_befor, marker='o', color='green', markersize=15)
    axs[3,2].plot(tmp_df_YT_add30['datetime'].mean(), mean_delta_views_after, marker='o', color='orange', markersize=15)  
    axs[4,2].plot(tmp_df_YT_sub30['datetime'].mean(), mean_delta_subs_befor, marker='o', color='green', markersize=15)
    axs[4,2].plot(tmp_df_YT_add30['datetime'].mean(), mean_delta_subs_after, marker='o', color='orange', markersize=15)
    
    # sometimes there is no value at all for this period of time in YT meta --> error when plotting
    if not (tmp_df_YT_META_sub30.empty or tmp_df_YT_META_add30.empty):
        axs[5,2].plot(tmp_df_YT_META_sub30['upload_date'].mean(), mean_duration_befor, marker='o', color='green', markersize=15)
        axs[5,2].plot(tmp_df_YT_META_add30['upload_date'].mean(), mean_duration_after, marker='o', color='orange', markersize=15)  
        axs[6,2].plot(tmp_df_YT_META_sub30['upload_date'].mean(), mean_likes_befor, marker='o', color='green', markersize=15)
        axs[6,2].plot(tmp_df_YT_META_add30['upload_date'].mean(), mean_likes_after, marker='o', color='orange', markersize=15)  

    
    # plot horizontal lines for means
    mean_befor_list = [mean_delta_patrons_befor, mean_delta_earnings_befor, mean_delta_videos_befor, mean_delta_views_befor, mean_delta_subs_befor, mean_duration_befor, mean_likes_befor]
    mean_afer_list = [mean_delta_patrons_after, mean_delta_earnings_after, mean_delta_videos_after, mean_delta_views_after, mean_delta_subs_after, mean_duration_after, mean_likes_after]
       
    for idx, mean in enumerate(mean_befor_list):
            if not math.isnan(mean):
                axs[idx,2].hlines(y=mean, xmin=bkpt_date_sub30, xmax=bkpt_date      , linewidth=2, linestyle='--', color='green')

    for idx, mean in enumerate(mean_afer_list):
            if not math.isnan(mean):
                axs[idx,2].hlines(y=mean, xmin=bkpt_date,       xmax=bkpt_date_add30, linewidth=2, linestyle='--', color='orange')
        

    
    
    ################################### ZOOM OUT PLOTS ###################################
    
    # number of patrons (delta)
    axs[0,0].scatter(ts_pt_weekly_avg_df_float64.index, ts_pt_weekly_avg_df_float64['delta_patrons'], c='orange', s=30, marker='+')
    axs[0,0].set(title="Delta patrons per week")
    axs[0,0].set_ylabel("Δ Patrons")    
    color_neg_pos(axs[0,0], ts_pt_weekly_avg_df_float64.index, ts_pt_weekly_avg_df_float64['delta_patrons'])

    # number of patrons (cumulative)
    axs[0,1].plot(tmp_df_pt['date'], tmp_df_pt['patrons'], alpha=0.2)
    axs[0,1].plot(tmp_df_pt['date'], tmp_df_pt['patrons_ma'])
    axs[0,1].set(title="Number of patrons")
    axs[0,1].set_ylabel("# Patrons")

    # patreon earnings (delta)
    axs[1,0].scatter(ts_pt_weekly_avg_df_float64.index, ts_pt_weekly_avg_df_float64['delta_earning'], color='royalblue', s=30, marker='+')
    axs[1,0].set(title="Patreon delta earnings per week")
    axs[1,0].set_ylabel("Δ Earnings") 
    color_neg_pos(axs[1,0], ts_pt_weekly_avg_df_float64.index, ts_pt_weekly_avg_df_float64['delta_earning'])

    # patreon earnings (cumulative)
    axs[1,1].plot(tmp_df_pt['date'], tmp_df_pt['earning'], alpha=0.2)
    axs[1,1].plot(tmp_df_pt['date'], tmp_df_pt['earning_ma'], color='royalblue')
    axs[1,1].set(title="Patreon earnings per month")
    axs[1,1].set_ylabel("Earnings")
    
    # youtube videos (delta)
    axs[2,0].scatter(tmp_df_yt['datetime'], tmp_df_yt['delta_videos'], c='r', s=30, marker='+')
    axs[2,0].set(title="YouTube delta videos per week")
    axs[2,0].set_ylabel("Δ Videos")
    color_neg_pos(axs[2,0], tmp_df_yt['datetime'], tmp_df_yt['delta_videos'])

    # youtube videos (cumulative)
    axs[2,1].plot(tmp_df_yt['datetime'], tmp_df_yt['videos'], 'r')
    axs[2,1].set(title="YouTube cumulative videos")
    axs[2,1].set_ylabel("# Videos")

    # youtube views (delta)
    axs[3,0].scatter(tmp_df_yt['datetime'], tmp_df_yt['delta_views'], c='g', s=30, marker='+')
    axs[3,0].set(title="YouTube delta views per week")
    axs[3,0].set_ylabel("Δ Views")
    color_neg_pos(axs[3,0], tmp_df_yt['datetime'], tmp_df_yt['delta_views'])

    # youtube views (cumulative)
    axs[3,1].plot(tmp_df_yt['datetime'], tmp_df_yt['views'], 'g')
    axs[3,1].set(title="YouTube cumulative views")
    axs[3,1].set_ylabel("# Views")

    # youtube subs (delta)
    axs[4,0].scatter(tmp_df_yt['datetime'], tmp_df_yt['delta_subs'], c='m', s=30, marker='+')
    axs[4,0].set(title="YouTube delta subscriptions per week")
    axs[4,0].set_ylabel("Δ Subscriptions")
    color_neg_pos(axs[4,0], tmp_df_yt['datetime'], tmp_df_yt['delta_subs'])

    # youtube subs (cumulative)
    axs[4,1].plot(tmp_df_yt['datetime'], tmp_df_yt['subs'], 'm')
    axs[4,1].set(title="YouTube cumulative subscriptions")
    axs[4,1].set_ylabel("# Subscriptions")
    
    
    # youtube durations per uploads
    axs[5,0].scatter(tmp_df_yt_meta['upload_date'], tmp_df_yt_meta['duration'], c='brown', s=30, marker='+')
    axs[5,0].set(title="YouTube videos durations")
    axs[5,0].set_ylabel("Duration")
    
    
    # youtube likes at crawl date
    axs[6,0].scatter(tmp_df_yt_meta['upload_date'], tmp_df_yt_meta['like_count'], c='lightblue', s=30, marker='+')
    axs[6,0].set(title="YouTube likes (plotted against upload date)")
    axs[6,0].set_ylabel("Likes")
    

    ########################## RESTRICT DATES FOR ZOOM IN (+/- 2 months around breakpoint) ##########################

    # calculate min and max dates for zoom
    date_min_zoom = breakpoint_date - (2 * month_offset)
    date_max_zoom = breakpoint_date + (2 * month_offset)
            
    # restrict datasets between min and max dates
    tmp_df_pt_zoomed = tmp_df_pt[(tmp_df_pt['date'] >= date_min_zoom) & (tmp_df_pt['date'] <= date_max_zoom)].copy()
    tmp_df_yt_zoomed = tmp_df_yt[(tmp_df_yt['datetime'] >= date_min_zoom) & (tmp_df_yt['datetime'] <= date_max_zoom)].copy()
    tmp_df_yt_meta_zoomed = tmp_df_yt_meta[(tmp_df_yt_meta['upload_date'] >= date_min_zoom) & (tmp_df_yt_meta['upload_date'] <= date_max_zoom)].copy()

    # used for coloration
    ts_pt_weekly_avg_df_zoomed = ts_pt_weekly_avg_df_float64[(ts_pt_weekly_avg_df_float64.index >= date_min_zoom) & (ts_pt_weekly_avg_df_float64.index <= date_max_zoom)]
    
    
   ################################### ZOOM IN PLOTS  ###################################

    # zoomed in patron numbers (delta)
    axs[0,2].scatter(ts_pt_weekly_avg_df_zoomed.index, ts_pt_weekly_avg_df_zoomed['delta_patrons'], c='orange', s=30, marker='+')
    axs[0,2].plot(ts_pt_weekly_avg_df_zoomed.index, ts_pt_weekly_avg_df_zoomed['delta_patrons'], c='orange', alpha=0.3)
    axs[0,2].set(title="Delta patrons per week")
    axs[0,2].set_ylabel("Δ Patrons")
    color_neg_pos(axs[0,2], ts_pt_weekly_avg_df_float64.index, ts_pt_weekly_avg_df_zoomed['delta_patrons'])
    
    # zoomed in patron numbers (cumulative)
    axs[0,3].plot(tmp_df_pt_zoomed['date'], tmp_df_pt_zoomed['patrons'], alpha=0.2)
    axs[0,3].plot(tmp_df_pt_zoomed['date'], tmp_df_pt_zoomed['patrons_ma'])
    axs[0,3].set(title="Number of patrons (zoomed in)")
    axs[0,3].set_ylabel("# Patrons")
    
    # zoomed in patron earnings (delta)
    axs[1,2].scatter(ts_pt_weekly_avg_df_zoomed.index, ts_pt_weekly_avg_df_zoomed['delta_earning'], color='royalblue', s=30, marker='+')
    axs[1,2].plot(ts_pt_weekly_avg_df_zoomed.index, ts_pt_weekly_avg_df_zoomed['delta_earning'], color='royalblue', alpha=0.3)
    axs[1,2].set(title="Delta Patreon earnings per week (zoomed in)")
    axs[1,2].set_ylabel("Earnings")  
    color_neg_pos(axs[1,2], ts_pt_weekly_avg_df_float64.index, ts_pt_weekly_avg_df_zoomed['delta_earning'])

    # zoomed in patron earnings (cumulative)
    axs[1,3].plot(tmp_df_pt_zoomed['date'], tmp_df_pt_zoomed['earning'], alpha=0.2)
    axs[1,3].plot(tmp_df_pt_zoomed['date'], tmp_df_pt_zoomed['earning_ma'], color='royalblue')
    axs[1,3].set(title="Patreon earnings per month (zoomed in)")
    axs[1,3].set_ylabel("Earnings")
    
    # zoomed in youtube videos (delta)
    axs[2,2].scatter(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['delta_videos'], c='r', s=30, marker='+')
    axs[2,2].plot(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['delta_videos'], c='r', alpha=0.3)
    axs[2,2].set(title="YouTube delta videos per week (zoomed in)")
    axs[2,2].set_ylabel("Δ Videos")
    color_neg_pos(axs[2,2], tmp_df_yt['datetime'], tmp_df_yt_zoomed['delta_videos'])

    # zoomed in youtube videos (cumulative)
    axs[2,3].plot(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['videos'], 'r')
    axs[2,3].set(title="YouTube cumulative videos (zoomed in)")
    axs[2,3].set_ylabel("# Videos")

    # zoomed in youtube views (delta)
    axs[3,2].scatter(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['delta_views'], c='g', s=30, marker='+')
    axs[3,2].plot(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['delta_views'], c='g', alpha=0.3)
    axs[3,2].set(title="YouTube delta views per week (zoomed in)")
    axs[3,2].set_ylabel("Δ Views")
    color_neg_pos(axs[3,2], tmp_df_yt['datetime'], tmp_df_yt_zoomed['delta_views'])

    # zoomed in youtube views (cumulative)
    axs[3,3].plot(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['views'], 'g')
    axs[3,3].set(title="YouTube cumulative views (zoomed in)")
    axs[3,3].set_ylabel("# Views")
    
    # zoomed in youtube subs (delta)
    axs[4,2].scatter(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['delta_subs'], c='m', s=30, marker='+')
    axs[4,2].plot(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['delta_subs'], c='m', alpha=0.3)
    axs[4,2].set(title="YouTube delta subscriptions per week (zoomed in)")
    axs[4,2].set_ylabel("Δ Subscriptions")
    color_neg_pos(axs[4,2], tmp_df_yt['datetime'], tmp_df_yt_zoomed['delta_subs'])

    # zoomed in youtube subs (cumulative)
    axs[4,3].plot(tmp_df_yt_zoomed['datetime'], tmp_df_yt_zoomed['subs'], 'm')
    axs[4,3].set(title="YouTube cumulative subscriptions (zoomed in)")
    axs[4,3].set_ylabel("# Subscriptions")
    
    
    # youtube durations per uploads
    axs[5,2].scatter(tmp_df_yt_meta_zoomed['upload_date'], tmp_df_yt_meta_zoomed['duration'], c='brown', s=30, marker='+')
    axs[5,2].plot(tmp_df_yt_meta_zoomed['upload_date'], tmp_df_yt_meta_zoomed['duration'], c='brown', alpha=0.3)
    axs[5,2].set(title="YouTube videos durations (zoomed in)")
    axs[5,2].set_ylabel("Duration")
    color_neg_pos(axs[5,2], tmp_df_yt_meta_zoomed['upload_date'], tmp_df_yt_meta_zoomed['duration'])
    
        
   # youtube likes per uploads
    axs[6,2].scatter(tmp_df_yt_meta_zoomed['upload_date'], tmp_df_yt_meta_zoomed['like_count'], c='lightblue', s=30, marker='+')
    axs[6,2].plot(tmp_df_yt_meta_zoomed['upload_date'], tmp_df_yt_meta_zoomed['like_count'], c='lightblue', alpha=0.3)
    axs[6,2].set(title="YouTube likes (plotted against upload date) (zoomed in)")
    axs[6,2].set_ylabel("Likes")
    color_neg_pos(axs[5,2], tmp_df_yt_meta_zoomed['crawl_date'], tmp_df_yt_meta_zoomed['like_count'])
    
    
    
    ################################### FORMAT AXES ###################################

    # format the axes
    for i in range(axs.shape[0]):
        for j in range(axs.shape[1]):
            if j < 2:
                axs[i,j].set_xlim([date_min, date_max])
                axs[i,j].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
                axs[i,j].xaxis.set_major_locator(mdates.YearLocator())
                axs[i,j].xaxis.set_minor_locator(mdates.MonthLocator())
            if j >= 2:
                axs[i,j].set_xlim([date_min_zoom, date_max_zoom])
                axs[i,j].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%b'))
                axs[i,j].xaxis.set_major_locator(mdates.MonthLocator())
                # axs[i,j].xaxis.set_minor_locator(mdates.WeekdayLocator())
            axs[i,j].xaxis.grid(color="#CCCCCC", ls=":")
            axs[i,j].yaxis.grid(color="#CCCCCC", ls=":")
            axs[i,j].yaxis.set_major_formatter(KM_formatter)
            
            
    ################################### PLOT BREAKPOINT LINES AND POINTS ###################################

    # plot vertical lines for breakpoint, breakpoint-1month, breakpoint+1month
    print_legend = True
    for i in range(axs.shape[0]):
        for j in range(axs.shape[1]):
            if print_legend:
                axs[i,j].axvline(breakpoint_date, color='red', linestyle='--', label='break', linewidth=2.5)
                axs[i,j].axvline(breakpoint_date - month_offset, color='green', linestyle=':', label='- ' + str(month_offset.months)+' months', linewidth=2)
                axs[i,j].axvline(breakpoint_date + month_offset, color='orange', linestyle=':', label='+' + str(month_offset.months)+' months', linewidth=2)          
                # print_legend = False
            else:
                axs[i,j].axvline(breakpoint_date, color='red', linestyle='--', linewidth=2.5)
                axs[i,j].axvline(breakpoint_date - month_offset, color='green', linestyle=':', linewidth=2)
                axs[i,j].axvline(breakpoint_date + month_offset, color='orange', linestyle=':', linewidth=2)
    # axs[0,0].legend()
    axs[0,1].legend()

    # plot point for mean nb of patrons for breakpoint, breakpoint-1month, breakpoint+1month    
    axs[0,3].plot(breakpoint_date - month_offset, ts_pt_df.at[(breakpoint_date - month_offset), 'patrons_ma'], marker='o', color='green')
    axs[0,3].plot(breakpoint_date,               ts_pt_df.at[breakpoint_date              , 'patrons_ma'], marker='o', color='red')    
    axs[0,3].plot(breakpoint_date + month_offset, ts_pt_df.at[(breakpoint_date + month_offset), 'patrons_ma'], marker='o', color='orange')    


    ################################### GRANGER CAUSALITY TESTS ###################################

    # create a new dataframe with merged columns (the dates might have a day difference)
    selected_pt_columns  = ['delta_earning', 'delta_patrons']
    df_pt = ts_pt_weekly_avg_df_zoomed
    df_pt = df_pt[selected_pt_columns].reset_index().add_prefix('pt_')

    # selected_yt_columns = ['datetime', 'delta_views', 'delta_subs', 'delta_videos']
    selected_yt_columns = ['datetime', 'datetime_original', 'delta_views', 'delta_subs', 'delta_videos']
    df_yt = tmp_df_yt_zoomed
    df_yt = df_yt[selected_yt_columns].reset_index().add_prefix('yt_')

    # concatenated 2 dfs and select and reorder columns
    df_concat = pd.concat([df_pt, df_yt], axis=1)
    concat_columns = ['pt_date', 'yt_datetime', 'pt_delta_earning', 'pt_delta_patrons', 'yt_delta_views', 'yt_delta_subs', 'yt_delta_videos']
    df_concat = df_concat[concat_columns]
    # df_concat['dates_match'] = df_concat['pt_date'] == df_concat['yt_datetime']
    
    # display(df_concat.round())
    # display(df_concat.style.set_caption(f"df_concat"))
    
    
    print(f"\nGranger Causality Tests:")
    
    granger_causal_link = False
    for pt_var in PT_variables:
        for yt_var in YT_variables:
            
            # if nan values in this df, skip
            if df_concat[[yt_var, pt_var]].isna().values.any():
                continue
                
            pvalue_fwd = {}
            pvalue_rev = {}
            
            try:
                # print(f'\n\n• {pt_var} --> {yt_var}')
                granger_test_fwd = grangercausalitytests(df_concat[[yt_var, pt_var]], maxlag=MAXLAG, verbose=False)  
                # print(f'\n\n• {yt_var} --> {pt_var}')
                granger_test_rev = grangercausalitytests(df_concat[[pt_var, yt_var]], maxlag=MAXLAG, verbose=False) 
            except Exception:
                continue


            for lag in range(1, MAXLAG+1):           
                pvalue_fwd[lag] = granger_test_fwd[lag][0]['ssr_ftest'][1]
                pvalue_rev[lag] = granger_test_rev[lag][0]['ssr_ftest'][1]
                
            
            
            
            min_pvalue_fwd = min(pvalue_fwd.values())
            if min_pvalue_fwd < 0.05:
                granger_causal_link = True
                min_lag_fwd = [k for k, v in pvalue_fwd.items() if v == min_pvalue_fwd][0]
                print(f'• {pt_var} --> {yt_var} (pvalue={min_pvalue_fwd:.3f}, lag={min_lag_fwd})')

                # add value to df
                df_granger.loc[idx, pt_var+'->'+yt_var] = 1

                if (pt_var, yt_var) in granger_dict:                   
                    granger_dict[(pt_var, yt_var)].append(patreon)
                else:
                    granger_dict[(pt_var, yt_var)] = [patreon]
            else: 
                df_granger.loc[idx, pt_var+'->'+yt_var] = 0
                
                
                
            min_pvalue_rev = min(pvalue_rev.values())
            if min_pvalue_rev < 0.05:
                granger_causal_link = True
                min_lag_rev = [k for k, v in pvalue_rev.items() if v == min_pvalue_rev][0]
                print(f'• {yt_var} --> {pt_var} (pvalue={min_pvalue_rev:.3f}, lag={min_lag_rev})')

                # add value to df
                df_granger.loc[idx, yt_var+'->'+pt_var] = 1
                
                if (yt_var, pt_var) in granger_dict:
                    granger_dict[(yt_var, pt_var)].append(patreon)
                else:
                    granger_dict[(yt_var, pt_var)] = [patreon]
            else: 
                df_granger.loc[idx, yt_var+'->'+pt_var] = 0
                

    if (granger_causal_link == False):
        print("• No Granger causality found for this account")
        not_granger.append(patreon)
    
    print("\n")

    fig.tight_layout(w_pad=0)
    plt.show()
    
    print('\n\n\n---------------------------------------------------------------------------------------------------------------------------------------------------')
    
# print('\n\nGranger tests summary statistics:')
    
# print(f'• Number of patreon accounts analysed (patrons increase ratio > {incr_thresh_ratio}): {len(df_granger)}')
# print(f'• Number of patreon with no Granger-causal link: {len(not_granger)} ({len(not_granger)/len(df_granger):.0%})')

# print(f'• Number of patreon accounts per Granger-causal link:')
# # Converting granger dict into list of tuples (in order to sort it), the 2nd value of the tuple being the count of accounts
# granger_list = [(k, len(v)) for k, v in granger_dict.items()]
# # sort by count desc
# granger_list_desc = sorted(granger_list, key=lambda tup: -tup[1])
# for (k,v) in granger_list_desc:
#     print(f'    • {k[0]} \t--> {k[1]}:\t {v} ({v/len(df_granger):.0%})')


# df_granger[columns] = df_granger[columns].astype('Int64')
# df_granger

### 2.5 Matching

In [None]:
df_pt_bkpnt = pd.read_csv(LOCAL_DATA_FOLDER+"df_pt_bkpnt.tsv.gz", sep="\t", compression='gzip')
df_pt_bkpnt['bkpt_date'] = pd.to_datetime(df_pt_bkpnt['bkpt_date'])
df_pt_bkpnt['bkpt_date_sub30'] = pd.to_datetime(df_pt_bkpnt['bkpt_date_sub30'])
df_pt_bkpnt['bkpt_date_add30'] = pd.to_datetime(df_pt_bkpnt['bkpt_date_add30'])
df_pt_bkpnt

#### 2.5.1 Create a summary df

In [None]:
# # compare YouTube and Patreon timeseries for top patreon accounts with rolling average - MANUAL VERSION 2
# month_offset = pd.DateOffset(months=1)
# week_offset = pd.DateOffset(weeks=1)
# rolling_avg_window = 30

# # variables for Granger Tests
# MAXLAG = 2
# granger_dict = {} # dictionary with  keys (cause --> effect) and values with list of corresponding patreon account(s)
# not_granger = []
# YT_variables = ['yt_delta_videos', 'yt_delta_views', 'yt_delta_subs']
# # PT_variables = ['pt_delta_patrons', 'pt_delta_earning']
# PT_variables = ['pt_delta_patrons']

# df_granger = df_pt_bkpnt_filt.copy()

# # LOOP OVER TOP PATREON ACCOUNTS
# for idx, row in tqdm(df_granger[:10].iterrows()):   

    
#     ########################## RESTRICT DATAFRAMES TO 1 PATREON ACCOUNT ##########################

#     patreon = row['patreon_id']

#     # patreon earnings and users
#     tmp_df_pt = df_top_pt_daily_earnings[df_top_pt_daily_earnings['patreon'] == patreon].copy()  
    
#     # youtube videos
#     tmp_df_yt = df_yt_timeseries_top_pt[df_yt_timeseries_top_pt['patreon_id'] == patreon].copy()
    
#     # replace dates that were collected after 23:00 to their next day, and remove hour
#     tmp_df_yt['datetime_original'] = tmp_df_yt['datetime']
#     tmp_df_yt['datetime'] = tmp_df_yt['datetime'].apply(lambda date: (date + pd.DateOffset(days=1)) if date.hour >= 23 else date) 
    
#     # remove hours and convert to datetime type
#     tmp_df_yt['datetime'] = pd.to_datetime(tmp_df_yt['datetime'].dt.date)
    
#     ########################## PRINT TITLES ##########################
    
#     # print URLs for patreon, graphtreon, YT channel(s) related to this patreon account, and breakpoint date
#     # ch_ids = tmp_df_yt['channel'].unique()
#     # print(f"\n\n\n\033[1mRank {idx+1}: {patreon[12:]} \033[0m")
#     # print(f"https://www.{patreon}")
#     # print(f"https://graphtreon.com/creator/{patreon[12:]}")
#     # for ch_id in ch_ids:
#     #     print(f"https://youtube.com/channel/{ch_id}")

    
#     ########################## RESTRICT DATES FOR ZOOM OUT ##########################
    
#     # set min and max dates for plots   
#     date_min = max([tmp_df_yt['datetime'].min(), tmp_df_pt['date'].min()])
#     date_max = min([tmp_df_yt['datetime'].max(), tmp_df_pt['date'].max()])
    
#     if date_max < date_min:
#         print(f":( no overlapping period between YouTube and Patreon datasets\n")
#         continue
    
#     # restrict datasets between min and max dates
#     tmp_df_pt = tmp_df_pt[(tmp_df_pt['date'] >= date_min) & (tmp_df_pt['date'] <= date_max)]
#     tmp_df_yt = tmp_df_yt[(tmp_df_yt['datetime'] >= date_min) & (tmp_df_yt['datetime'] <= date_max)]
    
#     # align both dataframes since youtube starts once a week
#     tmp_df_pt = tmp_df_pt[tmp_df_pt['date'] >= tmp_df_yt['datetime'].min()]
    
    
    
#     ########################## PATREON: CALCULATE MOVING AVERAGE AND WEEKLY DELTAS ##########################
    
#     tmp_df_pt['patrons_ma'] = tmp_df_pt['patrons'].rolling(rolling_avg_window, center=True).mean()
#     tmp_df_pt['earning_ma'] = tmp_df_pt['earning'].rolling(rolling_avg_window, center=True).mean()
#     ts_pt_df = tmp_df_pt.set_index(tmp_df_pt['date']) # set the date as the index
    
#     # resample time series to get 7 days intervals in order to calculate weekly deltas
#     ts_pt_weekly_avg_df = ts_pt_df.resample('7D').mean()
#     ts_pt_weekly_avg_df['delta_patrons'] = ts_pt_weekly_avg_df['patrons'].diff(periods=1)
#     ts_pt_weekly_avg_df['delta_earning'] = ts_pt_weekly_avg_df['earning'].diff(periods=1)
#     ts_pt_weekly_avg_df = ts_pt_weekly_avg_df[1:]  # remove 1st row (which is NA)
#     tmp_df_yt = tmp_df_yt[1:] # remove YT 1st row to start at the same time as PT
    
#     # reorder columns to have deltas columns next to their respective columns
#     patreon_column_names = ['earning', 'delta_earning', 'earning_ma', 'patrons', 'delta_patrons', 'patrons_ma']
#     ts_pt_weekly_avg_df = ts_pt_weekly_avg_df[patreon_column_names]
    
#     # convert Float64 columns to float64 to avoid Matplotlib NAType error
#     ts_pt_weekly_avg_df_float64 = ts_pt_weekly_avg_df.astype({'patrons': 'float64', 'delta_patrons': 'float64'})
    
               
#     ################################### CALCULATE INCREASE AND REJECT IF NOT VALID OR LESS THAN THRESHOLD ###################################

#     breakpoint_date = row['bkpt_date']

#     avg_patrons_bkpnt = row['avg_patrons_bkpnt']
#     avg_patrons_sub30 = row['avg_patrons_sub30']
#     avg_patrons_add30 = row['avg_patrons_add30']
    
#     bkpt_date       = row['bkpt_date']
#     bkpt_date_sub30 = row['bkpt_date_sub30']
#     bkpt_date_add30 = row['bkpt_date_add30']
    
#     d1 = row['d1']
#     d2 = row['d2']

    
#     r = row['ratio']

# #     print(f'\nAverage number of patrons: (values calculated using a 30 days centered moving average)')
# #     print(f'• At breakpoint - 30days ({bkpt_date_sub30.date()}): {avg_patrons_sub30:,.1f}')
# #     print(f'• At breakpoint          ({bkpt_date.date()}): {avg_patrons_bkpnt:,.1f}')
# #     print(f'• At breakpoint + 30days ({bkpt_date_add30.date()}): {avg_patrons_add30:,.1f}')
    
# #     print(f'\nIncrease of patrons in the period before and after the breakpoint:')
# #     print(f"• Increase of patrons from {bkpt_date_sub30.date()} to {bkpt_date.date()}:        d1  = {d1:>+6.1f} patrons")
# #     print(f"• Increase of patrons from {bkpt_date.date()} to {bkpt_date_add30.date()}:        d2  = {d2:>+6.1f} patrons")
    
# #     print(f'\nRatio of the increases of the 2 periods: ')
# #     print(f"• Ratio between 2 increases:                            d2/d1  = {r:.2f}")
# #     print(f"• Percentage increase:                            |d2/d1|*100  = {abs(r):>+.0%}")
    


#     ########################## RESTRICT DATES FOR ZOOM IN (+/- 2 months around breakpoint) ##########################

#     # calculate min and max dates for zoom
#     date_min_zoom = breakpoint_date - (2 * month_offset)
#     date_max_zoom = breakpoint_date + (2 * month_offset)
            
#     # restrict datasets between min and max dates
#     tmp_df_pt_zoomed = tmp_df_pt[(tmp_df_pt['date'] >= date_min_zoom) & (tmp_df_pt['date'] <= date_max_zoom)].copy()
#     tmp_df_yt_zoomed = tmp_df_yt[(tmp_df_yt['datetime'] >= date_min_zoom) & (tmp_df_yt['datetime'] <= date_max_zoom)].copy()

#     # used for coloration
#     ts_pt_weekly_avg_df_zoomed = ts_pt_weekly_avg_df_float64[(ts_pt_weekly_avg_df_float64.index >= date_min_zoom) & (ts_pt_weekly_avg_df_float64.index <= date_max_zoom)]
    


#     ################################### GRANGER CAUSALITY TESTS ###################################

#     # create a new dataframe with merged columns (the dates might have a day difference)
#     selected_pt_columns  = ['delta_earning', 'delta_patrons']
#     df_pt = ts_pt_weekly_avg_df_zoomed
#     df_pt = df_pt[selected_pt_columns].reset_index().add_prefix('pt_')

#     # selected_yt_columns = ['datetime', 'delta_views', 'delta_subs', 'delta_videos']
#     selected_yt_columns = ['datetime', 'datetime_original', 'delta_views', 'delta_subs', 'delta_videos']
#     df_yt = tmp_df_yt_zoomed
#     df_yt = df_yt[selected_yt_columns].reset_index().add_prefix('yt_')

#     # concatenated 2 dfs and select and reorder columns
#     df_concat = pd.concat([df_pt, df_yt], axis=1)
#     concat_columns = ['pt_date', 'yt_datetime', 'pt_delta_earning', 'pt_delta_patrons', 'yt_delta_views', 'yt_delta_subs', 'yt_delta_videos']
#     df_concat = df_concat[concat_columns]
#     # df_concat['dates_match'] = df_concat['pt_date'] == df_concat['yt_datetime']
    
#     display(df_concat.round())
#     # display(df_concat.style.set_caption(f"df_concat"))
    
    
    
#     # print(f"\nGranger Causality Tests:")
    
#     granger_causal_link = False
#     for pt_var in PT_variables:
#         for yt_var in YT_variables:
            
#             # if nan values in this df, skip
#             if df_concat[[yt_var, pt_var]].isna().values.any():
#                 continue
                
#             pvalue_fwd = {}
#             pvalue_rev = {}
            
#             try:
#                 # print(f'\n\n• {pt_var} --> {yt_var}')
#                 granger_test_fwd = grangercausalitytests(df_concat[[yt_var, pt_var]], maxlag=MAXLAG, verbose=False)  
#                 # print(f'\n\n• {yt_var} --> {pt_var}')
#                 granger_test_rev = grangercausalitytests(df_concat[[pt_var, yt_var]], maxlag=MAXLAG, verbose=False) 
#             except Exception:
#                 continue


#             for lag in range(1, MAXLAG+1):           
#                 pvalue_fwd[lag] = granger_test_fwd[lag][0]['ssr_ftest'][1]
#                 pvalue_rev[lag] = granger_test_rev[lag][0]['ssr_ftest'][1]
                
            
            
            
#             min_pvalue_fwd = min(pvalue_fwd.values())
#             if min_pvalue_fwd < 0.05:
#                 granger_causal_link = True
#                 min_lag_fwd = [k for k, v in pvalue_fwd.items() if v == min_pvalue_fwd][0]
#                 # print(f'• {pt_var} --> {yt_var} (pvalue={min_pvalue_fwd:.3f}, lag={min_lag_fwd})')

#                 # add value to df
#                 df_granger.loc[idx, pt_var+'->'+yt_var] = 1

#                 if (pt_var, yt_var) in granger_dict:                   
#                     granger_dict[(pt_var, yt_var)].append(patreon)
#                 else:
#                     granger_dict[(pt_var, yt_var)] = [patreon]
#             else: 
#                 df_granger.loc[idx, pt_var+'->'+yt_var] = 0
                
                
                
#             min_pvalue_rev = min(pvalue_rev.values())
#             if min_pvalue_rev < 0.05:
#                 granger_causal_link = True
#                 min_lag_rev = [k for k, v in pvalue_rev.items() if v == min_pvalue_rev][0]
#                 # print(f'• {yt_var} --> {pt_var} (pvalue={min_pvalue_rev:.3f}, lag={min_lag_rev})')

#                 # add value to df
#                 df_granger.loc[idx, yt_var+'->'+pt_var] = 1
                
#                 if (yt_var, pt_var) in granger_dict:
#                     granger_dict[(yt_var, pt_var)].append(patreon)
#                 else:
#                     granger_dict[(yt_var, pt_var)] = [patreon]
#             else: 
#                 df_granger.loc[idx, yt_var+'->'+pt_var] = 0
                

#     if (granger_causal_link == False):
#         # print("• No Granger causality found for this account")
#         not_granger.append(patreon)
    
#     # print("\n")

#     # fig.tight_layout(w_pad=0)
#     # plt.show()
    
#     # print('\n\n\n---------------------------------------------------------------------------------------------------------------------------------------------------')
    
# print(F'\n\nGranger tests summary statistics: (with maxlag = {MAXLAG}')
    
# print(f'• Number of patreon accounts analysed (patrons increase ratio > {incr_thresh_ratio}): {len(df_granger)}')
# print(f'• Number of patreon with no Granger-causal link: {len(not_granger)} ({len(not_granger)/len(df_granger):.0%})')

# print(f'• Number of patreon accounts per Granger-causal link:')

# # Converting granger dict into list of tuples (in order to sort it), the 2nd value of the tuple being the count of accounts
# granger_list = [(k, len(v)) for k, v in granger_dict.items()]
# # sort by count desc
# granger_list_desc = sorted(granger_list, key=lambda tup: -tup[1])
# for (k,v) in granger_list_desc:
#     print(f'    • {k[0]} \t--> {k[1]}:\t {v} ({v/len(df_granger):.0%})')


# df_granger[columns] = df_granger[columns].astype('Int64')
# df_granger