# Characterizing Patronage on YouTube

## Scripts

In [None]:
import os 
import io
import pandas as pd
import json
import re
import zstandard
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import seaborn as sns
import gzip
from tqdm import tqdm
import timeit

In [None]:
DATA_FOLDER = "/dlabdata1/youtube_large/"

### 1. Filter YouTube metadata containing patreon id

In [None]:
# get uncompressed file size - problem: returns a negative ratio
# !gzip -l /dlabdata1/youtube_large/yt_metadata_en.jsonl.gz

In [None]:
# print(2**10, "Bytes = 1 KB")
# print(2**20, "Bytes = 1 MB")
# print(2**30, "Bytes = 1 GB")

In [None]:
def json_escape(str):
    """
    replace new line special character by a space
    """
    return str.replace("\\n", " ")

In [None]:
# extract patreon accounts from youtube channel descriptions and
# filter the metadata to retain only the rows which description contains a patreon url
input_file_path = DATA_FOLDER+"/yt_metadata_en.jsonl.gz"

# MAX_ITER = 10_000

nb_rows_read = 0
JSONDecodeErrors_cnt = 0 
lines_json = []    

# match patterns starting with patreon.com/ and matching any character after until space
# pattern = re.compile(r'patreon.com/[^\s]*')

# match patterns starting with patreon.com/ and matching at least 1 character after until space
pattern = re.compile(r'patreon.com/[^\s]+')


compressed_file_size = os.stat(input_file_path).st_size
print("Compressed file size is :                 {:>3,.2f} GB".format(compressed_file_size / 2**30))

uncompressed_file_size = 97_600_000_000
print("Estimated Uncompressed file size is :     {:>3,.2f} GB".format(uncompressed_file_size / 2**30))

start = timeit.default_timer()

# Load tqdm with size counter instead of file counter
with tqdm(total=uncompressed_file_size, unit='B', unit_scale=True, unit_divisor=1024) as pbar:
    with gzip.open(input_file_path, "r") as f:
        for i, line_byte in enumerate(f): 

            read_bytes = len(line_byte)
            if read_bytes:
                pbar.set_postfix(file=input_file_path[len(DATA_FOLDER)+1:], refresh=False)
                pbar.update(read_bytes)

            nb_rows_read += 1
            
            # set a maximum iteration for tests
            # if nb_rows_read >= MAX_ITER:
            #     break

            # convert bytes into string
            line_str = line_byte.decode("utf-8")

            # convert string into json after escaping new line characters
            line_str_esc = json_escape(line_str)
            try:
                line_json = json.loads(line_str_esc)
            except Exception as e:
                JSONDecodeErrors_cnt += 1
                pass

            # print(line_json)
            # print(line_json['categories'])
            
            # add line if description contains a patreon.com id
            if re.search(pattern, line_json['description']):
                patreon_id = re.findall(pattern, line_json['description'])[0]
                line_json['patreon_id'] = patreon_id
                lines_json.append(line_json)

stop = timeit.default_timer()
time_diff = stop - start

print()
print("==> total time to read and filter youtube metadata:                {:>10.0f} min. ({:.0f}s.)".format(time_diff/60, time_diff)) 
print("==> number of rows read:                                           {:>10,}".format(nb_rows_read))
print("==> number of videos containing a patreon link in the description: {:>10,} ({:.3%})".format(len(lines_json), len(lines_json)/nb_rows_read ))
print("==> number of skipped rows (JSONDecodeErrors):                     {:>10,} ({:.3%})".format(JSONDecodeErrors_cnt, JSONDecodeErrors_cnt/nb_rows_read))

# create new dataframe with the filtered lines
df_yt_metadata_pt = pd.DataFrame(data=lines_json, index=None)

# calculate memory usage of the new dataframe
mem_cons = df_yt_metadata_pt.memory_usage(index=True).sum()
print("==> memory usage of new (filtered) dataframe:                      {:12,.2f} GB ({:,} bytes)".format(mem_cons / 2**30, mem_cons))

In [None]:
df_yt_metadata_pt.head(2)

In [None]:
# list all files in DATA_FOLDER
# !ls -lh /dlabdata1/youtube_large

In [None]:
# list all files in LOCAL_FOLDER
!ls -lh ../

In [None]:
# save filtered data to LOCAL SCRATCH FOLDER as a compressed tsv
output_file_path = "../yt_metadata_en_pt.tsv.gz"
df_yt_metadata_pt.to_csv(output_file_path, index=False, sep='\t', compression='gzip')

In [None]:
# save filtered data to LOCAL SCRATCH FOLDER as feather - PROBLEM: KERNEL DIES
# output_file_path = "../yt_metadata_en_pt.feather"
# df_yt_metadata_pt.to_feather(output_file_path)

In [None]:
!ls -lh "../yt_metadata_en_pt.tsv.gz"

In [None]:
# # read feather file
# df_yt_metadata_pt = pd.read_feather(output_file_path)
# df_yt_metadata_pt

In [None]:
# df_yt_metadata_pt = pd.read_csv('../yt_metadata_en_pt.tsv.gz', sep="\t", lineterminator='\n', compression='gzip', nrows=100_000)
df_yt_metadata_pt = pd.read_csv('../yt_metadata_en_pt.tsv.gz", sep="\t", lineterminator='\n', compression='gzip') # takes about 2 mins
df_yt_metadata_pt

### 2. Filter Graphtreon to keep only records which patreon id exists the YouTube metadata

Read filtered youtube metadata file from disk... 

In [None]:
!ls -lh "../yt_metadata_en_pt.tsv.gz"

In [None]:
# read filtered youtube metadata file (takes about 2 mins)
df_yt_metadata_pt = pd.read_csv("../yt_metadata_en_pt.tsv.gz", sep="\t", lineterminator='\n', compression='gzip') 
df_yt_metadata_pt.head(3)

In [None]:
print("[YouTube metadata] number of videos that contain a patreon link in description:      {:>10,}".format(len(df_yt_metadata_pt)))

In [None]:
# get list of all unique patreon ids in df_yt_metadata_pt
yt_patreon_list = df_yt_metadata_pt.patreon_id.unique()
print("[Filtered YouTube metadata] total number of unique patreon ids:                       {:>9,}".format(len(yt_patreon_list)))

In [None]:
!ls -lh /dlabdata1/youtube_large/final_processed_file.jsonl.gz

In [None]:
def json_escape(str):
    """
    replace new line special character by a space
    """
    return str.replace("\\n", " ")

In [None]:
# extract patreon accounts from youtube channel descriptions and
# filter the metadata to retain only the rows which patreon url exists in the filtered YT metadata 
input_file_path = DATA_FOLDER+"/final_processed_file.jsonl.gz"

# MAX_ITER = 1_000

nb_rows_read = 0
JSONDecodeErrors_cnt = 0 
lines_json = []    

# pattern = re.compile(r'patreon.com/\w*')

compressed_file_size = os.stat(input_file_path).st_size
print("Compressed file size is :                 {:>3,.2f} GB".format(compressed_file_size / 2**30))
# 12.4
uncompressed_file_size = 13_310_000_000
print("Estimated Uncompressed file size is :     {:>3,.2f} GB".format(uncompressed_file_size / 2**30))

start = timeit.default_timer()

# Load tqdm with size counter instead of file counter
with tqdm(total=uncompressed_file_size, unit='B', unit_scale=True, unit_divisor=1024) as pbar:
    with gzip.open(input_file_path, "r") as f:
        for i, line_byte in enumerate(f): 

            read_bytes = len(line_byte)
            if read_bytes:
                pbar.set_postfix(file=input_file_path[len(DATA_FOLDER)+1:], refresh=False)
                pbar.update(read_bytes)

            nb_rows_read += 1
            
            # set a maximum iteration for tests
            # if nb_rows_read >= MAX_ITER:
            #     break

            # convert bytes into string
            line_str = line_byte.decode("utf-8")

            
            # convert string into json after escaping new line characters
            line_str_esc = json_escape(line_str)
            try:
                line_json = json.loads(line_str_esc)
            except Exception as e:
                JSONDecodeErrors_cnt += 1
                pass
           
            
            # add line if patreon id is exists in df_yt_metadata_pt
            if line_json['patreon'] in yt_patreon_list:
                lines_json.append(line_json)

stop = timeit.default_timer()
time_diff = stop - start

print()
print("==> total time to read and filter graphtreon time series:          {:>10.0f} min. ({:.0f}s.)".format(time_diff/60, time_diff)) 
print("==> number of rows read:                                           {:>10,}".format(nb_rows_read))
print("==> number of patreon ids that exist in both GTts and YT metadata: {:>10,} ({:.2%})".format(len(lines_json), len(lines_json)/nb_rows_read ))
print("==> number of skipped rows (JSONDecodeErrors):                     {:>10,}".format(JSONDecodeErrors_cnt))

# create new dataframe with the filtered lines
df_gt_timeseries_filtered = pd.DataFrame(data=lines_json)

# calculate memory usage of the new dataframe
mem_cons = df_gt_timeseries_filtered.memory_usage(index=True).sum()
print("==> memory usage of new (filtered) dataframe:                      {:12,.2f} MB ({:,} bytes)".format(mem_cons / 2**20, mem_cons))

In [None]:
df_gt_timeseries_filtered.head(1)

In [None]:
# save filtered data to disk as feather - ERROR
# output_file_path = "df_gt_timeseries_filtered.feather"
# df_gt_timeseries_filtered.to_feather(output_file_path)

In [None]:
# save filtered data to LOCAL SCRATCH FOLDER as a compressed tsv
output_file_path = "../df_gt_timeseries_filtered.tsv.gz"
df_gt_timeseries_filtered.to_csv(output_file_path, index=False, sep='\t', compression='gzip')

In [None]:
# !ls -lh yt_metadata_en_pt.tsv.gz
!ls -lh ../df_gt_timeseries_filtered.tsv.gz