In [None]:
import pandas as pd
import json
import scipy
import gzip
from tqdm import tqdm

from datetime import datetime, date, time

import matplotlib
import matplotlib.pyplot as plt

In [None]:
word_lists = [
    "mental health",
    "mental illness",
    "solitude",
    "alone",
    "lonely",
    "loneliness",
    "depress"  # "depression",# "depressed",# "depressing",
    "stress"  # "stress", # "stressing",# "stressed"
    "anxiety",
    "anxious",
    "suicide",
    "suicidal",
    "trauma",
    "ptsd",
    "[^a-z0-9]emo[^a-z0-9]"
]

key_words = "|".join(word_lists)

In [None]:
##### metadata #####

def df_filter(metadata: list):
    """
    Function to build and filter dataframe.
    A record would be kept if all its text fields (description, tags, title) match with at least one keyword specifed above
    """

    df_meta = pd.DataFrame(metadata)

    # df_text = df_meta[["description", "tags", "title"]].copy()
    # df_text.dropna(inplace=True)

    filter_condition1 = df_meta["description"].str.lower().str.contains(key_words).astype(int)
    filter_condition2 = df_meta["tags"].str.lower().str.contains(key_words).astype(int)
    filter_condition3 = df_meta["title"].str.lower().str.contains(key_words).astype(int)

    filter_condition = filter_condition1 + filter_condition2 + filter_condition3
    df_meta = df_meta[filter_condition >= 2]

    # df_meta = df_meta[df_meta["description"].str.lower().str.contains(key_words)]
    # df_meta = df_meta[df_meta["tags"].str.lower().str.contains(key_words)]
    # df_meta = df_meta[df_meta["title"].str.lower().str.contains(key_words)]

    return df_meta.copy()


metadata = []
tmp_metadata = []

# Specify the file path and batch size
file_path = './data/yt_metadata_en.jsonl.gz'
batch = 200000

with gzip.open(file_path, mode="rb") as f:
    for idx, row in enumerate(tqdm(f, total=72924794)):
        json_dict = json.loads(row.decode("utf-8"))
        tmp_metadata.append(json_dict)

        if (idx + 1) % batch == 0 and idx != 0:
            metadata.append(df_filter(tmp_metadata))
            tmp_metadata = []
    
    if tmp_metadata:
        metadata.append(df_filter(tmp_metadata))
        tmp_metadata = []

df_meta_final = pd.concat(metadata)

In [None]:
df_meta_final.to_csv("./metadata_more.csv", index=False)

In [None]:
df_meta = pd.read_csv("./metadata.csv")
df_meta.upload_date = pd.to_datetime(df_meta.upload_date)

df_meta['upload_month'] = df_meta['upload_date'].dt.to_period('M')

# Plotting the histogram
plt.figure(figsize=(10, 6))
df_meta['upload_month'].value_counts().sort_index().plot(kind='bar', color='skyblue')

In [None]:
df_meta_channel = df_meta.sort_values(by="upload_date").drop_duplicates(subset = ['channel_id', 'upload_month'], keep='first').copy()
plt.figure(figsize=(10, 6))
df_meta_channel['upload_month'].value_counts().sort_index().plot(kind='bar', color='skyblue')