In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# Path setting
comments_file_path = 'data/youtube_comments.tsv.gz'
top_commenters_file_path = 'data/top_commenters.parquet'
video_metadata_file_path = 'data/yt_metadata_helper.feather'

#### Helper functions

In [34]:
def update_category_counts(chunk, top_commenters_df, video_metadata_df, comments_authors_cat_df):
    """
        Function to update the comments_authors_cat_df DataFrame
    """

    # Merge chunk to get author id match and video id match
    chunk_merged = pd.merge(chunk, top_commenters_df, on='author', how='inner')
    chunk_merged = pd.merge(chunk, video_metadata_df, on='video_id', how='inner')

    # Iterate over rows and update comments_authors_cat_df
    for _, row in chunk_merged.iterrows():
        author_id = row['author']
        category = row['categories']
        comments_authors_cat_df.loc[comments_authors_cat_df['author'] == author_id, f'{category}_count'] += 1

    return comments_authors_cat_df

#### Load dataset

In [35]:
# Get Top comments and Videos's metadata
top_commenters_df = pd.read_parquet(top_commenters_file_path)
video_metadata_df = pd.read_feather(video_metadata_file_path)[['display_id', 'categories']].rename(columns={'display_id': 'video_id'})

# Init comments_authors_cat_df 
categories = video_metadata_df['categories'].unique()
columns = ['author', 'number_comm'] + [f'{cat}_count' for cat in categories]
comments_authors_cat_df = pd.DataFrame(0, index=top_commenters_df['author'], columns=columns)

display(comments_authors_cat_df.head())

KeyboardInterrupt: 

In [None]:
# Chunk size
chunk_size = 10**6  

verbose_count = 0
# Pass and loop
for chunk in pd.read_csv(comments_file_path, sep='\t', compression='gzip', chunksize=chunk_size):
    
    comments_authors_cat_df = update_category_counts(chunk, top_commenters_df, video_metadata_df, comments_authors_cat_df)
    
    # Verbose
    verbose_count += chunk
    percentage_covered = (verbose_count / 8.6e9) * 100
    if percentage_covered % 20 == 0:
        print(f"Percentage of comments covered: {percentage_covered:.2f}%")

# Reset index 
comments_authors_cat_df.reset_index(inplace=True)

In [None]:
# Safety check
comments_authors_cat_df.info()

In [None]:
# Save it
comments_authors_cat_df.to_parquet(path='data/comments_authors_categories.parquet')