In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# Path setting
comments_file_path = 'data/youtube_comments.tsv.gz'
top_commenters_file_path = 'data/top_commenters.parquet'
video_metadata_file_path = 'data/yt_metadata_helper.parquet'

#### Helper functions

In [2]:
def update_category_counts(chunk, top_commenters_df, video_metadata_df, comments_authors_cat_df):
    """
        Function to update the comments_authors_cat_df DataFrame
    """

    # Merge chunk to get author id match and video id match
    chunk_merged = pd.merge(chunk, top_commenters_df, on='author', how='inner')
    print('merged 1')
    chunk_merged = pd.merge(chunk, video_metadata_df, on='video_id', how='inner')

    # Iterate over rows and update comments_authors_cat_df
    for _, row in chunk_merged.iterrows():
        author_id = row['author']
        category = row['categories']
        comments_authors_cat_df.loc[comments_authors_cat_df['author'] == author_id, f'{category}_count'] += 1

    return comments_authors_cat_df

#### Load dataset

In [4]:
# Get Top comments and Videos's metadata
top_commenters_df = pd.read_parquet(top_commenters_file_path)
video_metadata_df = pd.read_parquet(video_metadata_file_path)

In [6]:
# Init comments_authors_cat_df
categories = video_metadata_df['categories'].unique()
columns = ['author', 'number_comm'] + [f'{cat}_count' for cat in categories]
data = {'author': top_commenters_df['author'], 'number_comm': top_commenters_df['number_comm']}
data.update({f'{cat}_count': 0 for cat in categories})

comments_authors_cat_df = pd.DataFrame(data, columns=columns)
display(comments_authors_cat_df.head())

Unnamed: 0,author,number_comm,Film & Animation_count,Gaming_count,Education_count,People & Blogs_count,Entertainment_count,Autos & Vehicles_count,Comedy_count,Sports_count,News & Politics_count,Music_count,Howto & Style_count,Science & Technology_count,Travel & Events_count,Pets & Animals_count,Nonprofits & Activism_count,_count,Shows_count,Movies_count
0,157898414,352020,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,309188934,285379,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,133567048,255775,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,472447001,202561,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,224828975,193858,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
# Chunk size
chunk_size = 10**5  

verbose_count = 0
# Pass and loop
for chunk in pd.read_csv(comments_file_path, sep='\t', compression='gzip', chunksize=chunk_size):
    
    comments_authors_cat_df = update_category_counts(chunk, top_commenters_df, video_metadata_df, comments_authors_cat_df)
    print(1)
    # Verbose
    verbose_count += chunk
    percentage_covered = (verbose_count / 8.6e9) * 100
    if percentage_covered % 10 == 0:
        print(f"Percentage of comments covered: {percentage_covered:.2f}%")

# Reset index 
comments_authors_cat_df.reset_index(inplace=True)

merged 1


KeyboardInterrupt: 

In [None]:
# Safety check
comments_authors_cat_df.info()

In [None]:
# Save it
comments_authors_cat_df.to_parquet(path='data/comments_authors_categories.parquet')