In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import itertools
import pickle
import random
import gzip
import json
import zstd
import polars as pl

# No. of Comments Posted Under a Single Video by Each Author

It creates dataset in "data_type1" which contains '***suspicious***' users that posted more than 8 comments (mean+2x*std*) and '***normal***' users that posted less than 5 comments (mean+1x*std*) under a single video


**NOTE:**
- Most of the users in the provided comments dataset have only a single comments. So, we had to filter out active users.
- We considered normal users with at least a total of 100 comments in lifetime to have sufficient data to distinguish between regular active users and bots.
- We choose 100 as the threshold since mean of total comments per user is 19 with S.D of 140. So, to consider active normal users we set a threshold of comments count going above around half a S.D over mean (~100). 
- This gives us a good conservative threshold to proceed easily with comparative data analysis of bots vs normal active users since comments data is huge containing mostly users with a single comment in lifetime.

In [3]:
# Load YouTube Videos Metadata
df_metadata_feather = pd.read_feather("./data/yt_metadata_helper.feather")
df_metadata_feather.rename(columns={"display_id":"video_id"}, inplace=True)

# Convert df_metadata_feather to a Polars DataFrame
df_metadata_polars = pl.from_pandas(df_metadata_feather).with_columns(
    pl.col("upload_date").dt.date().alias("upload_date")  
)

# remove Pandas df from memory since not used
del df_metadata_feather 

df_metadata_polars.head()

categories,channel_id,dislike_count,video_id,duration,like_count,upload_date,view_count
str,str,f64,str,i64,f64,date,f64
"""Film & Animation""","""UCzWrhkg9eK5I8Bm3HfV-unA""",1.0,"""SBqSc91Hn9g""",1159,8.0,2016-09-28,1057.0
"""Film & Animation""","""UCzWrhkg9eK5I8Bm3HfV-unA""",1.0,"""UuugEl86ESY""",2681,23.0,2016-09-28,12894.0
"""Film & Animation""","""UCzWrhkg9eK5I8Bm3HfV-unA""",779.0,"""oB4c-yvnbjs""",1394,1607.0,2016-09-28,1800602.0
"""Film & Animation""","""UCzWrhkg9eK5I8Bm3HfV-unA""",24.0,"""ZaV-gTCMV8E""",5064,227.0,2016-09-28,57640.0
"""Film & Animation""","""UCzWrhkg9eK5I8Bm3HfV-unA""",13.0,"""cGvL7AvMfM0""",3554,105.0,2016-09-28,86368.0


In [4]:
# Create filtered authors df (active users with at least 100 total comments in lifetime)
threshold_comments = 100
num_comments_authors_orignial = pd.read_csv("./data/num_comments_authors.tsv.gz", compression="infer", sep="\t")
active_users_df = num_comments_authors_orignial[num_comments_authors_orignial["video_id"]>threshold_comments]
active_users_df_polars = pl.from_pandas(active_users_df[["author"]])

del active_users_df # remove Pandas df from memory since not used

before_size = num_comments_authors_orignial.shape[0]
after_size = active_users_df_polars.shape[0]
print(f"before:\t{before_size} users\nafter:\t{after_size} users\nchange:\t{before_size/after_size:.1f} times")

before:	448810483 users
after:	15386449 users
change:	29.2 times


### Create Dataset for Normal Active Users (Sample)

In [None]:
%%time
# Create dataset for sampled normal users (with comments per video < threshold AND >100 comments in lifetime)

import os
import time

i = 0
chunksize = 100_000_000
processed_rows= 0
start_time = time.time()  # Record the start time
file_path = f"normal_users.parquet"

threshold = 5 # threshold = mean + 1*(std)

file_exists = os.path.exists(file_path) 
print(f"'{file_path}' exists? {file_exists}")

'''
NOTE: total_rows in youtube_comments.tsv.gz = 8_610_000_000
'''

for df in  pd.read_csv("./data/youtube_comments.tsv.gz", compression="infer", sep="\t", chunksize=chunksize): # chunksize = # of rows
    print("\nprocessing df chunk...")
    try:
        q = (
            pl.from_pandas(df)
            .lazy()  # Convert to lazy frame
            .group_by(['author', 'video_id'])
            .agg(
                comments=pl.col('video_id').count(),  # Count of video_id for each group
                likes=pl.col('likes').sum(),         # Sum of likes for each group
                replies=pl.col('replies').sum()      # Sum of replies for each group
            )
        )
    except Exception as e:
        print(f"Error encountered: {e}")
        print("Attempting to coerce non-numeric values and retry...")
        
        # Convert specific columns to numeric, coercing errors to NaN
        df['replies'] = pd.to_numeric(df['replies'], errors='coerce', downcast="integer").dropna().astype('int64')
        df['likes'] = pd.to_numeric(df['likes'], errors='coerce', downcast="integer").dropna().astype('int64')

        # Retry loading into Polars
        q = (
            pl.from_pandas(df)
            .lazy()
            .group_by(['author', 'video_id'])
            .agg(
                comments=pl.col('video_id').count(),
                likes=pl.col('likes').sum(),
                replies=pl.col('replies').sum()
            )
        )
    
    df_polars = q.collect()

    # filter normal users with comments per day < threshold
    df_polars = q.collect().filter(pl.col('comments') < threshold)

    # filter active users (with at least 100 comments in lifetime)
    df_polars = df_polars.join(active_users_df_polars, on="author", how="inner")

    # add metadata info 
    df_polars = df_polars.join(df_metadata_polars, on="video_id", how="inner")
    
    df_polars = df_polars.with_columns([
        pl.col("replies").cast(pl.Int64)
    ])
    df_polars = df_polars.with_columns([
        pl.col("likes").cast(pl.Int64)
    ])

    # Write the first chunk or append the next chunks
    if not file_exists:
        # Write the first chunk with header
        df_polars.write_parquet(file_path, compression='zstd')
        print(f"'{file_path}' saved.")
        file_exists = True
    else:
        # For subsequent chunks, load the existing Feather file, & overwrite it with updated data (append don't work with feather)
        print(f"reading '{file_path}'...")
        df_existing = pl.read_parquet(file_path, low_memory=True)
        df_combined = pl.concat([df_existing, df_polars])
        df_combined.write_parquet(file_path, compression='zstd')
        print(f"saved '{file_path}'. ")

    # Track the number of processed rows
    processed_rows += chunksize

    # Display progress
    print(f"iteration:{i+1}, processed_rows: {processed_rows:_}")
    i += 1

    current_time = time.time()     # Record the end time
    elapsed_time = current_time - start_time
    elapsed_hours = int(elapsed_time // 3600)
    elapsed_minutes = int((elapsed_time % 3600) // 60)
    elapsed_seconds = int(elapsed_time % 60)
    print(f"Elapsed time: {elapsed_hours:02}:{elapsed_minutes:02}:{elapsed_seconds:02}\n")


In [5]:
# Load dataset we just created of normal active users
normal_users = pl.read_parquet("normal_users.parquet")
normal_users

author,video_id,comments,likes,replies,categories,channel_id,dislike_count,duration,like_count,upload_date,view_count
i64,str,u32,i64,i64,str,str,f64,i64,f64,date,f64
3557393,"""PKk4m4ayicc""",1,4,3,"""Film & Animation""","""UCzWrhkg9eK5I8Bm3HfV-unA""",3747.0,2324,9691.0,2016-04-29,7.782766e6
834253,"""T8zHka4w-uw""",1,0,0,"""Film & Animation""","""UCzWrhkg9eK5I8Bm3HfV-unA""",430.0,10087,1304.0,2017-12-30,849965.0
594074,"""hn2zYwqSINY""",1,0,1,"""Gaming""","""UCzWrhkg9eK5I8Bm3HfV-unA""",1561.0,1146,6118.0,2017-12-07,4.028426e6
724503,"""UTZLSHaE4Sw""",1,0,0,"""Gaming""","""UCzWrhkg9eK5I8Bm3HfV-unA""",206.0,1706,2235.0,2017-11-18,585746.0
1328548,"""6vY6iO0nCLo""",2,0,0,"""Comedy""","""UCzWoOpdzQ-Lwv3ZiWGMDbGg""",4.0,58,537.0,2014-03-23,39850.0
…,…,…,…,…,…,…,…,…,…,…,…
33000590,"""BQGhbm7ygkY""",1,0,0,"""News & Politics""","""UCrwE8kVqtIUVUzKui2WVpuQ""",6224.0,1213,234072.0,2019-08-06,6.885147e6
33278446,"""BQGhbm7ygkY""",2,0,0,"""News & Politics""","""UCrwE8kVqtIUVUzKui2WVpuQ""",6224.0,1213,234072.0,2019-08-06,6.885147e6
33454175,"""BQGhbm7ygkY""",1,5,0,"""News & Politics""","""UCrwE8kVqtIUVUzKui2WVpuQ""",6224.0,1213,234072.0,2019-08-06,6.885147e6
27226318,"""zVg8uqM4TXs""",1,2,0,"""News & Politics""","""UCrwE8kVqtIUVUzKui2WVpuQ""",137.0,193,1808.0,2017-02-13,99581.0


In [27]:
# Unique authors per year in sampled normal users dataset
df = normal_users.clone()
result = (
    df.with_columns(pl.col("upload_date").dt.year().alias("year"))
    .group_by("year")
    .agg(pl.col("author").n_unique().alias("unique_authors"))
)

print(result)

shape: (15, 2)
┌──────┬────────────────┐
│ year ┆ unique_authors │
│ ---  ┆ ---            │
│ i32  ┆ u32            │
╞══════╪════════════════╡
│ 2010 ┆ 336004         │
│ 2007 ┆ 115234         │
│ 2013 ┆ 638841         │
│ 2016 ┆ 812782         │
│ 2019 ┆ 716328         │
│ …    ┆ …              │
│ 2015 ┆ 763894         │
│ 2018 ┆ 788401         │
│ 2009 ┆ 329552         │
│ 2006 ┆ 43929          │
│ 2012 ┆ 532896         │
└──────┴────────────────┘


In [30]:
# Display normal users dataset sorted by author id
normal_users_sorted = normal_users.sort(by="author")
normal_users_sorted

author,video_id,comments,likes,replies,categories,channel_id,dislike_count,duration,like_count,upload_date,view_count
i64,str,u32,i64,i64,str,str,f64,i64,f64,date,f64
16,"""uAaxjufrDjM""",1,0,0,"""Gaming""","""UCzNhowpzT4AwyIW7Unk_B5Q""",73.0,3309,2690.0,2018-12-10,228886.0
16,"""8Ig-FoKo15M""",1,4,0,"""Entertainment""","""UCrp8aFu6VjkZAY9Hhj6IrXA""",269.0,135,4658.0,2017-12-03,224938.0
16,"""YIRZtGvq694""",1,0,0,"""People & Blogs""","""UCrCkLipGria4UloMpvNil5g""",1664.0,702,25165.0,2018-08-13,563268.0
16,"""HJyfpL-O1WU""",1,0,0,"""People & Blogs""","""UCrCkLipGria4UloMpvNil5g""",9703.0,216,106212.0,2018-05-19,5.649267e6
16,"""NMCk4cVBr4I""",1,0,0,"""Music""","""UCrCkLipGria4UloMpvNil5g""",181088.0,221,382158.0,2017-10-06,2.9257455e7
…,…,…,…,…,…,…,…,…,…,…,…
33481868,"""lN-rVlMIJZs""",1,0,0,"""People & Blogs""","""UCsvn_Po0SmunchJYOWpOxMg""",3157.0,421,225249.0,2016-04-21,1.1922035e7
33481868,"""vlo0-GDi-qc""",2,1,1,"""People & Blogs""","""UCsvn_Po0SmunchJYOWpOxMg""",3488.0,337,104142.0,2016-04-01,5.782411e6
33481868,"""g_FB4Bw99KU""",3,5,2,"""Gaming""","""UCsvn_Po0SmunchJYOWpOxMg""",1350.0,313,153964.0,2016-03-13,8.625662e6
33481868,"""IR0-o1bqaig""",1,0,0,"""Gaming""","""UCsvn_Po0SmunchJYOWpOxMg""",1268.0,376,102792.0,2015-07-31,8.023453e6


### Create Dataset for Suspicious Users (Type-1)

In [15]:
%%time
# Filter original youtube dataset to only contain suspicious users with comments per video > threshold

import os
import time

i = 0
chunksize = 100_000_000
processed_rows= 0
start_time = time.time()  # Record the start time
file_path = f"youtube_comments_filtered.parquet"

threshold = 8.67 # threshold = mean + 2*(std) = 1.32 + 2*(3.67) = 8.67

file_exists = os.path.exists(file_path) 
print(f"'{file_path}' exists? {file_exists}")

'''
NOTE: total_rows in youtube_comments.tsv.gz = 8_610_000_000
'''

for df in  pd.read_csv("./data/youtube_comments.tsv.gz", compression="infer", sep="\t", chunksize=chunksize): # chunksize = # of rows
    print("\nprocessing df chunk...")
    try:
        q = (
            pl.from_pandas(df)
            .lazy()  # Convert to lazy frame
            .group_by(['author', 'video_id'])
            .agg(
                comments=pl.col('video_id').count(),  # Count of video_id for each group
                likes=pl.col('likes').sum(),         # Sum of likes for each group
                replies=pl.col('replies').sum()      # Sum of replies for each group
            )
        )
    except Exception as e:
        print(f"Error encountered: {e}")
        print("Attempting to coerce non-numeric values and retry...")
        
        # Convert specific columns to numeric, coercing errors to NaN
        df['replies'] = pd.to_numeric(df['replies'], errors='coerce', downcast="integer").dropna().astype('int64')
        df['likes'] = pd.to_numeric(df['likes'], errors='coerce', downcast="integer").dropna().astype('int64')

        # Retry loading into Polars
        q = (
            pl.from_pandas(df)
            .lazy()
            .group_by(['author', 'video_id'])
            .agg(
                comments=pl.col('video_id').count(),
                likes=pl.col('likes').sum(),
                replies=pl.col('replies').sum()
            )
        )
    
    df_polars = q.collect()
    
    # filter authors with comments per day > threshold
    df_polars = q.collect().filter(pl.col('comments') > threshold)

    df_polars = df_polars.with_columns([
        pl.col("replies").cast(pl.Int64)
    ])
    df_polars = df_polars.with_columns([
        pl.col("likes").cast(pl.Int64)
    ])

    # Write the first chunk or append the next chunks
    if not file_exists:
        # Write the first chunk with header
        df_polars.write_parquet(file_path, compression='zstd')
        print(f"'{file_path}' saved.")
        file_exists = True
    else:
        # For subsequent chunks, load the existing Feather file, & overwrite it with updated data (append don't work with feather)
        print(f"reading '{file_path}'...")
        df_existing = pl.read_parquet(file_path, low_memory=True)
        df_combined = pl.concat([df_existing, df_polars])
        df_combined.write_parquet(file_path, compression='zstd')
        print(f"saved '{file_path}'. ")

    
    # Track the number of processed rows
    processed_rows += chunksize

    # Display progress
    print(f"iteration:{i+1}, processed_rows: {processed_rows:_}")
    i += 1

    current_time = time.time()     # Record the end time
    elapsed_time = current_time - start_time
    elapsed_hours = int(elapsed_time // 3600)
    elapsed_minutes = int((elapsed_time % 3600) // 60)
    elapsed_seconds = int(elapsed_time % 60)
    print(f"Elapsed time: {elapsed_hours:02}:{elapsed_minutes:02}:{elapsed_seconds:02}\n")


'youtube_comments_filtered.parquet' exists? False

processing df chunk...
'youtube_comments_filtered.parquet' saved.
iteration:1, processed_rows: 100_000_000
Elapsed time: 00:01:28


processing df chunk...
reading 'youtube_comments_filtered.parquet'...
saved 'youtube_comments_filtered.parquet'. 
iteration:2, processed_rows: 200_000_000
Elapsed time: 00:02:59


processing df chunk...
reading 'youtube_comments_filtered.parquet'...
saved 'youtube_comments_filtered.parquet'. 
iteration:3, processed_rows: 300_000_000
Elapsed time: 00:04:26


processing df chunk...
reading 'youtube_comments_filtered.parquet'...
saved 'youtube_comments_filtered.parquet'. 
iteration:4, processed_rows: 400_000_000
Elapsed time: 00:05:59


processing df chunk...
reading 'youtube_comments_filtered.parquet'...
saved 'youtube_comments_filtered.parquet'. 
iteration:5, processed_rows: 500_000_000
Elapsed time: 00:07:29


processing df chunk...
reading 'youtube_comments_filtered.parquet'...
saved 'youtube_comments_fil




processing df chunk...
Error encountered: Could not convert '2' with type str: tried to convert to int64
Attempting to coerce non-numeric values and retry...
reading 'youtube_comments_filtered.parquet'...
saved 'youtube_comments_filtered.parquet'. 
iteration:64, processed_rows: 6_400_000_000
Elapsed time: 01:44:39


processing df chunk...
reading 'youtube_comments_filtered.parquet'...
saved 'youtube_comments_filtered.parquet'. 
iteration:65, processed_rows: 6_500_000_000
Elapsed time: 01:46:36


processing df chunk...
reading 'youtube_comments_filtered.parquet'...
saved 'youtube_comments_filtered.parquet'. 
iteration:66, processed_rows: 6_600_000_000
Elapsed time: 01:48:21


processing df chunk...
reading 'youtube_comments_filtered.parquet'...
saved 'youtube_comments_filtered.parquet'. 
iteration:67, processed_rows: 6_700_000_000
Elapsed time: 01:50:11


processing df chunk...
reading 'youtube_comments_filtered.parquet'...
saved 'youtube_comments_filtered.parquet'. 
iteration:68, proc

In [23]:
# Load dataset we just created of suspicious users with comments per day > threshold
suspicious_users = pl.read_parquet("youtube_comments_filtered.parquet")
suspicious_users

author,video_id,comments,likes,replies
i64,str,u32,i64,i64
4183535,"""C_Zmr79SX7A""",23,62,0
3580280,"""c9ho_gvXjjQ""",9,0,0
6426424,"""Nbr1aZ3XTxk""",53,8,0
2327535,"""OgBai7nR5E4""",11,6,0
5112800,"""NnhTuKa5FCw""",14,211,0
…,…,…,…,…
576541562,"""INIVR-VC-as""",9,11,1
576512090,"""OxOqXxl-pbs""",11,7,0
576327494,"""3SjuC0PYscU""",9,53,0
576204953,"""upL8g8RNRQY""",11,0,6


In [24]:
# Join videos metadata with suspicious users dataset & save it to disk
suspicious_users_chanel = suspicious_users.join(df_metadata_polars, on="video_id", how="inner")
suspicious_users_chanel.write_parquet("suspicious_users.parquet", compression='zstd')
suspicious_users_chanel.head()

author,video_id,comments,likes,replies,categories,channel_id,dislike_count,duration,like_count,upload_date,view_count
i64,str,u32,i64,i64,str,str,f64,i64,f64,datetime[ns],f64
59652612,"""ikPgqOPXiAw""",11,0,0,"""Film & Animation""","""UCzWrhkg9eK5I8Bm3HfV-unA""",4846.0,614,8759.0,2015-11-24 00:00:00,6044073.0
67508742,"""mp9gt45aHxY""",10,2,1,"""Gaming""","""UCzWrhkg9eK5I8Bm3HfV-unA""",11423.0,1762,29891.0,2017-12-08 00:00:00,23152662.0
297593699,"""yYRyznkNnL8""",11,0,0,"""Autos & Vehicles""","""UCzWrhkg9eK5I8Bm3HfV-unA""",1943.0,1525,6722.0,2017-06-13 00:00:00,4318130.0
45096664,"""3vQK78eUg2A""",11,0,8,"""Film & Animation""","""UCzWrhkg9eK5I8Bm3HfV-unA""",2548.0,2196,8780.0,2017-02-13 00:00:00,6674760.0
246052255,"""3vQK78eUg2A""",9,60,33,"""Film & Animation""","""UCzWrhkg9eK5I8Bm3HfV-unA""",2548.0,2196,8780.0,2017-02-13 00:00:00,6674760.0


In [26]:
# Sanity checks
min_value = suspicious_users_chanel["comments"].min()
print(f"Sanity check to see the lowest number of comments made : {min_value}.")
print(f"So it should be higher than the threshold : {min_value > threshold}")

Sanity check to see the lowest number of comments made : 9.
So it should be higher than the threshold : True


In [27]:
suspicious_users_chanel.sort("comments", descending = True).head(10)

author,video_id,comments,likes,replies,categories,channel_id,dislike_count,duration,like_count,upload_date,view_count
i64,str,u32,i64,i64,str,str,f64,i64,f64,datetime[ns],f64
260739815,"""qUWubraZ3Dg""",30990,0,0,"""People & Blogs""","""UCc5bKcABu7kLfZflAH1K73w""",1196.0,208,1754.0,2007-02-14 00:00:00,453386.0
327375141,"""gHbYJfwFgOU""",27973,51923,29765,"""Science & Technology""","""UCvQECJukTDE2i6aCoMnS-Vg""",28177.0,152,137619.0,2012-08-23 00:00:00,7867065.0
459327790,"""pRpeEdMmmQ0""",25107,1393,86,"""Music""","""UCGnjeahCJW1AF34HBmQTJ-Q""",630719.0,211,9570906.0,2010-06-04 00:00:00,2363300000.0
190348425,"""y6fThXQPT6I""",24246,1833,216,"""Music""","""UCm1dsgJNnhaLkY3uAdqN4mA""",46585.0,295,953772.0,2019-01-04 00:00:00,73180617.0
503224102,"""Hb-TWv0Qe24""",24229,7886,25,"""Entertainment""","""UCZJwVlZhnEIrHD0hyogRqsA""",18217.0,845,136696.0,2017-04-20 00:00:00,10221193.0
219760968,"""1J76wN0TPI4""",23475,9055,571,"""Entertainment""","""UCGqvJPRcv7aVFun-eTsatcA""",42078.0,184,931061.0,2018-05-30 00:00:00,71293503.0
503224102,"""4_FRxFioFzc""",22077,9506,370,"""Entertainment""","""UCZJwVlZhnEIrHD0hyogRqsA""",14400.0,590,41044.0,2015-11-05 00:00:00,5492312.0
393529827,"""p_c6uQHlhZ0""",21081,24055,0,"""Music""","""UC044qA_Mds7HgnKd9NLOe3g""",16738.0,318,395298.0,2017-11-04 00:00:00,23712496.0
184412459,"""TjxZ6MrBl9E""",20052,1361,2038,"""Science & Technology""","""UCnzItcpd7WGVn80TRRb8RcQ""",3174.0,154,30329.0,2009-10-30 00:00:00,2794953.0
30292780,"""gHbYJfwFgOU""",19678,44143,15625,"""Science & Technology""","""UCvQECJukTDE2i6aCoMnS-Vg""",28177.0,152,137619.0,2012-08-23 00:00:00,7867065.0


In [9]:
# Load dataset we just created of suspicious users with comments per day > threshold
suspicious_users = pl.read_parquet("suspicious_users.parquet")
suspicious_users

author,video_id,comments,likes,replies,categories,channel_id,dislike_count,duration,like_count,upload_date,view_count
i64,str,u32,i64,i64,str,str,f64,i64,f64,date,f64
59652612,"""ikPgqOPXiAw""",11,0,0,"""Film & Animation""","""UCzWrhkg9eK5I8Bm3HfV-unA""",4846.0,614,8759.0,2015-11-24,6.044073e6
67508742,"""mp9gt45aHxY""",10,2,1,"""Gaming""","""UCzWrhkg9eK5I8Bm3HfV-unA""",11423.0,1762,29891.0,2017-12-08,2.3152662e7
297593699,"""yYRyznkNnL8""",11,0,0,"""Autos & Vehicles""","""UCzWrhkg9eK5I8Bm3HfV-unA""",1943.0,1525,6722.0,2017-06-13,4.31813e6
45096664,"""3vQK78eUg2A""",11,0,8,"""Film & Animation""","""UCzWrhkg9eK5I8Bm3HfV-unA""",2548.0,2196,8780.0,2017-02-13,6.67476e6
246052255,"""3vQK78eUg2A""",9,60,33,"""Film & Animation""","""UCzWrhkg9eK5I8Bm3HfV-unA""",2548.0,2196,8780.0,2017-02-13,6.67476e6
…,…,…,…,…,…,…,…,…,…,…,…
430474027,"""BQGhbm7ygkY""",12,88,0,"""News & Politics""","""UCrwE8kVqtIUVUzKui2WVpuQ""",6224.0,1213,234072.0,2019-08-06,6.885147e6
437403189,"""BQGhbm7ygkY""",10,82,0,"""News & Politics""","""UCrwE8kVqtIUVUzKui2WVpuQ""",6224.0,1213,234072.0,2019-08-06,6.885147e6
446311653,"""BQGhbm7ygkY""",11,522,14,"""News & Politics""","""UCrwE8kVqtIUVUzKui2WVpuQ""",6224.0,1213,234072.0,2019-08-06,6.885147e6
495878616,"""BQGhbm7ygkY""",12,70,0,"""News & Politics""","""UCrwE8kVqtIUVUzKui2WVpuQ""",6224.0,1213,234072.0,2019-08-06,6.885147e6


In [18]:
print("Suspicious Users - Comments Per Video:")
suspicious_users['comments'].describe()

Suspicious Users - Comments Per Video:


statistic,value
str,f64
"""count""",18891930.0
"""null_count""",0.0
"""mean""",21.605663
"""std""",52.316513
"""min""",9.0
"""25%""",10.0
"""50%""",13.0
"""75%""",20.0
"""max""",30990.0


### Break Down Large Parquet Dataset into Small Partial Files

In [8]:
# Break down large Parquet file into smaller sub-files for uploading on Github
import polars as pl

dataset = 'normal_users'
rows_per_chunk = len(df) // 26 # for 'normal_users'

#dataset = 'suspicious_users'
#rows_per_chunk = len(df) // 5 # for 'suspicious_users'

df = pl.read_parquet(f"{dataset}.parquet")
n = 1
print(f"breaking down '{dataset}.parquet'...")

# Split and save each chunk based on rows
for i in range(0, len(df), rows_per_chunk):
    chunk_df = df.slice(i, rows_per_chunk)
    chunk_df.write_parquet(f'./data_type1/{dataset}_{n}.parquet')
    n += 1

print("done")


breaking down 'normal_users.parquet'...
done


### Read Partial Files to Create a DF

In [1]:
# To read partial Parquet files & create a single df
import polars as pl
import glob

dataset = 'normal_users'
#dataset = 'suspicious_users'

print(f"reading partial files '{dataset}_i.parquet'...")

# List all Parquet files
parquet_files = glob.glob(f'./data_type1/{dataset}_*.parquet')

# Read and concatenate all Parquet files
combined = pl.concat([pl.read_parquet(file) for file in parquet_files])
combined

reading partial files 'normal_users_i.parquet'...


author,video_id,comments,likes,replies,categories,channel_id,dislike_count,duration,like_count,upload_date,view_count
i64,str,u32,i64,i64,str,str,f64,i64,f64,datetime[ns],f64
9587630,"""gza2-iPzPi4""",1,0,0,"""People & Blogs""","""UCvtRTOMP2TqYqu51xNrqAzg""",1108.0,977,19517.0,2018-05-27 00:00:00,340224.0
9808653,"""gza2-iPzPi4""",1,0,0,"""People & Blogs""","""UCvtRTOMP2TqYqu51xNrqAzg""",1108.0,977,19517.0,2018-05-27 00:00:00,340224.0
9842650,"""gza2-iPzPi4""",1,0,0,"""People & Blogs""","""UCvtRTOMP2TqYqu51xNrqAzg""",1108.0,977,19517.0,2018-05-27 00:00:00,340224.0
10045235,"""gza2-iPzPi4""",1,0,0,"""People & Blogs""","""UCvtRTOMP2TqYqu51xNrqAzg""",1108.0,977,19517.0,2018-05-27 00:00:00,340224.0
10156676,"""gza2-iPzPi4""",1,0,0,"""People & Blogs""","""UCvtRTOMP2TqYqu51xNrqAzg""",1108.0,977,19517.0,2018-05-27 00:00:00,340224.0
…,…,…,…,…,…,…,…,…,…,…,…
2598751,"""lcHBHxhp6p4""",1,3,0,"""Sports""","""UCxFt75OIIvoN4AaL7lJxtTg""",47.0,1737,789.0,2019-06-01 00:00:00,76285.0
1689039,"""FWGCmt5a8O4""",1,0,0,"""Sports""","""UCxFt75OIIvoN4AaL7lJxtTg""",70.0,346,3411.0,2019-06-01 00:00:00,305577.0
582076,"""QXsW3R0EVaA""",1,0,0,"""Sports""","""UCxFt75OIIvoN4AaL7lJxtTg""",120.0,5480,3883.0,2019-06-01 00:00:00,385782.0
1689039,"""QXsW3R0EVaA""",2,5,3,"""Sports""","""UCxFt75OIIvoN4AaL7lJxtTg""",120.0,5480,3883.0,2019-06-01 00:00:00,385782.0


In [3]:
# Sanity Check
original = pl.read_parquet(f'{dataset}.parquet')
print("Original: ", original.describe())
print("Combined: ", combined.describe())

Original:  shape: (9, 13)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ author    ┆ video_id  ┆ comments  ┆ … ┆ duration  ┆ like_coun ┆ upload_da ┆ view_cou │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ t         ┆ te        ┆ nt       │
│ str       ┆ f64       ┆ str       ┆ f64       ┆   ┆ f64       ┆ ---       ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆ f64       ┆ str       ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 2.2474631 ┆ 224746315 ┆ 2.2474631 ┆ … ┆ 2.2474631 ┆ 2.2379684 ┆ 224746315 ┆ 2.247457 │
│           ┆ 5e8       ┆           ┆ 5e8       ┆   ┆ 5e8       ┆ e8        ┆           ┆ 62e8     │
│ null_coun ┆ 0.0       ┆ 0         ┆ 0.0       ┆ … ┆ 0.0       ┆ 949475.0  ┆ 0         ┆ 553.0    │
│ t         ┆           ┆           ┆           ┆   ┆           ┆