In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import polars as pl

# Reading dataset (Parquet)

I'm using the lazy loader of polars since my computer is not enough powerful.

In [4]:
dataset_df = pl.scan_parquet("author_comments_per_video_2018.parquet", 
                            low_memory=True)

dataset_df.head().collect()

author,video_id,upload_date,comments,likes,replies
i64,str,date,u32,i64,i64
16,"""s9sgU7fWlV4""",2018-11-17,1,0,0
16,"""ujdUh7ZfkUY""",2018-07-08,1,6,0
16,"""OaLFcwHzPaA""",2018-12-27,1,1,0
16,"""aQoD6KfXZqg""",2018-03-21,1,0,0
16,"""Croosu_XdF4""",2018-07-14,1,0,0


In [5]:
numb_rows = dataset_df.select(pl.len()).collect().item()
print(f"The number of rows in the Df is : {numb_rows}")

The number of rows in the Df is : 905447155


## Computing the first two statistical moments

Since the dataset **author_comments_per_video_2018.parquet** is too large, it can only be read by chunk. So the mean and the variance are computed by hand.

In [9]:
# Setting the variables
chunk_size = 100000000  # The chunk size can be changed to the computer performance
mean = 0

for start in range(0, numb_rows, chunk_size):
    mean += dataset_df.slice(start, chunk_size).select("comments").sum().collect()

mean = mean/numb_rows
mean = mean.item() # The variable mean is a DF so converting it into a float

print(f'The mean number of comments per user and per video is : {mean}')

The mean number of comments per user and per video is : 1.325973376104981


In [10]:
# Variance is just like a counter variable at start
variance = 0

for start in range(0, numb_rows, chunk_size):
    chunk = dataset_df.slice(start, chunk_size).select("comments").collect()

    variance += np.sum((np.array(chunk) - mean)**2)

del chunk # To unload the memory

variance = variance/(numb_rows - 1) # Estimator without bias

std = np.sqrt(variance)

print(f'The standard deviation is : {std}')

The standard deviation is : 3.674350891355733


## Getting the indexes of the suspicious accounts

#### Defining the threshold for the filtering

In [12]:
threshold = mean + 2*std
print(f'The number of suspicious comments per video is set to : {threshold}')

The number of suspicious comments per video is set to : 8.674675158816447


In [13]:
# Initializing the list
index_suspicious_accounts = []

for start in range(0, numb_rows, chunk_size):
    chunk = dataset_df.slice(start, chunk_size).select("comments").collect()

    chunk = chunk.to_pandas()

    df_buffer = chunk[chunk["comments"] > threshold]

    if (df_buffer.min() < threshold)[0]:
        print("There is a problem in the location of the suspicious accounts")
        continue
    
    index_suspicious_accounts.extend(df_buffer.index + start)

# Unloading the memory
del chunk
del df_buffer

In [14]:
print(f"Quick sanity check by looking at the smallest index : {min(index_suspicious_accounts)}")

Quick sanity check by looking at the smallest index : 681


In [15]:
numb_sus_accounts = len(index_suspicious_accounts)

print(f"The number of suspicious account is {numb_sus_accounts} which is much lower than the number of accounts")

The number of suspicious account is 3891275 which is much lower than the number of accounts


In [16]:
chunk_size = 10000000  # The chunk size is divided by 10 compared as before

# Initialize a list to get the rows indexes of suspicous accounts
selected_rows = []

start = 0
while True:
    chunk = dataset_df.slice(start, chunk_size).collect()
    if chunk.is_empty():
        break
    # Compute the current indexes
    chunk_indices = range(start, start + chunk_size)
    # Find the row indices in the current chunk
    chunk_row_indices = [i - start for i in index_suspicious_accounts if i in chunk_indices]
    # Extract the rows of interest from the opened chunk
    if chunk_row_indices:
        selected_rows.append(chunk[chunk_row_indices])
    start += chunk_size

chunk = None # Unload the memory

# Export the rows to a dataframe to be used
if selected_rows:
    selected_rows_df = pl.concat(selected_rows)
else:
    selected_rows_df = pl.DataFrame()

print(selected_rows_df)

shape: (3_891_275, 6)
┌───────────┬─────────────┬─────────────┬──────────┬───────┬─────────┐
│ author    ┆ video_id    ┆ upload_date ┆ comments ┆ likes ┆ replies │
│ ---       ┆ ---         ┆ ---         ┆ ---      ┆ ---   ┆ ---     │
│ i64       ┆ str         ┆ date        ┆ u32      ┆ i64   ┆ i64     │
╞═══════════╪═════════════╪═════════════╪══════════╪═══════╪═════════╡
│ 299       ┆ 4NDYXVKx2Jw ┆ 2018-11-22  ┆ 23       ┆ 376   ┆ 39      │
│ 933       ┆ 2SU6SWNxOYk ┆ 2018-02-26  ┆ 10       ┆ 8     ┆ 18      │
│ 1047      ┆ Nu2yQ1zYDYU ┆ 2018-04-02  ┆ 10       ┆ 25    ┆ 0       │
│ 1047      ┆ OHELU6I10wQ ┆ 2018-01-18  ┆ 10       ┆ 13    ┆ 1       │
│ 1560      ┆ hlb3gGNBHFE ┆ 2018-03-04  ┆ 14       ┆ 17    ┆ 1       │
│ …         ┆ …           ┆ …           ┆ …        ┆ …     ┆ …       │
│ 576549884 ┆ KOUP8AGYdr8 ┆ 2018-07-10  ┆ 10       ┆ 11    ┆ 1       │
│ 576549884 ┆ Nr3WBJPg0BE ┆ 2018-08-31  ┆ 10       ┆ 8     ┆ 0       │
│ 576550604 ┆ R5E43VdWvyI ┆ 2018-09-04  ┆ 10       ┆ 0 

In [17]:
min_value = selected_rows_df["comments"].min()

print(f"Sanity check to see the lowest number of comments made : {min_value}.")
print(f"So it should be higher than the threshold : {min_value > threshold}")

Sanity check to see the lowest number of comments made : 9.
So it should be higher than the threshold : True


In [24]:
selected_rows_df.sort("comments", descending = True).head(10)

author,video_id,upload_date,comments,likes,replies
i64,str,date,u32,i64,i64
219760968,"""1J76wN0TPI4""",2018-05-30,23475,9055,571
333172471,"""zPJl2e38S4g""",2018-03-31,18421,6000,69
314699484,"""xpVfcZ0ZcFM""",2018-02-16,12530,627,0
311898076,"""z7tRr49qZfo""",2018-03-14,10227,3081,0
298514026,"""XW6MpM0LTI0""",2018-06-06,9890,3458,0
52730071,"""b03U6BYF9L0""",2018-02-15,9095,1949,3837
177343493,"""6YNZlXfW6Ho""",2018-04-26,8902,1520,29
432685379,"""ZHkZgCF4_G4""",2018-12-07,8436,4349,0
309188934,"""st8dkGzJWtg""",2018-06-11,7477,1991,0
270113088,"""WReyepxMs34""",2018-10-13,7262,1037,419


In [91]:
selected_rows_df.write_parquet("suspicious_users_2018.parquet")