___

# <p style="text-align: center"> Invisible Influencers </p>
## <p style="text-align: center"> Investigating YouTube Bot's Phenomenon </p>

___

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import polars as pl
import glob
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from scipy.stats import gaussian_kde
from sklearn.svm import SVC

# Ensure plotly.js is set to default
pio.kaleido.scope.plotlyjs = None

In [None]:
# Set Default Plotting Params

font_family = "Arial"  # Replace with your desired font
plt.rcParams["font.family"] = font_family

# Optional: Set font size and weight
plt.rcParams["font.size"] = 12
plt.rcParams["axes.titlesize"] = 14
plt.rcParams["axes.labelsize"] = 12

# Apply Seaborn theme (inherits Matplotlib fonts)
sns.set_theme(style="whitegrid", font=font_family)

# Apply Plotly theme
plotly_layout = {
    "font": {
        "family": font_family,
        "size": 12,  # Match with plt.rcParams["font.size"]
    }
}

___

## <p style="text-align: center"> <span style="text-decoration: underline"> **Type-1 Bot Analysis** </span> </p>

#### - **Data Preprocessing and Loading**

In [None]:
combine = False

if combine:

    datasets = ['normal', 'suspicious']

    for dataset in datasets:
        print(f"reading partial files '{dataset}_i.parquet'...")

        # List all Parquet files
        parquet_files = glob.glob(f'./data_type1/{dataset}_users_*.parquet')

        # Read and concatenate all Parquet files
        combined = pl.concat([pl.read_parquet(file) for file in parquet_files])
        combined.write_parquet(f'data_type1/combi_{dataset}_dataset1.parquet')

In [None]:
# Load Data

type_1=pl.read_parquet('data/data_type1/combi_dataset1.parquet')
df_sus=pl.read_parquet('data/data_type1/combi_suspicious_dataset1.parquet')

In [None]:
# Add a column 'year' to the dataframe

type_1 = type_1.with_columns([
    pl.col("upload_date").cast(pl.Date),  # Ensure it's in datetime format
    pl.col("upload_date").dt.year().alias("year")  # Extract year
])

df_sus = df_sus.with_columns([
    pl.col("upload_date").cast(pl.Date),  # Ensure it's in datetime format
    pl.col("upload_date").dt.year().alias("year")  # Extract year
])

df_sus

#### <p style="text-align: center; margin-top: 0.5cm"> <span style="text-decoration: underline"> **Do Bots Target Specific Video Categories ?** </span> </p>

In [None]:
# Colormap for consistency

colormap_categories = {
    "Entertainment": "red",
    "Other": "orange",
    "Gaming": "cyan",
    "People & Blogs": "yellow",
    "News & Politics": "lime",
    "Howto & Style": "lightblue",
    "Music": "blue",
    "Education": "lightgreen",
    "Science & Technology": "brown",
    "Film & Animation": "pink",
    "Comedy": "green",
    "Sports": "purple",
    "Pets & Animals": "teal",
    "Travel & Events": "lavender",
    "Autos & Vehicles": "salmon",
    "Nonprofits & Activism": "gold",
    "Shows": "gold",
    "Trailers": "lightcoral"
}

order_categories = {
    "Entertainment": 1,
    "Gaming": 2,
    "People & Blogs": 3,
    "News & Politics": 4,
    "Howto & Style": 5,
    "Music": 6,
    "Education": 7,
    "Science & Technology": 8,
    "Film & Animation": 9,
    "Comedy": 10,
    "Sports": 11,
    "Pets & Animals": 12,
    "Travel & Events": 13,
    "Autos & Vehicles": 14,
    "Nonprofits & Activism": 15,
    "Shows": 16,
    "Trailers": 17,
    "Other": 18
}

#### - **Suspicious Users**

In [None]:
# Compute the number of comments per category of videos
df_comm_per_cat = df_sus.group_by(["year","categories"]).agg(pl.col("comments").sum()).filter(pl.col("categories")!="").sort(by=["year","comments"], descending=True)

# Add proportion of comments per category per year
comments_per_year = df_comm_per_cat.group_by("year").agg(pl.col("comments").sum().alias("total_comments")).sort(by="year")
df_comm_per_cat = df_comm_per_cat.join(comments_per_year, on="year")
df_comm_per_cat = df_comm_per_cat.with_columns([
    (pl.col("comments") / pl.col("total_comments") * 100.0).alias("proportion")
])

# Keep only categories with more than 2% of the comments, put others in a category 'Other'
df_comm_per_cat = df_comm_per_cat.with_columns(pl.when(pl.col("proportion")<5.0).then(pl.lit("Other")).otherwise(pl.col("categories")).alias("Categories")).drop("categories")
df_comm_per_cat = df_comm_per_cat.group_by(["year","Categories"]).agg(pl.col("comments").sum(), pl.col("proportion").sum()).sort(by=["year","comments"], descending=True)

# Add order column for plotting
df_comm_per_cat = df_comm_per_cat.with_columns([
    pl.col("Categories").map_elements(lambda x: order_categories[x]).alias("order")
])

df_comm_per_cat = df_comm_per_cat.sort(by=["year","order"])

df_comm_per_cat

In [None]:
fig = px.area(df_comm_per_cat.to_pandas(), x="year", y="proportion", color="Categories", 
              labels={"proportion":"Proportion of Comments (%)", "year":"Year", "Categories":"Category"},
              color_discrete_map=colormap_categories)
fig.show()
fig.write_image("./image_aurel/prop_comments_per_category_sus.svg")
fig.write_html("./image_aurel/prop_comments_per_category_sus.html")

#### - **Normal Users**

In [None]:
# Compute the number of comments per category of videos
df_comm_per_cat_normal = type_1.group_by(["year","categories"]).agg(pl.col("comments").sum()).filter(pl.col("categories")!="").sort(by=["year","comments"], descending=True)

# Add proportion of comments per category per year
comments_per_year_normal = df_comm_per_cat_normal.group_by("year").agg(pl.col("comments").sum().alias("total_comments")).sort(by="year")
df_comm_per_cat_normal = df_comm_per_cat_normal.join(comments_per_year_normal, on="year")
df_comm_per_cat_normal = df_comm_per_cat_normal.with_columns([
    (pl.col("comments") / pl.col("total_comments") * 100.0).alias("proportion")
])

# Keep only categories with more than 4% of the comments, put others in a category 'Other'
df_comm_per_cat_normal = df_comm_per_cat_normal.with_columns(pl.when(pl.col("proportion")<5.0).then(pl.lit("Other")).otherwise(pl.col("categories")).alias("Categories")).drop("categories")
df_comm_per_cat_normal = df_comm_per_cat_normal.group_by(["year","Categories"]).agg(pl.col("comments").sum(), pl.col("proportion").sum()).sort(by=["year","comments"], descending=True)

# Add order column for plotting
df_comm_per_cat_normal = df_comm_per_cat_normal.with_columns([
    pl.col("Categories").map_elements(lambda x: order_categories[x]).alias("order")
])

df_comm_per_cat_normal = df_comm_per_cat_normal.sort(by=["year","order"])

In [None]:
fig = px.area(df_comm_per_cat_normal.to_pandas(), x="year", y="proportion", color="Categories", 
            labels={"proportion":"Proportion of Comments (%)", "year":"Year", "Categories":"Category"},
            color_discrete_map=colormap_categories)
fig.show()


fig.write_image("./image_aurel/prop_comments_per_category_normal.svg")
fig.write_html("./image_aurel/prop_comments_per_category_normal.html")


#### <p style="text-align: center; margin-top: 0.5cm"> <span style="text-decoration: underline"> **Do Bots Target One or Many Channels ?** </span> </p>

In [None]:
# Color map for consistency

colormap_cat_channel = {
    "1": "red",
    "2": "blue",
    "3": "green",
    "4": "orange",
    "5+": "cyan",
}

order_cat_channel = {
    "1": 1,
    "2": 2,
    "3": 3,
    "4": 4,
    "5+": 5,
}

#### - **Suspicious Users**

In [None]:
df_nb_channels_per_bot = df_sus.group_by(["year","author"]).agg(pl.col("channel_id").n_unique().alias("nb_channels")).sort(by=["year","nb_channels"], descending=True)
df_nb_channels_per_bot = df_nb_channels_per_bot.with_columns(pl.when(pl.col("nb_channels")>=5).then(pl.lit("5+")).otherwise(pl.col("nb_channels")).alias("nb_channels")).group_by(["year","nb_channels"]).agg(pl.col("nb_channels").count().alias("nb_users")).sort(by=["year","nb_users"], descending=True)

# Proportion per year 

nb_bots_per_year = df_nb_channels_per_bot.group_by("year").agg(pl.col("nb_users").sum().alias("total_users")).sort(by="year")

df_nb_channels_per_bot = df_nb_channels_per_bot.join(nb_bots_per_year, on="year")
df_nb_channels_per_bot = df_nb_channels_per_bot.with_columns([
    (pl.col("nb_users") / pl.col("total_users") * 100.0).alias("proportion")
])

df_nb_channels_per_bot = df_nb_channels_per_bot.with_columns([
    pl.col("nb_channels").map_elements(lambda x: order_cat_channel[x]).alias("order")
])

df_nb_channels_per_bot = df_nb_channels_per_bot.sort(by=["year","order"])

In [None]:
fig = px.area(df_nb_channels_per_bot.to_pandas(), x="year", y="proportion", color="nb_channels", 
              labels={"proportion":"Proportion of Bots (%)", "year":"Year", "nb_channels":"Channels Targeted"},
              color_discrete_map=colormap_cat_channel)
fig.show()


fig.write_image("./image_aurel/prop_bots_per_channels_sus.svg")
fig.write_html("./image_aurel/prop_bots_per_channels_sus.html")

#### - **Normal Users**

In [None]:
df_nb_channels_per_user = type_1.group_by(["year","author"]).agg(pl.col("channel_id").n_unique().alias("nb_channels")).sort(by=["year","nb_channels"], descending=True)
df_nb_channels_per_user = df_nb_channels_per_user.with_columns(pl.when(pl.col("nb_channels")>=5).then(pl.lit("5+")).otherwise(pl.col("nb_channels")).alias("nb_channels")).group_by(["year","nb_channels"]).agg(pl.col("nb_channels").count().alias("nb_users")).sort(by=["year","nb_users"], descending=True)

# Proportion per year 

nb_user_per_year = df_nb_channels_per_user.group_by("year").agg(pl.col("nb_users").sum().alias("total_users")).sort(by="year")

df_nb_channels_per_user = df_nb_channels_per_user.join(nb_user_per_year, on="year")
df_nb_channels_per_user = df_nb_channels_per_user.with_columns([
    (pl.col("nb_users") / pl.col("total_users") * 100.0).alias("proportion")
])

df_nb_channels_per_user = df_nb_channels_per_user.with_columns([
    pl.col("nb_channels").map_elements(lambda x: order_cat_channel[x]).alias("order")
])

df_nb_channels_per_user = df_nb_channels_per_user.sort(by=["year","order"])

In [None]:
fig = px.area(df_nb_channels_per_user.to_pandas(), x="year", y="proportion", color="nb_channels", 
              labels={"proportion":"Proportion of Normal users (%)", "year":"Year", "nb_channels":"Channels Targeted"},
              color_discrete_map=colormap_cat_channel)
fig.show()


fig.write_image("./image_aurel/prop_normal_per_channels.svg")
fig.write_html("./image_aurel/prop_normal_per_channels.html")

#### <p style="text-align: center; margin-top: 0.5cm"> <span style="text-decoration: underline"> **How Different are Normal Users and Bots in Commenting Behaviors ?** </span> </p>

In [None]:
data_type1_comm_per_user = type_1.select(["author", "comments"]).group_by("author").agg(pl.col("comments").sum()).sort(by="comments", descending=True)

df_sus_comm_per_bot= df_sus.select(["author", "comments"]).group_by("author").agg(pl.col("comments").sum()).sort(by="comments", descending=True)

df_comm_per_bot = pl.concat([data_type1_comm_per_user.with_columns([
    pl.lit("Normal").alias("Type")
]), df_sus_comm_per_bot.with_columns([
    pl.lit("Suspicious").alias("Type")
])])

In [None]:
# box plot of comments per user for normal and suspicious users

fig = px.box(df_comm_per_bot.to_pandas(), x="Type", y="comments", color="Type",
         labels={"comments":"Number of Comments", "Type":"User Type"},
         log_y=True)
fig.update_yaxes(range=[None, 10**5])


# Save the plot
fig.write_image("./image_aurel/boxplot_comments_per_user.png")

#### <p style="text-align: center; margin-top: 0.5cm"> <span style="text-decoration: underline"> **How Do Metrics Vary Over Time For Normal Users and Bots ?** </span> </p>

#### - **Suspicious Users**

In [None]:
# Generate Metrics Data

# Define chunk size
chunk_size_sus = 1_000_000  # Adjust based on memory constraints

# Initialize lists to collect results
unique_users_results_sus = []
chunk_results_sus = []

# Iterate through the dataset in chunks
for start_sus in range(0, len(df_sus), chunk_size_sus):
    # Slice the DataFrame for the current chunk
    chunk_sus = df_sus[start_sus : start_sus + chunk_size_sus]

    # Fill missing values in specific columns (only if they exist)
    columns_to_fill_sus = ["comments", "likes", "replies", "view_count"]
    for col_sus in columns_to_fill_sus:
        if col_sus in chunk_sus.columns:
            chunk_sus = chunk_sus.with_columns(pl.col(col_sus).fill_null(0))

    # Aggregate metrics for this chunk
    chunk_metrics_sus = (
        chunk_sus.group_by("year")
        .agg([
            pl.col("comments").sum().alias("total_comments"),
            pl.col("likes").sum().alias("total_likes"),
            pl.col("replies").sum().alias("total_replies"),
            pl.col("view_count").sum().alias("total_views"),
            pl.col("video_id").count().alias("total_videos"),
        ])
    )
    chunk_results_sus.append(chunk_metrics_sus)

# Combine all chunk results for aggregated metrics
final_metrics_sus = pl.concat(chunk_results_sus).group_by("year").sum()

# Iterate through the dataset again for unique users
for start_sus in range(0, len(df_sus), chunk_size_sus):
    # Slice the DataFrame for the current chunk
    chunk_sus = df_sus[start_sus : start_sus + chunk_size_sus]

    # Calculate unique users per year
    unique_users_sus = (
        chunk_sus.group_by("year")
        .agg(pl.col("author").n_unique().alias("unique_users"))
    )
    unique_users_results_sus.append(unique_users_sus)

# Combine all unique users results
unique_users_combined_sus = pl.concat(unique_users_results_sus).group_by("year").sum()

# Merge the aggregated metrics with unique users
final_result_sus = final_metrics_sus.join(unique_users_combined_sus, on="year", how="left")

# Compute comments per user
final_result_sus = final_result_sus.with_columns(
    (pl.col("total_comments") / pl.col("unique_users")).alias("comments_per_user")
)
# Sort the DataFrame by year
final_result_sus = final_result_sus.sort("year")
# Display the final result
print(final_result_sus)


In [None]:
# Normalize Metrics

final_result_sus = final_result_sus.with_columns([
    (pl.col("total_likes") / final_result_sus["total_likes"].max()).alias("Total Likes"),
    (pl.col("total_comments") / final_result_sus["total_comments"].max()).alias("Total Comments"),
    (pl.col("comments_per_user") / final_result_sus["comments_per_user"].max()).alias("Comments per User"),
    (pl.col("total_replies") / final_result_sus["total_replies"].max()).alias("Total Replies"),
])

# Remove year 2005
final_result_sus = final_result_sus.filter(pl.col("year")>2005)


colors = ["red", "blue", "green", "orange", "cyan"]

In [None]:
fig = px.line(final_result_sus.to_pandas(), x="year", y=["Total Likes", "Total Comments", "Comments per User", "Total Replies"],
                labels={"value":"Normalized Value", "year":"Year", "variable":"Metric"},
                color_discrete_sequence=colors, markers=True)
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2006+x for x in range(0,13, 2)],
        ticktext = [str(2006+x) for x in range(0,13, 2)]
    )
)
fig.show()

fig.write_image("./image_aurel/normalized_metrics_sus.svg")
fig.write_html("./image_aurel/normalized_metrics_sus.html")


#### - **Normal Users**

In [None]:
# Generate Metrics Data

# Define chunk size
chunk_size = 1_000_000  # Adjust based on memory constraints

# Initialize lists to collect results
unique_users_results = []
chunk_results = []

# Iterate through the dataset in chunks
for start in range(0, len(type_1), chunk_size):
    # Slice the DataFrame for the current chunk
    chunk = type_1[start : start + chunk_size]

    # Fill missing values in specific columns (only if they exist)
    columns_to_fill = ["comments", "likes", "replies", "view_count"]
    for col in columns_to_fill:
        if col in chunk.columns:
            chunk = chunk.with_columns(pl.col(col).fill_null(0))

    # Aggregate metrics for this chunk
    chunk_metrics = (
        chunk.group_by("year")
        .agg([
            pl.col("comments").sum().alias("total_comments"),
            pl.col("likes").sum().alias("total_likes"),
            pl.col("replies").sum().alias("total_replies"),
            pl.col("view_count").sum().alias("total_views"),
            pl.col("video_id").count().alias("total_videos"),
        ])
    )
    chunk_results.append(chunk_metrics)

# Combine all chunk results for aggregated metrics
final_metrics = pl.concat(chunk_results).group_by("year").sum()

# Iterate through the dataset again for unique users
for start in range(0, len(type_1), chunk_size):
    # Slice the DataFrame for the current chunk
    chunk = type_1[start : start + chunk_size]

    # Calculate unique users per year
    unique_users = (
        chunk.group_by("year")
        .agg(pl.col("author").n_unique().alias("unique_users"))
    )
    unique_users_results.append(unique_users)

# Combine all unique users results
unique_users_combined = pl.concat(unique_users_results).group_by("year").sum()

# Merge the aggregated metrics with unique users
final_result = final_metrics.join(unique_users_combined, on="year", how="left")

# Compute comments per user
final_result = final_result.with_columns(
    (pl.col("total_comments") / pl.col("unique_users")).alias("comments_per_user")
)

# Display the final result
print(final_result)


In [None]:
# Normalize Metrics

final_result = final_result.with_columns([
    (pl.col("total_likes") / final_result["total_likes"].max()).alias("Total Likes"),
    (pl.col("total_comments") / final_result["total_comments"].max()).alias("Total Comments"),
    (pl.col("comments_per_user") / final_result["comments_per_user"].max()).alias("Comments per User"),
    (pl.col("total_replies") / final_result["total_replies"].max()).alias("Total Replies"),
])

colors = ["red", "blue", "green", "orange", "cyan"]
markers = {
    "Total Likes": "circle",
    "Total Comments": "square",
    "Comments per User": "diamond",
    "Total Replies": "triangle-up",
}

In [None]:
fig = px.line(final_result.to_pandas(), x="year", y=["Total Likes", "Total Comments", "Comments per User", "Total Replies"],
                labels={"value":"Normalized Value", "year":"Year", "variable":"Metric"},
                color_discrete_sequence=colors, markers=True)
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2006+x for x in range(0,13, 2)],
        ticktext = [str(2006+x) for x in range(0,13, 2)]
    )
)
fig.show()

fig.write_image("./image_aurel/normalized_metrics.svg")
fig.write_html("./image_aurel/normalized_metrics.html")

#### <p style="text-align: center; margin-top: 0.5cm"> <span style="text-decoration: underline"> **How different are bots and normal users?** </span> </p>

#### - **Suspicious Users**

In [None]:
# Filter suspicious users to retain only the necessary columns
data_com_vid_year_sus= df_sus.select(["year", "author", "comments", "video_id"])

In [None]:
# Group by year and author to compute metrics

grouped_data_sus = (
    data_com_vid_year_sus.group_by(["year", "author"])  # Group by year and author
    .agg([
        pl.col("comments").sum().alias("total_comments"),  # Total comments by user per year
        pl.col("video_id").n_unique().alias("distinct_videos_commented"),  # Unique videos commented on
    ])
    .with_columns([
        (pl.col("total_comments") / pl.col("distinct_videos_commented")).alias("avg_comments_per_user")  # Avg comments per user
    ])
)

In [None]:
#subsample suspicious users because the plot would be impossible to make :

# Group data by year
grouped_sus = grouped_data_sus.group_by('year')

# Calculate the total subsample size (1/100th of the total rows)
total_subsample_size = len(grouped_data_sus) // 1

# Subsampling uniformly across years
subsampled_list_sus = []
for year, group in grouped_sus:
    # Ensure `group` is a DataFrame
    group = pd.DataFrame(group)
    
    # Calculate the number of samples for this year (proportional to the group's size)
    year_sample_size = max(1, len(group) * total_subsample_size // len(grouped_data_sus))
    
    # Randomly sample the data for this year
    subsampled_list_sus.append(group.sample(n=year_sample_size, random_state=42))

# Combine the sampled data from all years
subsampled_df_sus = pd.concat(subsampled_list_sus)

# Define the correct column names
column_names = ['year', 'author', 'total_comments', 'distinct_videos_commented', 'avg_comments_per_user']

# Assign the column names back to the DataFrame
subsampled_df_sus.columns = column_names

In [None]:
# Ensure the year column is integer for proper sorting FOR SUSPICIOUS USERS
subsampled_df_sus['year'] = subsampled_df_sus['year'].astype(int)

# Sort the DataFrame by year
subsampled_df_sus = subsampled_df_sus.sort_values(by='year', ascending=True)

# Reset the index after sorting (optional)
subsampled_df_sus = subsampled_df_sus.reset_index(drop=True)

In [None]:
# Ridge Line Plot for Suspicious Users

# Ensure Seaborn has a clean theme
sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

# Ensure 'year' is treated as categorical for the plot
subsampled_df_sus['year'] = subsampled_df_sus['year'].astype(int)  # Convert years to integers
subsampled_df_sus['year'] = subsampled_df_sus['year'].astype(str)  # Convert to string for categorical use

# Generate a color palette for years
pal = sns.color_palette(palette='viridis', n_colors=subsampled_df_sus['year'].nunique())

# Create the FacetGrid for ridgeline
g = sns.FacetGrid(
    subsampled_df_sus,
    row='year',
    hue='year',
    aspect=15,  # Stretch plots horizontally
    height=0.5,  # Adjust height of each row
    palette=pal,
)

# Add density plots (kde)
g.map(
    sns.kdeplot,
    'avg_comments_per_user',
    bw_adjust=0.2,
    clip=(0, 50),  # Clip x-axis range to 0-5
    #clip_on=False,
    fill=True,
    alpha=1,
    linewidth=1.5,
)
# g.set_titles("")
# g.set_axis_labels("", "")
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)

# Add a white contour line around each density plot
g.map(
    sns.kdeplot,
    'avg_comments_per_user',
    bw_adjust=0.2,
    clip=(0, 50),  # Clip x-axis range to 0-5
    #clip_on=False,  
    color="w",
    lw=2,
)

# Add a horizontal line at y=0 for each plot
g.map(plt.axhline, y=0, lw=2, clip_on=False)

# Add year labels to each plot
for i, ax in enumerate(g.axes.flat):
    ax.text(
        -0.5, 0.02,  # Adjust the position of the year label
        subsampled_df_sus['year'].unique()[i],
        fontweight='bold',
        fontsize=12,
        color=ax.lines[-1].get_color(),
    )

# Adjust subplot overlap
g.fig.subplots_adjust(hspace=-0.5)

# Remove the density label from each y-axis
for ax in g.axes.flat:
    ax.set_ylabel("")  # Remove the y-axis labels for density
    ax.set_xlim(0, 50)  # Set x-axis range to 0 - 50    


# Remove unnecessary axes details
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)

# Set x-axis label
plt.xlabel("Average Comments per User", fontweight='bold', fontsize=12)
g.fig.suptitle('Distribution of Avg Comments per User Across Years',
               ha='right', fontsize=16, fontweight='bold')

# Show the plot
plt.show()

g.savefig("./image_aurel/ridge_line_sus.svg")
g.savefig("./image_aurel/ridge_line_sus.html")

#### - **Normal Users**

In [None]:
# Filter normal users to retain only the necessary columns
data_com_vid_year= type_1.select(["year", "author", "comments", "video_id"])

In [None]:
# Group by year and author to compute metrics
grouped_data = (
    data_com_vid_year.group_by(["year", "author"])  # Group by year and author
    .agg([
        pl.col("comments").sum().alias("total_comments"),  # Total comments by user per year
        pl.col("video_id").n_unique().alias("distinct_videos_commented"),  # Unique videos commented on
    ])
    .with_columns([
        (pl.col("total_comments") / pl.col("distinct_videos_commented")).alias("avg_comments_per_user")  # Avg comments per user
    ])
)

In [None]:
#subsample normal users because the plot would be impossible to make :

# Group data by year
grouped = grouped_data.group_by('year')

# Calculate the total subsample size (1/100th of the total rows)
total_subsample_size = len(grouped_data) // 100

# Subsampling uniformly across years
subsampled_list = []
for year, group in grouped:
    # Ensure `group` is a DataFrame
    group = pd.DataFrame(group)
    
    # Calculate the number of samples for this year (proportional to the group's size)
    year_sample_size = max(1, len(group) * total_subsample_size // len(grouped_data))
    
    # Randomly sample the data for this year
    subsampled_list.append(group.sample(n=year_sample_size, random_state=42))

# Combine the sampled data from all years
subsampled_df = pd.concat(subsampled_list)

# Define the correct column names
column_names = ['year', 'author', 'total_comments', 'distinct_videos_commented', 'avg_comments_per_user']

# Assign the column names back to the DataFrame
subsampled_df.columns = column_names

In [None]:
# Ridge Line Plot for Normal Users

# Ensure the year column is integer for proper sorting
subsampled_df['year'] = subsampled_df['year'].astype(int)

# Sort the DataFrame by year
subsampled_df = subsampled_df.sort_values(by='year', ascending=True)

# Reset the index after sorting (optional)
subsampled_df = subsampled_df.reset_index(drop=True)

# Ensure Seaborn has a clean theme
sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

# Prepare data (already in `subsampled_df`)
# Ensure 'year' is treated as categorical for the plot
subsampled_df['year'] = subsampled_df['year'].astype(int)  # Convert years to integers
subsampled_df['year'] = subsampled_df['year'].astype(str)  # Convert to string for categorical use

# Generate a color palette for years
pal = sns.color_palette(palette='viridis', n_colors=subsampled_df['year'].nunique())

# Create the FacetGrid for ridgeline
g = sns.FacetGrid(
    subsampled_df,
    row='year',
    hue='year',
    aspect=15,  # Stretch plots horizontally
    height=0.5,  # Adjust height of each row
    palette=pal,
)

# Add density plots (kde)
g.map(
    sns.kdeplot,
    'avg_comments_per_user',
    bw_adjust=1,  # Bandwidth adjustment
    clip_on=False,
    fill=True,
    alpha=1,
    linewidth=1.5,
)
# g.set_titles("")
# g.set_axis_labels("", "")
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)

# Add a white contour line around each density plot
g.map(
    sns.kdeplot,
    'avg_comments_per_user',
    bw_adjust=1,
    clip_on=False,  
    color="w",
    lw=2,
)

# Add a horizontal line at y=0 for each plot
g.map(plt.axhline, y=0, lw=2, clip_on=False)

# Add year labels to each plot
for i, ax in enumerate(g.axes.flat):
    ax.text(
        -0.5, 0.02,  # Adjust the position of the year label
        subsampled_df['year'].unique()[i],
        fontweight='bold',
        fontsize=12,
        color=ax.lines[-1].get_color(),
    )

# Adjust subplot overlap
g.fig.subplots_adjust(hspace=-0.5)

# Remove the density label from each y-axis
for ax in g.axes.flat:
    ax.set_ylabel("")  # Remove the y-axis labels for density

# Remove unnecessary axes details
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)

# Set x-axis label
plt.xlabel("Average Comments per User", fontweight='bold', fontsize=12)
g.fig.suptitle('Distribution of Avg Comments per User Across Years',
               ha='right', fontsize=16, fontweight='bold')

# Show the plot
plt.show()

g.savefig("./image_aurel/ridge_line_normal.svg")

#### - **3D Plot**

In [None]:

# Combine both datasets for SVM training
combined_df = pd.concat([
    subsampled_df.assign(label=0),  # Normal users
    subsampled_df_sus.assign(label=1)  # Suspicious users
])

# Combine features and labels
X = combined_df[["year", "avg_comments_per_user", "distinct_videos_commented"]].values
y = combined_df["label"].values

# Train the SVM
svm = SVC(kernel="linear", C=1)
svm.fit(X, y)

# Extract hyperplane parameters
w = svm.coef_[0]  # Weights
b = svm.intercept_[0]  # Intercept

# Create mesh grid for the hyperplane
x_range = np.linspace(combined_df["year"].min(), combined_df["year"].max(), 30)
y_range = np.linspace(combined_df["avg_comments_per_user"].min(), combined_df["avg_comments_per_user"].max(), 30)
x, y = np.meshgrid(x_range, y_range)
z = (-w[0] * x - w[1] * y - b) / w[2]  # Solve for z

# Density calculation for Dataset 1
xyz1 = subsampled_df[["year", "avg_comments_per_user", "distinct_videos_commented"]].values.T
kde1 = gaussian_kde(xyz1)(xyz1)
subsampled_df["density"] = kde1

# Density calculation for Dataset 2
xyz2 = subsampled_df_sus[["year", "avg_comments_per_user", "distinct_videos_commented"]].values.T
kde2 = gaussian_kde(xyz2)(xyz2)
subsampled_df_sus["density"] = kde2

# Create the 3D Scatter Plot
fig = go.Figure()

# Add Dataset 1 (Normal Users) with density-based coloring
fig.add_trace(go.Scatter3d(
    x=subsampled_df["year"],
    y=subsampled_df["avg_comments_per_user"],
    z=subsampled_df["distinct_videos_commented"],
    mode='markers',
    marker=dict(
        size=3,
        color=subsampled_df["density"],  # Color based on density
        colorscale="Viridis",  # Blue-Green color scale
        opacity=0.6
    ),
    name="Normal Users"
))

# Add Dataset 2 (Suspicious Users) with density-based coloring
fig.add_trace(go.Scatter3d(
    x=subsampled_df_sus["year"],
    y=subsampled_df_sus["avg_comments_per_user"],
    z=subsampled_df_sus["distinct_videos_commented"],
    mode='markers',
    marker=dict(
        size=3,
        color=subsampled_df_sus["density"],  # Color based on density
        colorscale="Jet",  # Yellow-to-Red color scale
        opacity=0.8
    ),
    name="Suspicious Users"
))

# Add the SVM hyperplane
fig.add_trace(go.Surface(
    x=x, y=y, z=z,
    colorscale=[[0, 'lightgrey'], [1, 'lightgrey']],  # Single light-grey color
    opacity=0.5,
    showscale=False,  # Remove colorbar for hyperplane
    name="SVM Hyperplane"
))

# Add a box with the hyperplane metrics
fig.add_annotation(
    text=f"<b>SVM Hyperplane Metrics</b><br>"
         f"Equation: {w[0]:.2f}*x1 + {w[1]:.2f}*x2 + {w[2]:.2f}*x3 + {b:.2f} = 0",
    showarrow=False,
    xref="paper", yref="paper",
    x=0.05, y=0.95,  # Position: top-left corner
    bordercolor="black", borderwidth=1,
    bgcolor="white", font=dict(size=12)
)

# Update layout
fig.update_layout(
    title="",
    scene=dict(
        xaxis=dict(title="Year", tickvals=list(range(2005, 2020)), autorange="reversed"),
        yaxis=dict(title="Average Comments/User", range=[0, 50]),
        zaxis=dict(title="Distinct Videos Commented", range=[0, 1000])
    ),
    margin=dict(l=0, r=0, b=0, t=40)
)

# Show the plot
fig.show()

# Save the plot as an HTML file
fig.write_html("./image_aurel/3D_hyperplane_density.html")


___

## <p style="text-align: center"> <span style="text-decoration: underline"> **Type-2 Bot Analysis** </span> </p>

#### - **Data Preprocessing and Loading**

In [None]:
combine = False

if combine:

    datasets = ['normal', 'suspicious']

    for dataset in datasets:
        print(f"reading partial files '{dataset}_i.parquet'...")

        # List all Parquet files
        parquet_files = glob.glob(f'./data/data_type2/{dataset}_users_*.parquet')

        # Read and concatenate all Parquet files
        combined = pl.concat([pl.read_parquet(file) for file in parquet_files])
        combined.write_parquet(f'./data/data_type2/combi_{dataset}_dataset2.parquet')

In [None]:
df_normal_2 = pl.read_parquet('./data/data_type2/combi_normal_dataset2.parquet')
df_sus_2 = pl.read_parquet('./data/data_type2/combi_suspicious_dataset2.parquet')

#### <p style="text-align: center; margin-top: 0.5cm"> <span style="text-decoration: underline"> **Do Bots Comment On More Videos ?** </span> </p>

In [None]:
df_norm = pd.read_parquet(df_normal_2)[0] # Just need 1 df for plot
df_sus =  pd.read_parquet(df_sus_2)[0]
df_sus = df_sus[df_sus["videos_commented"] >= 10]

# Adding a column to discriminate between the 2 df
df_norm["category"]='normal'
df_sus["category"]='suspicious'

# Concatenating for the plot
df = pd.concat([df_norm[::5],df_sus[df_sus["videos_commented"] < 30]])

In [None]:
sns.set_theme(style="darkgrid")
sns.violinplot(x="category", y="videos_commented", hue="category", data=df, palette="Pastel1", split=True)
plt.show()

___

#### <p style="text-align: center; margin-top: 0.5cm"> <span style="text-decoration: underline"> **BOT RANKING** </span> </p>


In [None]:
# Compute lifetime of suspicious users

lifetime_df = df_sus.group_by("author").agg([
                pl.col("upload_date").min().alias("min"),
                pl.col("upload_date").max().alias("max")])

# Compute the difference
date_diff = (lifetime_df['max'] - lifetime_df['min']).alias("date_diff")

# Create the constant of one day to add
one_day = pl.duration(days=1).alias("one_day")

lifetime_df = lifetime_df.with_columns([
    (date_diff + one_day).alias("lifetime")
])

In [None]:
longuest_survival_type1=lifetime_df["lifetime"].max()
longest_survival_row_type1 = lifetime_df.filter(pl.col("lifetime") == pl.col("lifetime").max())