In [20]:
import os
import pandas as pd
import pickle
import io
import zstandard as zstd

def load_zst_csv(file_path):
    """
    Decompress a .csv.zst file using a streaming reader and return a pandas DataFrame.
    This method avoids loading the entire compressed file into memory.
    """
    with open(file_path, 'rb') as fh:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(fh) as reader:
            text = reader.read().decode('utf-8')
    return pd.read_csv(io.StringIO(text))

def get_user_subreddit_counts(event_folder):
    """
    Load submissions.csv.zst and comments.csv.zst from the given event folder,
    then compute, for each user, the number of distinct subreddits in which they participate.
    """
    submissions_path = os.path.join(event_folder, "submissions.csv.zst")
    comments_path = os.path.join(event_folder, "comments.csv.zst")
    
    df_sub = load_zst_csv(submissions_path)
    df_com = load_zst_csv(comments_path)
    
    # Ensure necessary columns are present
    df_sub = df_sub[['author', 'subreddit']]
    df_com = df_com[['author', 'subreddit']]
    
    # Combine submissions and comments
    df_all = pd.concat([df_sub, df_com], ignore_index=True)
    df_all = df_all.dropna(subset=['author', 'subreddit'])
    df_all = df_all[df_all['author'] != '[deleted]']
    
    # Group by author and count distinct subreddits
    user_sub_count = df_all.groupby('author')['subreddit'].nunique().reset_index()
    user_sub_count.rename(columns={'subreddit': 'num_subreddits'}, inplace=True)
    
    return user_sub_count

def label_bridging_nodes(user_sub_count):
    """
    Label a user as a bridging node if they participate in more than one subreddit.
    Instead of using numeric labels, assign the strings "Bridging" and "Non-Bridging".
    """
    user_sub_count['bridging'] = user_sub_count['num_subreddits'].apply(
        lambda x: "Bridging" if x > 1 else "Non-Bridging"
    )
    return user_sub_count

def merge_with_CRC(event_folder, user_labels):
    """
    Load CRC values from a pickle file, then merge them with the bridging node labels.
    """
    crc_path = os.path.join(event_folder, "CRC_values.pkl")
    
    with open(crc_path, "rb") as f:
        crc_dict = pickle.load(f)
    
    df_crc = pd.DataFrame(list(crc_dict.items()), columns=['author', 'CRC'])
    
    merged = pd.merge(user_labels, df_crc, on='author', how='left')
    return merged

def print_top_crc_bridging_latex(merged, top_perc=0.25):
    """
    Filter to the top users by CRC (top_perc fraction), then print summary statistics for CRC by bridging status in LaTeX table format.
    """
    # Sort by CRC descending and take the top percentage
    merged_sorted = merged.sort_values(by="CRC", ascending=False)
    top_n = int(len(merged_sorted) * top_perc)
    top_users = merged_sorted.head(top_n)
    
    # Group by bridging status and compute descriptive statistics for CRC
    crc_stats = top_users.groupby('bridging')['CRC'].describe()
    latex_table = crc_stats.to_latex(float_format="%.2f")
    
    print("LaTeX table for CRC by Bridging Status (Top {:.0f}% Users):".format(top_perc*100))
    print(latex_table)
    
    # Additionally, print mean CRC for each group and their difference
    mean_bridging = top_users[top_users['bridging'] == "Bridging"]['CRC'].mean()
    mean_nonbridging = top_users[top_users['bridging'] == "Non-Bridging"]['CRC'].mean()
    diff = mean_nonbridging - mean_bridging
    print("\nSummary of Mean CRC Values for Top {:.0f}% Users:".format(top_perc*100))
    print(f"Mean CRC (Bridging): {mean_bridging:.2f}")
    print(f"Mean CRC (Non-Bridging): {mean_nonbridging:.2f}")
    print(f"Difference (Non-Bridging - Bridging): {diff:.2f}")

def main(event_folder, top_perc=0.25):
    # Compute distinct subreddit counts per user from submissions and comments.
    user_sub_count = get_user_subreddit_counts(event_folder)
    user_labels = label_bridging_nodes(user_sub_count)
    
    # Merge bridging labels with pre-computed CRC values.
    merged = merge_with_CRC(event_folder, user_labels)
    
    # Save merged results.
    output_csv = os.path.join(event_folder, "bridging_CRC.csv")
    merged.to_csv(output_csv, index=False)
    print(f"Merged data saved to: {output_csv}")
    
    # Print LaTeX table of summary statistics for CRC by bridging status for top users.
    print_top_crc_bridging_latex(merged, top_perc=top_perc)
    
    return merged

In [25]:
event_folder = "data/2008_elections"
merged_data = main(event_folder, top_perc=1)

Merged data saved to: data/2008_elections/bridging_CRC.csv
LaTeX table for CRC by Bridging Status (Top 100% Users):
\begin{tabular}{lrrrrrrrr}
\toprule
 & count & mean & std & min & 25% & 50% & 75% & max \\
bridging &  &  &  &  &  &  &  &  \\
\midrule
Bridging & 1749.00 & 1.20 & 0.13 & 1.00 & 1.09 & 1.20 & 1.27 & 1.77 \\
Non-Bridging & 11435.00 & 1.12 & 0.11 & 1.00 & 1.02 & 1.10 & 1.21 & 2.00 \\
\bottomrule
\end{tabular}


Summary of Mean CRC Values for Top 100% Users:
Mean CRC (Bridging): 1.20
Mean CRC (Non-Bridging): 1.12
Difference (Non-Bridging - Bridging): -0.07


In [26]:
event_folder = "data/2011_wallstreet"
merged_data = main(event_folder, top_perc=1)

Merged data saved to: data/2011_wallstreet/bridging_CRC.csv
LaTeX table for CRC by Bridging Status (Top 100% Users):
\begin{tabular}{lrrrrrrrr}
\toprule
 & count & mean & std & min & 25% & 50% & 75% & max \\
bridging &  &  &  &  &  &  &  &  \\
\midrule
Bridging & 459.00 & 1.18 & 0.11 & 1.01 & 1.10 & 1.19 & 1.25 & 1.52 \\
Non-Bridging & 31168.00 & 1.12 & 0.10 & 1.00 & 1.02 & 1.10 & 1.21 & 1.68 \\
\bottomrule
\end{tabular}


Summary of Mean CRC Values for Top 100% Users:
Mean CRC (Bridging): 1.18
Mean CRC (Non-Bridging): 1.12
Difference (Non-Bridging - Bridging): -0.06


In [27]:
event_folder = "data/2016_elections"
merged_data = main(event_folder, top_perc=1)

Merged data saved to: data/2016_elections/bridging_CRC.csv
LaTeX table for CRC by Bridging Status (Top 100% Users):
\begin{tabular}{lrrrrrrrr}
\toprule
 & count & mean & std & min & 25% & 50% & 75% & max \\
bridging &  &  &  &  &  &  &  &  \\
\midrule
Bridging & 37849.00 & 1.21 & 0.13 & 1.00 & 1.10 & 1.23 & 1.30 & 2.02 \\
Non-Bridging & 257980.00 & 1.14 & 0.12 & 1.00 & 1.02 & 1.11 & 1.25 & 1.66 \\
\bottomrule
\end{tabular}


Summary of Mean CRC Values for Top 100% Users:
Mean CRC (Bridging): 1.21
Mean CRC (Non-Bridging): 1.14
Difference (Non-Bridging - Bridging): -0.07


In [28]:
event_folder = "data/2017_rally"
merged_data = main(event_folder, top_perc=1)

Merged data saved to: data/2017_rally/bridging_CRC.csv
LaTeX table for CRC by Bridging Status (Top 100% Users):
\begin{tabular}{lrrrrrrrr}
\toprule
 & count & mean & std & min & 25% & 50% & 75% & max \\
bridging &  &  &  &  &  &  &  &  \\
\midrule
Bridging & 3984.00 & 1.21 & 0.15 & 1.00 & 1.07 & 1.21 & 1.32 & 1.65 \\
Non-Bridging & 100141.00 & 1.17 & 0.13 & 1.00 & 1.04 & 1.17 & 1.30 & 2.50 \\
\bottomrule
\end{tabular}


Summary of Mean CRC Values for Top 100% Users:
Mean CRC (Bridging): 1.21
Mean CRC (Non-Bridging): 1.17
Difference (Non-Bridging - Bridging): -0.03


In [29]:
event_folder = "data/2021_riot"
merged_data = main(event_folder, top_perc=1)

Merged data saved to: data/2021_riot/bridging_CRC.csv
LaTeX table for CRC by Bridging Status (Top 100% Users):
\begin{tabular}{lrrrrrrrr}
\toprule
 & count & mean & std & min & 25% & 50% & 75% & max \\
bridging &  &  &  &  &  &  &  &  \\
\midrule
Bridging & 6958.00 & 1.16 & 0.12 & 1.00 & 1.05 & 1.15 & 1.27 & 1.55 \\
Non-Bridging & 247522.00 & 1.15 & 0.11 & 1.00 & 1.03 & 1.14 & 1.26 & 4.21 \\
\bottomrule
\end{tabular}


Summary of Mean CRC Values for Top 100% Users:
Mean CRC (Bridging): 1.16
Mean CRC (Non-Bridging): 1.15
Difference (Non-Bridging - Bridging): -0.01
