In [7]:
import pandas as pd
import numpy as np
import networkx as nx
import multinetx as mnet
import re
from datetime import datetime, timedelta
import pickle
import random
import os
import os.path
import torch
import subprocess
from tqdm import tqdm
import math
import glob
from tdigest import TDigest

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from nltk.sentiment.vader import SentimentIntensityAnalyzer

os.makedirs('cslasl-pre', exist_ok=True)
os.makedirs('cslasl-pre/edges', exist_ok=True)
os.makedirs('network', exist_ok=True)
os.makedirs('network/uil', exist_ok=True)
os.makedirs('network/tdl', exist_ok=True)
os.makedirs('network/csl', exist_ok=True)
os.makedirs('network/asl', exist_ok=True)

In [8]:
def compute_network_metrics(authors_path, edges_path):
    # Count nodes from the authors file.
    num_nodes = 0
    with open(authors_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                num_nodes += 1

    # Count edges and sum their weights from the edges file.
    num_edges = 0
    weight_sum = 0.0
    with open(edges_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(";")
            if len(parts) == 3:
                try:
                    w = float(parts[2])
                    num_edges += 1
                    weight_sum += w
                except ValueError:
                    continue

    avg_weight = weight_sum / num_edges if num_edges > 0 else 0.0
    # Compute density for a directed graph: density = m / (n*(n-1)) if n > 1
    density = num_edges / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0

    print(f"Number of nodes: {num_nodes}")
    print(f"Number of edges: {num_edges}")
    print(f"Average edge weight: {avg_weight:.4f}")
    print(f"Density: {density:.5f}")


def custom_round(num):
    # Standard round to three decimals.
    rounded = round(num, 3)
    # Format to a string with exactly 3 decimal places.
    s = f"{rounded:.3f}"
    # Replace the last character (third decimal) with '9'
    s = s[:-1] + '9'
    return float(s)

In [9]:
def load_submissions(submissions_file):
        return pd.read_csv(submissions_file, compression="zstd")
    
def load_comments(comments_file):
    return pd.read_csv(comments_file, compression="zstd")
        
submissions = pd.read_csv('submissions.csv.zst')
submissions['selftext'] = submissions['selftext'].fillna("")
comments = pd.read_csv('comments.csv.zst')

In [10]:
if os.path.isfile("cslasl-pre/authors.pkl"):
    with open("cslasl-pre/authors.pkl", "rb") as f:
        authors = pickle.load(f)    
else:
    authors = list(submissions['author'])
    authors.extend(list(comments['author']))
    authors = list(set(authors))

with open("network/authors.txt", "w", encoding="utf-8") as fout:
    for author in authors:
        fout.write(f"{author}\n")    

print(f"Number of nodes: {len(authors)}")

Number of nodes: 13184


In [11]:
import os
import subprocess
import pandas as pd
from tqdm import tqdm

# Define file paths for UIL and TDL temporary and output files
uil_temp_file = "cslasl-pre/edges/uil_temp_edges.txt"
tdl_temp_file = "cslasl-pre/edges/tdl_temp_edges.txt"

uil_sorted_file = "cslasl-pre/edges/uil_temp_edges_sorted.txt"
tdl_sorted_file = "cslasl-pre/edges/tdl_temp_edges_sorted.txt"

uil_output_file = "network/uil/edges.txt"
tdl_output_file = "network/tdl/edges.txt"

# Ensure the output directories exist
os.makedirs(os.path.dirname(uil_output_file), exist_ok=True)
os.makedirs(os.path.dirname(tdl_output_file), exist_ok=True)

def edge_exists_in_file(file_path, edge):
    """Check if an edge exists in the file without loading the entire file into memory."""
    if not os.path.exists(file_path):
        return False
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.startswith(edge):  # Faster string match check
                return True
    return False

# Process comments and generate only new edges
with open(uil_temp_file, "w", encoding="utf-8") as fout_uil, \
     open(tdl_temp_file, "w", encoding="utf-8") as fout_tdl:

    progress_bar = tqdm(total=len(comments), desc="Processing comments for UIL and TDL", unit="comment")
    for comment in comments.itertuples():
        author = comment.author
        try:
            parent = comment.parent_id.split("_")[1]  # Extract parent ID
        except Exception:
            continue

        # Determine the reply-to author: check submissions first, then comments
        reply_to = None
        try:
            reply_to = submissions.loc[submissions['id'] == parent]['author'].values[0]
        except Exception:
            try:
                reply_to = comments.loc[comments['id'] == parent]['author'].values[0]
            except Exception:
                reply_to = None

        if reply_to:
            edge_str = f"{author};{reply_to}"
            edge_tdl_str = f"{author};{reply_to};{comment.created_utc}"

            # Write only new UIL edges
            if not edge_exists_in_file(uil_output_file, edge_str):
                fout_uil.write(f"{edge_str}\n")

            # Write only new TDL edges with timestamp
            if not edge_exists_in_file(tdl_output_file, edge_str):
                fout_tdl.write(f"{edge_tdl_str}\n")

        progress_bar.update(1)
    progress_bar.close()

# --------------------------
# Process UIL Temporary File
# --------------------------
subprocess.run(["sort", uil_temp_file, "-o", uil_sorted_file])

# Aggregate sorted UIL edges: count occurrences (i.e., weight).
with open(uil_sorted_file, "r", encoding="utf-8") as fin, \
     open(uil_output_file, "a", encoding="utf-8") as fout:  # Append mode

    current_edge = None
    count = 0
    for line in fin:
        edge = line.strip()  # Format: "author;reply_to"
        if current_edge is None:
            current_edge = edge
            count = 1
        elif edge == current_edge:
            count += 1
        else:
            fout.write(f"{current_edge};{count}\n")
            current_edge = edge
            count = 1
    if current_edge is not None:
        fout.write(f"{current_edge};{count}\n")

# --------------------------
# Process TDL Temporary File
# --------------------------
subprocess.run(["sort", "-t", ";", "-k1,1", "-k2,2", "-k3,3", tdl_temp_file, "-o", tdl_sorted_file])

DELTA_T = 3600  # 1 hour

def compute_sliding_window_max(timestamps, window=DELTA_T):
    if not timestamps:
        return 0
    max_count = 0
    start = 0
    for end in range(len(timestamps)):
        while timestamps[end] - timestamps[start] > window:
            start += 1
        count = end - start + 1
        if count > max_count:
            max_count = count
    return max_count

# Aggregate sorted TDL edges: compute burst weight
with open(tdl_sorted_file, "r", encoding="utf-8") as fin, \
     open(tdl_output_file, "a", encoding="utf-8") as fout:  # Append mode

    current_edge = None  # tuple: (author, reply_to)
    timestamps = []

    for line in fin:
        parts = line.strip().split(";")
        if len(parts) != 3:
            continue
        author, reply_to, ts_str = parts
        try:
            ts = float(ts_str)
        except ValueError:
            continue
        edge = (author, reply_to)
        if current_edge is None:
            current_edge = edge
            timestamps = [ts]
        elif edge == current_edge:
            timestamps.append(ts)
        else:
            burst_weight = compute_sliding_window_max(sorted(timestamps), window=DELTA_T)
            fout.write(f"{current_edge[0]};{current_edge[1]};{burst_weight}\n")
            current_edge = edge
            timestamps = [ts]
    if current_edge is not None:
        burst_weight = compute_sliding_window_max(sorted(timestamps), window=DELTA_T)
        fout.write(f"{current_edge[0]};{current_edge[1]};{burst_weight}\n")

# Clean up temporary files
os.remove(uil_temp_file)
os.remove(uil_sorted_file)
os.remove(tdl_temp_file)
os.remove(tdl_sorted_file)

print("UIL")
compute_network_metrics("network/authors.txt", "network/uil/edges.txt")
print("\n")
print("TDL")
compute_network_metrics("network/authors.txt", "network/tdl/edges.txt")

Processing comments for UIL and TDL: 100%|████████████████████████████████████████████████████████████████████████████| 65511/65511 [05:09<00:00, 211.85comment/s]


UIL
Number of nodes: 13184
Number of edges: 56209
Average edge weight: 1.1655
Density: 0.00032


TDL
Number of nodes: 13184
Number of edges: 56209
Average edge weight: 1.0498
Density: 0.00032


In [None]:
# BUILD CSL
def clean(text):
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()

# --------------------------
# AGGREGATE TEXTS PER AUTHOR
# --------------------------
def aggregate_texts(df, text_fields):
    """Concatenate selected fields into a single string per row."""
    return df[text_fields].fillna("").agg(" ".join, axis=1)

submissions["full_text"] = aggregate_texts(submissions, ["title", "selftext"])
comments["body"] = comments["body"].fillna("")
submissions["selftext"] = submissions["selftext"].fillna("")

contents_dir = "cslasl-pre/contents_individual"
os.makedirs(contents_dir, exist_ok=True)

# Final merged output file (one line per author, in order)
final_output_file = "cslasl-pre/contents.txt"

with open("network/authors.txt", "r", encoding="utf-8") as f:
    # Keep order; also create a set for fast membership check.
    authors_ordered = [line.strip() for line in f if line.strip()]
authors_set = set(authors_ordered)

def write_author_text(author, text):
    """Append the cleaned text for an author to the corresponding file."""
    # For safety, you might sanitize the author name if needed.
    filepath = os.path.join(contents_dir, f"{author}.txt")
    with open(filepath, "a", encoding="utf-8") as fout:
        fout.write(text + "\n")

# --- PROCESS SUBMISSIONS ---
for submission in submissions.itertuples():
    author = submission.author
    if author in authors_set:
        full_text = clean(f"{submission.title} {submission.selftext}")
        write_author_text(author, full_text)

# --- PROCESS COMMENTS ---
for comment in comments.itertuples():
    author = comment.author
    if author in authors_set:
        body = clean(comment.body)
        write_author_text(author, body)

# --- MERGE PER-AUTHOR FILES INTO A SINGLE OUTPUT ---
# This writes each merged text in the same order as in authors_ordered.
with open(final_output_file, "w", encoding="utf-8") as fout_final:
    for author in authors_ordered:
        filepath = os.path.join(contents_dir, f"{author}.txt")
        if os.path.exists(filepath):
            with open(filepath, "r", encoding="utf-8") as f:
                # Merge all lines for this author into one string.
                merged_text = " ".join(line.strip() for line in f if line.strip())
            fout_final.write(merged_text + "\n")
        else:
            # If no texts were found for this author, write an empty line.
            fout_final.write("\n")

if os.path.isfile('cslasl-pre/csl_embeddings.pkl'):
    with open('cslasl-pre/csl_embeddings.pkl', 'rb') as f:
        embeddings = pickle.load(f)
else:
    # Read the contents from the final output file
    with open(final_output_file, "r", encoding="utf-8") as f:
        contents = f.readlines()  # Read all lines from the file
    
    model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", trust_remote_code=True, device="cuda:1")
    embeddings = model.encode(contents, normalize=True)
    
    with open('cslasl-pre/csl_embeddings.pkl', 'wb') as f:
        pickle.dump(embeddings, f)

threshold = 0.4
edges_file = "cslasl-pre/edges/edges.txt"
os.makedirs(os.path.dirname(edges_file), exist_ok=True)

num_users = len(authors)

# --- STEP 1: Write edges to a text file (line by line) ---
with open(edges_file, "w", encoding="utf-8") as fout:
    progress_bar = tqdm(total=num_users, desc="Processing authors", unit="author")
    for i in range(num_users):
        # Compare author i with all later authors (to avoid duplicates)
        for j in range(i+1, num_users):
            sim = np.dot(embeddings[i], embeddings[j])
            if sim > threshold:
                # Write as "author1;author2;similarity"
                fout.write(f"{authors[i]};{authors[j]};{sim}\n")
        progress_bar.update(1)
    progress_bar.close()

del embeddings

# Initialize a TDigest instance.
digest = TDigest()

with open(edges_file, "r", encoding="utf-8") as fin:
    for line in fin:
        parts = line.strip().split(";")
        if len(parts) == 3:
            try:
                sim_val = float(parts[2])
                digest.update(sim_val)
            except ValueError:
                continue

# Compute the 90th percentile using the digest.
tau_c_approx = digest.percentile(90)

if tau_c_approx is not None:
    tau_c = custom_round(tau_c_approx)  # Apply custom rounding.
    print("Estimated tau_c (90th percentile):", tau_c)
else:
    print("No similarity values above threshold were recorded.")

# --- STEP 3: Filter edges for those with weight greater than tau_c ---

if tau_c is not None:
    filtered_edges_file = "network/csl/edges.txt"
    with open(edges_file, "r", encoding="utf-8") as fin, \
         open(filtered_edges_file, "w", encoding="utf-8") as fout:
        for line in fin:
            parts = line.strip().split(";")
            if len(parts) == 3:
                try:
                    weight = float(parts[2])
                except ValueError:
                    continue
                if weight > tau_c:
                    fout.write(line)

compute_network_metrics("network/authors.txt", "network/csl/edges.txt")

In [16]:
# BUILD ASL

if os.path.isfile("cslasl-pre/edges/authors_sentiments.pkl"):
    with open("cslasl-pre/edges/authors_sentiments.pkl", 'rb') as f:
        authors_sentiments = pickle.load(f)
else:
    tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
    model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment").to("cuda:1")
    
    def get_sentiment_vector(text):
        """Compute sentiment vector from RoBERTa model."""
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to("cuda:1")
        with torch.no_grad():
            output = model(**inputs).logits  # Get logits
        return output.squeeze().cpu().numpy()
    
    authors_sentiments = {}
    for i in range(0, len(authors)):
        authors_sentiments[authors[i]] = get_sentiment_vector(contents[i])
    
    with open("cslasl-pre/edges/authors_sentiments.pkl", 'wb') as f:
        pickle.dump(authors_sentiments, f)

if os.path.isfile("cslasl-pre/edges/normalized_matrix.pkl"):
    with open("cslasl-pre/edges/normalized_matrix.pkl", 'rb') as f:
        normalized_matrix = pickle.load(f)
else:
    # Convert sentiment vectors to NumPy
    sentiment_matrix = np.array(list(authors_sentiments.values()))
    norms = np.linalg.norm(sentiment_matrix, axis=1, keepdims=True)
    valid_norms = norms.flatten() > 0  # Mask for non-zero norms
    normalized_matrix = np.zeros_like(sentiment_matrix)
    normalized_matrix[valid_norms] = sentiment_matrix[valid_norms] / norms[valid_norms]
    
    with open("cslasl-pre/edges/normalized_matrix.pkl", 'wb') as f:
        pickle.dump(normalized_matrix, f)

del authors_sentiments

threshold_sent = 0.7  # Only write edges with similarity > 0.5.
edges_file = "cslasl-pre/edges/edges_sentiments.txt"
os.makedirs(os.path.dirname(edges_file), exist_ok=True)

num_users = len(authors)

# --- STEP 1: Write edges to a text file (line by line) ---
with open(edges_file, "w", encoding="utf-8") as fout:
    progress_bar = tqdm(total=num_users, desc="Processing authors", unit="author")
    for i in range(num_users):
        for j in range(i+1, num_users):
            sim = np.dot(normalized_matrix[i], normalized_matrix[j])
            if sim > threshold_sent:
                fout.write(f"{authors[i]};{authors[j]};{sim}\n")
        progress_bar.update(1)
    progress_bar.close()

# Free memory if needed.
del normalized_matrix

# --- STEP 2: Compute empirical tau_a using a TDigest ---
digest = TDigest()
with open(edges_file, "r", encoding="utf-8") as fin:
    for line in fin:
        parts = line.strip().split(";")
        if len(parts) == 3:
            try:
                sim_val = float(parts[2])
                digest.update(sim_val)
            except ValueError:
                continue

tau_a_approx = digest.percentile(90)  # 90th percentile.
if tau_a_approx is not None:
    TAU_A = custom_round(tau_a_approx)
    print("Estimated tau_a (90th percentile):", TAU_A)
else:
    print("No similarity values above threshold were recorded.")
    TAU_A = None

# --- STEP 3: Filter edges for those with weight greater than TAU_A ---
if TAU_A is not None:
    filtered_edges_file = "network/asl/edges.txt"
    os.makedirs(os.path.dirname(filtered_edges_file), exist_ok=True)
    with open(edges_file, "r", encoding="utf-8") as fin, \
         open(filtered_edges_file, "w", encoding="utf-8") as fout:
        for line in fin:
            parts = line.strip().split(";")
            if len(parts) == 3:
                try:
                    weight = float(parts[2])
                except ValueError:
                    continue
                if weight > TAU_A:
                    fout.write(line)
    print(f"Filtered edges written to {filtered_edges_file}")

compute_network_metrics("network/authors.txt", "network/asl/edges.txt")

Processing authors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 13184/13184 [04:08<00:00, 53.00author/s]


Estimated tau_a (90th percentile): 0.999
Filtered edges written to network/asl/edges.txt
Number of nodes: 13184
Number of edges: 3170370
Average edge weight: 0.9995
Density: 0.01824


In [17]:
import os
import itertools
from tqdm import tqdm

# Define the interlayer coupling weight.
omega = 1.0

# Define the layers.
layers = ["UIL", "TDL", "CSL", "ASL"]

# Paths for the authors file and the output interlayer edges file.
authors_file = "network/authors.txt"
interlayer_edges_file = "network/interlayer_edges.txt"

# Ensure the output directory exists.
os.makedirs(os.path.dirname(interlayer_edges_file), exist_ok=True)

# Read authors (one author per line, preserving order).
with open(authors_file, "r", encoding="utf-8") as f:
    authors = [line.strip() for line in f if line.strip()]

# Open the interlayer edges file for writing.
edge_count = 0
with open(interlayer_edges_file, "w", encoding="utf-8") as fout:
    # For each author, generate an interlayer edge for every distinct pair of layers.
    for author in tqdm(authors, desc="Processing authors", unit="author"):
        for layer1, layer2 in itertools.combinations(layers, 2):
            node1 = f"{author}@{layer1}"
            node2 = f"{author}@{layer2}"
            fout.write(f"{node1};{node2};{omega}\n")
            edge_count += 1

print(f"Total number of interlayer edges: {edge_count}")

Processing authors: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 13184/13184 [00:00<00:00, 123436.78author/s]

Total number of interlayer edges: 79104



