In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import multinetx as mnet
import re
from datetime import datetime, timedelta
import pickle
import random
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/shared/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
# --------------------------
# PARAMETERS & THRESHOLDS
# --------------------------
TAU_C = 0.39        # Threshold for content similarity (CSL)
TAU_A = 0.99      # Threshold for affective similarity (ASL)
OMEGA = 1.0        # Uniform interlayer coupling weight
DELTA_T = 3600     # Time window (in seconds) for temporal burst calculation (e.g., 1 hour)

In [2]:
def load_submissions(submissions_file):
    return pd.read_csv(submissions_file, compression="zstd")

def load_comments(comments_file):
    return pd.read_csv(comments_file, compression="zstd")
    
submissions = pd.read_csv('submissions.csv.zst')
submissions['selftext'] = submissions['selftext'].fillna("")
comments = pd.read_csv('comments.csv.zst')

In [3]:
all_authors = list(submissions['author'])
all_authors.extend(list(comments['author']))
all_authors = list(set(all_authors))
print(len(all_authors))

380704


In [4]:
### Build UIL

def get_edges():
    edges = {}

    for comment in comments.itertuples():
        author = comment.author
        parent = comment.parent_id.split("_")[1]
    
        reply_to = None
    
        try:
            reply_to = submissions.loc[submissions['id'] == parent]['author'].values[0]
        except:
            reply_to = comments.loc[comments['id'] == parent]['author'].values[0]
    
        if reply_to:
            edge = (author, reply_to)
            if edge in edges:
                edges[edge] += 1
            else:
                edges[edge] = 1    

    return edges

UIL = nx.DiGraph()
for author in all_authors:
    UIL.add_node(author)

for edge, weight in get_edges().items():
    UIL.add_edge(edge[0], edge[1], weight=weight)

print(f"UIL nodes: {len(UIL.nodes())}")
print(f"UIL edges: {len(UIL.edges())}")
print(f"UIL density: {nx.density(UIL)}")

def average_edge_weight(G):
    """Compute the average edge weight in a NetworkX graph."""
    edge_weights = [data["weight"] for _, _, data in G.edges(data=True)]  # Extract weights
    return sum(edge_weights) / len(edge_weights) if edge_weights else 0 

avg_weight = average_edge_weight(UIL)
print(f"Average UIL Edge Weight: {avg_weight:.4f}")

with open('uil.pkl', 'wb') as f:
    pickle.dump(UIL, f)

UIL nodes: 380704
UIL edges: 1409660
UIL density: 9.726142707446654e-06
Average UIL Edge Weight: 1.2266


In [7]:
### Build TDL

def compute_sliding_window_max(timestamps, window=DELTA_T):
    """
    Given a sorted list of timestamps (in seconds), compute the maximum number of events
    occurring within any window of length `window`.
    """
    if len(timestamps) == 0:
        return 0
    max_count = 0
    start = 0
    for end in range(len(timestamps)):
        while timestamps[end] - timestamps[start] > window:
            start += 1
        count = end - start + 1
        if count > max_count:
            max_count = count
    return max_count

def get_edges():
    edges = {}

    for comment in comments.itertuples():
        author = comment.author
        parent = comment.parent_id.split("_")[1]
    
        reply_to = None
    
        try:
            reply_to = submissions.loc[submissions['id'] == parent]['author'].values[0]
        except:
            reply_to = comments.loc[comments['id'] == parent]['author'].values[0]
    
        if reply_to:
            edge = (author, reply_to)
            timestamp = comment.created_utc
            
            if edge in edges:
                edges[edge].append(timestamp)
            else:
                edges[edge] = [timestamp]   

    return edges

TDL = nx.DiGraph()
for author in all_authors:
    TDL.add_node(author)

for edge, times in get_edges().items():
    times_sorted = sorted(times)
    burst_weight = compute_sliding_window_max(times_sorted, window=DELTA_T)
    TDL.add_edge(edge[0], edge[1], weight=burst_weight)

print(f"TDL nodes: {len(TDL.nodes())}")
print(f"TDL edges: {len(TDL.edges())}")
print(f"TDL density: {nx.density(TDL)}")

def average_edge_weight(G):
    """Compute the average edge weight in a NetworkX graph."""
    edge_weights = [data["weight"] for _, _, data in G.edges(data=True)]  # Extract weights
    return sum(edge_weights) / len(edge_weights) if edge_weights else 0 

avg_weight = average_edge_weight(TDL)
print(f"Average TDL Edge Weight: {avg_weight:.4f}")

with open('tdl.pkl', 'wb') as f:
    pickle.dump(TDL, f)

TDL nodes: 380704
TDL edges: 1409660
TDL density: 9.726142707446654e-06
Average TDL Edge Weight: 1.0880


In [2]:
# --------------------------
# CLEANING FUNCTION
# --------------------------
def clean(text):
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()

# --------------------------
# LOAD DATA
# --------------------------
def load_submissions(submissions_file):
    return pd.read_csv(submissions_file, compression="zstd")

def load_comments(comments_file):
    return pd.read_csv(comments_file, compression="zstd")

submissions = pd.read_csv("submissions.csv.zst")
comments = pd.read_csv("comments.csv.zst")


# CLS/ASL Preprocessing
# --------------------------
# AGGREGATE TEXTS PER AUTHOR
# --------------------------
def aggregate_texts(df, text_fields):
    """Concatenate selected fields into a single string per row."""
    return df[text_fields].fillna("").agg(" ".join, axis=1)

submissions["full_text"] = aggregate_texts(submissions, ["title", "selftext"])
comments["body"] = comments["body"].fillna("")
submissions["selftext"] = submissions["selftext"].fillna("")

contents_per_author = {}

# Process submissions
for submission in submissions.itertuples():
    author = submission.author
    full_text = clean(f"{submission.title} {submission.selftext}")
    contents_per_author.setdefault(author, []).append(full_text)

# Process comments
for comment in comments.itertuples():
    author = comment.author
    body = clean(comment.body)
    contents_per_author.setdefault(author, []).append(body)

# Merge all texts for each author
for author, contents in contents_per_author.items():
    contents_per_author[author] = " ".join(contents)

authors = list(contents_per_author.keys())
contents = list(contents_per_author.values())

os.makedirs('cslasl-pre', exist_ok=True)
with open('cslasl-pre/authors.pkl', 'wb') as f:
    pickle.dump(authors, f)
with open('cslasl-pre/contents.pkl', 'wb') as f:
    pickle.dump(contents, f)

In [3]:
# Get CSL edges and with tau_c at 0.3, then compute estimated tau_c

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", trust_remote_code=True, device="cuda:1")
embeddings = model.encode(contents, normalize=True)
with open('cslasl-pre/csl_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

In [5]:
from tqdm import tqdm

with open("cslasl-pre/csl_embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

with open("cslasl-pre/authors.pkl", "rb") as f:
    authors = pickle.load(f)

# Compute cosine similarity in **batches** to prevent MemoryError
num_users = len(authors)
batch_size = 200  # Adjust based on memory constraints

# Initialize tqdm progress bar
total_batches = (num_users // batch_size) ** 2  # Total iterations (i, j) pairs
progress_bar = tqdm(total=total_batches, desc="Computing Similarities", unit="batch")

for i in range(0, num_users, batch_size):
    edges = {}
    end_i = min(i + batch_size, num_users)
    
    for j in range(i, num_users, batch_size):
        end_j = min(j + batch_size, num_users)
        
        # Compute batch-wise similarity
        similarity_batch = np.dot(embeddings[i:end_i], embeddings[j:end_j].T)
        
        # Extract upper triangle indices to avoid duplicate calculations
        if i == j:
            upper_tri_indices = np.triu_indices(end_i - i, k=1)
            similarity_values = similarity_batch[upper_tri_indices]
        else:
            similarity_values = similarity_batch.flatten()

        # Store only meaningful similarities
        for x, sim in enumerate(similarity_values):
            if sim > 0.3:  # Adjust threshold as needed
                author_1 = authors[i + x // (end_j - j)]
                author_2 = authors[j + x % (end_j - j)]
                edges[(author_1, author_2)] = sim

        # Update progress bar
        progress_bar.update(1)
                
    # Save edges to a file
    os.makedirs('cslasl-pre/edges', exist_ok=True)
    with open(f"cslasl-pre/edges/edges_{i}.pkl", "wb") as f:
        pickle.dump(edges, f)
    del edges

# Close progress bar
progress_bar.close()

Computing Similarities:  50%|█████████████▌             | 1813560/3621409 [9:16:23<9:14:38, 54.33batch/s]


In [None]:
import math
import glob
from tdigest import TDigest
from tqdm import tqdm

digest = TDigest()

def custom_round(num):
    """
    Ceil if the third digit after the decimal is >= 5, otherwise floor.
    Always returns a number with two decimal places.
    """
    # Convert to string and split on '.'
    num_str = f"{num:.10f}"  # Ensure precision to check third decimal
    integer_part, decimal_part = num_str.split('.')

    # Extract the relevant digits
    third_digit = int(decimal_part[2]) if len(decimal_part) > 2 else 0

    # Apply ceil or floor rounding
    if third_digit >= 5:
        return math.ceil(num * 100) / 100  # Round up
    else:
        return math.floor(num * 100) / 100


# Compute empirical threshold
edges_files = glob.glob(os.path.join("cslasl-pre/edges", "*.pkl"))
progress_bar = tqdm(total=len(edges_files), desc="Computing TAU_C")

for edges_file in edges_files:
    with open(edges_file, "rb") as f:
        current = pickle.load(f)
    # Update the digest with each similarity value
    for value in current.values():
        digest.update(value)
    del current
    progress_bar.update(1)

estimated_90th = digest.percentile(90)
TAU_C = custom_round(estimated_90th)  # 90th percentile
print("Estimated tau_c (90th percentile):", TAU_C)

progress_bar = tqdm(total=len(edges_files), desc="Generating all edges")
current_edges = {}
for i in range(0, len(edges_files)):
    with open(edges_files[i], "rb") as f:
        current = pickle.load(f)
        current_edges = {key: f_val for key, value in current.items() if (f_val := float(value)) > TAU_C}
    del current
    with open(f"cslasl-pre/edges/all_edges_{i}.pkl", "wb") as f:
        pickle.dump(current_edges, f)
    del current_edges
    progress_bar.update(1)

Computing TAU_C:  23%|████████▎                           | 442/1904 [20:52:44<43:05:34, 106.11s/it]

In [125]:
### Build CSL

def aggregate_texts(df, text_fields):
    # Concatenate selected fields into a single string per row.
    return df[text_fields].fillna("").agg(" ".join, axis=1)

submissions['full_text'] = aggregate_texts(submissions, ['title', 'selftext'])
comments['body'] = comments['body'].fillna("")

def get_edges():
    from sentence_transformers import SentenceTransformer
    import torch
    
    def clean(text):
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip().lower()
    
    edges = {}
    
    contents_per_author = {}
    for submission in submissions.itertuples():
        author = submission.author
        title = submission.title
        selftext = submission.selftext
        full_text = f"{title} {selftext}"
        full_text = clean(full_text)
        
        if author in contents_per_author:
            contents_per_author[author].extend(full_text)
        else:
            contents_per_author[author] = [full_text]
    
    for comment in comments.itertuples():
        author = comment.author
        body = comment.body
        body = clean(body)
    
        if author in contents_per_author:
            contents_per_author[author].extend(body)
        else:
            contents_per_author[author] = [body]
    
    for author, contents in contents_per_author.items():
        contents_per_author[author] = " ".join(contents_per_author[author])
    
    authors = list(contents_per_author.keys())
    contents = list(contents_per_author.values())
    
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', trust_remote_code=True, device="cuda:1")
    embeddings = model.encode(contents, normalize=True)
    similarity_matrix = np.dot(embeddings, embeddings.T)  # Dot product of normalized vectors
    indices = np.triu_indices(len(authors), k=1)  # Get indices of the upper triangle (excluding diagonal)
    
    author_pairs = [(authors[i], authors[j]) for i, j in zip(indices[0], indices[1])]
    similarities = similarity_matrix[indices]  # Fetch corresponding similarity values

    # Store edges where similarity > TAU_C
    edges = {pair: sim for pair, sim in zip(author_pairs, similarities) if sim > TAU_C}
    
    return edges

CSL = nx.Graph()
for author in all_authors:
    CSL.add_node(author)

for edge, weight in get_edges().items():
    CSL.add_edge(edge[0], edge[1], weight=weight)

print(f"CSL nodes: {len(CSL.nodes())}")
print(f"CSL edges: {len(CSL.edges())}")
print(f"CSL density: {nx.density(CSL)}")

def average_edge_weight(G):
    """Compute the average edge weight in a NetworkX graph."""
    edge_weights = [data["weight"] for _, _, data in G.edges(data=True)]  # Extract weights
    return sum(edge_weights) / len(edge_weights) if edge_weights else 0 

avg_weight = average_edge_weight(CSL)
print(f"Average CSL Edge Weight: {avg_weight:.4f}")

with open('csl.pkl', 'wb') as f:
    pickle.dump(CSL, f)

CSL nodes: 16474
CSL edges: 12208150


In [117]:
### Build ASL

def aggregate_texts(df, text_fields):
    # Concatenate selected fields into a single string per row.
    return df[text_fields].fillna("").agg(" ".join, axis=1)

submissions['full_text'] = aggregate_texts(submissions, ['title', 'selftext'])
comments['full_text'] = comments['body'].fillna("")

def get_edges():
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    import torch
    
    def clean(text):
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip().lower()
        
    edges = {}
    
    contents_per_author = {}
    for submission in submissions.itertuples():
        author = submission.author
        title = submission.title
        selftext = submission.selftext
        full_text = f"{title} {selftext}"
        full_text = clean(full_text)
        
        if author in contents_per_author:
            contents_per_author[author].extend(full_text)
        else:
            contents_per_author[author] = [full_text]
    
    for comment in comments.itertuples():
        author = comment.author
        body = comment.body
        body = clean(body)
    
        if author in contents_per_author:
            contents_per_author[author].extend(body)
        else:
            contents_per_author[author] = [body]
    
    for author, contents in contents_per_author.items():
        contents_per_author[author] = " ".join(contents_per_author[author])

    tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
    model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment").to("cuda:1")

    def get_sentiment_vector(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to("cuda:1")  # Move input to GPU
        with torch.no_grad():
            output = model(**inputs).logits  # Get logits
        return output.squeeze().cpu().numpy()

    authors_sentiments = {}
    for author, contents in contents_per_author.items():
        authors_sentiments[author] = get_sentiment_vector(contents)
         
    sentiment_matrix = np.array(list(authors_sentiments.values()))
    norms = np.linalg.norm(sentiment_matrix, axis=1, keepdims=True)
    valid_norms = norms.flatten() > 0  # Mask for non-zero norms
    normalized_matrix = np.zeros_like(sentiment_matrix)
    normalized_matrix[valid_norms] = sentiment_matrix[valid_norms] / norms[valid_norms]
    similarity_matrix = np.dot(normalized_matrix, normalized_matrix.T)

    edges = {}
    indices = np.triu_indices(len(authors), k=1)  # Get upper triangle indices
    
    author_pairs = [(authors[i], authors[j]) for i, j in zip(indices[0], indices[1])]
    similarities = similarity_matrix[indices]  # Fetch similarity values
    
    # Store Edges Where Similarity > TAU_A
    edges = {pair: sim for pair, sim in zip(author_pairs, similarities) if sim > TAU_A}
    
    return edges

ASL = nx.Graph()
for author in all_authors:
    ASL.add_node(author)

for edge, weight in get_edges().items():
    ASL.add_edge(edge[0], edge[1], weight=weight)

print(f"ASL nodes: {len(ASL.nodes())}")
print(f"ASL edges: {len(ASL.edges())}")
print(f"ASL density: {nx.density(ASL)}")

def average_edge_weight(G):
    """Compute the average edge weight in a NetworkX graph."""
    edge_weights = [data["weight"] for _, _, data in G.edges(data=True)]  # Extract weights
    return sum(edge_weights) / len(edge_weights) if edge_weights else 0 

avg_weight = average_edge_weight(ASL)
print(f"Average ASL Edge Weight: {avg_weight:.4f}")

with open('asl.pkl', 'wb') as f:
    pickle.dump(ASL, f)

ASL nodes: 16474
ASL edges: 15248971


In [134]:
with open('uil.pkl', 'rb') as f:
    UIL = pickle.load(f)
with open('csl.pkl', 'rb') as f:
    CSL = pickle.load(f)
with open('tdl.pkl', 'rb') as f:
    TDL = pickle.load(f)
with open('asl.pkl', 'rb') as f:
    ASL = pickle.load(f)

layers = {
    "UIL": UIL,
    "CSL": CSL,
    "TDL": TDL,
    "ASL": ASL
}

In [136]:
multilayer = mnet.MultilayerGraph()

# Add nodes from UIL (the same for all layers).
for layer_name, G in layers.items():
    for node in G.nodes():
        multilayer.add_node((node, layer_name))

# Add intralayer edges.
for layer_name, G in layers.items():
    for u, v, data in G.edges(data=True):
        multilayer.add_edge((u, layer_name), (v, layer_name), weight=data.get('weight', 1))

# Add interlayer edges.
all_users = set()
for G in layers.values():
    all_users.update(G.nodes())
layer_names = list(layers.keys())
for user in all_users:
    for i in range(len(layer_names)):
        for j in range(i+1, len(layer_names)):
            multilayer.add_edge((user, layer_names[i]), (user, layer_names[j]), weight=OMEGA)

interlayer_edges = [e for e in multilayer.edges() if e[0][1] != e[1][1]]
num_interlayer_edges = len(interlayer_edges)

print("Multilayer network constructed with {} interlayer edges.".format(len(interlayer_edges)))

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [137]:
# --------------------------
# SAVE THE MULTILAYER NETWORK
# --------------------------
with open('network.pkl', 'wb') as f:
    pickle.dump(multilayer, f)
print("Multilayer network saved to 'multilayer_network.pkl'")

Multilayer network saved to 'multilayer_network.pkl'


In [138]:
print("Multilayer network constructed with {} interlayer edges.".format(len(interlayer_edges)))

Multilayer network constructed with 98844 interlayer edges.
