In [4]:
%load_ext autoreload
%autoreload 2

from datetime import datetime, timedelta
from dateutil import rrule

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import community  # For Louvain community detection
from sklearn.metrics.cluster import normalized_mutual_info_score

from tqdm.notebook import tqdm
from concurrent.futures import as_completed
from multiprocessing import Pool

## Retweet Network Backbone Extraction and Edge Classification

This notebook cell processes user-user **retweet co-engagement networks** and extracts their **informative backbones** based on two weight dimensions:

- **`nij_c`**: the number of times a pair of users retweeted the same tweet (retweet count).
- **`nij_t`**: the average temporal proximity (inverted) between users when they co-retweeted the same tweet (retweet time).

### Methodology

1. **Input**: Edge lists from preprocessed retweet networks located in the `networks/` folder (one file per target date).
2. **Backbone Extraction**:
   - **Count-based backbone** (`nij_c`) is extracted using the **Polya Urn Filter** as proposed in [ Marcaccioli et al., 2019](https://www.nature.com/articles/s41467-019-08667-3).
   - **Time-based backbone** (`nij_t`) is computed after inverting and normalizing the temporal weights, then filtered with the same method.
   - ⚠️ **Note**: The actual backbone filtering must be performed externally using the official [MATLAB implementation](https://www.mathworks.com/matlabcentral/fileexchange/69501-pf).
3. **Classification**:
   - Edges are assigned a class (1–4) based on their statistical significance in each backbone:
     - `Class 4`: Significant in both dimensions.
     - `Class 2`: Significant only in count.
     - `Class 3`: Significant only in time.
     - `Class 1`: Not significant in either.

### Output

For each date, the final classified backbone is saved as: `Backbone-{target_date}.csv` within `networks/` folder.

In [None]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

# Define experiment parameters
alpha = 0.5                  # Significance level for the Polya filter
p_value_threshold_t = 0.05   # p-value threshold for temporal backbone
p_value_threshold_c = 0.1    # p-value threshold for count backbone

# Dates for which backbones will be extracted
target_dates = ['2022-11-01', '2023-01-08']
target_dates = [pd.to_datetime(date).date() for date in target_dates]

# Function to classify edges based on p-values
def classify_edges(row):
    if row['p_valor_c'] <= p_value_threshold_c and row['p_valor_t'] <= p_value_threshold_t:
        return 4  # Strong in both dimensions
    elif row['p_valor_c'] <= p_value_threshold_c:
        return 2  # Strong only in retweet count
    elif row['p_valor_t'] <= p_value_threshold_t:
        return 3  # Strong only in retweet time
    else:
        return 1  # Weak edge

# Iterate over each date to process networks
for target_date in target_dates:
    print(f"Processing date: {target_date}")

    # --------------------------------------------
    # 1. Load the edge list for the given date
    # --------------------------------------------
    edge_df = pd.read_csv(f"networks/{target_date}-edges-data.csv")

    # --------------------------------------------
    # 2. Retweet count backbone (nij_c)
    # --------------------------------------------

    # NOTE: Replace this block with MATLAB Polya Filter
    # The proper backbone extraction must be performed using:
    # https://www.mathworks.com/matlabcentral/fileexchange/69501-pf
    # from the paper: https://www.nature.com/articles/s41467-019-08667-3

    # Simulating a placeholder Polya URN backbone extraction for count
    df_count = edge_df[['src', 'trg', 'weight_count']].copy()
    df_count.rename(columns={'weight_count': 'nij_c'}, inplace=True)

    # Here, insert: MATLAB Polya Filter execution and load result with columns: ['src', 'trg', 'nij_c', 'p_valor_c']

    # --------------------------------------------
    # 3. Temporal backbone (nij_t)
    # --------------------------------------------

    df_time = edge_df[['src', 'trg', 'weight_time']].copy()
    df_time.rename(columns={'weight_time': 'nij_t'}, inplace=True)

    # Invert and normalize weights (time proximity -> higher weight)
    df_time['nij_t'] = df_time['nij_t'] / 60  # Convert seconds to minutes
    max_time = df_time['nij_t'].max()
    df_time['nij_t'] = max_time + 1 - df_time['nij_t']  # Invert: closer = stronger
    df_time['nij_t'] = np.ceil(df_time['nij_t'])        # Discretize to integers

    # Again, insert MATLAB backbone here and load result with ['src', 'trg', 'nij_t', 'p_valor_t']

    # --------------------------------------------
    # 4. Merge both backbone results and classify
    # --------------------------------------------

    # Simulated backbone merge (in real case, load p-values from MATLAB output)
    # df_count must contain ['src', 'trg', 'nij_c', 'p_valor_c']
    # df_time must contain  ['src', 'trg', 'nij_t', 'p_valor_t']
    merged = pd.merge(df_count, df_time, on=['src', 'trg'])


    # Classify edges based on statistical significance in each backbone
    merged['edge_class'] = merged.apply(classify_edges, axis=1)

    # --------------------------------------------
    # 5. Save the final result
    # --------------------------------------------

    output_path = f"networks/Backbone-{target_date}.csv"
    merged[['src', 'trg', 'nij_c', 'nij_t', 'edge_class']].to_csv(output_path, index=False)
    print(f"Saved classified backbone to: {output_path}\n")

## CDFs


This section presents the **Empirical Cumulative Distribution Functions (ECDFs)** of two edge weight dimensions extracted from the retweet co-engagement networks:

- **`nij_c`** — Number of times two users co-retweeted the same original tweet (retweet count).
- **`nij_t`** — Average time proximity (in minutes) between co-retweets by two users (inverted and normalized).

The ECDFs are plotted **separately for each edge class**, as defined by the dual significance in the count-based and time-based backbones.



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Reset seaborn to default visual style
sns.reset_defaults()

# Dates for which plots will be generated
target_dates = ['2022-11-01', '2023-01-08']
target_dates = [pd.to_datetime(date).date() for date in target_dates]

# Ensure output directory for figures exists
os.makedirs('figs', exist_ok=True)

def plot_ecdf_for_classes(df, variable, save_path, x_axis_label, log_x_axis=False):
    """
    Plot ECDFs for a specific variable grouped by edge class.

    Parameters:
    - df (pd.DataFrame): Data containing the 'edge_class' column and the variable to plot.
    - variable (str): Name of the column to plot ('nij_c' or 'nij_t').
    - save_path (str): File path to save the resulting plot.
    - x_axis_label (str): Label for the x-axis in the plot.
    - log_x_axis (bool): Whether to apply a logarithmic scale to the x-axis.
    """
    plt.figure(figsize=(4, 3))

    # Loop over each edge class and plot its ECDF
    for edge_class in sorted(df['edge_class'].unique()):
        subset = df[df['edge_class'] == edge_class]
        sns.ecdfplot(data=subset, x=variable, label=f'Class {edge_class}')

    if log_x_axis:
        plt.xscale('log')

    plt.xlabel(x_axis_label)
    plt.ylabel('P(X ≤ x)')
    plt.legend(title='Edge Class', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(save_path)
    plt.show()


# -------------------------------------
# Generate ECDF plots for each date
# -------------------------------------
for target_date in target_dates:
    print(f"Generating ECDF plots for: {target_date}")

    # Load the classified backbone file
    df = pd.read_csv(f"networks/Backbone-{target_date}.csv")

    # Normalize nij_t back to minutes (undo previous inversion logic)
    df['nij_t'] = df['nij_t'].astype(int) + 1
    df['nij_t'] = np.ceil(df['nij_t'] / 60)

    # Plot ECDF for temporal weight
    plot_ecdf_for_classes(
        df=df,
        variable='nij_t',
        save_path=f'figs/{target_date}-time.pdf',
        x_axis_label='Avg. retweet time (minutes)',
        log_x_axis=True
    )

    # Plot ECDF for retweet count weight
    plot_ecdf_for_classes(
        df=df,
        variable='nij_c',
        save_path=f'figs/{target_date}-count.pdf',
        x_axis_label='# shared retweets',
        log_x_axis=True
    )

## Edge Class Diversity per User

This section shows the **distribution of users** according to the **number of distinct edge classes** they are connected to in the backbone network.

Each bar represents the **fraction of users** who participate in 1, 2, 3, or 4 structurally distinct types of edges (as defined by count/time-based significance).


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# List of target experiment dates
target_dates = ['2022-11-01', '2023-01-08']

# Maximum Y-axis fraction to display
ylim_fraction = 0.8

# Ensure output folder exists
os.makedirs('figs', exist_ok=True)

for date in target_dates:
    file_path = f"networks/Backbone-{date}.csv"

    try:
        # Load classified backbone data
        df = pd.read_csv(file_path)

        # Reshape to long format: each user-role (src/trg) with its edge class
        edges_per_user = df.melt(id_vars='edge_class', value_vars=['src', 'trg'],
                                 var_name='role', value_name='user')

        # Count how many distinct edge classes each user is involved in
        class_count_per_user = edges_per_user.groupby('user')['edge_class'].nunique().reset_index()

        # Aggregate: fraction of users by number of distinct edge classes
        user_distribution = class_count_per_user['edge_class'].value_counts(normalize=True).sort_index()

        # Plot the distribution
        plt.figure(figsize=(4, 3))
        plt.bar(user_distribution.index, user_distribution.values, color='skyblue')

        # Axis labels and styling
        plt.xlabel('# Distinct Edge Classes')
        plt.ylabel('Fraction of Users')
        plt.xticks(range(1, len(user_distribution) + 1))
        plt.ylim(0, ylim_fraction)
        plt.tight_layout()

        # Save the plot
        output_path = f"figs/{date}_User_EdgeClass_Distribution.pdf"
        plt.savefig(output_path)
        plt.show()

    except FileNotFoundError:
        print(f"File not found: {file_path}")
