In [14]:
event_folder = "data/2008_elections/"

In [8]:
import os
import sys
import math
import pickle
import numpy as np
from tqdm import tqdm
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer

MIN_SIM = 0.20      # soglia minima di similarità per scrivere un edge
BLOCK   = 2048      # block size per moltiplicazioni a blocchi
BATCH_EMB = 64      # batch per encode() degli embeddings
DEVICE_STR = None   # es: "cuda:0" oppure None per auto

# --- paths (usano event_folder) ---
AUTHORS_FILE = os.path.join(event_folder, "network", "authors.txt")
CONTENTS_FILE = os.path.join(event_folder, "cslasl-pre", "contents.txt")
EMB_CACHE = os.path.join(event_folder, "cslasl-pre", "csl_embeddings.pkl")
EDGES_DIR = os.path.join(event_folder, "cslasl-pre", "edges")
EDGES_FILE = os.path.join(EDGES_DIR, "edges.txt")

# --- funzioni utili ---
def load_authors(path):
    with open(path, "r", encoding="utf-8") as f:
        authors = [line.strip() for line in f if line.strip()]
    return authors

def load_contents(path):
    with open(path, "r", encoding="utf-8") as f:
        contents = [line.rstrip("\n") for line in f]
    return contents

def ensure_dirs():
    os.makedirs(os.path.dirname(EMB_CACHE), exist_ok=True)
    os.makedirs(EDGES_DIR, exist_ok=True)

def get_device(device_arg):
    if device_arg:
        return torch.device(device_arg)
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_or_build_embeddings(contents, device, batch_size=64):
    if os.path.isfile(EMB_CACHE):
        with open(EMB_CACHE, "rb") as f:
            emb = pickle.load(f)
        emb = emb.astype(np.float32, copy=False)
        norms = np.linalg.norm(emb, axis=1, keepdims=True)
        norms[norms == 0.0] = 1.0
        emb = emb / norms
        return emb

    print("Loading model: sentence-transformers/all-mpnet-base-v2 ...")
    model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device=str(device))
    emb = model.encode(
        contents,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    ).astype(np.float32)

    with open(EMB_CACHE, "wb") as f:
        pickle.dump(emb, f)
    return emb

def write_edges_blockwise(authors, emb, min_sim, block, device, flush_every=1_000_000):
    """
    Calcola similarità a blocchi (upper-triangular) e scrive "u;v;sim" (undirected, i<j).
    """
    n, d = emb.shape
    print(f"Embeddings: n={n}, d={d}")
    if n != len(authors):
        raise ValueError(f"Authors ({len(authors)}) e embeddings ({n}) non coincidono.")

    use_gpu = (device.type == "cuda")
    print(f"Device: {device} (GPU={use_gpu})")
    print(f"Block size: {block}, min_sim: {min_sim}")

    with open(EDGES_FILE, "w", encoding="utf-8") as fout:
        total_written = 0
        emb_torch = torch.from_numpy(emb)
        if use_gpu:
            emb_torch = emb_torch.to(device, non_blocking=True)

        n_blocks = math.ceil(n / block)
        pbar = tqdm(total=n_blocks * (n_blocks + 1) // 2, desc="Blocks", unit="blk")
        for bi in range(n_blocks):
            i0 = bi * block
            i1 = min((bi + 1) * block, n)
            Ei = emb_torch[i0:i1]

            for bj in range(bi, n_blocks):
                j0 = bj * block
                j1 = min((bj + 1) * block, n)
                Ej = emb_torch[j0:j1]

                S = Ei @ Ej.T  # (i_len, j_len)

                if bi == bj:
                    tri_mask = torch.triu(torch.ones((i1 - i0, j1 - j0), dtype=torch.bool, device=S.device), diagonal=1)
                    S = torch.where(tri_mask, S, torch.full_like(S, float("-inf")))

                keep = S > float(min_sim)
                if not torch.any(keep):
                    pbar.update(1)
                    continue

                idx_i, idx_j = torch.nonzero(keep, as_tuple=True)
                sims = S[idx_i, idx_j].detach().float().cpu().numpy()
                gi = (i0 + idx_i.cpu().numpy()).tolist()
                gj = (j0 + idx_j.cpu().numpy()).tolist()

                buf = []
                for k in range(len(sims)):
                    u = authors[gi[k]]
                    v = authors[gj[k]]
                    w = float(sims[k])
                    buf.append(f"{u};{v};{w:.6f}\n")

                fout.writelines(buf)
                total_written += len(buf)

                if total_written >= flush_every:
                    fout.flush()
                    os.fsync(fout.fileno())
                    total_written = 0

                del keep, idx_i, idx_j, sims, buf, S
                if use_gpu:
                    torch.cuda.synchronize()

                pbar.update(1)

            if use_gpu:
                torch.cuda.synchronize()

        pbar.close()

def print_edge_percentiles():
    import array
    vals = array.array("f")
    if not os.path.isfile(EDGES_FILE):
        print(f"Missing {EDGES_FILE}")
        return
    with open(EDGES_FILE, "r", encoding="utf-8") as fin:
        for line in fin:
            try:
                w = float(line.rsplit(";", 1)[1])
                vals.append(w)
            except Exception:
                continue
    if len(vals) == 0:
        print("No edges written; try lowering MIN_SIM.")
        return
    arr = np.frombuffer(vals, dtype=np.float32)
    for p in [50, 60, 70, 75, 80, 85, 90, 95]:
        print(f"Similarity p{p:02d}: {np.percentile(arr, p):.4f}")
    print(f"Edges total: {len(arr):,}")

# --- run ---
ensure_dirs()

if not os.path.isfile(AUTHORS_FILE):
    raise SystemExit(f"Missing {AUTHORS_FILE}")
if not os.path.isfile(CONTENTS_FILE):
    raise SystemExit(f"Missing {CONTENTS_FILE} (one line per author, same order).")

authors = load_authors(AUTHORS_FILE)
contents = load_contents(CONTENTS_FILE)

if len(authors) != len(contents):
    raise SystemExit(f"authors ({len(authors)}) != contents lines ({len(contents)}) → allinea i file.")

device = get_device(DEVICE_STR)
emb = load_or_build_embeddings(contents, device, batch_size=BATCH_EMB)
del contents  # libera RAM

write_edges_blockwise(authors, emb, min_sim=MIN_SIM, block=BLOCK, device=device)

del emb
if device.type == "cuda":
    torch.cuda.empty_cache()

print_edge_percentiles()
print(f"Done. Edges file -> {EDGES_FILE}")

Loading model: sentence-transformers/all-mpnet-base-v2 ...


README.md: 0.00B [00:00, ?B/s]

Batches:   0%|          | 0/206 [00:00<?, ?it/s]

Embeddings: n=13184, d=768
Device: cuda (GPU=True)
Block size: 2048, min_sim: 0.2


Blocks: 100%|██████████████████████████████████████████████████████████████████████████| 28/28 [00:41<00:00,  1.47s/blk]


Similarity p50: 0.3023
Similarity p60: 0.3288
Similarity p70: 0.3600
Similarity p75: 0.3784
Similarity p80: 0.3998
Similarity p85: 0.4256
Similarity p90: 0.4592
Similarity p95: 0.5104
Edges total: 39,773,721
Done. Edges file → data/2008_elections/cslasl-pre/edges/edges.txt


In [12]:
import os
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

AUTHORS_FILE = os.path.join(event_folder, "network", "authors.txt")
EDGES_FILE   = os.path.join(event_folder, "cslasl-pre", "edges", "edges.txt")  # "u;v;sim"
OUT_CSV      = os.path.join("data", "thresholds_analysis", "results_threshold_csl_density_gcc.csv")
OUT_FIG      = os.path.join("data", "thresholds_analysis", "fig_threshold_csl_density_gcc.png")

PERCENTILES  = [50, 60, 70, 75, 80, 85, 90, 95]
BOOTSTRAP    = False   # True per ribbon 95% CI
B_REPS       = 200     # numero bootstrap
ALPHA        = 0.05    # 95% CI

# --- helpers ---
def custom_round(x: float) -> float:
    r = round(float(x), 3)
    s = f"{r:.3f}"
    s = s[:-1] + "9"
    return float(s)

def load_authors(path):
    with open(path, "r", encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]

def load_edges_df(path):
    # u;v;w  (trattiamo come undirected e deduplichiamo coppie simmetriche)
    df = pd.read_csv(path, sep=";", header=None, names=["u","v","w"],
                     dtype={"u":"string","v":"string","w":"float32"}, engine="c")
    # canonicalizza (min, max) per togliere eventuali duplicati
    uu = np.where(df["u"].values < df["v"].values, df["u"].values, df["v"].values)
    vv = np.where(df["u"].values < df["v"].values, df["v"].values, df["u"].values)
    df = pd.DataFrame({"u": uu, "v": vv, "w": df["w"].values})
    df = df.drop_duplicates(subset=["u","v"], keep="first").reset_index(drop=True)
    return df

def build_graph_undirected(authors, df_edges):
    G = nx.Graph()
    G.add_nodes_from(authors)
    if not df_edges.empty:
        G.add_edges_from(zip(df_edges["u"].tolist(), df_edges["v"].tolist()))
    return G

def graph_metrics(G, n_nodes):
    dens = nx.density(G)
    if G.number_of_edges() == 0 or G.number_of_nodes() == 0:
        gcc_nodes = 1 if n_nodes > 0 else 0
    else:
        gcc_nodes = max((len(c) for c in nx.connected_components(G)), default=1)
    gcc_pct = 100.0 * (gcc_nodes / n_nodes) if n_nodes > 0 else 0.0
    return dens, gcc_pct, G.number_of_edges()

def edge_bootstrap_ci(authors, df_edges, n_nodes, B=200, alpha=0.05, rng=None):
    if df_edges.empty:
        return (np.nan, np.nan, np.nan, np.nan)
    if rng is None:
        rng = np.random.default_rng(42)
    M = len(df_edges)
    dens, gccs = [], []
    idx = np.arange(M)
    for _ in range(B):
        sample_idx = rng.choice(idx, size=M, replace=True)
        G_b = build_graph_undirected(authors, df_edges.iloc[sample_idx])
        d_b, g_b, _ = graph_metrics(G_b, n_nodes)
        dens.append(d_b); gccs.append(g_b)
    lo = 100*alpha/2; hi = 100*(1-alpha/2)
    den_lo, den_hi = np.percentile(dens, [lo, hi])
    gcc_lo, gcc_hi = np.percentile(gccs, [lo, hi])
    return (den_lo, den_hi, gcc_lo, gcc_hi)

# --- load data ---
authors = load_authors(AUTHORS_FILE)
n_all   = len(authors)
df_all  = load_edges_df(EDGES_FILE)
sims    = df_all["w"].astype("float64").values

# --- compute tau_c per percentile (con custom_round) ---
taus = {p: custom_round(np.percentile(sims, p)) for p in PERCENTILES}
print("tau_c by percentile:", {k: f"{v:.3f}" for k,v in taus.items()})

# --- sweep & metrics ---
rows = []
rng = np.random.default_rng(123)
for p in PERCENTILES:
    tau = taus[p]
    df_tau = df_all[df_all["w"] > tau]  # STRICT > come nel tuo pipeline

    G = build_graph_undirected(authors, df_tau)
    dens, gcc_pct, m_edges = graph_metrics(G, n_all)

    if BOOTSTRAP:
        den_lo, den_hi, gcc_lo, gcc_hi = edge_bootstrap_ci(authors, df_tau, n_all,
                                                            B=B_REPS, alpha=ALPHA, rng=rng)
    else:
        den_lo = den_hi = gcc_lo = gcc_hi = np.nan

    rows.append({
        "percentile": p,
        "tau_c": tau,
        "edges": m_edges,
        "density": dens,
        "density_lo": den_lo,
        "density_hi": den_hi,
        "gcc_pct": gcc_pct,
        "gcc_pct_lo": gcc_lo,
        "gcc_pct_hi": gcc_hi
    })

res = pd.DataFrame(rows).sort_values("percentile")
res.to_csv(OUT_CSV, index=False)
print(f"Saved metrics → {OUT_CSV}")

# --- plot ---
x = res["percentile"].values
y_den = res["density"].values
y_gcc = res["gcc_pct"].values

# --- cosmetic + baseline line + (optional) bootstrap ribbons already supported ---
BASELINE = 90  # percentile baseline

fig, ax1 = plt.subplots(figsize=(7.0, 3.6))

# left axis: density
(line1,) = ax1.plot(x, y_den, marker="o", linestyle="-", label="CSL density", linewidth=2, markersize=6)
ax1.set_xlabel(r"Content similarity threshold percentile ($\tau_c$)")
ax1.set_ylabel("CSL density")
ax1.grid(True, which="both", axis="both", alpha=0.3)

# ribbons se disponibili
if BOOTSTRAP and not res["density_lo"].isna().all():
    ax1.fill_between(x, res["density_lo"].values, res["density_hi"].values, alpha=0.2)

# baseline marker
ax1.axvline(BASELINE, linestyle="--", linewidth=1)
ax1.text(BASELINE+0.5, ax1.get_ylim()[1]*0.95, "baseline", rotation=90, va="top")

# right axis: GCC%
ax2 = ax1.twinx()
(line2,) = ax2.plot(x, y_gcc, marker="s", linestyle="--", label="GCC size (%)", linewidth=2, markersize=6)
ax2.set_ylabel("GCC size (% of nodes)")

if BOOTSTRAP and not res["gcc_pct_lo"].isna().all():
    ax2.fill_between(x, res["gcc_pct_lo"].values, res["gcc_pct_hi"].values, alpha=0.15)

# legenda combinata
lines = [line1, line2]
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels, loc="best", frameon=False)

fig.tight_layout()
fig.savefig(OUT_FIG, bbox_inches="tight")
plt.close(fig)
print(f"Saved figure → {OUT_FIG}")

tau_c by percentile: {50: '0.309', 60: '0.329', 70: '0.369', 75: '0.379', 80: '0.409', 85: '0.429', 90: '0.459', 95: '0.519'}
Saved metrics → data/thresholds_analysis/results_threshold_csl_density_gcc.csv
Saved figure → data/thresholds_analysis/fig_threshold_csl_density_gcc.png


In [20]:
import os, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from matplotlib.lines import Line2D
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

# -------------------------------------------------
# PATHS / OUTPUT
# -------------------------------------------------
AUTHORS_FILE   = os.path.join(event_folder, "network", "authors.txt")
LABELS_FILE    = os.path.join(event_folder, "CRC_radicalization_analysis.csv")  # columns: author, radical
CSL_ALL_EDGES  = os.path.join(event_folder, "cslasl-pre", "edges", "edges.txt") # u;v;sim
UIL_FILE       = os.path.join(event_folder, "network", "uil", "edges.txt")      # u;v;w
TDL_FILE       = os.path.join(event_folder, "network", "tdl", "edges.txt")      # u;v;w
ASL_FILE       = os.path.join(event_folder, "network", "asl", "edges.txt")      # u;v;w

OUT_DIR = os.path.join("data", "thresholds_analysis")
os.makedirs(OUT_DIR, exist_ok=True)
OUT_CSV = os.path.join(OUT_DIR, "results_threshold_crc_coef_auc.csv")
OUT_FIG = os.path.join(OUT_DIR, "fig_threshold_crc_coef_auc.png")

PERCENTILES = [50, 60, 70, 75, 80, 85, 90, 95]
BASELINE = 90

# CRC hyperparams (coerenti con la tua funzione)
omega = 1.0
beta  = {"uil": 1.0, "csl": 1.0, "tdl": 1.0, "asl": 1.0}

# -------------------------------------------------
# HELPERS
# -------------------------------------------------
def custom_round(x: float) -> float:
    r = round(float(x), 3)
    s = f"{r:.3f}"
    return float(s[:-1] + "9")

def load_authors(path):
    with open(path, "r", encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]

def load_edges_semicolon(path):
    if not os.path.isfile(path):
        return pd.DataFrame(columns=["u","v","w"])
    return pd.read_csv(path, sep=";", header=None, names=["u","v","w"],
                       dtype={"u":"string","v":"string","w":"float64"}, engine="c")

def strength_from_stream_file(path, authors_set):
    strengths = {a: 0.0 for a in authors_set}
    if not os.path.isfile(path):
        return strengths
    chunksize = 1_000_00  # 100k righe per chunk (snappy e memory-friendly)
    for chunk in pd.read_csv(path, sep=";", header=None, names=["u","v","w"],
                             dtype={"u":"string","v":"string","w":"float64"},
                             engine="c", chunksize=chunksize):
        chunk = chunk.dropna(subset=["u","v","w"])
        if chunk.empty:
            continue
        tw = np.log1p(chunk["w"].values)
        s_u = pd.Series(tw, index=chunk["u"].values).groupby(level=0).sum()
        s_v = pd.Series(tw, index=chunk["v"].values).groupby(level=0).sum()
        s = s_u.add(s_v, fill_value=0.0)
        for node, val in s.items():
            if node in strengths:
                strengths[node] += float(val)
            else:
                strengths[node] = float(val)
    return strengths

def strength_from_csl_df(df_csl, authors_set):
    strengths = {a: 0.0 for a in authors_set}
    if df_csl.empty:
        return strengths
    tw = np.log1p(df_csl["w"].values)
    s_u = pd.Series(tw, index=df_csl["u"].values).groupby(level=0).sum()
    s_v = pd.Series(tw, index=df_csl["v"].values).groupby(level=0).sum()
    s = s_u.add(s_v, fill_value=0.0)
    for node, val in s.items():
        if node in strengths:
            strengths[node] += float(val)
        else:
            strengths[node] = float(val)
    return strengths

def compute_crc_from_strengths(str_by_layer, authors, omega=1.0, beta=None):
    if beta is None:
        beta = {l: 1.0 for l in str_by_layer.keys()}
    n = len(authors)
    CRC = {}
    total_phi = {a: sum(str_by_layer[l].get(a, 0.0) for l in str_by_layer.keys()) for a in authors}
    for a in authors:
        prod = 1.0
        for l in str_by_layer.keys():
            phi_val = str_by_layer[l].get(a, 0.0)
            eff = (phi_val + omega * (total_phi[a] - phi_val)) / max(n - 1, 1)
            prod *= (1.0 + beta.get(l, 1.0) * eff)
        CRC[a] = prod
    return CRC

def fit_logit_and_ci(x, y):
    alpha = np.nan; lo = np.nan; hi = np.nan; auc = np.nan
    try:
        import statsmodels.api as sm
        X = sm.add_constant(x)
        res = sm.Logit(y, X).fit(disp=0)
        alpha = float(res.params[1])
        ci = res.conf_int(alpha=0.05)
        lo, hi = float(ci.iloc[1,0]), float(ci.iloc[1,1])
        p = res.predict(X)
        auc = roc_auc_score(y, p) if len(np.unique(y)) == 2 else np.nan
        return alpha, lo, hi, auc
    except Exception:
        rng = np.random.default_rng(42)
        n = len(y)
        coefs = []
        for _ in range(500):
            idx = rng.integers(0, n, size=n)
            xb = x[idx].reshape(-1,1)
            yb = y[idx]
            if len(np.unique(yb)) < 2:
                continue
            lr = LogisticRegression(solver="liblinear")
            lr.fit(xb, yb)
            coefs.append(lr.coef_.ravel()[0])
        if len(coefs) > 0:
            alpha = float(np.median(coefs))
            lo, hi = np.percentile(coefs, [2.5, 97.5])
        lr = LogisticRegression(solver="liblinear")
        lr.fit(x.reshape(-1,1), y)
        p = lr.predict_proba(x.reshape(-1,1))[:,1]
        auc = roc_auc_score(y, p)
        return alpha, lo, hi, auc

# -------------------------------------------------
# LOAD
# -------------------------------------------------
authors = load_authors(AUTHORS_FILE)
labels_df = pd.read_csv(LABELS_FILE, usecols=["author","radical"])
labels_df["author"] = labels_df["author"].astype(str)
labels_map = dict(zip(labels_df["author"], labels_df["radical"].astype(int)))

# allinea autori con etichette disponibili
y = np.array([labels_map.get(a, np.nan) for a in authors])
mask = ~np.isnan(y)
authors_lab = [a for a, m in zip(authors, mask) if m]
y = y[mask].astype(int)
if len(np.unique(y)) < 2:
    raise SystemExit("Labels must contain both classes 0/1.")

authors_set = set(authors_lab)

# layer fissi (UIL/TDL/ASL): strength streaming
str_uil = strength_from_stream_file(UIL_FILE, authors_set)
str_tdl = strength_from_stream_file(TDL_FILE, authors_set)
str_asl = strength_from_stream_file(ASL_FILE, authors_set)

# CSL candidati (filtra su autori con label)
df_csl_all = load_edges_semicolon(CSL_ALL_EDGES)
if not df_csl_all.empty:
    df_csl_all = df_csl_all[df_csl_all["u"].isin(authors_set) & df_csl_all["v"].isin(authors_set)].reset_index(drop=True)

sims = df_csl_all["w"].astype("float64").values if not df_csl_all.empty else np.array([0.0])
taus = {p: custom_round(np.percentile(sims, p)) for p in PERCENTILES}
print("tau_c by percentile:", {k: f"{v:.3f}" for k, v in taus.items()})

# -------------------------------------------------
# SWEEP
# -------------------------------------------------
rows = []
for p in PERCENTILES:
    tau = taus[p]
    # CSL strict >
    if df_csl_all.empty:
        df_csl_tau = pd.DataFrame(columns=["u","v","w"])
    else:
        df_csl_tau = df_csl_all[df_csl_all["w"] > tau].reset_index(drop=True)

    # strengths per layer
    str_csl = strength_from_csl_df(df_csl_tau, authors_set)
    str_by_layer = {"uil": str_uil, "csl": str_csl, "tdl": str_tdl, "asl": str_asl}

    # CRC per autore
    CRC = compute_crc_from_strengths(str_by_layer, authors_lab, omega=omega, beta=beta)
    x = np.array([CRC[a] for a in authors_lab], dtype=float)

    # z-score
    mu, sd = x.mean(), x.std()
    if sd == 0: sd = 1.0
    xz = (x - mu) / sd

    alpha, lo, hi, auc = fit_logit_and_ci(xz, y)
    rows.append({
        "percentile": p,
        "tau_c": tau,
        "alpha": alpha,
        "alpha_lo": lo,
        "alpha_hi": hi,
        "auc": auc,
        "edges_csl": len(df_csl_tau)
    })

res = pd.DataFrame(rows).sort_values("percentile")
res.to_csv(OUT_CSV, index=False)
print(f"Saved metrics → {OUT_CSV}")

# -------------------------------------------------
# PLOT (legend clean)
# -------------------------------------------------
xp  = res["percentile"].values
yal = res["alpha"].values
ylo = res["alpha_lo"].values
yhi = res["alpha_hi"].values
yauc= res["auc"].values

# safe yerr (evita NaN)
yerr_low  = np.nan_to_num(yal - ylo, nan=0.0, posinf=0.0, neginf=0.0)
yerr_high = np.nan_to_num(yhi - yal, nan=0.0, posinf=0.0, neginf=0.0)
yerr = np.vstack([yerr_low, yerr_high])

fig, ax1 = plt.subplots(figsize=(7.0, 3.6))

# α + CI (senza label per evitare _nolegend_)
err_container = ax1.errorbar(xp, yal, yerr=yerr, fmt="o", capsize=3, elinewidth=1.2)

ax1.set_xlabel(r"Content similarity threshold percentile ($\tau_c$)")
ax1.set_ylabel(r"Logistic coefficient $\alpha$ (CRC)")
ax1.grid(True, which="both", axis="both", alpha=0.3)

# baseline (no label)
baseline_line = ax1.axvline(BASELINE, linestyle="--", linewidth=1, zorder=0)

# asse destro per AUC
ax2 = ax1.twinx()
(line_auc,) = ax2.plot(xp, yauc, marker="s", linestyle="-")
ax2.set_ylabel("AUC")

# stringi asse destro e formatta
if np.isfinite(yauc).any():
    m, M = float(np.nanmin(yauc)), float(np.nanmax(yauc))
    pad = max(0.0025, 0.15 * (M - m if M > m else 0.005))
    ax2.set_ylim(m - pad, M + pad)
ax2.yaxis.set_major_formatter(FormatStrFormatter('%.3f'))

# LEGEND pulita con proxy (colori presi dagli handle reali)
alpha_color = err_container.lines[0].get_color()
auc_color   = line_auc.get_color()
try:
    base_color = baseline_line.get_edgecolor()
except Exception:
    base_color = "0.35"

alpha_proxy    = Line2D([], [], marker="o", linestyle="none", color=alpha_color, label=r"$\alpha$ (95% CI)")
auc_proxy      = Line2D([], [], marker="s", linestyle="-",  color=auc_color,   label="AUC")
baseline_proxy = Line2D([], [], linestyle="--", color=base_color,              label="baseline")

ax1.legend([alpha_proxy, auc_proxy, baseline_proxy],
           [alpha_proxy.get_label(), "AUC", "baseline"],
           loc="best", frameon=False)

fig.tight_layout()
fig.savefig(OUT_FIG, bbox_inches="tight", dpi=150)
plt.close(fig)
print(f"Saved figure -> {OUT_FIG}")

tau_c by percentile: {50: '0.309', 60: '0.329', 70: '0.369', 75: '0.379', 80: '0.409', 85: '0.429', 90: '0.459', 95: '0.519'}
Saved metrics → data/thresholds_analysis/results_threshold_crc_coef_auc.csv
Saved figure -> data/thresholds_analysis/fig_threshold_crc_coef_auc.png


In [23]:
import os, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import ticker

# ----------------- PATHS -----------------
AUTHORS_FILE   = os.path.join(event_folder, "network", "authors.txt")
LABELS_FILE    = os.path.join(event_folder, "CRC_radicalization_analysis.csv")  # columns: author, radical
CSL_ALL_EDGES  = os.path.join(event_folder, "cslasl-pre", "edges", "edges.txt") # u;v;sim
UIL_FILE       = os.path.join(event_folder, "network", "uil", "edges.txt")      # u;v;w
TDL_FILE       = os.path.join(event_folder, "network", "tdl", "edges.txt")      # u;v;w
ASL_FILE       = os.path.join(event_folder, "network", "asl", "edges.txt")      # u;v;w

OUT_DIR = os.path.join("data", "thresholds_analysis")
os.makedirs(OUT_DIR, exist_ok=True)
OUT_CSV = os.path.join(OUT_DIR, "results_threshold_crc_jaccard.csv")
OUT_FIG = os.path.join(OUT_DIR, "fig_threshold_jaccard_heatmap.png")

# ----------------- CONFIG -----------------
PERCENTILES = [50, 60, 70, 75, 80, 85, 90, 95]
BASELINE = 90
K_FRACS = [0.005, 0.01, 0.05]     # 0.5%, 1%, 5%
K_LABELS = ["0.5%", "1%", "5%"]

omega = 1.0
beta  = {"uil": 1.0, "csl": 1.0, "tdl": 1.0, "asl": 1.0}

# ----------------- HELPERS -----------------
def custom_round(x: float) -> float:
    r = round(float(x), 3)
    s = f"{r:.3f}"
    return float(s[:-1] + "9")

def load_authors(path):
    with open(path, "r", encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]

def load_edges_semicolon(path):
    if not os.path.isfile(path):
        return pd.DataFrame(columns=["u","v","w"])
    return pd.read_csv(path, sep=";", header=None, names=["u","v","w"],
                       dtype={"u":"string","v":"string","w":"float64"}, engine="c")

def strength_from_stream_file(path, authors_set):
    strengths = {a: 0.0 for a in authors_set}
    if not os.path.isfile(path):
        return strengths
    chunksize = 100_000
    for chunk in pd.read_csv(path, sep=";", header=None, names=["u","v","w"],
                             dtype={"u":"string","v":"string","w":"float64"},
                             engine="c", chunksize=chunksize):
        chunk = chunk.dropna(subset=["u","v","w"])
        if chunk.empty: 
            continue
        tw = np.log1p(chunk["w"].values)
        s_u = pd.Series(tw, index=chunk["u"].values).groupby(level=0).sum()
        s_v = pd.Series(tw, index=chunk["v"].values).groupby(level=0).sum()
        s = s_u.add(s_v, fill_value=0.0)
        for node, val in s.items():
            if node in strengths:
                strengths[node] += float(val)
            else:
                strengths[node] = float(val)
    return strengths

def strength_from_csl_df(df_csl, authors_set):
    strengths = {a: 0.0 for a in authors_set}
    if df_csl.empty:
        return strengths
    tw = np.log1p(df_csl["w"].values)
    s_u = pd.Series(tw, index=df_csl["u"].values).groupby(level=0).sum()
    s_v = pd.Series(tw, index=df_csl["v"].values).groupby(level=0).sum()
    s = s_u.add(s_v, fill_value=0.0)
    for node, val in s.items():
        if node in strengths:
            strengths[node] += float(val)
        else:
            strengths[node] = float(val)
    return strengths

def compute_crc_from_strengths(str_by_layer, authors, omega=1.0, beta=None):
    if beta is None:
        beta = {l: 1.0 for l in str_by_layer.keys()}
    n = len(authors)
    CRC = {}
    total_phi = {a: sum(str_by_layer[l].get(a, 0.0) for l in str_by_layer.keys()) for a in authors}
    for a in authors:
        prod = 1.0
        for l in str_by_layer.keys():
            phi_val = str_by_layer[l].get(a, 0.0)
            eff = (phi_val + omega * (total_phi[a] - phi_val)) / max(n - 1, 1)
            prod *= (1.0 + beta.get(l, 1.0) * eff)
        CRC[a] = prod
    return CRC

def topk_set(crc_dict, k):
    """Ritorna l'insieme dei top-k (k intero) ordinando per score decrescente, tie-break per autore."""
    items = sorted(crc_dict.items(), key=lambda kv: (-kv[1], kv[0]))
    k = max(1, min(k, len(items)))
    return set([a for a, _ in items[:k]])

def jaccard(a, b):
    if not a and not b:
        return 1.0
    inter = len(a & b)
    union = len(a | b)
    return inter / union if union > 0 else 0.0

# ----------------- LOAD -----------------
authors = load_authors(AUTHORS_FILE)

# se vuoi limitarti agli autori con etichetta (coerenza con le altre figure)
labels_df = pd.read_csv(LABELS_FILE, usecols=["author","radical"])
labels_df["author"] = labels_df["author"].astype(str)
lab_set = set(labels_df["author"].tolist())
authors = [a for a in authors if a in lab_set]   # mantieni ordine
authors_set = set(authors)
nA = len(authors)

df_csl_all = load_edges_semicolon(CSL_ALL_EDGES)
if not df_csl_all.empty:
    df_csl_all = df_csl_all[df_csl_all["u"].isin(authors_set) & df_csl_all["v"].isin(authors_set)].reset_index(drop=True)

# layer fissi strengths una volta sola
str_uil = strength_from_stream_file(UIL_FILE, authors_set)
str_tdl = strength_from_stream_file(TDL_FILE, authors_set)
str_asl = strength_from_stream_file(ASL_FILE, authors_set)

# percentili su CSL candidati
sims = df_csl_all["w"].astype("float64").values if not df_csl_all.empty else np.array([0.0])
taus = {p: custom_round(np.percentile(sims, p)) for p in PERCENTILES}
print("tau_c by percentile:", {k: f"{v:.3f}" for k, v in taus.items()})

# ----------------- CRC per ciascun tau -----------------
crc_by_tau = {}
for p in PERCENTILES:
    tau = taus[p]
    if df_csl_all.empty:
        df_csl_tau = pd.DataFrame(columns=["u","v","w"])
    else:
        df_csl_tau = df_csl_all[df_csl_all["w"] > tau].reset_index(drop=True)
    str_csl = strength_from_csl_df(df_csl_tau, authors_set)
    str_by_layer = {"uil": str_uil, "csl": str_csl, "tdl": str_tdl, "asl": str_asl}
    crc_by_tau[p] = compute_crc_from_strengths(str_by_layer, authors, omega=omega, beta=beta)

# baseline sets (90th)
if BASELINE not in crc_by_tau:
    raise SystemExit("Baseline percentile not computed.")
baseline_crc = crc_by_tau[BASELINE]

k_sizes = [max(1, int(np.ceil(fr * nA))) for fr in K_FRACS]

baseline_sets = {K_LABELS[i]: topk_set(baseline_crc, k_sizes[i]) for i in range(len(K_FRACS))}

# ----------------- Jaccard matrix -----------------
M = np.zeros((len(PERCENTILES), len(K_FRACS)), dtype=float)
for i, p in enumerate(PERCENTILES):
    crc_p = crc_by_tau[p]
    for j, ksz in enumerate(k_sizes):
        s_tau = topk_set(crc_p, ksz)
        base_set = baseline_sets[K_LABELS[j]]
        M[i, j] = jaccard(s_tau, base_set)

# salva CSV (formato wide: una riga per percentile)
df_out = pd.DataFrame({
    "percentile": PERCENTILES,
    K_LABELS[0]: M[:, 0],
    K_LABELS[1]: M[:, 1],
    K_LABELS[2]: M[:, 2],
})
df_out.to_csv(OUT_CSV, index=False)
print(f"Saved Jaccard table → {OUT_CSV}")

# ----------------- HEATMAP -----------------
fig, ax = plt.subplots(figsize=(6.8, 3.8))
im = ax.imshow(M, aspect="auto", vmin=0.0, vmax=1.0)

# axis ticks/labels
ax.set_yticks(range(len(PERCENTILES)))
ax.set_yticklabels([f"{p}%" for p in PERCENTILES])
ax.set_xticks(range(len(K_LABELS)))
ax.set_xticklabels([f"top-{lbl}" for lbl in K_LABELS])

ax.set_xlabel("Top-k set")
ax.set_ylabel(r"Threshold percentile $\tau_c$")

# colorbar
cbar = plt.colorbar(im, ax=ax)
cbar.set_label("Jaccard overlap")

# annotate cells
for i in range(M.shape[0]):
    for j in range(M.shape[1]):
        ax.text(j, i, f"{M[i,j]:.2f}", ha="center", va="center", fontsize=9)

# baseline row marker
try:
    base_idx = PERCENTILES.index(BASELINE)
    ax.hlines(base_idx, -0.5, len(K_LABELS)-0.5, colors="k", linestyles="--", linewidth=1)
except ValueError:
    pass

fig.tight_layout()
fig.savefig(OUT_FIG, bbox_inches="tight")
plt.close(fig)
print(f"Saved figure -> {OUT_FIG}")

tau_c by percentile: {50: '0.309', 60: '0.329', 70: '0.369', 75: '0.379', 80: '0.409', 85: '0.429', 90: '0.459', 95: '0.519'}
Saved Jaccard table → data/thresholds_analysis/results_threshold_crc_jaccard.csv
Saved figure -> data/thresholds_analysis/fig_threshold_jaccard_heatmap.png


In [2]:
import os, math, gc
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

# ---------- CONFIG ----------
EVENTS = {
    "2008 Elections": "data/2008_elections",
    "2011 Occupy Wall Street": "data/2011_wallstreet",
    "2016 Elections": "data/2016_elections",
    "2017 Charlottesville Rally": "data/2017_rally",
    "2021 Capitol Riot": "data/2021_riot",
}

OUT_DIR = os.path.join("data", "thresholds_analysis")
os.makedirs(OUT_DIR, exist_ok=True)
OUT_CSV = os.path.join(OUT_DIR, "robustness_summary.csv")

P_SWEEP_ALPHA = [70, 75, 80, 85, 90, 95]   # for α and GCC
P_SWEEP_JACC  = [80, 85, 90, 95]           # for top-1% Jaccard
BASELINE = 90
AUC_DROP_OK = 0.05   # <= 5%
JACC_MIN_OK = 0.60
GCC_MIN_OK  = 80.0

omega = 1.0
beta  = {"uil": 1.0, "csl": 1.0, "tdl": 1.0, "asl": 1.0}

# ---------- helpers ----------
def custom_round(x: float) -> float:
    r = round(float(x), 3); s = f"{r:.3f}"
    return float(s[:-1] + "9")

def load_authors(path):
    with open(path, "r", encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]

def strength_from_stream_file(path, authors_set):
    """UIL/TDL/ASL: stream 'u;v;w' and accumulate log1p(w). Sparse dict."""
    strengths = defaultdict(float)
    if not os.path.isfile(path):
        return strengths
    with open(path, "r", encoding="utf-8") as fin:
        for line in fin:
            parts = line.rstrip("\n").split(";")
            if len(parts) != 3: continue
            u, v, ws = parts
            if (u not in authors_set) or (v not in authors_set): continue
            try:
                w = float(ws)
            except ValueError:
                continue
            inc = math.log1p(w)
            strengths[u] += inc
            strengths[v] += inc
    return strengths

def get_tau_percentiles_stream(path, percentiles):
    """Compute CSL similarity percentiles by streaming. Works across tdigest versions."""
    from tdigest import TDigest
    d = TDigest()
    count = 0
    if os.path.isfile(path):
        with open(path, "r", encoding="utf-8") as fin:
            for line in fin:
                parts = line.rstrip("\n").split(";")
                if len(parts) != 3:
                    continue
                try:
                    d.update(float(parts[2]))
                    count += 1
                except Exception:
                    continue
    taus = {}
    for p in sorted(set(percentiles)):
        if count == 0:
            val = 0.0
        else:
            # Prefer percentile([0..100]); fall back to quantile([0..1])
            try:
                val = d.percentile(p)             # tdigest ≥0.5.2
            except Exception:
                val = d.quantile(p / 100.0)       # older API
        taus[p] = custom_round(float(val) if val is not None else 0.0)
    return taus

class DSU:
    """Union–find for GCC% without building a graph."""
    __slots__ = ("parent","size")
    def __init__(self, n):
        self.parent = list(range(n))
        self.size = [1]*n
    def find(self, x):
        while self.parent[x] != x:
            self.parent[x] = self.parent[self.parent[x]]
            x = self.parent[x]
        return x
    def union(self, a, b):
        ra, rb = self.find(a), self.find(b)
        if ra == rb: return
        if self.size[ra] < self.size[rb]:
            ra, rb = rb, ra
        self.parent[rb] = ra
        self.size[ra] += self.size[rb]
    def max_size(self):
        # Only sizes of roots matter; but scanning size is fine.
        return max(self.size) if self.size else 0

def csl_strength_and_gcc_stream(path, authors, authors_set, tau):
    """
    Stream CSL once for a given tau:
      - accumulate strengths (log1p(w)) for u,v when w>tau
      - union endpoints in DSU to track GCC size
    Returns (strengths_dict, gcc_pct, edges_kept)
    """
    strengths = defaultdict(float)
    n = len(authors)
    idx = {a:i for i,a in enumerate(authors)}
    dsu = DSU(n)
    edges_kept = 0
    if os.path.isfile(path):
        with open(path, "r", encoding="utf-8") as fin:
            for line in fin:
                parts = line.rstrip("\n").split(";")
                if len(parts) != 3: continue
                u, v, ws = parts
                if (u not in authors_set) or (v not in authors_set): continue
                try:
                    w = float(ws)
                except ValueError:
                    continue
                if w > tau:
                    edges_kept += 1
                    inc = math.log1p(w)
                    strengths[u] += inc
                    strengths[v] += inc
                    dsu.union(idx[u], idx[v])
    gcc_pct = 100.0 * dsu.max_size() / max(n,1)
    return strengths, gcc_pct, edges_kept

def compute_crc_from_strengths(str_by_layer, authors, omega=1.0, beta=None):
    if beta is None: beta = {l: 1.0 for l in str_by_layer.keys()}
    # Precompute totals per author lazily
    layers = list(str_by_layer.keys())
    CRC = {}
    n = len(authors)
    for a in authors:
        total_phi = 0.0
        for l in layers:
            total_phi += str_by_layer[l].get(a, 0.0)
        prod = 1.0
        for l in layers:
            phi_val = str_by_layer[l].get(a, 0.0)
            eff = (phi_val + omega * (total_phi - phi_val)) / max(n - 1, 1)
            prod *= (1.0 + beta.get(l, 1.0) * eff)
        CRC[a] = prod
    return CRC

def fit_logit_alpha_auc(x, y):
    alpha = np.nan; lo = np.nan; hi = np.nan; auc = np.nan
    try:
        import statsmodels.api as sm
        X = sm.add_constant(x)
        res = sm.Logit(y, X).fit(disp=0)
        alpha = float(res.params[1])
        ci = res.conf_int(alpha=0.05)
        lo, hi = float(ci.iloc[1,0]), float(ci.iloc[1,1])
        p = res.predict(X)
        auc = roc_auc_score(y, p) if len(np.unique(y))==2 else np.nan
        return alpha, lo, hi, auc
    except Exception:
        lr = LogisticRegression(solver="liblinear")
        lr.fit(x.reshape(-1,1), y)
        p = lr.predict_proba(x.reshape(-1,1))[:,1]
        auc = roc_auc_score(y, p)
        alpha = float(lr.coef_.ravel()[0])
        lo = hi = np.nan
        return alpha, lo, hi, auc

def topk_set(crc_dict, k):
    items = sorted(crc_dict.items(), key=lambda kv: (-kv[1], kv[0]))
    k = max(1, min(k, len(items)))
    return set(a for a,_ in items[:k])

def jaccard(a,b):
    if not a and not b: return 1.0
    u = len(a|b)
    return len(a&b)/u if u>0 else 0.0

# ---------- main loop ----------
rows = []
for ev_name, ev_folder in EVENTS.items():
    AUTHORS_FILE   = os.path.join(ev_folder, "network", "authors.txt")
    LABELS_FILE    = os.path.join(ev_folder, "CRC_radicalization_analysis.csv")
    CSL_ALL_EDGES  = os.path.join(ev_folder, "cslasl-pre", "edges", "edges.txt")
    UIL_FILE       = os.path.join(ev_folder, "network", "uil", "edges.txt")
    TDL_FILE       = os.path.join(ev_folder, "network", "tdl", "edges.txt")
    ASL_FILE       = os.path.join(ev_folder, "network", "asl", "edges.txt")

    # Authors + labels
    authors = load_authors(AUTHORS_FILE)
    labs = pd.read_csv(LABELS_FILE, usecols=["author","radical"]).astype({"author":str})
    lab_set = set(labs["author"])
    authors = [a for a in authors if a in lab_set]  # keep order, ensure labels exist
    y = labs.set_index("author").loc[authors, "radical"].astype(int).values
    authors_set = set(authors)
    if len(np.unique(y)) < 2:
        # skip event if only one class
        continue

    # Fixed-layer strengths (streaming, sparse dicts)
    str_uil = strength_from_stream_file(UIL_FILE, authors_set)
    str_tdl = strength_from_stream_file(TDL_FILE, authors_set)
    str_asl = strength_from_stream_file(ASL_FILE, authors_set)

    # Percentiles for CSL via streaming digest
    taus = get_tau_percentiles_stream(CSL_ALL_EDGES, P_SWEEP_ALPHA + P_SWEEP_JACC)
    # Baseline top-1% set (stream CSL once at tau=90 for strengths + GCC not needed here)
    str_csl_base, _, _ = csl_strength_and_gcc_stream(CSL_ALL_EDGES, authors, authors_set, taus[BASELINE])
    crc_base = compute_crc_from_strengths(
        {"uil":str_uil,"csl":str_csl_base,"tdl":str_tdl,"asl":str_asl},
        authors, omega=omega, beta=beta
    )
    k1 = max(1, int(math.ceil(0.01 * len(authors))))
    base_top1 = topk_set(crc_base, k1)
    del str_csl_base, crc_base
    gc.collect()

    # Sweep α/AUC/GCC with single-pass per tau (low RAM)
    alphas, los, his, aucs, gccs = [], [], [], [], []
    for p in P_SWEEP_ALPHA:
        tau = taus[p]
        str_csl_tau, gcc_pct, _ = csl_strength_and_gcc_stream(CSL_ALL_EDGES, authors, authors_set, tau)
        str_by_layer = {"uil":str_uil,"csl":str_csl_tau,"tdl":str_tdl,"asl":str_asl}
        CRC = compute_crc_from_strengths(str_by_layer, authors, omega=omega, beta=beta)
        x = np.array([CRC[a] for a in authors], dtype=float)
        mu, sd = x.mean(), x.std(); sd = sd if sd>0 else 1.0
        xz = (x - mu) / sd
        alpha, lo, hi, auc = fit_logit_alpha_auc(xz, y)
        alphas.append(alpha); los.append(lo); his.append(hi); aucs.append(auc); gccs.append(gcc_pct)
        del str_csl_tau, str_by_layer, x, xz
        gc.collect()

    # Jaccard min (top-1%) on 80..95 — stream once per tau
    jmins = []
    for p in P_SWEEP_JACC:
        tau = taus[p]
        str_csl_tau, _, _ = csl_strength_and_gcc_stream(CSL_ALL_EDGES, authors, authors_set, tau)
        crc_tau = compute_crc_from_strengths({"uil":str_uil,"csl":str_csl_tau,"tdl":str_tdl,"asl":str_asl},
                                             authors, omega=omega, beta=beta)
        s_tau = topk_set(crc_tau, k1)
        jmins.append(jaccard(s_tau, base_top1))
        del str_csl_tau, crc_tau, s_tau
        gc.collect()
    jmin = float(np.min(jmins)) if jmins else np.nan

    # Baseline indices e metriche
    idx90 = P_SWEEP_ALPHA.index(BASELINE)
    alpha90, lo90, hi90 = alphas[idx90], los[idx90], his[idx90]
    auc90 = aucs[idx90]

    # Max drop AUC rel. vs baseline
    auc_drop_rel = max([max(0.0, (auc90 - a)/auc90) for a in aucs if np.isfinite(a) and np.isfinite(auc90) and auc90>0] + [0.0])
    auc_drop_pct = 100.0 * auc_drop_rel

    # Min GCC% 70..95
    gcc_min = float(np.nanmin(gccs)) if len(gccs)>0 else np.nan

    # pass/fail
    alpha_pass = all((not np.isnan(l) and l>0.0) for l in los)  # CI low > 0 in 70..95
    auc_pass   = auc_drop_rel <= AUC_DROP_OK
    net_pass   = (not np.isnan(jmin) and jmin >= JACC_MIN_OK) and (not np.isnan(gcc_min) and gcc_min >= GCC_MIN_OK)

    rows.append({
        "Event": ev_name,
        "alpha@90": alpha90,
        "alpha_ci": f"[{lo90:.2f},{hi90:.2f}]" if np.isfinite(lo90) and np.isfinite(hi90) else "",
        "AUC@90": auc90,
        "Max AUC drop (%)": auc_drop_pct,
        "Min Jaccard top-1%": jmin,
        "Min GCC%": gcc_min,
        "✓_alpha": "✓" if alpha_pass else "✗",
        "✓_AUC":   "✓" if auc_pass else "✗",
        "✓_Net":   "✓" if net_pass else "✗",
    })

    # free per-event
    del str_uil, str_tdl, str_asl, authors, authors_set, labs, y
    gc.collect()

df = pd.DataFrame(rows)
df.to_csv(OUT_CSV, index=False)
print(f"Saved -> {OUT_CSV}")
df

Saved -> data/thresholds_analysis/robustness_summary.csv


Unnamed: 0,Event,alpha@90,alpha_ci,AUC@90,Max AUC drop (%),Min Jaccard top-1%,Min GCC%,✓_alpha,✓_AUC,✓_Net
0,2008 Elections,1.301132,,0.78106,1.196286,0.808219,82.78216,✗,✓,✓
1,2011 Occupy Wall Street,1.076047,,0.767263,2.567598,0.853801,59.265817,✗,✓,✗
2,2016 Elections,1.124495,,0.779893,1.878128,0.849375,86.639579,✗,✓,✓
3,2017 Charlottesville Rally,0.989415,,0.755183,0.0,1.0,0.00096,✗,✓,✗
4,2021 Capitol Riot,0.897817,,0.737315,0.0,1.0,0.000393,✗,✓,✗
