In [None]:
# anes_social_media_composition_csv.py
# ANES 2020 vs 2024 (CSV) — weighted composition by platform

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from textwrap import wrap

import matplotlib.gridspec as gridspec

# ------------------ PATHS ------------------
PATH20 = "anes_timeseries_2020_csv_20220210.csv"
PATH24 = "anes_timeseries_2024_csv_20250808.csv"

OUTDIR = "figures"
os.makedirs(OUTDIR, exist_ok=True)

# ------------------ VARIABLE MAPS (BRITTLE) ------------------
# Weights (post, full sample)
WEIGHT_20 = "V200010b"   # 2020 post-election
WEIGHT_24 = "V240107b"   # 2024 post-election full sample

# Presidential vote (kept for later figures; not used directly here)
VOTE_20 = "V202073"
VOTE_24 = "V242067"

# Demographics
VARS_20 = dict(
    gender="V201600",      # PRE: sex (1 male, 2 female)
    age   ="V201507x",     # PRE: summary age (80=80+)
    race  ="V201549x",     # PRE: summary race/ethnicity
    educ  ="V201511x",     # PRE: 5-cat education
)
VARS_24 = dict(
    gender="V241550",      # PRE: sex (1 male, 2 female)
    age   ="V241458x",     # PRE: summary age (80=80+)
    race  ="V241501x",     # PRE: summary race/ethnicity
    educ  ="V241465x",     # PRE: 5-cat education
)

# Platform flags: visited/mentioned (1) vs not (0)
PLAT_20 = {
    "Facebook":"V202541a","Twitter":"V202541b","Instagram":"V202541c",
    "Reddit":"V202541d","YouTube":"V202541e","Snapchat":"V202541f","TikTok":"V202541g","Other":"V202541h"
}
PLAT_24 = {
    "Facebook":"V242577a","Twitter":"V242577b","Instagram":"V242577c",
    "Reddit":"V242577d","YouTube":"V242577e","Snapchat":"V242577f","TikTok":"V242577g","Other":"V242577h"
}

# Add a virtual "Population" group (flag=None means use entire weighted sample)
PLAT_20_WITH_POP = {"Population": None, **PLAT_20}
PLAT_24_WITH_POP = {"Population": None, **PLAT_24}

# ------------------ IO HELPERS ------------------
def read_anes_csv(path, keep_cols):
    df = pd.read_csv(path, dtype=str, usecols=lambda c: c in keep_cols)
    for c in keep_cols:
        if c not in df.columns:
            df[c] = pd.NA
    df.columns = df.columns.str.strip()
    return df

def coerce_numeric(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

# ------------------ RECODERS (STRICT) ------------------
def recode_gender_sex(series):
    s = pd.to_numeric(series, errors="coerce")
    out = pd.Series(pd.NA, index=s.index, dtype="object")
    out[s == 1] = "Male"
    out[s == 2] = "Female"
    return pd.Categorical(out, categories=["Male","Female"], ordered=False)

def recode_age_band(series):
    a = pd.to_numeric(series, errors="coerce")
    return pd.cut(a, bins=[17,29,44,64,200],
                  labels=["18–29","30–44","45–64","65+"],
                  right=True, include_lowest=True)

def recode_race_5(series):
    r = pd.to_numeric(series, errors="coerce")
    out = pd.Series(pd.NA, index=r.index, dtype="object")
    out[r == 1] = "White NH"
    out[r == 2] = "Black"
    out[r == 3] = "Hispanic"
    out[r == 4] = "Asian"
    out[r.isin([5,6])] = "Other/Mixed"
    return pd.Categorical(out, categories=["White NH","Black","Hispanic","Asian","Other/Mixed"], ordered=False)

def recode_educ_4(series):
    e = pd.to_numeric(series, errors="coerce")
    out = pd.Series(pd.NA, index=e.index, dtype="object")
    out[e <= 2] = "HS or less"
    out[e == 3] = "Some college"
    out[e == 4] = "BA"
    out[e == 5] = "Postgrad"
    return pd.Categorical(out, categories=["HS or less","Some college","BA","Postgrad"], ordered=True)

# ------------------ WEIGHTED SHARE ------------------
def weighted_share(mask, weights):
    """Share of total weight where mask==True. weights must be >0 and finite."""
    w = np.asarray(weights, dtype=float)
    m = np.asarray(mask, dtype=bool)
    ok = np.isfinite(w) & (w > 0) & np.isfinite(m)
    w = w[ok]; m = m[ok]
    if w.size == 0:
        return np.nan
    return w[m].sum() / w.sum()

def weighted_mean(series, weights):
    s = pd.to_numeric(series, errors="coerce")
    w = np.asarray(weights, float)
    m = np.isfinite(s) & np.isfinite(w) & (w > 0)
    return float(np.sum(s[m] * w[m]) / np.sum(w[m])) if m.any() else np.nan

def compute_platform_age_means(df, age_col, weight_col, platform_map, year):
    """
    Returns {(year, platform): mean_age} for each platform (and 'Population' if present).
    """
    out = {}
    for plat, flag in platform_map.items():
        if flag is None:
            users = df.copy()
        else:
            if flag not in df.columns:
                out[(year, plat)] = np.nan
                continue
            users = df[pd.to_numeric(df[flag], errors="coerce") == 1].copy()
        out[(year, plat)] = weighted_mean(users[age_col], users[weight_col])
    return out

def tidy_comp(df, demo_col, weight_col, platform_map, year, categories_order,
              include_missing=True, missing_label="Missing/NA"):
    """
    Weighted composition for each platform + 'Population' (if present).
    Adds an explicit Missing/NA slice so stacks sum to 1.
    """
    rows = []
    base_cats = list(categories_order)  # categories to compute explicitly

    for plat, flag in platform_map.items():
        # Choose users (platform visitors) or the whole sample (Population)
        if flag is None:
            users = df.copy()
        else:
            if flag not in df.columns:
                # Still emit zeros to keep stacks aligned
                for cat in (base_cats + ([missing_label] if include_missing else [])):
                    rows.append(dict(year=year, platform=plat, category=cat, p=0.0))
                continue
            users = df[pd.to_numeric(df[flag], errors="coerce") == 1].copy()

        if users.empty:
            for cat in (base_cats + ([missing_label] if include_missing else [])):
                rows.append(dict(year=year, platform=plat, category=cat, p=0.0))
            continue

        # Main categories
        total_known = 0.0
        for cat in base_cats:
            p = weighted_share(users[demo_col] == cat, users[weight_col])
            p = float(p) if np.isfinite(p) else 0.0
            rows.append(dict(year=year, platform=plat, category=cat, p=p))
            total_known += p

        # Missing/NA slice so the stack sums to ~1
        if include_missing:
            p_miss = weighted_share(users[demo_col].isna(), users[weight_col])
            if not np.isfinite(p_miss):
                p_miss = 0.0
            # numerical safety & residual catch-all
            p_resid = max(0.0, min(1.0, float(p_miss)))
            # if rounding error leaves a tiny gap, fill with residual
            gap = max(0.0, 1.0 - (total_known + p_resid))
            rows.append(dict(year=year, platform=plat, category=missing_label, p=p_resid + gap))

    return pd.DataFrame(rows)

# ------------------ PLOTTER ------------------
CB_PALETTE = ["#0072B2","#E69F00","#009E73","#D55E00","#CC79A7",
              "#56B4E9","#F0E442","#000000"]  # color-blind friendly

def plot_grouped_stacked(comp_df, title, outfile, categories_order,
                         show_missing=True, annotations=None, anno_fmt="Mean: {m:.0f}", ypad=0.025):
    """
    If `annotations` is provided, it should be a dict {(year, platform): mean_age}.
    We'll print a short label above each bar (2020 left, 2024 right).
    """
    cats = list(categories_order) + (
        ["Missing/NA"] if show_missing and "Missing/NA" in comp_df["category"].unique() else []
    )
    platforms = ["Population"] + [
        p for p in PLAT_20.keys() if p in comp_df["platform"].unique()
    ]
    width = 0.38
    x = np.arange(len(platforms))

    # --- slightly smaller legend column ---
    fig = plt.figure(figsize=(12, 5.5), dpi=300)
    gs = gridspec.GridSpec(nrows=1, ncols=2, width_ratios=[1.0, 0.18], wspace=0.02)
    ax = fig.add_subplot(gs[0])
    ax_leg = fig.add_subplot(gs[1])
    ax_leg.axis("off")

    def draw_stack(year, xoffset, hatch=None, edgecolor="black", alpha=0.95):
        for i, plat in enumerate(platforms):
            sub = comp_df[(comp_df["platform"] == plat) & (comp_df["year"] == year)]
            bottoms = 0.0
            for j, cat in enumerate(cats):
                row = sub[sub["category"] == cat]
                val = float(row["p"].values[0]) if not row.empty else 0.0
                ax.bar(
                    x[i] + xoffset,
                    val,
                    width,
                    bottom=bottoms,
                    color=CB_PALETTE[j % len(CB_PALETTE)],
                    edgecolor=edgecolor,
                    linewidth=0.6,
                    hatch=hatch,
                    alpha=alpha,
                )
                bottoms += val

            # --- annotation (mean age) ---
            if annotations is not None:
                m = annotations.get((year, plat), np.nan)
                if np.isfinite(m):
                    ax.text(
                        x[i] + xoffset, 1.0 + ypad, anno_fmt.format(m=m),
                        ha="center", va="bottom", fontsize=6
                    )

    draw_stack(2020, -width / 2, hatch=None)
    draw_stack(2024,  width / 2, hatch="///", edgecolor="black", alpha=0.85)

    ax.axvline(x=0.5, ymin=0, ymax=1, color="#aaaaaa", linewidth=0.8, linestyle="--")
    ax.set_xticks(x, ["\n".join(wrap(p, 12)) for p in platforms])
    ax.set_ylim(0, 1.08 if annotations is not None else 1.0)  # headroom for labels
    ax.set_ylabel("Share of platform users / population")
    ax.set_title(title, loc="left", fontweight="bold")
    ax.spines[["top", "right"]].set_visible(False)

    # --- compact legend layout ---
    comp_handles = [
        plt.Rectangle((0, 0), 1, 1, color=CB_PALETTE[i % len(CB_PALETTE)], ec="black", lw=0.5)
        for i, _ in enumerate(cats)
    ]
    leg1 = ax_leg.legend(comp_handles, cats, title="Composition", loc="upper left", frameon=False)
    ax_leg.add_artist(leg1)

    ypatch_solid = plt.Rectangle((0, 0), 1, 1, color="#BBBBBB", ec="black", lw=0.6)
    ypatch_hatch = plt.Rectangle(
        (0, 0), 1, 1, facecolor="#BBBBBB", ec="black", lw=0.6, hatch="///", alpha=0.85
    )
    ax_leg.legend(
        [ypatch_solid, ypatch_hatch],
        ["2020", "2024"],
        title="Year",
        loc="upper left",
        bbox_to_anchor=(0, 0.6),
        frameon=False,
    )

    fig.savefig(os.path.join(OUTDIR, outfile), dpi=300, bbox_inches="tight")
    plt.close(fig)


# ------------------ LOAD DATA ------------------
keep_20 = [WEIGHT_20, VOTE_20, *VARS_20.values(), *PLAT_20.values()]
keep_24 = [WEIGHT_24, VOTE_24, *VARS_24.values(), *PLAT_24.values()]

d20 = read_anes_csv(PATH20, keep_20)
d24 = read_anes_csv(PATH24, keep_24)

num_20 = [WEIGHT_20, VOTE_20, VARS_20["gender"], VARS_20["age"], VARS_20["race"], VARS_20["educ"], *PLAT_20.values()]
num_24 = [WEIGHT_24, VOTE_24, VARS_24["gender"], VARS_24["age"], VARS_24["race"], VARS_24["educ"], *PLAT_24.values()]
d20 = coerce_numeric(d20, num_20)
d24 = coerce_numeric(d24, num_24)

d20 = d20.rename(columns={WEIGHT_20:"weight"})
d24 = d24.rename(columns={WEIGHT_24:"weight"})
d20 = d20[(d20["weight"] > 0) & np.isfinite(d20["weight"])]
d24 = d24[(d24["weight"] > 0) & np.isfinite(d24["weight"])]

# ------------------ RECODE DEMOGRAPHICS ------------------
d20["gender2"] = recode_gender_sex(d20[VARS_20["gender"]])
d24["gender2"] = recode_gender_sex(d24[VARS_24["gender"]])

d20["age_band"] = recode_age_band(d20[VARS_20["age"]])
d24["age_band"] = recode_age_band(d24[VARS_24["age"]])

d20["race5"] = recode_race_5(d20[VARS_20["race"]])
d24["race5"] = recode_race_5(d24[VARS_24["race"]])

d20["educ4"] = recode_educ_4(d20[VARS_20["educ"]])
d24["educ4"] = recode_educ_4(d24[VARS_24["educ"]])

# ------------------ BUILD COMPOSITION TABLES ------------------
ORDER_GENDER = ["Male","Female"]
ORDER_AGE    = ["18–29","30–44","45–64","65+"]
ORDER_RACE   = ["White NH","Black","Hispanic","Asian","Other/Mixed"]
ORDER_EDUC   = ["HS or less","Some college","BA","Postgrad"]

comp_gender = pd.concat([
    tidy_comp(d20, "gender2", "weight", PLAT_20_WITH_POP, 2020, ORDER_GENDER, include_missing=True),
    tidy_comp(d24, "gender2", "weight", PLAT_24_WITH_POP, 2024, ORDER_GENDER, include_missing=True)
], ignore_index=True)

comp_age = pd.concat([
    tidy_comp(d20, "age_band", "weight", PLAT_20_WITH_POP, 2020, ORDER_AGE, include_missing=True),
    tidy_comp(d24, "age_band", "weight", PLAT_24_WITH_POP, 2024, ORDER_AGE, include_missing=True)
], ignore_index=True)

comp_race = pd.concat([
    tidy_comp(d20, "race5", "weight", PLAT_20_WITH_POP, 2020, ORDER_RACE, include_missing=True),
    tidy_comp(d24, "race5", "weight", PLAT_24_WITH_POP, 2024, ORDER_RACE, include_missing=True)
], ignore_index=True)

comp_educ = pd.concat([
    tidy_comp(d20, "educ4", "weight", PLAT_20_WITH_POP, 2020, ORDER_EDUC, include_missing=True),
    tidy_comp(d24, "educ4", "weight", PLAT_24_WITH_POP, 2024, ORDER_EDUC, include_missing=True)
], ignore_index=True)

# ------------------ PLOTS ------------------
plot_grouped_stacked(
    comp_gender,
    title="Gender (Sex) Composition by Social Media Platform (ANES 2020 vs 2024, weighted)",
    outfile="A_gender_by_platform_2020_vs_2024.png",
    categories_order=ORDER_GENDER,
)

# Weighted mean ages by platform & year (used only for the age plot)
age_means = {}
age_means.update(compute_platform_age_means(d20, VARS_20["age"], "weight", PLAT_20_WITH_POP, 2020))
age_means.update(compute_platform_age_means(d24, VARS_24["age"], "weight", PLAT_24_WITH_POP, 2024))

plot_grouped_stacked(
    comp_age,
    title="Age Composition by Social Media Platform (ANES 2020 vs 2024, weighted)",
    outfile="B_age_by_platform_2020_vs_2024.png",
    categories_order=ORDER_AGE,
    annotations=age_means,           # <— add
    anno_fmt=r"$\mu={m:.0f}$",        # <— format as requested
    ypad=0.03                        # <— small headroom above bars
)

plot_grouped_stacked(
    comp_race,
    title="Race/Ethnicity Composition by Social Media Platform (ANES 2020 vs 2024, weighted)",
    outfile="C_race_by_platform_2020_vs_2024.png",
    categories_order=ORDER_RACE,
)

plot_grouped_stacked(
    comp_educ,
    title="Education Composition by Social Media Platform (ANES 2020 vs 2024, weighted)",
    outfile="D_educ_by_platform_2020_vs_2024.png",
    categories_order=ORDER_EDUC,
)

print("Saved figures to:", OUTDIR)


Saved figures to: figures


In [188]:
# === Repair loader (final): keep all columns needed for posting vs. affective polarization ===
import pandas as pd, numpy as np

# Weights
WEIGHT_20, WEIGHT_24 = "V200010b", "V240107b"

# Presidential vote (kept for other figures; NOT used in these plots)
VOTE_20, VOTE_24 = "V202073", "V242067"

# Ideology (kept for other figs)
IDEO_20, IDEO_24 = "V201201", "V241178"

# Feeling thermometers (used here for affective polarization)
DEMFT_20, REPFT_20 = "V201156", "V201157"
DEMFT_24, REPFT_24 = "V241166", "V241167"

# Platform visit flags (already in PLAT_20/PLAT_24)

# Visit frequency (optional) and posting frequency (needed)
FREQ_FB_20, POST_FB_20 = "V202542", "V202543"
FREQ_TW_20, POST_TW_20 = "V202544", "V202545"
FREQ_FB_24, POST_FB_24 = "V242578", "V242579"
FREQ_TW_24, POST_TW_24 = "V242580", "V242581"

# Reddit posting frequency (2024 only)
POST_RD_24 = "V202547"

# -------- Build explicit read lists (include visit flags!) --------
needed_20 = [
    WEIGHT_20, VOTE_20, IDEO_20, DEMFT_20, REPFT_20,
    FREQ_FB_20, POST_FB_20, FREQ_TW_20, POST_TW_20,
    *PLAT_20.values()  # Facebook/Twitter/Reddit/... visit flags incl. V202541a/b/d etc.
]
needed_24 = [
    WEIGHT_24, VOTE_24, IDEO_24, DEMFT_24, REPFT_24,
    FREQ_FB_24, POST_FB_24, FREQ_TW_24, POST_TW_24, POST_RD_24,
    *PLAT_24.values()  # Facebook/Twitter/Reddit/... visit flags incl. V242577a/b/d etc.
]

def read_needed(path, cols, weight_name):
    df = pd.read_csv(path, dtype=str, usecols=lambda c: c in cols)
    # ensure every requested column exists
    for c in cols:
        if c not in df.columns:
            df[c] = pd.NA
    # coerce to numeric (weights, FTs, visit flags, posting freq)
    for c in cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    # rename weight, filter positives
    df = df.rename(columns={weight_name: "weight"})
    df = df[(df["weight"] > 0) & np.isfinite(df["weight"])].copy()
    return df

d20x = read_needed(PATH20, needed_20, WEIGHT_20)
d24x = read_needed(PATH24, needed_24, WEIGHT_24)

# -------- Vote recodes (useful elsewhere; not used in the FT plots) --------
def recode_pres_vote_2020(series):
    v = pd.to_numeric(series, errors="coerce")
    out = pd.Series("N/A", index=v.index, dtype="object")
    out[v == 1] = "Democrat"          # Biden
    out[v == 2] = "Republican"        # Trump
    out[v.isin([3,4,5,8])] = "Other"  # Jorgensen/Hawkins/Other/Libertarian
    out[v == 7] = "Republican"        # "Specified as Republican candidate"
    out[v.isin([-9,-8,-7,-6,-1,11,12])] = "N/A"
    return pd.Categorical(out, ["Republican","Democrat","Other","N/A"])

def recode_pres_vote_2024(series):
    v = pd.to_numeric(series, errors="coerce")
    out = pd.Series("N/A", index=v.index, dtype="object")
    out[v == 1] = "Democrat"          # Harris
    out[v == 2] = "Republican"        # Trump
    out[v.isin([4,5,6])] = "Other"    # West/Stein/Another
    out[v.isin([9,-8,-7,-6,-1])] = "N/A"
    return pd.Categorical(out, ["Republican","Democrat","Other","N/A"])

d20x["vote_party"] = recode_pres_vote_2020(d20x.get(VOTE_20))
d24x["vote_party"] = recode_pres_vote_2024(d24x.get(VOTE_24))

# -------- Overwrite globals for downstream code --------
d20, d24 = d20x, d24x

# -------- Sanity prints so plots won't be "white" again --------
required_20 = ["weight", DEMFT_20, REPFT_20, POST_FB_20, POST_TW_20, PLAT_20["Facebook"], PLAT_20["Twitter"], PLAT_20["Reddit"]]
required_24 = ["weight", DEMFT_24, REPFT_24, POST_FB_24, POST_TW_24, POST_RD_24, PLAT_24["Facebook"], PLAT_24["Twitter"], PLAT_24["Reddit"]]

missing_20 = [c for c in required_20 if c not in d20.columns]
missing_24 = [c for c in required_24 if c not in d24.columns]
print("Missing in 2020 (should be empty):", missing_20)
print("Missing in 2024 (should be empty):", missing_24)

print("Rows after filters — 2020:", len(d20), " / 2024:", len(d24))


Missing in 2020 (should be empty): []
Missing in 2024 (should be empty): []
Rows after filters — 2020: 7453  / 2024: 4964


In [124]:
# =========================
# Helpers used in all figs
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools, math, collections, textwrap, os

OUT2 = "figures_extra"
os.makedirs(OUT2, exist_ok=True)

# Color-blind palette (same as before)
CB = ["#0072B2","#E69F00","#009E73","#D55E00","#CC79A7","#56B4E9","#F0E442","#000000"]

# ---------- weighted-stat helpers ----------
def kish_neff(w):
    w = np.asarray(w, float)
    w = w[np.isfinite(w) & (w>0)]
    return (w.sum()**2)/np.square(w).sum() if w.size else np.nan

def wmean(x, w):
    x = np.asarray(x, float); w = np.asarray(w, float)
    m = np.isfinite(x) & np.isfinite(w) & (w>0)
    if not m.any(): return np.nan
    return np.sum(w[m]*x[m]) / np.sum(w[m])

def wvar(x, w):
    # freq-weighted variance around weighted mean
    x = np.asarray(x, float); w = np.asarray(w, float)
    m = np.isfinite(x) & np.isfinite(w) & (w>0)
    if not m.any(): return np.nan
    wm = np.sum(w[m]*x[m]) / np.sum(w[m])
    return np.sum(w[m]*(x[m]-wm)**2) / np.sum(w[m])

def platform_users(df, flag):
    return df[pd.to_numeric(df[flag], errors="coerce")==1].copy()

def left_label(ax, title):
    ax.set_title(title, loc="left", fontweight="bold")

# ---------- PARTY: use PRESIDENTIAL VOTE by default ----------
# If earlier cells already created df["vote_party"], we use it; otherwise we
# create it here from the raw presidential-vote variables.

# Raw vote variables (brittle names)
VOTE_20 = "V202073"     # 2020 POST: presidential vote
VOTE_24 = "V242067"     # 2024 POST: presidential vote

def _recode_pres_vote_2020(series):
    v = pd.to_numeric(series, errors="coerce")
    out = pd.Series("N/A", index=v.index, dtype="object")
    out[v == 1] = "Democrat"                  # Biden
    out[v == 2] = "Republican"                # Trump
    out[v.isin([3,4,5,8])] = "Other"          # Jorgensen/Hawkins/Other/Libertarian
    out[v == 7] = "Republican"                # Specified as GOP candidate
    out[v.isin([-9,-8,-7,-6,-1,11,12])] = "N/A"
    return pd.Categorical(out, categories=["Republican","Democrat","Other","N/A"], ordered=False)

def _recode_pres_vote_2024(series):
    v = pd.to_numeric(series, errors="coerce")
    out = pd.Series("N/A", index=v.index, dtype="object")
    out[v == 1] = "Democrat"                  # Harris
    out[v == 2] = "Republican"                # Trump
    out[v.isin([4,5,6])] = "Other"            # West, Stein, Another
    out[v.isin([9,-8,-7,-6,-1])] = "N/A"
    return pd.Categorical(out, categories=["Republican","Democrat","Other","N/A"], ordered=False)

def ensure_vote_party(df, year):
    """Ensure df has a categorical column 'vote_party' (Rep/Dem/Other/N/A)."""
    if "vote_party" in df.columns:
        return df
    if year == 2020 and VOTE_20 in df.columns:
        df = df.copy()
        df["vote_party"] = _recode_pres_vote_2020(df[VOTE_20])
        return df
    if year == 2024 and VOTE_24 in df.columns:
        df = df.copy()
        df["vote_party"] = _recode_pres_vote_2024(df[VOTE_24])
        return df
    # If we get here, we can't build it; leave as-is and you can explicitly supply another party col.
    return df

# Convenience to get boolean masks for Dem/Rep/Other based on vote_party (default)
def party_masks(df, party_col="vote_party"):
    dem = df[party_col].astype(object).eq("Democrat")
    rep = df[party_col].astype(object).eq("Republican")
    oth = df[party_col].astype(object).eq("Other")
    return dem, rep, oth

# ---------- OPTIONAL: FEELING THERMOMETER party (only use if you explicitly request it) ----------
# Kept here for explicit FT-based analyses; NOT used by default.
def derive_party_family_ft(df, dem_ft, rep_ft, thresh=10):
    d = pd.to_numeric(df[dem_ft], errors="coerce")
    r = pd.to_numeric(df[rep_ft], errors="coerce")
    diff = d - r
    out = pd.Series("Other", index=df.index, dtype="object")
    out[diff >= thresh] = "Democrat"
    out[diff <= -thresh] = "Republican"
    out = out.where(out.isin(["Democrat","Republican"]), "Other")
    return pd.Categorical(out, categories=["Republican","Democrat","Other"], ordered=False)

# --------------------------
# Variable names (brittle)
# --------------------------
# Ideology (forced L/C)
IDEO_20 = "V201201"   # 1 Liberal, 2 Conservative, 3 Moderate(VOL) -> drop
IDEO_24 = "V241178"   # 1 Liberal, 2 Conservative

# Feeling thermometers (loaded but only used when explicitly asked)
DEMFT_20, REPFT_20 = "V201156", "V201157"
DEMFT_24, REPFT_24 = "V241166", "V241167"

# FB/Twitter frequency and posting frequency
FREQ_FB_20, POST_FB_20 = "V202542", "V202543"
FREQ_TW_20, POST_TW_20 = "V202544", "V202545"
FREQ_FB_24, POST_FB_24 = "V242578", "V242579"
FREQ_TW_24, POST_TW_24 = "V242580", "V242581"

# Core platform lists for overlap, counts, etc. (6 major)
PLATS6_20 = {
    "Facebook":PLAT_20["Facebook"], "YouTube":PLAT_20["YouTube"], "Instagram":PLAT_20["Instagram"],
    "Twitter":PLAT_20["Twitter"], "TikTok":PLAT_20["TikTok"], "Reddit":PLAT_20["Reddit"]
}
PLATS6_24 = {
    "Facebook":PLAT_24["Facebook"], "YouTube":PLAT_24["YouTube"], "Instagram":PLAT_24["Instagram"],
    "Twitter":PLAT_24["Twitter"], "TikTok":PLAT_24["TikTok"], "Reddit":PLAT_24["Reddit"]
}


In [None]:
def num_platforms_dist(
    df, plats, year,
    weight_col="weight",
    include_other=True,
    other_col=None  # pass PLAT_20.get("Other") / PLAT_24.get("Other") or leave None to auto-detect
):
    # Normalize platform columns list
    if isinstance(plats, dict):
        plat_cols = list(plats.values())
        # If the dict has an "Other" key and include_other=True, append its column
        if include_other and ("Other" in plats):
            oc = plats["Other"]
            if oc and (oc not in plat_cols):
                plat_cols.append(oc)
    else:
        plat_cols = list(plats)

    # Manual override: if other_col is provided and present, append it
    if include_other and other_col:
        if (other_col in df.columns) and (other_col not in plat_cols):
            plat_cols.append(other_col)

    # Keep only columns that actually exist in df
    plat_cols = [c for c in plat_cols if c in df.columns]

    # Build 0/1 flags for each platform (treat exactly 1 as visited)
    flags = pd.DataFrame({
        c: pd.to_numeric(df[c], errors="coerce").eq(1).astype(int)
        for c in plat_cols
    }, index=df.index)

    # Count of platforms visited
    k_raw = flags.sum(axis=1).astype(float)

    # Top-code for the histogram bin labels: e.g., if 7 platforms, labels 0..6 and "7+"
    nplat = len(plat_cols)
    k_top = k_raw.clip(lower=0, upper=nplat)

    # Make labels dynamically: "0","1",...,"(nplat-1)","nplat+"
    labels = [str(i) for i in range(nplat)] + [f"{nplat}+"]
    # Bin edges: -0.5, 0.5, 1.5, ..., (nplat-0.5), big number
    edges = [-0.5] + [i + 0.5 for i in range(nplat)] + [1e9]
    cats = pd.cut(k_top, bins=edges, labels=labels, include_lowest=True, right=True)

    # Weighted share in each bin
    # (fallback wprop_ci if not defined upstream)
    def _wprop_ci(mask, w):
        m = np.asarray(mask, bool); w = np.asarray(w, float)
        ok = np.isfinite(m) & np.isfinite(w) & (w>0)
        if not ok.any(): return (np.nan, np.nan, np.nan, np.nan)
        m, w = m[ok], w[ok]
        W = w.sum(); p = (w[m].sum())/W if W>0 else np.nan
        neff = (W**2)/np.square(w).sum() if w.size else np.nan
        se = math.sqrt(p*(1-p)/neff) if np.isfinite(neff) and neff>0 and np.isfinite(p) else np.nan
        return p, p-1.96*se, p+1.96*se, se

    w = df[weight_col]
    rows = []
    for lab in labels:
        mask = (cats == lab)
        p, _, _, _ = (_wprop_ci(mask, w) if "wprop_ci" not in globals() else wprop_ci(mask, w))
        rows.append((lab, p))

    T = pd.DataFrame(rows, columns=["k", "share"]).assign(year=year)

    # Weighted mean + CI for the (un-top-coded) number of platforms
    mu = wmean(k_raw, w)
    neff = kish_neff(w)
    se_mu = np.sqrt(wvar(k_raw, w)) / np.sqrt(neff) if (np.isfinite(neff) and neff > 0) else np.nan
    ci = (mu - 1.96*se_mu, mu + 1.96*se_mu)

    return T, mu, ci


# ---------- Build tables for 2020 and 2024 ----------
# Pass the actual "Other" column from your maps (robust to None if missing)
t20, mu20, ci20 = num_platforms_dist(
    d20, PLATS6_20, 2020,
    weight_col="weight",
    include_other=True,
    other_col=PLAT_20.get("Other")
)
t24, mu24, ci24 = num_platforms_dist(
    d24, PLATS6_24, 2024,
    weight_col="weight",
    include_other=True,
    other_col=PLAT_24.get("Other")
)

# Ensure same x order for both (labels are identical from the function)
t20 = t20.sort_values("k").reset_index(drop=True)
t24 = t24.sort_values("k").reset_index(drop=True)

# ---------- Plot (larger text everywhere) ----------
fig, ax = plt.subplots(figsize=(9.5, 6.0), dpi=400)

# Base font sizes
FS_TICKS = 20
FS_LABEL = 19
FS_TITLE = 20
FS_LEG = 19
FS_ANN = 18

x = np.arange(len(t20))
barw = 0.36

ax.bar(x - 0.18, t20["share"], width=barw, color=CB[0], edgecolor="black", linewidth=0.8, label="2020", zorder=2)
ax.bar(x + 0.18, t24["share"], width=barw, color=CB[2], edgecolor="black", linewidth=0.8, label="2024", zorder=2)

ax.set_xticks(x, t20["k"], fontsize=FS_TICKS)
ax.set_ylabel("Share of population", fontsize=FS_LABEL)
ax.set_xlabel("Number of Platforms Visited", fontsize=FS_LABEL)

ax.tick_params(axis="y", labelsize=FS_TICKS)
ax.yaxis.grid(True, linestyle=":", alpha=0.35)
ax.set_axisbelow(True)

ax.legend(frameon=False, fontsize=FS_LEG)

ax.text(
    0.55, 0.98,
    f"Mean (2020): {mu20:.2f} [{ci20[0]:.2f}, {ci20[1]:.2f}]\n"
    f"Mean (2024): {mu24:.2f} [{ci24[0]:.2f}, {ci24[1]:.2f}]",
    transform=ax.transAxes, va="top", ha="left", fontsize=FS_ANN
)

ax.spines[['top', 'right']].set_visible(False)

fig.tight_layout()
fig.savefig(os.path.join(OUT2, "M_num_platforms_distribution_including_other.png"), bbox_inches="tight")
plt.close(fig)


In [125]:
# ============================
# Usage, party composition, ideology (with "Other" platform),
# plus compact change-bar panels with larger text
# ============================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# ---------- Font sizes ----------
FS_TICKS = 22
FS_LABEL = 22
FS_TITLE = 22
FS_LEG   = 22
FS_ANN   = 22

# ---------- Small helpers ----------
def _with_other(plats, df, include_other=True, other_col="visited_other"):
    """Append 'Other' platform if requested and present."""
    if isinstance(plats, dict):
        platmap = plats.copy()
    else:
        platmap = {c: c for c in plats}
    if include_other and (other_col in df.columns) and ("Other" not in platmap):
        platmap["Other"] = other_col
    return platmap


# --- fallback if not already defined ---
if "platform_users" not in globals():
    def platform_users(df, flag_col):
        """Return respondents who visited the platform (flag_col == 1) and have nonmissing weight."""
        m = pd.to_numeric(df[flag_col], errors="coerce").eq(1)
        return df.loc[m & df["weight"].notna()].copy()


if "derive_party_family" not in globals():
    def derive_party_family(df, dem_ft_col, rep_ft_col, thresh=10):
        """If RepFT - DemFT >= thresh → Rep; <= -thresh → Dem; else Other."""
        dem = pd.to_numeric(df[dem_ft_col], errors="coerce")
        rep = pd.to_numeric(df[rep_ft_col], errors="coerce")
        diff = rep - dem
        party = pd.Series("Other", index=df.index, dtype="object")
        party.loc[diff >= thresh] = "Rep"
        party.loc[diff <= -thresh] = "Dem"
        return party


# ---------- Usage rate among ALL respondents by platform ----------
def usage_rate(df, platmap, weight_col="weight", include_other=True, other_col="visited_other"):
    pm = _with_other(platmap, df, include_other=include_other, other_col=other_col)
    rows = []
    for plat, flag in pm.items():
        visited = pd.to_numeric(df[flag], errors="coerce").eq(1)
        p, _, _, _ = wprop_ci(visited, df[weight_col])
        rows.append((plat, p))
    return pd.DataFrame(rows, columns=["platform", "share"])


u20 = usage_rate(d20, PLAT_20, weight_col="weight", include_other=True)
u24 = usage_rate(d24, PLAT_24, weight_col="weight", include_other=True)

all_plats = sorted(set(u20["platform"]).union(u24["platform"]))
u20 = u20.set_index("platform").reindex(all_plats).reset_index()
u24 = u24.set_index("platform").reindex(all_plats).reset_index()

dU = u24.merge(u20, on="platform", suffixes=("_24", "_20"))
dU["delta"] = dU["share_24"] - dU["share_20"]


# ---------- Party composition among visitors (Dem/Rep shares) ----------
def party_comp(df, platmap, demft, repft, weight_col="weight",
               include_other=True, other_col="visited_other"):
    tmp = df.copy()
    tmp["_party"] = derive_party_family(tmp, demft, repft, thresh=10)
    pm = _with_other(platmap, tmp, include_other=include_other, other_col=other_col)
    rows = []
    for plat, flag in pm.items():
        users = platform_users(tmp, flag)
        for p in ["Dem", "Rep"]:
            m = users["_party"] == p
            val, _, _, _ = wprop_ci(m, users[weight_col])
            rows.append((plat, p, val))
    return pd.DataFrame(rows, columns=["platform", "party", "share"])


pc20 = party_comp(d20, PLAT_20, DEMFT_20, REPFT_20, include_other=True)
pc24 = party_comp(d24, PLAT_24, DEMFT_24, REPFT_24, include_other=True)

# align
all_plats_pc = sorted(set(pc20["platform"]).union(pc24["platform"]))
full_mi = pd.MultiIndex.from_product([all_plats_pc, ["Dem", "Rep"]], names=["platform", "party"])

pc20 = pc20.set_index(["platform", "party"]).reindex(full_mi).reset_index()
pc24 = pc24.set_index(["platform", "party"]).reindex(full_mi).reset_index()

dPC = pc24.merge(pc20, on=["platform", "party"], suffixes=("_24", "_20"))
dPC["delta"] = dPC["share_24"] - dPC["share_20"]


# ---------- Median ideology (share Conservative) among visitors ----------
def cons_share_visitors(df, platmap, ideovar, weight_col="weight",
                        include_other=True, other_col="visited_other"):
    pm = _with_other(platmap, df, include_other=include_other, other_col=other_col)
    x = pd.to_numeric(df[ideovar], errors="coerce")
    valid = df[np.isin(x, [1, 2])].copy()
    valid["_cons"] = (x == 2)
    rows = []
    for plat, flag in pm.items():
        users = platform_users(valid, flag)
        p, _, _, _ = wprop_ci(users["_cons"].astype(bool), users[weight_col])
        rows.append((plat, p))
    return pd.DataFrame(rows, columns=["platform", "pcon"])


c20 = cons_share_visitors(d20, PLAT_20, IDEO_20, include_other=True)
c24 = cons_share_visitors(d24, PLAT_24, IDEO_24, include_other=True)

all_plats_c = sorted(set(c20["platform"]).union(c24["platform"]))
c20 = c20.set_index("platform").reindex(all_plats_c).reset_index()
c24 = c24.set_index("platform").reindex(all_plats_c).reset_index()

dC = c24.merge(c20, on="platform", suffixes=("_24", "_20"))
dC["delta"] = dC["pcon_24"] - dC["pcon_20"]


# ---------- Plot: compact change-bar panels ----------
def change_bar(df, value_col, title, fname, outdir=OUT2):
    df2 = df.sort_values("delta", key=lambda s: s.abs())
    fig, ax = plt.subplots(figsize=(9.5, 6.0), dpi=400)
    y = np.arange(len(df2))
    colors = np.where(df2["delta"] >= 0, CB[2], CB[3])
    ax.barh(y, df2["delta"], color=colors, edgecolor="black", linewidth=0.8, zorder=2)
    ax.set_yticks(y, df2["platform"], fontsize=FS_TICKS)
    ax.tick_params(axis="x", labelsize=FS_TICKS)
    ax.axvline(0, color="#888888", linewidth=0.9)
    # ax.set_xlabel("Change (2024 − 2020)", fontsize=FS_LABEL)
    ax.set_xlabel(title, fontsize=FS_LABEL)

    # ax.set_title(title, fontsize=FS_TITLE)
    ax.grid(axis="x", linestyle=":", alpha=0.35)
    ax.set_axisbelow(True)
    ax.spines[['top', 'right']].set_visible(False)
    fig.tight_layout()
    fig.savefig(os.path.join(outdir, fname), bbox_inches="tight")
    plt.close(fig)


# ---------- Generate publication-ready figures ----------
change_bar(dU, "delta", "Δ Usage rate among all respondents", "N_delta_usage.png")
change_bar(dPC[dPC["party"]=="Dem"], "delta", "Δ Democratic share among visitors", "N_delta_party_Dem.png")
change_bar(dPC[dPC["party"]=="Rep"], "delta", "Δ Republican share among visitors", "N_delta_party_Rep.png")
change_bar(dC, "delta", "Δ Share Conservative among visitors", "N_delta_conservative_share.png")


In [101]:
# === Change in "no-platform users" by age band (2024 − 2020) — No CI version ===
import numpy as np, pandas as pd, math, os, matplotlib.pyplot as plt

# ---- paths ----
PATH20 = globals().get("PATH20", "data/anes_timeseries_2020.csv")
PATH24 = globals().get("PATH24", "data/anes_timeseries_2024.csv")
OUTDIR_FIG = globals().get("OUT2", "figures_extra")
os.makedirs(OUTDIR_FIG, exist_ok=True)

# ---- variable names ----
WEIGHT_20, WEIGHT_24 = "V200010b", "V240107b"
AGE_20, AGE_24 = "V201507x", "V241458x"

PLAT_20 = globals().get("PLAT_20", {
    "Facebook":"V202541a","Twitter":"V202541b","Instagram":"V202541c",
    "Reddit":"V202541d","YouTube":"V202541e","Snapchat":"V202541f","TikTok":"V202541g"
})
PLAT_24 = globals().get("PLAT_24", {
    "Facebook":"V242577a","Twitter":"V242577b","Instagram":"V242577c",
    "Reddit":"V242577d","YouTube":"V242577e","Snapchat":"V242577f","TikTok":"V242577g"
})

# ---- style constants ----
FS_TICKS = 17
FS_LABEL = 19
FS_TITLE = 21
CB_GREEN = "#009E73"
CB_RED   = "#D55E00"

# ---- helpers ----
def recode_age_band(a):
    a = pd.to_numeric(a, errors="coerce")
    return pd.cut(a, bins=[17,29,44,64,200], labels=["18–29","30–44","45–64","65+"],
                  right=True, include_lowest=True)

def kish_neff(w):
    w = np.asarray(w, float)
    return (w.sum()**2)/np.square(w).sum() if w.size else np.nan

def wprop_se(mask, w):
    m = np.asarray(mask, bool); w = np.asarray(w, float)
    ok = np.isfinite(m) & np.isfinite(w) & (w>0)
    if not ok.any(): return (np.nan, np.nan, np.nan, np.nan)
    m, w = m[ok], w[ok]
    W = w.sum()
    p = (w[m].sum())/W if W>0 else np.nan
    neff = (W**2)/np.square(w).sum() if w.size else np.nan
    se = math.sqrt(p*(1-p)/neff) if np.isfinite(neff) and neff>0 and np.isfinite(p) else np.nan
    return p, p-1.96*se, p+1.96*se, se

def ensure_frame(year):
    """Load minimal frame with age, weights, and platform flags."""
    if year==2020:
        need_cols = [WEIGHT_20, AGE_20, *PLAT_20.values()]
        df = globals().get("d20", None)
        if df is None or any(c not in df.columns for c in [AGE_20, *PLAT_20.values(), "weight"]):
            df = pd.read_csv(PATH20, dtype=str, usecols=lambda c: c in need_cols)
            for c in need_cols:
                if c not in df.columns: df[c] = pd.NA
            for c in need_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
            df = df.rename(columns={WEIGHT_20: "weight"})
            df = df[(df["weight"]>0) & np.isfinite(df["weight"])].copy()
        return df
    else:
        need_cols = [WEIGHT_24, AGE_24, *PLAT_24.values()]
        df = globals().get("d24", None)
        if df is None or any(c not in df.columns for c in [AGE_24, *PLAT_24.values(), "weight"]):
            df = pd.read_csv(PATH24, dtype=str, usecols=lambda c: c in need_cols)
            for c in need_cols:
                if c not in df.columns: df[c] = pd.NA
            for c in need_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
            df = df.rename(columns={WEIGHT_24: "weight"})
            df = df[(df["weight"]>0) & np.isfinite(df["weight"])].copy()
        return df

def zero_platform_mask(df, platmap):
    flags = pd.DataFrame({p: pd.to_numeric(df[c], errors="coerce") for p,c in platmap.items()})
    valid = flags.apply(lambda s: s.isin([0,1]).all(), axis=1)
    any_use = (flags == 1).any(axis=1)
    zero = valid & (~any_use)
    return zero, valid

def share_zero_by_age(df, platmap, age_var):
    out=[]
    age_band = recode_age_band(df[age_var])
    zero, valid = zero_platform_mask(df, platmap)
    for band in ["18–29","30–44","45–64","65+"]:
        sel = (age_band==band) & valid
        p, _, _, se = wprop_se(zero[sel], df.loc[sel, "weight"])
        out.append(dict(age_band=band, share=p, se=se, n=int(sel.sum()), neff=kish_neff(df.loc[sel,"weight"])))
    return pd.DataFrame(out)

# ---- compute ----
d20_use = ensure_frame(2020)
d24_use = ensure_frame(2024)
t20 = share_zero_by_age(d20_use, PLAT_20, AGE_20)
t24 = share_zero_by_age(d24_use, PLAT_24, AGE_24)
chg = t24.merge(t20, on="age_band", suffixes=("_24","_20"))
chg["delta"] = chg["share_24"] - chg["share_20"]

# ---- PLOT (no error bars) ----
fig, ax = plt.subplots(figsize=(8.5, 5.5), dpi=400)
order = ["18–29","30–44","45–64","65+"]
plotdf = chg.set_index("age_band").loc[order].reset_index()
y = np.arange(len(plotdf))

ax.barh(
    y,
    plotdf["delta"],
    color=np.where(plotdf["delta"]>=0, CB_GREEN, CB_RED),
    edgecolor="black",
    linewidth=1.0,
    zorder=3,
)
ax.axvline(0, color="#666666", linewidth=0.9)
ax.set_yticks(y, plotdf["age_band"], fontsize=FS_TICKS)
ax.tick_params(axis="x", labelsize=FS_TICKS)
ax.set_xlabel("Δ No–social-media users by age group (2024 − 2020)", fontsize=FS_LABEL)
# ax.set_title("Δ No–social-media users by age group", fontsize=FS_TITLE, loc="left")

ax.spines[['top','right']].set_visible(False)
ax.grid(axis="x", linestyle=":", alpha=0.35)
ax.set_axisbelow(True)

fig.tight_layout()
outpath = os.path.join(OUTDIR_FIG, "O_delta_no_platform_by_age_pub.png")
fig.savefig(outpath, bbox_inches="tight")
plt.close(fig)

print("✅ Saved publication-ready figure (no CIs):", outpath)


✅ Saved publication-ready figure (no CIs): figures_extra/O_delta_no_platform_by_age_pub.png


In [126]:
# === Change in "no-platform users" by age band (2024 − 2020) — No CI version ===
import numpy as np, pandas as pd, math, os, matplotlib.pyplot as plt

# ---- paths ----
PATH20 = globals().get("PATH20", "anes_timeseries_2020_csv_20220210.csv")
PATH24 = globals().get("PATH24", "anes_timeseries_2024_csv_20250808.csv")
OUTDIR_FIG = globals().get("OUT2", "figures_extra")
os.makedirs(OUTDIR_FIG, exist_ok=True)

# ---- variable names ----
WEIGHT_20, WEIGHT_24 = "V200010b", "V240107b"
AGE_20, AGE_24 = "V201507x", "V241458x"

# Use your in-scope maps if available; otherwise define with "Other"
PLAT_20 = globals().get("PLAT_20", {
    "Facebook":"V202541a","Twitter":"V202541b","Instagram":"V202541c",
    "Reddit":"V202541d","YouTube":"V202541e","Snapchat":"V202541f","TikTok":"V202541g","Other":"V202541h"
})
PLAT_24 = globals().get("PLAT_24", {
    "Facebook":"V242577a","Twitter":"V242577b","Instagram":"V242577c",
    "Reddit":"V242577d","YouTube":"V242577e","Snapchat":"V242577f","TikTok":"V242577g","Other":"V242577h"
})

# ---- style constants ----
FS_TICKS = 17
FS_LABEL = 19
FS_TITLE = 21
CB_GREEN = "#009E73"
CB_RED   = "#D55E00"

# ---- helpers ----
def recode_age_band(a):
    a = pd.to_numeric(a, errors="coerce")
    return pd.cut(a, bins=[17,29,44,64,200], labels=["18–29","30–44","45–64","65+"],
                  right=True, include_lowest=True)

def kish_neff(w):
    w = np.asarray(w, float)
    w = w[np.isfinite(w) & (w>0)]
    return (w.sum()**2)/np.square(w).sum() if w.size else np.nan

def wprop_se(mask, w):
    m = np.asarray(mask, bool); w = np.asarray(w, float)
    ok = np.isfinite(m) & np.isfinite(w) & (w>0)
    if not ok.any(): return (np.nan, np.nan, np.nan, np.nan)
    m, w = m[ok], w[ok]
    W = w.sum()
    p = (w[m].sum())/W if W>0 else np.nan
    neff = (W**2)/np.square(w).sum() if w.size else np.nan
    se = math.sqrt(p*(1-p)/neff) if np.isfinite(neff) and neff>0 and np.isfinite(p) else np.nan
    return p, p-1.96*se, p+1.96*se, se

def ensure_frame(year):
    """Load minimal frame with age, weights, and platform flags (including 'Other' if present)."""
    if year == 2020:
        platmap = PLAT_20
        need_cols = [WEIGHT_20, AGE_20, *platmap.values()]
        df = globals().get("d20", None)
        if df is None or any(c not in df.columns for c in [AGE_20, *platmap.values(), "weight"]):
            df = pd.read_csv(PATH20, dtype=str, usecols=lambda c: c in need_cols)
            for c in need_cols:
                if c not in df.columns: df[c] = pd.NA
            for c in need_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
            df = df.rename(columns={WEIGHT_20: "weight"})
            df = df[(df["weight"]>0) & np.isfinite(df["weight"])].copy()
        return df
    else:
        platmap = PLAT_24
        need_cols = [WEIGHT_24, AGE_24, *platmap.values()]
        df = globals().get("d24", None)
        if df is None or any(c not in df.columns for c in [AGE_24, *platmap.values(), "weight"]):
            df = pd.read_csv(PATH24, dtype=str, usecols=lambda c: c in need_cols)
            for c in need_cols:
                if c not in df.columns: df[c] = pd.NA
            for c in need_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
            df = df.rename(columns={WEIGHT_24: "weight"})
            df = df[(df["weight"]>0) & np.isfinite(df["weight"])].copy()
        return df

def zero_platform_mask(df, platmap):
    # Expect platform flags coded 0=not mentioned, 1=mentioned; treat others as invalid
    flags = pd.DataFrame({p: pd.to_numeric(df[c], errors="coerce") for p,c in platmap.items()})
    valid = flags.apply(lambda row: row.isin([0,1]).all(), axis=1)
    any_use = (flags == 1).any(axis=1)
    zero = valid & (~any_use)
    return zero, valid

def share_zero_by_age(df, platmap, age_var):
    out=[]
    age_band = recode_age_band(df[age_var])
    zero, valid = zero_platform_mask(df, platmap)
    for band in ["18–29","30–44","45–64","65+"]:
        sel = (age_band==band) & valid
        p, _, _, se = wprop_se(zero[sel], df.loc[sel, "weight"])
        out.append(dict(age_band=band, share=float(p) if np.isfinite(p) else np.nan,
                        se=float(se) if np.isfinite(se) else np.nan,
                        n=int(sel.sum()), neff=float(kish_neff(df.loc[sel,"weight"]))))
    return pd.DataFrame(out)

# ---- compute ----
d20_use = ensure_frame(2020)
d24_use = ensure_frame(2024)
t20 = share_zero_by_age(d20_use, PLAT_20, AGE_20)
t24 = share_zero_by_age(d24_use, PLAT_24, AGE_24)
chg = t24.merge(t20, on="age_band", suffixes=("_24","_20"))
chg["delta"] = chg["share_24"] - chg["share_20"]

# ---- PLOT (no error bars) ----
fig, ax = plt.subplots(figsize=(8.5, 5.5), dpi=400)
order = ["18–29","30–44","45–64","65+"]
plotdf = chg.set_index("age_band").loc[order].reset_index()
y = np.arange(len(plotdf))

ax.barh(
    y,
    plotdf["delta"],
    color=np.where(plotdf["delta"]>=0, CB_GREEN, CB_RED),
    edgecolor="black",
    linewidth=1.0,
    zorder=3,
)
ax.axvline(0, color="#666666", linewidth=0.9)
ax.set_yticks(y, plotdf["age_band"], fontsize=FS_TICKS)
ax.tick_params(axis="x", labelsize=FS_TICKS)
ax.set_xlabel("Δ No–social-media users by age group (2024 − 2020)", fontsize=FS_LABEL)

ax.spines[['top','right']].set_visible(False)
ax.grid(axis="x", linestyle=":", alpha=0.35)
ax.set_axisbelow(True)

fig.tight_layout()
outpath = os.path.join(OUTDIR_FIG, "O_delta_no_platform_by_age_pub.png")
fig.savefig(outpath, bbox_inches="tight")
plt.close(fig)

print("✅ Saved publication-ready figure (no CIs):", outpath)


✅ Saved publication-ready figure (no CIs): figures_extra/O_delta_no_platform_by_age_pub.png


In [None]:
# === Heatmap: Presidential vote lean among visitors (Dem − Rep), 2020 vs 2024 ===
# Uses presidential vote (V202073 / V242067) → vote_party (Republican/Democrat/Other/N/A).
# Falls back to recoding if vote_party isn't already in d20/d24.

import numpy as np, pandas as pd, matplotlib.pyplot as plt, os, math

OUTDIR = "figures_extra"; os.makedirs(OUTDIR, exist_ok=True)

# ---- vote-party helpers (safe if not already defined earlier) ----
VOTE_20 = "V202073"
VOTE_24 = "V242067"

def _recode_pres_vote_2020(series):
    v = pd.to_numeric(series, errors="coerce")
    out = pd.Series("N/A", index=v.index, dtype="object")
    out[v == 1] = "Democrat"                  # Joe Biden
    out[v == 2] = "Republican"                # Donald Trump
    out[v.isin([3,4,5,8])] = "Other"          # Jorgensen/Hawkins/Other/Libertarian
    out[v == 7] = "Republican"                # Specified as GOP candidate
    out[v.isin([-9,-8,-7,-6,-1,11,12])] = "N/A"
    return pd.Categorical(out, categories=["Republican","Democrat","Other","N/A"], ordered=False)

def _recode_pres_vote_2024(series):
    v = pd.to_numeric(series, errors="coerce")
    out = pd.Series("N/A", index=v.index, dtype="object")
    out[v == 1] = "Democrat"                  # Kamala Harris
    out[v == 2] = "Republican"                # Donald Trump
    out[v.isin([4,5,6])] = "Other"            # West, Stein, Another
    out[v.isin([9,-8,-7,-6,-1])] = "N/A"
    return pd.Categorical(out, categories=["Republican","Democrat","Other","N/A"], ordered=False)

def ensure_vote_party(df, year):
    if "vote_party" in df.columns: return df
    df = df.copy()
    if year == 2020 and VOTE_20 in df.columns:
        df["vote_party"] = _recode_pres_vote_2020(df[VOTE_20])
    elif year == 2024 and VOTE_24 in df.columns:
        df["vote_party"] = _recode_pres_vote_2024(df[VOTE_24])
    return df

# Weighted proportion with CI (fallback if not defined elsewhere)
def wprop_ci(mask, w):
    m = np.asarray(mask, bool); w = np.asarray(w, float)
    ok = np.isfinite(m) & np.isfinite(w) & (w>0)
    if not ok.any(): return (np.nan,)*4
    m, w = m[ok], w[ok]
    W = w.sum(); p = (w[m].sum())/W if W>0 else np.nan
    neff = (W**2)/np.square(w).sum() if w.size else np.nan
    se = math.sqrt(p*(1-p)/neff) if np.isfinite(neff) and neff>0 else np.nan
    return p, p-1.96*se, p+1.96*se, se

# ---- Compute Dem − Rep among platform visitors (by actual vote) ----
def lean_among_visitors(df, platmap, year):
    df = ensure_vote_party(df, year)
    rows = []
    for plat in ["Facebook","YouTube","Instagram","Twitter","TikTok","Snapchat","Reddit","Other"]:
        if plat not in platmap: 
            continue
        flag = platmap[plat]
        if flag not in df.columns: 
            continue
        visited = pd.to_numeric(df[flag], errors="coerce").eq(1)
        users = df.loc[visited & df["weight"].notna()].copy()
        if users.empty:
            continue
        p_dem, *_ = wprop_ci(users["vote_party"].eq("Democrat"), users["weight"])
        p_rep, *_ = wprop_ci(users["vote_party"].eq("Republican"), users["weight"])
        rows.append((plat, p_dem - p_rep))
    return pd.DataFrame(rows, columns=["platform","lean"])

lean20 = lean_among_visitors(d20, PLAT_20, 2020).assign(year=2020)
lean24 = lean_among_visitors(d24, PLAT_24, 2024).assign(year=2024)
LEAN = pd.concat([lean20, lean24], ignore_index=True)

platforms = [p for p in ["Facebook","YouTube","Instagram","Twitter","TikTok","Snapchat","Reddit","Other"]
             if p in set(LEAN["platform"])]
years = [2020, 2024]

# matrix rows=platforms, cols=years
M = np.full((len(platforms), len(years)), np.nan)
for i, plat in enumerate(platforms):
    for j, yr in enumerate(years):
        r = LEAN[(LEAN["platform"]==plat) & (LEAN["year"]==yr)]
        if not r.empty:
            M[i, j] = float(r.iloc[0]["lean"])

# ---- Plot: diverging blue↔red (positive = more Democratic, negative = more Republican) ----
fig, ax = plt.subplots(figsize=(5.4, 6.2), dpi=350)

vmax = np.nanmax(np.abs(M))
im = ax.imshow(M, aspect="auto", cmap="RdBu", vmin=-vmax, vmax=vmax)  # RdBu: low=red, high=blue

ax.set_yticks(np.arange(len(platforms)), labels=platforms)
ax.set_xticks(np.arange(len(years)), labels=[str(y) for y in years])
ax.set_xlabel("Year")
ax.set_title("Presidential vote lean among visitors (Dem − Rep)", loc="left")

cbar = plt.colorbar(im, ax=ax)
cbar.set_label("Dem − Rep (share among visitors)")

# annotate signed percentage points
for i in range(M.shape[0]):
    for j in range(M.shape[1]):
        val = M[i, j]
        if np.isfinite(val):
            ax.text(j, i, f"{val*100:+.0f}pp",
                    ha="center", va="center", fontsize=9,
                    bbox=dict(boxstyle="round,pad=0.15", fc="white", ec="none", alpha=0.6))

ax.spines[['top','right']].set_visible(False)
fig.tight_layout()
outpath = os.path.join(OUTDIR, "Q_vote_lean_heatmap_diverging.png")
fig.savefig(outpath, bbox_inches="tight")
print("Saved:", outpath)


In [193]:
# --- ULTRA-COMPACT: platform-grouped bars with minimal whitespace ---
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import os

OUTDIR = "figures_extra"; os.makedirs(OUTDIR, exist_ok=True)

platforms      = ["Facebook", "Twitter"]
metrics        = ["member", "visit", "post"]
metric_labels  = {"member":"User", "visit":"Visit", "post":"Post"}
colors         = {"member":"#1f77b4", "visit":"#2ca02c", "post":"#ff7f0e"}  # blue/green/orange
pair_sequence  = [("member", 2020), ("member", 2024),
                  ("visit",  2020), ("visit",  2024),
                  ("post",   2020), ("post",   2024)]

# Compact geometry
bar_w     = 0.22        # bar width
pair_gap  = 0.06        # space between 2020 and 2024 bars in a pair
trio_gap  = 0.28        # space between User/Visit/Post pairs within platform
group_gap = 0.55        # space between platforms (smaller => tighter)

bars = []               # dicts with x, platform, metric, year, value
major_ticks, minor_ticks = [], []

x = 0.0
for plat in platforms:
    pair_centers = []
    for k, (m, yr) in enumerate(pair_sequence):
        bars.append(dict(x=x, plat=plat, m=m, yr=yr))
        if k % 2 == 1:  # after each pair, compute center
            x_prev = x - (bar_w + pair_gap)
            pair_centers.append((x_prev + x) / 2)
            x += trio_gap
        else:
            x += bar_w + pair_gap

    major_ticks.append((np.mean(pair_centers), plat))
    # for c, lbl in zip(pair_centers, ["User","Visit","Post"]):
    #     minor_ticks.append((c, lbl))

    x += group_gap  # move to next platform

# Pull values from RES
def _val(p, y, m):
    row = RES[(RES["platform"]==p) & (RES["year"]==y)]
    return float(row.iloc[0][m]) if (not row.empty and np.isfinite(row.iloc[0][m])) else np.nan

vals = []
for b in bars:
    b["val"] = _val(b["plat"], b["yr"], b["m"])
    if np.isfinite(b["val"]): vals.append(b["val"])

# Tight y-limits with small pad
if vals:
    vmax = max(abs(min(vals)), abs(max(vals)))
else:
    vmax = 0.3
pad = 0.02
ylim = min(0.5, max(0.25, vmax + pad))

# ---- PLOT ----
fig, ax = plt.subplots(figsize=(7.2, 3.8), dpi=400)  # smaller & wider than tall

# Draw bars
for b in bars:
    v = b["val"]
    if not np.isfinite(v): 
        continue
    ax.bar(b["x"], v, width=bar_w,
           color=colors[b["m"]],
           edgecolor="black", lw=0.7,
           hatch=("//" if b["yr"]==2024 else ""),
           zorder=3)

# Small numeric annotations
for b in bars:
    v = b["val"]
    if not np.isfinite(v): continue
    ax.text(b["x"], v + (0.010 if v>=0 else -0.010),
            f"{v*100:+.0f}pp",
            ha="center", va=("bottom" if v>=0 else "top"),
            fontsize=7.8, color="#333")

# Axes & styling
ax.axhline(0, color="#666", lw=0.8)
ax.set_ylim(-0.1, ylim)
ax.yaxis.set_major_formatter(lambda v,_: f"{int(v*100)}pp")
ax.set_ylabel("Democratic − Republican lean (pp)", fontsize=10)
ax.set_title("Partisanship of Users, Visits, and Posts (Facebook & Twitter/X)", loc="left", fontsize=12.5)
ax.grid(axis="y", linestyle=":", alpha=0.28)
ax.spines[['top','right']].set_visible(False)

# Major ticks = platform labels
ax.set_xticks([p for p,_ in major_ticks], [lab for _,lab in major_ticks], fontsize=10.5)

# Minor ticks = "User / Visit / Post" under pairs
ax.set_xticks([p for p,_ in minor_ticks], [lab for _,lab in minor_ticks], minor=True)
ax.tick_params(axis="x", which="minor", length=0, pad=2, labelsize=9.5)

# Legends (compact, inside)
from matplotlib.patches import Patch
metric_handles = [Patch(facecolor=colors[m], edgecolor="black", label=metric_labels[m]) for m in metrics]
year_handles   = [Patch(facecolor="white", edgecolor="black", hatch=h, label=str(y)) for y,h in [(2020,""), (2024,"//")]]
leg1 = ax.legend(handles=metric_handles, title="Measure", loc="upper left", frameon=False, fontsize=9, title_fontsize=9.5)
ax.add_artist(leg1)
# ax.legend(handles=year_handles, title="Year", loc="upper left", frameon=False, fontsize=9, title_fontsize=9.5) 

ax.legend(
    handles=year_handles,
    title="Year",
    loc="upper left",               # anchor position (relative to bbox)
    bbox_to_anchor=(0.15, 1),      # move right (+x) or up (+y)
    frameon=False,
    fontsize=9,
    title_fontsize=9.5
)

# Remove side whitespace: set x-limits to bars span with tiny pad
xs = [b["x"] for b in bars]
xmin = min(xs) - bar_w*0.8
xmax = max(xs) + bar_w*0.8
ax.set_xlim(xmin, xmax)
ax.margins(x=0)  # no extra automatic padding

# Note kept tiny & tucked in
ax.text(0.012, 0.03, "Positive = more Democratic", transform=ax.transAxes, fontsize=8.5, color="#666")

fig.tight_layout(pad=0.4)
outpath = os.path.join(OUTDIR, "R_member_visit_post_partisanship_FB_TW.png")
fig.savefig(outpath, bbox_inches="tight")
plt.close(fig)
print("✅ Saved:", outpath)


✅ Saved: figures_extra/R_member_visit_post_partisanship_FB_TW.png


In [None]:
# ==== Posting frequency (higher = more frequent) vs affective polarization (|RepFT - DemFT|) ====
# Prereqs from your loader: d20, d24, PLAT_20, PLAT_24,
# DEMFT_20/24, REPFT_20/24, POST_FB_20/24, POST_TW_20/24, and (optionally) POST_RD_24="V202547" for 2024.

import numpy as np, pandas as pd, matplotlib.pyplot as plt, os, math

OUT = "figures_extra"; os.makedirs(OUT, exist_ok=True)

# ----------------- weighted helpers -----------------
def kish_neff(w):
    w = np.asarray(w, float); w = w[np.isfinite(w) & (w>0)]
    return (w.sum()**2)/np.square(w).sum() if w.size else np.nan

def wmean(x, w):
    x = np.asarray(x, float); w = np.asarray(w, float)
    m = np.isfinite(x) & np.isfinite(w) & (w>0)
    return np.sum(w[m]*x[m]) / np.sum(w[m]) if m.any() else np.nan

def wvar(x, w):
    x = np.asarray(x, float); w = np.asarray(w, float)
    m = np.isfinite(x) & np.isfinite(w) & (w>0)
    if not m.any(): return np.nan
    mu = np.sum(w[m]*x[m]) / np.sum(w[m])
    return np.sum(w[m]*(x[m]-mu)**2) / np.sum(w[m])

def wse_mean(x, w):
    ne = kish_neff(w)
    if not np.isfinite(ne) or ne <= 0: return np.nan
    v = wvar(x, w)
    return math.sqrt(v / ne) if np.isfinite(v) else np.nan

# ----------------- core computation -----------------
def post_vs_affpol_quartiles(df, demft, repft, visit_flag, post_var, reverse=True, verbose=False):
    """Return DataFrame with quartile mean (reversed so higher=more) and 95% CI."""
    if visit_flag not in df.columns or post_var not in df.columns:
        if verbose: print(f"⚠️ Missing columns: {visit_flag} or {post_var}")
        return None

    visit = pd.to_numeric(df[visit_flag], errors="coerce").eq(1)
    dft   = pd.to_numeric(df[demft], errors="coerce")
    rft   = pd.to_numeric(df[repft], errors="coerce")
    post  = pd.to_numeric(df[post_var], errors="coerce")  # 1=Always … 5=Never
    w     = pd.to_numeric(df["weight"], errors="coerce")

    # sample: visitors with valid FTs, weights, and valid posting code 1..5
    m = visit & np.isfinite(dft) & np.isfinite(rft) & post.isin([1,2,3,4,5]) & np.isfinite(w) & (w>0)
    if not m.any():
        if verbose: print(f"⚠️ No valid sample for {visit_flag}/{post_var}")
        return None

    pol  = (rft[m] - dft[m]).abs().astype(float)
    yraw = post[m].astype(float)
    # reverse so higher = more frequent posting (match earlier pipeline)
    y = (6 - yraw) if reverse else yraw
    wt = w[m].astype(float)

    # weighted quartiles of polarization
    def weighted_quartiles(x, w):
        x = np.asarray(x, float); w = np.asarray(w, float)
        ord_idx = np.argsort(x); x = x[ord_idx]; w = w[ord_idx]
        cw = np.cumsum(w) / w.sum()
        qs = []
        for q in (0.25, 0.50, 0.75):
            i = np.searchsorted(cw, q, side="left")
            i = min(i, len(x)-1)
            qs.append(x[i])
        return qs

    try:
        q1,q2,q3 = weighted_quartiles(pol, wt)
    except Exception:
        q1,q2,q3 = np.quantile(pol, [0.25,0.5,0.75])

    bins = [-np.inf, q1, q2, q3, np.inf]
    labels = ["Q1 (low)","Q2","Q3","Q4 (high)"]
    qcat = pd.cut(pol, bins=bins, labels=labels, include_lowest=True, right=True)

    rows = []
    for lab in labels:
        s = (qcat == lab)
        if not s.any():
            rows.append((lab, np.nan, np.nan))
            continue
        mu = wmean(y[s], wt[s])
        se = wse_mean(y[s], wt[s])
        moe = 1.96*se if np.isfinite(se) else np.nan
        rows.append((lab, mu, moe))
    out = pd.DataFrame(rows, columns=["quartile","mean_rev","moe"])  # mean_rev: higher=more posting
    out["quartile"] = pd.Categorical(out["quartile"], categories=labels, ordered=True)
    return out.sort_values("quartile").reset_index(drop=True)

# ----------------- plotting -----------------
def plot_platform(series20, series24, platform_label, outfile):
    fig, ax = plt.subplots(figsize=(5.6, 3.9), dpi=350)
    x = np.arange(4)
    def draw(line, color, label):
        if line is None or line["mean_rev"].isna().all(): return
        y  = line["mean_rev"].to_numpy(float)
        mo = line["moe"].to_numpy(float)
        ax.plot(x, y, marker="o", lw=2.0, label=label, color=color, zorder=3)
        if np.isfinite(mo).any():
            ax.fill_between(x, y-mo, y+mo, color=color, alpha=0.18, linewidth=0)

    draw(series20, "#1f77b4", "2020")
    draw(series24, "#d62728", "2024")

    ax.set_xticks(x, ["Q1 (low)","Q2","Q3","Q4 (high)"])
    ax.set_ylabel("Mean posting frequency (reversed: higher = more often)")
    ax.set_xlabel("Affective polarization |Rep FT − Dem FT| (quartiles)")
    ax.set_title(f"{platform_label}: Political posting vs. affective polarization")
    ax.grid(axis="y", linestyle=":", alpha=0.35)
    ax.spines[['top','right']].set_visible(False)
    ax.legend(frameon=False, loc="upper left")
    fig.tight_layout()
    path = os.path.join(OUT, outfile)
    fig.savefig(path, bbox_inches="tight")
    plt.close(fig)
    print("Saved:", path)

# ----------------- wire variables -----------------
DEMFT_20, REPFT_20 = "V201156", "V201157"
DEMFT_24, REPFT_24 = "V241166", "V241167"

POST_FB_20, POST_FB_24 = "V202543", "V242579"
POST_TW_20, POST_TW_24 = "V202545", "V242581"
POST_RD_24             = "V202546"   # Reddit (2024 only)

FB20, FB24 = PLAT_20["Facebook"], PLAT_24["Facebook"]
TW20, TW24 = PLAT_20["Twitter"],  PLAT_24["Twitter"]
RD24 = PLAT_24.get("Reddit", None)

# ----------------- compute & plot -----------------
fb20 = post_vs_affpol_quartiles(d20, DEMFT_20, REPFT_20, FB20, POST_FB_20, reverse=True, verbose=True)
fb24 = post_vs_affpol_quartiles(d24, DEMFT_24, REPFT_24, FB24, POST_FB_24, reverse=True, verbose=True)
plot_platform(fb20, fb24, "Facebook", "S_postfreq_vs_affpol_Facebook_rev.png")

tw20 = post_vs_affpol_quartiles(d20, DEMFT_20, REPFT_20, TW20, POST_TW_20, reverse=True, verbose=True)
tw24 = post_vs_affpol_quartiles(d24, DEMFT_24, REPFT_24, TW24, POST_TW_24, reverse=True, verbose=True)
plot_platform(tw20, tw24, "Twitter/X", "S_postfreq_vs_affpol_Twitter_rev.png")

# # Reddit: 2020 only 
# We skip this since data is not available for 2024
# if RD24 is not None and POST_RD_24 in d24.columns:
#     rd24 = post_vs_affpol_quartiles(d24, DEMFT_24, REPFT_24, RD24, POST_RD_24, reverse=True, verbose=True)
#     plot_platform(None, rd24, "Reddit", "S_postfreq_vs_affpol_Reddit_rev.png")
# else:
#     print("⚠️ Reddit posting variable (V202547) or Reddit flag missing; skipping Reddit plot.")


Saved: figures_extra/S_postfreq_vs_affpol_Facebook_rev.png
Saved: figures_extra/S_postfreq_vs_affpol_Twitter_rev.png


In [192]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

OUT = "figures_extra"; os.makedirs(OUT, exist_ok=True)

# ---- variable names already loaded earlier ----
# Feeling thermometers
DEMFT_20, REPFT_20 = "V201156", "V201157"
DEMFT_24, REPFT_24 = "V241166", "V241167"

# Facebook use & post (ANES coding: 1=most frequent; higher number=less)
FREQ_FB_20, POST_FB_20 = "V202542", "V202543"
FREQ_FB_24, POST_FB_24 = "V242578", "V242579"

# Twitter/X use & post
FREQ_TW_20, POST_TW_20 = "V202544", "V202545"
FREQ_TW_24, POST_TW_24 = "V242580", "V242581"

# ---------- helpers (mirror the old script’s behavior) ----------
def _safe_num(s):
    return pd.to_numeric(s, errors="coerce")

def set_missing(series: pd.Series) -> pd.Series:
    """ANES-style cleanup: negatives and {998,999} -> NaN."""
    s = _safe_num(series)
    s = s.where(~(s < 0), np.nan)
    s = s.where(~s.isin([998, 999]), np.nan)
    return s

def reverse_1_to_k(series, k):
    """Reverse a 1..k scale so higher = more frequent."""
    x = set_missing(series)
    return (k + 1) - x

def wmean(x, w):
    x = np.asarray(x, float); w = np.asarray(w, float)
    m = np.isfinite(x) & np.isfinite(w) & (w > 0)
    return np.sum(w[m]*x[m]) / np.sum(w[m]) if m.any() else np.nan

def wbootstrap_ci_mean(x, w, B=2000, alpha=0.05, random_state=0):
    """
    Weighted bootstrap for the mean:
    sample indices with replacement using normalized weights; take the
    unweighted mean of each bootstrap sample. Returns (mean, (lo, hi)).
    """
    rng = np.random.default_rng(random_state)
    x = np.asarray(x, float); w = np.asarray(w, float)
    m = np.isfinite(x) & np.isfinite(w) & (w > 0)
    if not m.any():
        return (np.nan, (np.nan, np.nan))
    x, w = x[m], w[m]
    p = w / w.sum()
    n = x.size
    # point estimate (weighted)
    mu = np.sum(p * x)
    # bootstrap
    draws = rng.choice(n, size=(B, n), replace=True, p=p)
    boot = x[draws].mean(axis=1)
    lo, hi = np.quantile(boot, [alpha/2, 1 - alpha/2])
    return (float(mu), (float(lo), float(hi)))

def prep_year(df, demft, repft, use_var, post_var):
    """Return tidy with x (Rep−Dem FT), use/post (reversed), weight."""
    need = {demft, repft, use_var, post_var, "weight"}
    if not need.issubset(df.columns):
        return pd.DataFrame(columns=["x","use","post","weight"])
    # x: Rep - Dem thermometer (clean, clipped like old code)
    dem = set_missing(df[demft])
    rep = set_missing(df[repft])
    x = (rep - dem).clip(lower=-100, upper=100)

    # y’s: reversed to "higher = more frequent"
    y_use  = reverse_1_to_k(df[use_var], 7)   # 1..7
    y_post = reverse_1_to_k(df[post_var], 5)  # 1..5

    w = _safe_num(df["weight"])
    out = pd.DataFrame({"x": x, "use": y_use, "post": y_post, "weight": w})
    # keep rows with valid x, w (y handled later per outcome)
    return out.dropna(subset=["x", "weight"])

def summarize_by_quantile_bins(tidy, ycol, n_bins=10):
    """
    Per-year quantile bins (like your old helper).
    For each bin: weighted mean + weighted bootstrap CI.
    """
    if tidy.empty:
        return pd.DataFrame(columns=["x_mid","m","lo","hi"])
    tmp = tidy[tidy[ycol].notna()].copy()
    if tmp.empty:
        return pd.DataFrame(columns=["x_mid","m","lo","hi"])

    # robust quantiles; allow duplicates to drop
    q = np.linspace(0, 1, n_bins+1)
    cuts = np.quantile(tmp["x"], q)
    cuts = np.unique(cuts)
    if cuts.size < 3:
        # fallback equal-width
        cuts = np.linspace(tmp["x"].min(), tmp["x"].max(), min(n_bins, max(2, tmp["x"].nunique()-1)) + 1)

    cats = pd.cut(tmp["x"], bins=cuts, include_lowest=True, duplicates="drop")

    rows = []
    for iv, g in tmp.groupby(cats, dropna=True):
        if g.empty or iv is pd.NA:
            continue
        xmid = float(np.mean([iv.left, iv.right]))
        mu, (lo, hi) = wbootstrap_ci_mean(g[ycol].values, g["weight"].values, B=2000)
        rows.append((xmid, mu, lo, hi))
    res = pd.DataFrame(rows, columns=["x_mid","m","lo","hi"]).sort_values("x_mid")
    return res

def plot_two_years_line_ci(res20, res24, title, ylabel, outfile):
    fig, ax = plt.subplots(figsize=(6.8, 4.6), dpi=300)
    # 2020 (blue)
    if not res20.empty:
        ax.plot(res20["x_mid"], res20["m"], marker="o", lw=1.8, label="2020")
        ax.fill_between(res20["x_mid"], res20["lo"], res20["hi"], alpha=0.18, linewidth=0)
    # 2024 (red)
    if not res24.empty:
        ax.plot(res24["x_mid"], res24["m"], marker="o", lw=1.8, label="2024", color="#d62728")
        ax.fill_between(res24["x_mid"], res24["lo"], res24["hi"], alpha=0.18, linewidth=0, color="#d62728")

    ax.axvline(0, color="0.35", linestyle=":", linewidth=1)
    ax.set_xlim(-100, 100)
    ax.set_xlabel("Rep − Dem thermometer (−100=max Dem, +100=max Rep)")
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.grid(axis="y", linestyle=":", alpha=0.45)
    ax.spines[['top','right']].set_visible(False)
    ax.legend()
    fig.tight_layout()
    fig.savefig(os.path.join(OUT, outfile), bbox_inches="tight")
    plt.close(fig)
    print("Saved:", os.path.join(OUT, outfile))

# ---- prepare per-year tidy frames (use whatever d20/d24 currently hold) ----
fb20 = prep_year(d20, DEMFT_20, REPFT_20, FREQ_FB_20, POST_FB_20)
fb24 = prep_year(d24, DEMFT_24, REPFT_24, FREQ_FB_24, POST_FB_24)
tw20 = prep_year(d20, DEMFT_20, REPFT_20, FREQ_TW_20, POST_TW_20)
tw24 = prep_year(d24, DEMFT_24, REPFT_24, FREQ_TW_24, POST_TW_24)

# ---- summarize with QUANTILE bins (per year) ----
fb20_use  = summarize_by_quantile_bins(fb20, "use",  n_bins=10)
fb24_use  = summarize_by_quantile_bins(fb24, "use",  n_bins=10)
fb20_post = summarize_by_quantile_bins(fb20, "post", n_bins=10)
fb24_post = summarize_by_quantile_bins(fb24, "post", n_bins=10)

tw20_use  = summarize_by_quantile_bins(tw20, "use",  n_bins=10)
tw24_use  = summarize_by_quantile_bins(tw24, "use",  n_bins=10)
tw20_post = summarize_by_quantile_bins(tw20, "post", n_bins=10)
tw24_post = summarize_by_quantile_bins(tw24, "post", n_bins=10)

# ---- plot the four requested panels ----
plot_two_years_line_ci(fb20_use,  fb24_use,
                       "Facebook use vs. party thermometer difference",
                       "Weighted mean frequency (higher = more often)",
                       "fb_use_vs_thermo_ci.png")

plot_two_years_line_ci(fb20_post, fb24_post,
                       "Facebook posting vs. party thermometer difference",
                       "Weighted mean political posting (higher = more often)",
                       "fb_post_vs_thermo_ci.png")

plot_two_years_line_ci(tw20_use,  tw24_use,
                       "Twitter/X use vs. party thermometer difference",
                       "Weighted mean frequency (higher = more often)",
                       "tw_use_vs_thermo_ci.png")

plot_two_years_line_ci(tw20_post, tw24_post,
                       "Twitter/X posting vs. party thermometer difference",
                       "Weighted mean political posting (higher = more often)",
                       "tw_post_vs_thermo_ci.png")

  for iv, g in tmp.groupby(cats, dropna=True):


Saved: figures_extra/fb_use_vs_thermo_ci.png
Saved: figures_extra/fb_post_vs_thermo_ci.png
Saved: figures_extra/tw_use_vs_thermo_ci.png
Saved: figures_extra/tw_post_vs_thermo_ci.png
