Based on https://bambinos.github.io/bambi/notebooks/mister_p.html

In [None]:
import warnings

import arviz as az
import bambi as bmb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
from data_utils import download_single_csv
from pathlib import Path

warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
df = pd.DataFrame(
    {
        "name": [
            "Rheia",
            "Kronos",
            "Demeter",
            "Hades",
            "Hestia",
            "Poseidon",
            "Hera",
            "Zeus",
            "Artemis",
            "Apollo",
            "Leto",
            "Ares",
            "Athena",
            "Hephaestus",
            "Aphrodite",
            "Cyclope",
            "Persephone",
            "Hermes",
            "Hebe",
            "Dionysus",
        ],
        "Risk_Strata": [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        "Treatment": [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        "Outcome": [0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0],
    }
)

df["Treatment_x_Risk_Strata"] = df.Treatment * df.Risk_Strata

df

In [None]:
simple_average = (
    df.groupby("Treatment")[["Outcome"]].mean().rename({"Outcome": "Share"}, axis=1)
)
simple_average

In [None]:
causal_risk_ratio = simple_average.iloc[1]["Share"] / simple_average.iloc[0]["Share"]
print("Causal Risk Ratio:", causal_risk_ratio)

In [None]:
df.groupby("Risk_Strata")[["Treatment"]].count().assign(
    proportion=lambda x: x["Treatment"] / len(df)
)

In [None]:
outcomes_controlled = (
    df.groupby(["Risk_Strata", "Treatment"])[["Outcome"]]
    .mean()
    .reset_index()
    .pivot(index="Treatment", columns=["Risk_Strata"], values="Outcome")
)

outcomes_controlled

In [None]:
weighted_avg = outcomes_controlled.assign(formula="0.4*0.25 + 0.6*0.66").assign(
    weighted_average=lambda x: x[0] * (df[df["Risk_Strata"] == 0].shape[0] / len(df))
    + x[1] * (df[df["Risk_Strata"] == 1].shape[0] / len(df))
)

weighted_avg

In [None]:
causal_risk_ratio = (
    weighted_avg.iloc[1]["weighted_average"] / weighted_avg.iloc[0]["weighted_average"]
)

print("Causal Risk Ratio:", causal_risk_ratio)

In [None]:
reg = bmb.Model("Outcome ~ 1 + Treatment", df)
results = reg.fit()

reg_strata = bmb.Model(
    "Outcome ~ 1 + Treatment + Risk_Strata + Treatment_x_Risk_Strata", df
)
results_strata = reg_strata.fit()

In [None]:
az.summary(results)

In [None]:
az.summary(results_strata)

In [None]:
ax = az.plot_forest(
    [results, results_strata],
    model_names=["naive_model", "stratified_model"],
    var_names=["Treatment"],
    kind="ridgeplot",
    ridgeplot_alpha=0.4,
    combined=True,
    figsize=(10, 6),
)
ax[0].axvline(0, color="black", linestyle="--")
ax[0].set_title("Treatment Effects under Stratification/Non-stratification");

In [None]:
new_df = df[["Risk_Strata"]].assign(Treatment=1).assign(Treatment_x_Risk_Strata=1)
new_preds = reg_strata.predict(results_strata, kind="pps", data=new_df, inplace=False)
print("Expected Outcome in the Treated")
new_preds["posterior_predictive"]["Outcome"].mean().item()

In [None]:
new_df = df[["Risk_Strata"]].assign(Treatment=0).assign(Treatment_x_Risk_Strata=0)
new_preds = reg_strata.predict(results_strata, kind="pps", data=new_df, inplace=False)
print("Expected Outcome in the Untreated")

new_preds["posterior_predictive"]["Outcome"].mean().item()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(20, 6))
axs = axs.flatten()
bmb.interpret.plot_predictions(reg, results, conditional=["Treatment"], ax=axs[0])
bmb.interpret.plot_predictions(
    reg_strata, results_strata, conditional=["Treatment"], ax=axs[1]
)
axs[0].set_title("Non Stratified Regression \n Model Predictions")
axs[1].set_title("Stratified Regression \n Model Predictions");

In [None]:
download_single_csv(Path("./data"), "mr_p_cces18_common_vv.csv.gz")
download_single_csv(Path("./data"), "mr_p_statelevel_predictors.csv")
download_single_csv(Path("./data"), "mr_p_poststrat_df.csv")

In [None]:
cces_all_df = pd.read_csv("data/mr_p_cces18_common_vv.csv.gz", low_memory=False)
cces_all_df.head()

In [None]:
states = [
    "AL",
    "AK",
    "AZ",
    "AR",
    "CA",
    "CO",
    "CT",
    "DE",
    "FL",
    "GA",
    "HI",
    "ID",
    "IL",
    "IN",
    "IA",
    "KS",
    "KY",
    "LA",
    "ME",
    "MD",
    "MA",
    "MI",
    "MN",
    "MS",
    "MO",
    "MT",
    "NE",
    "NV",
    "NH",
    "NJ",
    "NM",
    "NY",
    "NC",
    "ND",
    "OH",
    "OK",
    "OR",
    "PA",
    "RI",
    "SC",
    "SD",
    "TN",
    "TX",
    "UT",
    "VT",
    "VA",
    "WA",
    "WV",
    "WI",
    "WY",
]


numbers = list(range(1, 56, 1))

lkup_states = dict(zip(numbers, states))
lkup_states


ethnicity = [
    "White",
    "Black",
    "Hispanic",
    "Asian",
    "Native American",
    "Mixed",
    "Other",
    "Middle Eastern",
]
numbers = list(range(1, 9, 1))
lkup_ethnicity = dict(zip(numbers, ethnicity))
lkup_ethnicity


edu = ["No HS", "HS", "Some college", "Associates", "4-Year College", "Post-grad"]
numbers = list(range(1, 7, 1))
lkup_edu = dict(zip(numbers, edu))


def clean_df(df):
    ## 0 Oppose and 1 Support
    df["abortion"] = np.abs(df["CC18_321d"] - 2)
    df["state"] = df["inputstate"].map(lkup_states)
    ## dichotomous (coded as -0.5 Female, +0.5 Male)
    df["male"] = np.abs(df["gender"] - 2) - 0.5
    df["eth"] = df["race"].map(lkup_ethnicity)
    df["eth"] = np.where(
        df["eth"].isin(
            ["Asian", "Other", "Middle Eastern", "Mixed", "Native American"]
        ),
        "Other",
        df["eth"],
    )
    df["age"] = 2018 - df["birthyr"]
    df["age"] = pd.cut(
        df["age"].astype(int),
        [0, 29, 39, 49, 59, 69, 120],
        labels=["18-29", "30-39", "40-49", "50-59", "60-69", "70+"],
        ordered=True,
    )
    df["edu"] = df["educ"].map(lkup_edu)
    df["edu"] = np.where(
        df["edu"].isin(["Some college", "Associates"]), "Some college", df["edu"]
    )

    df = df[["abortion", "state", "eth", "male", "age", "edu", "caseid"]]
    return df.dropna()


statelevel_predictors_df = pd.read_csv("data/mr_p_statelevel_predictors.csv")

cces_all_df = clean_df(cces_all_df)
cces_all_df.head()

In [None]:
cces_df = cces_all_df.merge(
    statelevel_predictors_df, left_on="state", right_on="state", how="left"
)
cces_df["weight"] = (
    5 * cces_df["repvote"]
    + (cces_df["age"] == "18-29") * 0.5
    + (cces_df["age"] == "30-39") * 1
    + (cces_df["age"] == "40-49") * 2
    + (cces_df["age"] == "50-59") * 4
    + (cces_df["age"] == "60-69") * 6
    + (cces_df["age"] == "70+") * 8
    + (cces_df["male"] == 1) * 20
    + (cces_df["eth"] == "White") * 1.05
)

cces_df = cces_df.sample(5000, weights="weight", random_state=1000)
cces_df.head()

In [None]:
mosaic = """
    ABCD
    EEEE
    """

fig = plt.figure(layout="constrained", figsize=(20, 10))
ax_dict = fig.subplot_mosaic(mosaic)


def plot_var(var, ax):
    a = (
        cces_df.groupby(var, observed=False)[["abortion"]]
        .mean()
        .rename({"abortion": "share"}, axis=1)
        .reset_index()
    )
    b = (
        cces_all_df.groupby(var, observed=False)[["abortion"]]
        .mean()
        .rename({"abortion": "share_census"}, axis=1)
        .reset_index()
    )
    a = a.merge(b).sort_values("share")
    ax_dict[ax].vlines(a[var], a.share, a.share_census)
    ax_dict[ax].scatter(a[var], a.share, color="blue", label="Sample")
    ax_dict[ax].scatter(a[var], a.share_census, color="red", label="Census")
    ax_dict[ax].set_ylabel("Proportion")


plot_var("age", "A")
plot_var("edu", "B")
plot_var("male", "C")
plot_var("eth", "D")
plot_var("state", "E")

ax_dict["E"].legend()

ax_dict["C"].set_xticklabels([])
ax_dict["C"].set_xlabel("Female / Male")
plt.suptitle("Comparison of Proportions: Survey Sample V Census", fontsize=20);

In [None]:
def get_se_bernoulli(p, n):
    return np.sqrt(p * (1 - p) / n)


sample_cces_estimate = {
    "mean": np.mean(cces_df["abortion"].astype(float)),
    "se": get_se_bernoulli(np.mean(cces_df["abortion"].astype(float)), len(cces_df)),
}
sample_cces_estimate


sample_cces_all_estimate = {
    "mean": np.mean(cces_all_df["abortion"].astype(float)),
    "se": get_se_bernoulli(
        np.mean(cces_all_df["abortion"].astype(float)), len(cces_all_df)
    ),
}
sample_cces_all_estimate

summary = pd.DataFrame([sample_cces_all_estimate, sample_cces_estimate])
summary["data"] = ["Full Data", "Biased Data"]
summary

In [None]:
model_df = (
    cces_df.groupby(["state", "eth", "male", "age", "edu"], observed=False)
    .agg({"caseid": "nunique", "abortion": "sum"})
    .reset_index()
    .sort_values("abortion", ascending=False)
    .rename({"caseid": "n"}, axis=1)
    .merge(statelevel_predictors_df, left_on="state", right_on="state", how="left")
)
model_df["abortion"] = model_df["abortion"].astype(int)
model_df["n"] = model_df["n"].astype(int)
model_df.head()

In [None]:
formula = """ p(abortion, n) ~ C(state) + C(eth) + C(edu) + male + repvote"""

base_model = bmb.Model(
    formula, model_df.sample(n=200, replace=False), family="binomial"
)  # sampling to prevent EOFError

result = base_model.fit(
    random_seed=100,
    target_accept=0.95,
    idata_kwargs={"log_likelihood": True},
)

In [None]:
mosaic = """
    AABB
    CCCC
    """

fig = plt.figure(layout="constrained", figsize=(20, 7))
axs = fig.subplot_mosaic(mosaic)

bmb.interpret.plot_predictions(base_model, result, "eth", ax=axs["A"])
bmb.interpret.plot_predictions(base_model, result, "edu", ax=axs["B"])
bmb.interpret.plot_predictions(base_model, result, "state", ax=axs["C"])
plt.suptitle("Plot Prediction per Class", fontsize=20);

In [None]:
fig, ax = bmb.interpret.plot_comparisons(
    model=base_model,
    idata=result,
    contrast={"eth": ["Black", "White"]},
    conditional=["age", "edu"],
    comparison_type="diff",
    subplot_kwargs={"main": "age", "group": "edu"},
    fig_kwargs={"figsize": (12, 5), "sharey": True},
    legend=True,
)
ax[0].set_title(
    "Comparison of Difference in Ethnicity \n within Age and Educational Strata"
);

In [None]:
bmb.interpret.comparisons(
    model=base_model,
    idata=result,
    contrast={"edu": ["Post-grad", "No HS"]},
    conditional={"eth": ["Black", "White"], "state": ["NY", "CA", "ID", "VA"]},
    comparison_type="diff",
)

In [None]:
formula = "p(abortion, n) ~ male + repvote + (1 | state) + (1 | eth) + (1 | edu) + (1 | male:eth) + (1 | edu:age) + (1 | edu:eth)"

model_hierarchical = bmb.Model(
    formula, model_df.sample(n=400, replace=False), family="binomial"
)  # sample to prevent EOFError

result = model_hierarchical.fit(
    random_seed=100,
    target_accept=0.99,
    idata_kwargs={"log_likelihood": True},
)

In [None]:
az.summary(result, var_names=["Intercept", "male", "1|edu", "1|eth", "repvote"])

In [None]:
model_hierarchical.predict(result, kind="response")
ax = az.plot_ppc(result, figsize=(8, 5), kind="cumulative", observed_rug=True)
ax.set_title("Posterior Predictive Checks \n On Biased Sample");

In [None]:
poststrat_df = pd.read_csv("data/mr_p_poststrat_df.csv")

new_data = poststrat_df.merge(
    statelevel_predictors_df, left_on="state", right_on="state", how="left"
)
new_data.rename({"educ": "edu"}, axis=1, inplace=True)
new_data = model_df.merge(
    new_data,
    how="left",
    left_on=["state", "eth", "male", "age", "edu"],
    right_on=["state", "eth", "male", "age", "edu"],
).rename({"n_y": "n", "repvote_y": "repvote"}, axis=1)[
    ["state", "eth", "male", "age", "edu", "n", "repvote"]
]


new_data = new_data.merge(
    new_data.groupby("state")
    .agg({"n": "sum"})
    .reset_index()
    .rename({"n": "state_total"}, axis=1)
)
new_data["state_percent"] = new_data["n"] / new_data["state_total"]
new_data.head()

In [None]:
result_adjust = model_hierarchical.predict(
    result, data=new_data, inplace=False, kind="response"
)
result_adjust

Below code broken due to sampling above and below reliance on dataframe completeness

In [None]:
estimates = []
abortion_posterior_base = az.extract(result, num_samples=2000)["p"]
abortion_posterior_mrp = az.extract(result_adjust, num_samples=2000)["p"]

for s in new_data["state"].unique():
    idx = new_data.index[new_data["state"] == s].tolist()
    predicted_mrp = (
        (
            abortion_posterior_mrp[idx].mean(dim="sample")
            * new_data.iloc[idx]["state_percent"]
        )
        .sum()
        .item()
    )
    predicted_mrp_lb = (
        (
            abortion_posterior_mrp[idx].quantile(0.025, dim="sample")
            * new_data.iloc[idx]["state_percent"]
        )
        .sum()
        .item()
    )
    predicted_mrp_ub = (
        (
            abortion_posterior_mrp[idx].quantile(0.975, dim="sample")
            * new_data.iloc[idx]["state_percent"]
        )
        .sum()
        .item()
    )
    predicted = abortion_posterior_base[idx].mean().item()
    base_lb = abortion_posterior_base[idx].quantile(0.025).item()
    base_ub = abortion_posterior_base[idx].quantile(0.975).item()

    estimates.append(
        [
            s,
            predicted,
            base_lb,
            base_ub,
            predicted_mrp,
            predicted_mrp_ub,
            predicted_mrp_lb,
        ]
    )


state_predicted = pd.DataFrame(
    estimates,
    columns=[
        "state",
        "base_expected",
        "base_lb",
        "base_ub",
        "mrp_adjusted",
        "mrp_ub",
        "mrp_lb",
    ],
)

state_predicted = (
    state_predicted.merge(
        cces_all_df.groupby("state")[["abortion"]].mean().reset_index()
    )
    .sort_values("mrp_adjusted")
    .rename({"abortion": "census_share"}, axis=1)
)
state_predicted.head()