In [1]:
import simulation_tools as st

import time

from scipy import stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from cmdstanpy import CmdStanModel

from plotnine import ggplot, aes, geom_line, geom_point, labs, theme_minimal
from IPython.display import display

In [2]:
# Remove cmdstanpy logs
import logging
logger = logging.getLogger('cmdstanpy')
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.CRITICAL)

In [3]:
def simulate_across_sigma_b(
    model,
    num_repetitions,
    num_subjects_per_expt,
    prop_treatment,
    mu_b,
    mu_theta,
    sigma_b_grid,
    sigma_theta,
    sigma_treatment,
    sigma_control,
    show_progress=False
):
    begin_time = time.time()

    summary_stats = {}
    for i, sigma_b in enumerate(sigma_b_grid):
        if show_progress:
            print(f"Sigma b: {sigma_b}. Run: {i + 1}/{len(sigma_b_grid)}")

        summary_stats[sigma_b] = st.repeat_inferences(
            model=model,
            num_repetitions=num_repetitions,
            num_subjects_per_expt=num_subjects_per_expt,
            prop_treatment=prop_treatment,
            mu_b=mu_b,
            mu_theta=mu_theta,
            sigma_b=sigma_b,
            sigma_theta=sigma_theta,
            sigma_treatment=sigma_treatment,
            sigma_control=sigma_control,
        )

    if show_progress:
        print(f"Took {time.time() - begin_time:.2f} seconds.")

    return summary_stats

In [4]:
def summary_stats_dict_to_df(summary_stats_dict):
    sigma_b_grid = list(summary_stats_dict.keys())
    estimators = list(summary_stats_dict[sigma_b_grid[0]].keys())
    statistics = list(summary_stats_dict[sigma_b_grid[0]][estimators[0]].keys())

    df = pd.DataFrame({
        "sigma_b": np.tile(sigma_b_grid, len(estimators)),
        "estimator": np.repeat(estimators, len(sigma_b_grid))
    })

    for statistic in statistics:
        df[statistic] = [
            np.array(summary_stats_dict[s][e][statistic]).mean()
            for s, e in zip(df["sigma_b"], df["estimator"])
        ]
        
    return df

In [5]:
def simulate_across_sigma_b_df(
    model,
    num_repetitions,
    num_subjects_per_expt,
    prop_treatment,
    mu_b,
    mu_theta,
    sigma_b_grid,
    sigma_theta,
    sigma_treatment,
    sigma_control,
    show_progress=False
):
    summary_stats = simulate_across_sigma_b(
        model=model,
        num_repetitions=num_repetitions,
        num_subjects_per_expt=num_subjects_per_expt,
        prop_treatment=prop_treatment,
        mu_b=mu_b,
        mu_theta=mu_theta,
        sigma_b_grid=sigma_b_grid,
        sigma_theta=sigma_theta,
        sigma_treatment=sigma_treatment,
        sigma_control=sigma_control,
        show_progress=show_progress
    )
    
    return summary_stats_dict_to_df(summary_stats)

In [6]:
df_const_p = simulate_across_sigma_b_df(
    model=CmdStanModel(stan_file="dynamic_hier.stan"),
    num_repetitions=20,
    num_subjects_per_expt=np.repeat(
            st.CHICK_NUM_SUBJECTS_PER_EXPT, st.CHICK_NUM_EXPTS
        ),
    prop_treatment=np.full(38, 0.5),
    mu_b=0,
    mu_theta=st.CHICK_MU_THETA,
    sigma_b_grid=st.CHICK_SIGMA_B_GRID,
    sigma_theta=st.CHICK_SIGMA_THETA,
    sigma_treatment=st.CHICK_SIGMA_TREATMENT,
    sigma_control=st.CHICK_SIGMA_CONTROL,
    show_progress=True,
)

Sigma b: 0.0. Run: 1/11
Sigma b: 0.01. Run: 2/11
Sigma b: 0.02. Run: 3/11
Sigma b: 0.03. Run: 4/11
Sigma b: 0.04. Run: 5/11
Sigma b: 0.05. Run: 6/11
Sigma b: 0.06. Run: 7/11
Sigma b: 0.07. Run: 8/11
Sigma b: 0.08. Run: 9/11
Sigma b: 0.09. Run: 10/11
Sigma b: 0.1. Run: 11/11
Took 115.26 seconds.


In [7]:
# First 5 experiments use p=0.5
# the rest increase linearly from p=0.5 to p=0.95

prop_treatment_varying = np.concatenate(
    (np.full(5, 0.5), np.linspace(0.5, 0.95, st.CHICK_NUM_EXPTS - 5))
)
df_vary_p = simulate_across_sigma_b_df(
    model=CmdStanModel(stan_file="dynamic_hier.stan"),
    num_repetitions=20,
    num_subjects_per_expt=np.repeat(
            st.CHICK_NUM_SUBJECTS_PER_EXPT, st.CHICK_NUM_EXPTS
        ),
    prop_treatment=prop_treatment_varying,
    mu_b=0,
    mu_theta=st.CHICK_MU_THETA,
    sigma_b_grid=st.CHICK_SIGMA_B_GRID,
    sigma_theta=st.CHICK_SIGMA_THETA,
    sigma_treatment=st.CHICK_SIGMA_TREATMENT,
    sigma_control=st.CHICK_SIGMA_CONTROL,
    show_progress=True,
)

Sigma b: 0.0. Run: 1/11
Sigma b: 0.01. Run: 2/11
Sigma b: 0.02. Run: 3/11
Sigma b: 0.03. Run: 4/11
Sigma b: 0.04. Run: 5/11
Sigma b: 0.05. Run: 6/11
Sigma b: 0.06. Run: 7/11
Sigma b: 0.07. Run: 8/11
Sigma b: 0.08. Run: 9/11
Sigma b: 0.09. Run: 10/11
Sigma b: 0.1. Run: 11/11
Took 123.11 seconds.


In [62]:
plot_df_varying = df_vary_p[["sigma_b", "estimator", "mse"]]

In [11]:
import plotly.express as px

# Create the plot using Plotly
fig = px.line(
    pd.concat(
        [
            df_const_p.assign(LineStyle="Constant p"),
            df_vary_p.assign(LineStyle="Varying p"),
        ]
    ),
    x="sigma_b",
    y="mse",
    color="estimator",
    line_dash="LineStyle",
    markers=True,
    title="MSE for Different Estimators vs. sigma_b",
    labels={"sigma_b": "sigma_b", "mse": "Mean Squared Error"},
)

# Update the legend to remove linestyle
fig.update_layout(legend_title_text="Estimator")

fig.show()

In [14]:
print(df_vary_p[df_vary_p["estimator"] == "posterior"]["mse"].mean())
print(df_const_p[df_const_p["estimator"] == "posterior"]["mse"].mean())

0.0015230922624490189
0.0014609168723589089
