In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.special import expit, logit

np.random.seed(2025)
# 0) SETTINGS
N = 1000
target_effective_share = 0.55   # calibrate binary outcome base rate
sigma_beta_m = 0.25             # prior sd for mediator betas
sigma_beta_y = 0.25             # prior sd for outcome betas
noise_sd_m = 1.0                # mediator noise before Likert clamping
likert_min, likert_max = 1, 5

# 1) DRAW "LAWS" FOR GENERATING X (no fixed effects)

roles = ["Analyst", "Project manager", "Developer", "Executive director", "Other"]
# draw role probabilities at random (Dirichlet) to avoid fixing them
role_probs = np.random.dirichlet(np.ones(len(roles))).tolist()
role = np.random.choice(roles, size=N, p=role_probs)

exp_bins = ["0-1", "2-4", "5-7", "8+"]
exp_probs = np.random.dirichlet(np.ones(len(exp_bins))).tolist()
experience = np.random.choice(exp_bins, size=N, p=exp_probs)
experience_code = pd.Categorical(experience, categories=exp_bins, ordered=True).codes

pp_bins = ["2-3", "4-6", "7-10", "more than 10"]
pp_probs = np.random.dirichlet(np.ones(len(pp_bins))).tolist()
peopleparticipation = np.random.choice(pp_bins, size=N, p=pp_probs)
peopleparticipation_code = pd.Categorical(peopleparticipation, categories=pp_bins, ordered=True).codes

# usage rates drawn from Beta, then Bernoulli
p_dashboard = np.random.beta(6, 4)   # mean around 0.6 but random
p_ai       = np.random.beta(5, 5)    # mean around 0.5 but random
dashboarduse = np.random.binomial(1, p_dashboard, size=N)
aiuse       = np.random.binomial(1, p_ai, size=N)



In [None]:
# 2) RANDOM COEFFICIENTS FOR MEDIATORS
# Mediators: understanding, access, equity, traceability (values 1–5)

role_cat = pd.Categorical(role, categories=roles)
role_dummies = pd.get_dummies(role_cat, drop_first=True)
X_m = pd.concat([
    pd.Series(dashboarduse, name="dashboarduse"),
    pd.Series(aiuse,       name="aiuse"),
    role_dummies,
    pd.Series(experience_code, name="experience_code"),
    pd.Series(peopleparticipation_code, name="peopleparticipation_code")
], axis=1)

pred_names = X_m.columns.tolist()
import numpy as np

def to_likert(latent, min_val=1, max_val=5):
    arr = np.asarray(latent, dtype=float).reshape(-1)
    arr = np.rint(arr)
    arr = np.clip(arr, min_val, max_val)
    return arr.astype(np.int64)

def draw_mediator(latent_intercept_sd=0.5):
    """Draw mediator coefficients & generate a Likert 1..5 score."""
    beta = np.random.normal(0.0, sigma_beta_m, size=X_m.shape[1])
    alpha = np.random.normal(3.0, latent_intercept_sd)
    latent = alpha + X_m.values @ beta + np.random.normal(0, noise_sd_m, size=N)
    likert = to_likert(latent, likert_min, likert_max)
    return likert, alpha, beta

In [None]:
understanding, alpha_u, beta_u = draw_mediator()
access,        alpha_a, beta_a = draw_mediator()
equity,        alpha_e, beta_e = draw_mediator()
traceability,  alpha_t, beta_t = draw_mediator()

In [None]:
# 3) RANDOM COEFFICIENTS FOR BINARY OUTCOME (weak priors)
# Outcome predictors: mediators + dashboarduse + aiuse + role dummies + experience_code + peopleparticipation_code

X_y = pd.concat([
    pd.Series(understanding, name="understanding"),
    pd.Series(access,        name="access"),
    pd.Series(equity,        name="equity"),
    pd.Series(traceability,  name="traceability"),
    pd.Series(dashboarduse,  name="dashboarduse"),
    pd.Series(aiuse,         name="aiuse"),
    role_dummies.add_prefix("role_"),
    pd.Series(experience_code, name="experience_code"),
    pd.Series(peopleparticipation_code, name="peopleparticipation_code")
], axis=1)

beta_y = np.random.normal(0.0, sigma_beta_y, size=X_y.shape[1])


X_y = X_y.apply(pd.to_numeric, errors="raise")
Xy  = X_y.to_numpy(dtype=float, copy=True)
beta_y = np.asarray(beta_y, dtype=float)

mu = Xy @ beta_y
alpha_y = float(logit(target_effective_share) - mu.mean())
p_eff = expit(alpha_y + mu)
effectiveness = np.random.binomial(1, p_eff, size=N)

In [None]:

# 4) Assemble dataset

df = pd.DataFrame({
    "role": role,
    "experience": experience,
    "experience_code": experience_code,
    "peopleparticipation": peopleparticipation,
    "peopleparticipation_code": peopleparticipation_code,
    "dashboarduse": dashboarduse,
    "aiuse": aiuse,
    "understanding": understanding,
    "access": access,
    "equity": equity,
    "traceability": traceability,
    "effectiveness": effectiveness
})

# 5) Save data & realized draws for transparency
df.to_csv("simulated_collab_survey_1000_randomized.csv", index=False)

draws = {
    "role_probs": role_probs,
    "exp_probs": exp_probs,
    "pp_probs": pp_probs,
    "p_dashboard": float(p_dashboard),
    "p_ai": float(p_ai),
    "mediator_coeffs": {
        "understanding": {"alpha": float(alpha_u), "beta": dict(zip(pred_names, beta_u))},
        "access":        {"alpha": float(alpha_a), "beta": dict(zip(pred_names, beta_a))},
        "equity":        {"alpha": float(alpha_e), "beta": dict(zip(pred_names, beta_e))},
        "traceability":  {"alpha": float(alpha_t), "beta": dict(zip(pred_names, beta_t))}
    },
    "outcome_coeffs": {
        "alpha_y": float(alpha_y),
        "beta_y": dict(zip(X_y.columns.tolist(), beta_y))
    },
    "target_effective_share": target_effective_share,
    "realized_effective_share": float(df["effectiveness"].mean())
}
pd.json_normalize(draws, sep="__").to_json("random_draws_summary.json", orient="records", lines=False)

print("Saved data to simulated_collab_survey_1000_randomized.csv")
print("Saved coefficient draws to random_draws_summary.json")
print("Realized effective share:", round(df['effectiveness'].mean(), 3))