Based on: https://mc-stan.org/users/documentation/case-studies/pool-binary-trials.html

In [1]:
library(tidyverse)
library(yaml)

── [1mAttaching core tidyverse packages[22m ──────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
config_id <- "pooling_sim"

In [3]:
data_config <- yaml.load_file(paste0("../experiments/configs/", config_id, "/data.yaml"))

In [4]:
data_base_dir <- paste0("../", data_config$output_dir)
data_path <- paste0(data_base_dir, "/scores_data.csv")
if (!dir.exists(data_base_dir)) {
    dir.create(data_base_dir, recursive = TRUE)
}

In [5]:
N_students_per_class <- data_config$N_students_per_class
N_classes <- data_config$N_classes
N_total_students <- N_students_per_class * N_classes

# Global intercept (average exam score when study hours = 0)
beta_0 <- data_config$beta_0
# Slope (effect of study hours on exam score)
beta_1 <- data_config$beta_1
# Standard deviation for classroom-level random effects
sigma_class <- data_config$sigma_class
# Standard deviation for student-level random errors
sigma_student <- data_config$sigma_student
max_study_hours <- data_config$max_study_hours

data_seed <- data_config$seed

Our true data generating process

$y_{ij} = \beta_{0} + \beta_{1} X_{ij} + u_{0j} + e_{ij}$

Where:
- $y_{ij} \sim \mathcal{N}(\beta_{0} + \beta_{1} X_{ij} + u_{0j}, \sigma_{\text{student}}^2)$ = Exam score of student $i$ in classroom $j$
- $u_{0j} \sim \mathcal{N}(0, \sigma_{\text{class}}^2)$ = the class-level effect for class $j$
- $e_{ij} \sim \mathcal{N}(0, \sigma_{\text{student}}^2)$ = the student-level residual error term $i$
- $X_{ij}$ = Hours spent studying by student $i$ in classroom $j$

## Simulate data

In [6]:
set.seed(data_seed)

In [7]:
scores_df <- tibble(
        class_id = 1:N_classes,
        class_effect = rnorm(N_classes, 0, sigma_class)
    ) %>%
    inner_join(
        tibble(
            class_id = rep(1:N_classes, each=N_students_per_class),
            study_hours = runif(N_total_students, min = 0, max = max_study_hours),
            student_error_term = rnorm(N_total_students, mean = 0, sd = sigma_student),
            student_id = 1:N_total_students
        ),
        by="class_id"
    ) %>%
    mutate(
        score = beta_0 + beta_1 * study_hours + class_effect + student_error_term
    )

In [8]:
scores_df %>%
    write_csv(data_path)