<a href="https://colab.research.google.com/github/miaojingang/mc/blob/master/notebooks/simulations_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook, I replicate some simulations in the papers and demonstrate code usage.

In [1]:
library(devtools)

Loading required package: usethis



In [2]:
install_github("facebookresearch/mc")
install.packages("data.table")

Downloading GitHub repo facebookresearch/mc@HEAD




[32m✔[39m  [90mchecking for file ‘/tmp/Rtmpur36a8/remotes662e73a82/facebookresearch-mc-632289b/DESCRIPTION’[39m[36m[39m
[90m─[39m[90m  [39m[90mpreparing ‘mc’:[39m[36m[39m
[32m✔[39m  [90mchecking DESCRIPTION meta-information[39m[36m[39m
[90m─[39m[90m  [39m[90mchecking for LF line-endings in source and make files and shell scripts[39m[36m[39m
[90m─[39m[90m  [39m[90mchecking for empty or unneeded directories[39m[36m[39m
[90m─[39m[90m  [39m[90mbuilding ‘mc_0.0.2.tar.gz’[39m[36m[39m
   


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [3]:
library(mc)
library(data.table)

In [4]:
#@title function mc {form-width: "20%"}
one_run = function(
    N=400,  # sample size for main data
    n=100,  # sample size for calibration data
    pie=c(0.2, 0.8),  # true membership proportions
    p=array(rep(c(0.9, 0.1, 0.2, 0.8), 2), c(2, 2, 2)),  # miscalssification matrix
    mu=c(0.8, 0.4),  # true mean of y
    seed=123  # seed for random data generation

) {
    
    # N = 400; n = 100; pie = c(0.2, 0.8);
    # p = array(rep(c(0.9, 0.1, 0.2, 0.8), 2), c(2, 2, 2))
    # mu = c(0.8, 0.4); seed=431

    set.seed(seed)
    m = length(pie)
    i = rbinom(n=N + n, size=1, prob=pie[2]) + 1  # true group
    y = rbinom(n=N + n, size=1, prob=mu[i])  # y_value depends on true group info i
    j = rbinom(n=N + n, size=1, prob=p[2, i, y + 1]) + 1 # observed group

    dt = data.table(i, y, j)[, y2 := y ^ 2]

    # start calculation
    dt_P = head(dt, N)
    dt_V = tail(dt, n)

    n_jd_P = c(dt_P[, .N, keyby=j]$N)
    y_sum_jd_P = c(dt_P[, sum(y), keyby=j]$V1)
    n_ji_V =  as.matrix(dcast(dt_V, j ~ i, fun.aggregate = length))[, -1]
    y_sum_ji_V = as.matrix(dcast(dt_V, j ~ i, fun.aggregate = sum, value.var = "y"))[, -1]
    y2_sum_ji_V = as.matrix(dcast(dt_V, j ~ i, fun.aggregate = sum, value.var = "y2"))[, -1]

    # get estimates
    mom = mc::mc_mom(n_jd_P, y_sum_jd_P, n_ji_V, y_sum_ji_V)
    rmle = mc::mc_rmle(n_jd_P,  y_sum_jd_P, n_ji_V, y_sum_ji_V, y2_sum_ji_V)
    out = cbind(mu, mom, rmle)

    for (idx in 2 : (ncol(out) - 1)) {
        if (sum(out[, idx] < 0) + sum(out[, idx] > 1) >= 1) {
            out[, idx] = NA
        }
    }

    return(out)
}

In [9]:
#@title function simulation {form-width: "20%"}
simulation = function(n_reps=1000, verbose=TRUE) {
  # n_reps=400; verbose=TRUE; args=[]; kwargs={}

    res = do.call(rbind, lapply(1 : n_reps, function(seed) {one_run(seed=seed)}))
    res = data.table(res)
    res[, param := rep(c("mu1", "mu2"), n_reps)]
    pct_bad = colMeans(is.na(res))
 
    est_cols = setdiff(names(res), c("mu", "param", "mak_li_var"))
    err = copy(res)
    err[, (est_cols) := lapply(.SD, "-", err$mu), .SDcols=est_cols]
 
    bias = err[, lapply(.SD, mean, na.rm=TRUE), by=.(param), .SDcols=est_cols]
    mse = err[, lapply(.SD^2, mean, na.rm=TRUE), by=.(param), .SDcols=est_cols]


    estimated_var = err[, mean(mak_li_var, na.rm=TRUE), by=.(param)]
    empirical_var = err[, var(mak_li, na.rm=TRUE), by=.(param)]

    if (verbose) {
        print(head(res))
        cat("\n\nbias:\n"); print(bias)
        cat("\n\nmse:\n"); print(mse)
        cat("\n\nestimated_var\n"); print(estimated_var)
        cat("\n\nempirical_var\n"); print(empirical_var)

    }

}

simulation()

    mu     naive validation    no_y_V  with_y_V    mak_li  mak_li_var param
1: 0.8 0.6222222  0.8695652        NA 0.8551084 0.8576355 0.003876251   mu1
2: 0.4 0.3849057  0.4415584        NA 0.3833648 0.3859078 0.001014543   mu2
3: 0.8 0.6223776  0.7727273        NA 0.8574663 0.8181515 0.005912905   mu1
4: 0.4 0.4163424  0.3717949        NA 0.3939839 0.3974948 0.001034175   mu2
5: 0.8 0.6268657  0.7500000 0.7488856 0.7526997 0.7762974 0.004786532   mu1
6: 0.4 0.4210526  0.4342105 0.4050926 0.4112221 0.4030330 0.001005772   mu2


bias:
   param       naive   validation       no_y_V     with_y_V       mak_li
1:   mu1 -0.18841189 -0.008645615 -0.018648243 -0.003048568 -0.002562847
2:   mu2  0.01314251  0.001977685  0.001351776  0.001719917  0.001912470


mse:
   param       naive  validation      no_y_V     with_y_V       mak_li
1:   mu1 0.037273789 0.008025549 0.010482617 0.0066875830 0.0059830399
2:   mu2 0.001084084 0.002931810 0.001202868 0.0008498328 0.0009464252


estimated_var
   pa