### Missing Data

- This adds a missing data sampling process to our simple synthetic dataset
- The underlying p(C, A, Y) distribution can be specified to be randomized or not
- The `missingness` keyword (MCAR, MAR, or MNAR) specifies the type of missingness on C.

In [78]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from statsmodels.imputation import mice

In [151]:
def synthetic_data(n=100, c_dim=6, missingness="mcar", randomized=False):
    """
    Args:
        n: how many samples from the distribution
        c_dim: how many sides to the die being rolled for C
        missingness: either mcar, mar, or mnar
        randomized: if True, A is marginally independent of C

    The data distribution:
      C: roll a c_dim-sided die and record the result
      A: flip `1 + c_dim - C` fair coins, and record 1 if at least one flip lands heads
      Y: flip `C + A` fair coins, and record the number of heads
      R_C: flip 2 + Y coins; if fewer than 2 heads, C is missing
    """

    c = np.random.randint(1, 1 + c_dim, n)
    if randomized:
        a = np.random.binomial(n=1, p=0.5, size=n)
    else:
        a = np.random.binomial(n=1 + c_dim - c, p=0.5, size=n)
        a = (a > 0).astype(np.int32)

    y = np.random.binomial(n=a + c, p=0.5)

    missingness = missingness.lower()
    if missingness == "mcar":
        r_c = np.random.binomial(n=1, p=0.9, size=n)
    elif missingness == "mar":
        r_c = np.random.binomial(n=2 + y, p=0.5, size=n)
        r_c = (r_c >= 2).astype(np.int32)
    elif missingness == "mnar":
        r_c_prob = a * (c / c_dim) + (1 - a) * (1 - c / c_dim)
        r_c = np.random.binomial(n=1, p=r_c_prob, size=n)
        r_c = (r_c >= 1).astype(np.int32)
    else:
        raise ValueError(f"Unknown missingness: {missingness}")

    c = np.where(r_c, c, np.nan)

    df = pd.DataFrame(data=dict(c=c, a=a, y=y, r_c=r_c))
    return df

In [154]:
def experiment(n=100, c_dim=6, missingness="mcar", randomized=False,
               impute_model="a + y", ols_model="y ~ a + c",
               mice_repeats=1, data_repeats=1, verbose=False):
  
    """
    Run an experiment to estimate the causal effect on a dataset
        sampled from the specified synthetic data distribution.
    Args:
        n, c_dim, missingness, randomized: same as in `synthetic_data`
        impute_model: features to use when imputing c
        ols_model: model for which to fit our regression for y
        data_repeats: how many datasets to sample from the distribution
        mice_repeats: for each dataset sampled with `data_repeats`,
                      how many imputed datasets to sample
    """

    results = []
    for i in range(data_repeats):
        np.random.seed(i)
        df = synthetic_data(
            n=n, c_dim=c_dim,
            randomized=randomized,
            missingness=missingness)
    
        if verbose:
            print("Missingness distribution")
            print("y_val  E[R_c | Y=y_val]")
            for y_val in sorted(np.unique(df["y"])):
                e_r_c = 1 - np.mean(np.isnan(df["c"][df["y"] == y_val]))
                print(f"{y_val:5d} {e_r_c:.1f}")
        
        for j in range(mice_repeats):
            np.random.seed(j)
            imputer = mice.MICEData(df, perturbation_method="boot")
            imputer.set_imputer('c', formula=impute_model, model_class=sm.GLM)
            imputer.update_all(5)
            est = smf.ols(ols_model, data=imputer.data).fit().params["a"]
            results.append(est)

    err = ""
    if len(results) > 1:
        err = f" ± {np.std(results):.3f}"
    print(f"{np.mean(results):.3f}{err}")

### Experiments

The below code will loop over the three missingness models and run the same analysis on the datasets produced by each. While you can play around with any of the keyword args, a few things to try:
- What happens if you replace `impute_model` with just `"a"` or just `"y"`? Why?
- Does changing `n`, `data_repeats`, or `mice_repeats` affect your estimates more? Why?

In [173]:
kwargs = {
    "n": 100,
    "data_repeats": 10,
    "mice_repeats": 10,
    "ols_model": "y ~ a + c",
    "impute_model": "a + y",
    "randomized": False,
    "verbose": False
}

for m in ["mcar", "mar", "mnar"]:
    kw = kwargs.copy()
    kw["missingness"] = m
    print(m)
    experiment(**kw)

mcar
0.539 ± 0.141
mar
0.535 ± 0.125
mnar
-0.263 ± 0.182
