### Synthetic data example

- C: Result of a k-sided die.
- A: Flip 1 + k - C coins. A is 1 if at least one flip comes up heads.
- Y: Flip C + A coins and write down the number of heads.

In [1]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression

In [2]:
def get_smf_model_a_param(ols, df):
    """
    Fit a model with statsmodels
    Return the parameter corresponding to the treatment
    """
    return smf.ols(ols, data=df).fit().params['a']

In [3]:
def get_sklearn_model_a_param(ols, df):
    """
    Fit a model with sklearn
    Return the parameter corresponding to the treatment
    """
    target = ols.split("~")[0].strip()
    inputs = ols.split("~")[1].strip().split(" + ")

    model = LinearRegression()
    model.fit(df[inputs], df[target])

    return model.coef_[inputs.index("a")]

In [4]:
def observed(n=100, c_dim=6, ols="y ~ a"):
    """
    The observed data distribution
      C: roll a k-sided die and record the result
      A: flip `1 + k - C` fair coins, and record 1 if at least one flip lands heads
      Y: flip `C + A` fair coins, and record the number of heads
    """

    c = np.random.randint(1, 1 + c_dim, n)
    a = np.random.binomial(n=1 + c_dim - c, p=0.5, size=n)
    a = (a > 0).astype(np.int32)
    y = np.random.binomial(n=a + c, p=0.5)

    df = pd.DataFrame(data=dict(c=c, a=a, y=y))
    a_param = get_smf_model_a_param(ols, df)
    # a_param = get_sklearn_model_a_param(ols, df)

    return a_param

In [5]:
def randomized(n=100, c_dim=6, ols="y ~ a"):
    """
    The same distribution, except A is replaced with a fair coin flip
      C: roll a k-sided die and record the result
      A: flip a single fair coin, and record 1 if it lands heads
      Y: flip `C + A` fair coins, and record the number of heads
    """

    c = np.random.randint(1, 1 + c_dim, n)
    a = np.random.binomial(n=1, p=0.5, size=n)
    y = np.random.binomial(n=a + c, p=0.5)

    df = pd.DataFrame(data=dict(c=c, a=a, y=y))
    a_param = get_smf_model_a_param(ols, df)

    return a_param

In [6]:
def experiment(dist, n=100, c_dim=6, ols="y ~ a", repeats=1):
    """
    Run an experiment with the given kwargs
      dist: either "observed" or "randomized" distribution
      n: the number of samples to draw from the distribution
      c_dim: possible values that C can take (number of sides of the die)
      ols: regression model; either "y ~ a" or "y ~ a + c"
    """
    if dist == "observed":
        func = observed
    elif dist == "randomized":
        func = randomized
    else:
        raise ValueError(dist)

    np.random.seed(42)
    results = [func(n=n, c_dim=c_dim, ols=ols) for i in range(repeats)]
    err = ""
    if repeats > 1:
        err = f" ± {np.std(results):.3f}"
    print(f"{np.mean(results):.3f}{err}")

In [13]:
experiment("randomized", n=1000, c_dim=6, ols="y ~ a", repeats=10)

0.546 ± 0.055


In [14]:
experiment("randomized", n=10, c_dim=6, ols="y ~ a", repeats=1000)

0.454 ± 0.847
