### Propensity score + synthetic data

- C: Result of a k-sided die.
- A: Flip 1 + k - C coins. A is 1 if at least one flip comes up heads.
- Y: Flip C + A coins and write down the number of heads.

In [49]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [16]:
def get_sklearn_model(ols, df):
    """
    Fit a model with sklearn
    Return the parameter corresponding to the treatment
    """
    target = ols.split("~")[0].strip()
    inputs = ols.split("~")[1].strip().split(" + ")

    model = LogisticRegression(solver="lbfgs")
    model.fit(df[inputs], df[target])

    return model.predict_proba(df[inputs])[:, 1]

In [7]:
def observed(n=100, c_dim=6):
    """
    The observed data distribution
      C: roll a k-sided die and record the result
      A: flip `1 + k - C` fair coins, and record 1 if at least one flip lands heads
      Y: flip `C + A` fair coins, and record the number of heads
    """

    c = np.random.randint(1, 1 + c_dim, n)
    a = np.random.binomial(n=1 + c_dim - c, p=0.5, size=n)
    a = (a > 0).astype(np.int32)
    y = np.random.binomial(n=a + c, p=0.5)

    df = pd.DataFrame(data=dict(c=c, a=a, y=y))
    return df

In [8]:
def randomized(n=100, c_dim=6):
    """
    The same distribution, except A is replaced with a fair coin flip
      C: roll a k-sided die and record the result
      A: flip a single fair coin, and record 1 if it lands heads
      Y: flip `C + A` fair coins, and record the number of heads
    """

    c = np.random.randint(1, 1 + c_dim, n)
    a = np.random.binomial(n=1, p=0.5, size=n)
    y = np.random.binomial(n=a + c, p=0.5)

    df = pd.DataFrame(data=dict(c=c, a=a, y=y))
    return df

In [50]:
def experiment(dist, n=100, c_dim=6, method="a ~ c", repeats=1):
    """
    Run an experiment with the given kwargs
      dist: either "observed" or "randomized" distribution
      n: the number of samples to draw from the distribution
      c_dim: possible values that C can take (number of sides of the die)
      method: method for estimation: either "count" or "a ~ c"
    """
    if dist == "observed":
        func = observed
    elif dist == "randomized":
        func = randomized
    else:
        raise ValueError(dist)

    np.random.seed(42)
    results = []
    for i in range(repeats):
        df = func(n=n, c_dim=c_dim)
        if "~" in method:
            df["a_prob"] = get_sklearn_model(method, df)

            subdf = df[df["a"] == 1]
            e_y_a1 = np.sum(subdf["y"] / subdf["a_prob"])
            subdf = df[df["a"] == 0]
            e_y_a0 = np.sum(subdf["y"] / (1 - subdf["a_prob"]))
            results.append((e_y_a1 - e_y_a0) / df.shape[0])

        # what if we match on a_prob?
        elif method == "count":
            total = 0
            denominator = 0
            unique_c, counts = np.unique(
                df["c"], axis=0, return_counts=True)

            for uniq, count in zip(unique_c, counts):
                if count == 1: continue
                subdf = df[df["c"] == uniq]
                if np.unique(subdf["a"]).shape[0] == 1: continue
                e_y_a1 = subdf[subdf["a"] == 1]["y"].mean()
                e_y_a0 = subdf[subdf["a"] == 0]["y"].mean()
                total += count * (e_y_a1 - e_y_a0)
                denominator += count

            if denominator == 0:
                results.append(np.nan)
            else:
                results.append(total / denominator)
            
    err = ""
    if repeats > 1:
        err = f" ± {np.std(results):.3f}"
    print(f"{np.mean(results):.3f}{err}")

In [46]:
experiment("observed", n=10000, c_dim=2, method="count", repeats=10)

0.506 ± 0.014


In [47]:
experiment("observed", n=10000, c_dim=2, method="a ~ c", repeats=10)

0.506 ± 0.014


In [34]:
experiment("randomized", n=100, c_dim=2, method="a ~ c", repeats=10)

0.588 ± 0.139
