In [4]:
import numpy as np
import pandas as pd

In [None]:
# Dag:

# A -> B -> C
# X -> A <- Y
# J -> B <- K

In [5]:
def get_params(intercept=0.5, std=0.1):
    return {
        "1->a": intercept + np.random.normal(0, std),
        "1->b": intercept + np.random.normal(0, std),
        "1->c": intercept + np.random.normal(0, std),
        "x->a": np.random.normal(0, std),
        "y->a": np.random.normal(0, std),
        "a->b": np.random.normal(0, std),
        "j->b": np.random.normal(0, std),
        "k->b": np.random.normal(0, std),
        "b->c": np.random.normal(0, std),
    }

In [12]:
def sample(params=None, n=100):

    if params is None:
        params = get_params()

    independent = np.random.binomial(n=1, p=0.5, size=(n, 4))
    x, y, j, k = np.array_split(independent, 4, axis=1)

    a_prob = params["1->a"] + params["x->a"] * x + params["y->a"] * y
    a = np.random.binomial(n=1, p=a_prob, size=(n, 1))

    b_prob = params["1->b"] + params["j->b"] * j + params["k->b"] * k + params["a->b"] * a
    b = np.random.binomial(n=1, p=b_prob, size=(n, 1))

    c_prob = params["1->c"] + params["b->c"] * b
    c = np.random.binomial(n=1, p=c_prob, size=(n, 1))

    arr = np.concatenate([a, b, c, x, y, j, k], axis=1)
    df = pd.DataFrame(arr, columns=["a b c x y j k".split(" ")])
    return df

In [73]:
params = {
        "1->a": 0.2,
        "1->b": 0.2,
        "1->c": 0.2,
        "x->a": 0.2,
        "y->a": -0.15,
        "a->b": 0.4,
        "j->b": 0.2,
        "k->b": -0.15,
        "b->c": 0.4,
}
df = sample(n=10000000, params=params)
df.head()

Unnamed: 0,a,b,c,x,y,j,k
0,0,0,1,0,1,0,1
1,0,0,0,1,0,0,1
2,1,1,1,1,1,1,1
3,0,1,1,1,0,0,0
4,0,1,1,0,1,1,1


In [81]:
def check_independence(df, var1, var2, where=""):
    p_12 = np.mean(df[[var1, var2]].sum(axis=1) == 2)
    p_1 = np.mean(df[var1])
    p_2 = np.mean(df[var2])

    print(f"p({var1}{where})p({var2}{where}) = {p_1*p_2:.3f};"
          f" p({var1}, {var2}{where}) = {p_12:.3f}")

In [61]:
def check_conditional_independence(df, var1, var2, cond_var):
    subdf = df.iloc[np.where(df[cond_var] == 1)[0]]
    check_independence(subdf, var1, var2, where=f"| {cond_var}=1")
    subdf = df.iloc[np.where(df[cond_var] == 0)[0]]
    check_independence(subdf, var1, var2, where=f"| {cond_var}=0")

In [80]:
check_independence(df, "x", "y")
check_conditional_independence(df, "x", "y", "a")
check_conditional_independence(df, "x", "y", "b")
check_conditional_independence(df, "x", "y", "c")

p(x)p(y) = 0.250;p(x, y) = 0.250
p(x| a=1)p(y| a=1) = 0.241;p(x, y| a=1) = 0.278
p(x| a=0)p(y| a=0) = 0.239;p(x, y| a=0) = 0.242
p(x| b=1)p(y| b=1) = 0.250;p(x, y| b=1) = 0.250
p(x| b=0)p(y| b=0) = 0.250;p(x, y| b=0) = 0.250
p(x| c=1)p(y| c=1) = 0.250;p(x, y| c=1) = 0.250
p(x| c=0)p(y| c=0) = 0.250;p(x, y| c=0) = 0.250


In [75]:
check_independence(df, "j", "k")
check_conditional_independence(df, "j", "k", "a")
check_conditional_independence(df, "j", "k", "b")
check_conditional_independence(df, "j", "k", "c")

p(j)p(k) = 0.250; p(j, k) = 0.250
p(j| a=1)p(k| a=1) = 0.250; p(j, k| a=1) = 0.250
p(j| a=0)p(k| a=0) = 0.250; p(j, k| a=0) = 0.250
p(j| b=1)p(k| b=1) = 0.241; p(j, k| b=1) = 0.278
p(j| b=0)p(k| b=0) = 0.239; p(j, k| b=0) = 0.242
p(j| c=1)p(k| c=1) = 0.255; p(j, k| c=1) = 0.259
p(j| c=0)p(k| c=0) = 0.246; p(j, k| c=0) = 0.247
