# Data generating process (example/test)

In [1]:
import numpy as np
import pandas as pd
import os
import sys
sys.path.append("..")
### IMPORTS
from deconfounder.deconfounder_v2 import DeconfounderTreeV2
import time

np.random.seed(42)


Generating simulated data

In [2]:
sample_size = 100000

t = np.random.binomial(1, 0.5, sample_size)     # p_u = p_t = 0.5
x1 = np.random.binomial(1, 0.5, size=sample_size)   # p(x1=0) = p(x1=1) = 0.5 
x2 = np.random.binomial(1, 0.5, size=sample_size)   # p(x2=0) = p(x2=1) = 0.5 
mu = 4 - x1*3  - t + 2*t*x1     # the outcome is affected by treatment and x1
effects = -1 + 2*x1     # the treatment effect is only correlated with x1. x1 = 0, no treatment better; x1 = 1, treatment better
preds = effects + 2*x2  # the bias is correlated with x2. x2 = 0, no bias; x2 = 1, bias = 2
y = np.random.normal(mu, 1, sample_size)
data = pd.DataFrame({"x1":x1, "x2":x2, "t":t, "y": y, "pred":np.array(preds, dtype=np.float64), "c":np.array(effects), "mu":mu})
data.head(20)

Unnamed: 0,x1,x2,t,y,pred,c,mu
0,1,0,0,0.917365,1.0,1,1
1,1,0,1,0.871434,1.0,1,2
2,0,0,1,2.652828,-1.0,-1,3
3,0,0,1,3.69976,-1.0,-1,3
4,0,0,0,3.508528,-1.0,-1,4
5,1,0,0,0.938702,1.0,1,1
6,1,0,0,-0.673316,1.0,1,1
7,1,1,1,3.043559,3.0,1,2
8,0,1,1,3.780937,1.0,-1,3
9,1,0,1,3.844283,1.0,1,2


Fit the deconfounder tree on the simulated data

In [3]:
# %%
X_exp = data[['x1', 'x2', 't']].rename(columns={'t': 'treated'})
y_exp = data['y']
eff_pred = data['pred'].values
start_time = time.time()
deconfounder = DeconfounderTreeV2(random_state=42, min_weight_fraction_leaf=0.1)
deconfounder.fit(X_exp, y_exp, eff_pred)
print("--- Time to fit (and tune) causal tree %s seconds ---" % (time.time() - start_time))
pd.Series(deconfounder.predict(X_exp)).map('{:,.7f}'.format).value_counts()

--- Time to fit (and tune) causal tree 0.14861607551574707 seconds ---


inf          49834
2.9999999    25102
0.9999999    25064
dtype: int64

In [4]:
from sklearn.tree import export_text

# print the tree structure
tree_rules = export_text(deconfounder, feature_names=['x1', 'x2'])
print(tree_rules)

|--- x1 <= 0.50
|   |--- value: [inf]
|--- x1 >  0.50
|   |--- x2 <= 0.50
|   |   |--- value: [1.00]
|   |--- x2 >  0.50
|   |   |--- value: [3.00]



In [9]:
print("Average effects of individuals we decide to treat")
value = (data.c * (data.pred > 0)).mean()
print(f"without any correction: {value}")
value = (data.c * (data.pred - data.x2 *2 > 0)).mean()
print(f"corrected manually: {value}")
corr = eff_pred - deconfounder.predict(X_exp)
value = (data.c * (corr > 0)).mean()
print(f"corrected by deconfounder: {value}")

Average effects of individuals we decide to treat
without any correction: 0.25166
corrected manually: 0.50166
corrected by deconfounder: 0.50166
