In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

As example, we will use following dataset: 

- Treatment T: sodium intake (which can be binarized as `sodium intake > 3.5`)
- Outcome Y: systolic blood pressure
- Covariates X: age and protein in urine

We shall use as simulation the following ground-truth equations:

$$bloodPressure = \beta_1 \cdot sodium + 2 \cdot age$$

$$protein = \alpha_1 \cdot sodium + \alpha_2 \cdot bloodPressure$$



In [5]:
def generate_data(n=1000, seed=0, beta1=1.05, alpha1=0.4, alpha2=0.3, binary_treatment=True, binary_cutoff=3.5):
    np.random.seed(seed)
    age = np.random.normal(65, 5, n)
    sodium = age / 18 + np.random.normal(size=n)
    if binary_treatment:
        if binary_cutoff is None:
            binary_cutoff = sodium.mean()
        sodium = (sodium > binary_cutoff).astype(int)
    blood_pressure = beta1 * sodium + 2 * age + np.random.normal(size=n)
    proteinuria = alpha1 * sodium + alpha2 * blood_pressure + np.random.normal(size=n)
    hypertension = (blood_pressure >= 140).astype(int)  # not used, but could be used for binary outcomes
    return pd.DataFrame({'blood_pressure': blood_pressure, 'sodium': sodium,
                         'age': age, 'proteinuria': proteinuria})

def estimate_causal_effect(Xt, y, model=LinearRegression(), treatment_idx=0, regression_coef=False):
    model.fit(Xt, y)
    if regression_coef:
        return model.coef_[treatment_idx]
    else:
        Xt1 = pd.DataFrame.copy(Xt)
        Xt1[Xt.columns[treatment_idx]] = 1
        Xt0 = pd.DataFrame.copy(Xt)
        Xt0[Xt.columns[treatment_idx]] = 0
        return (model.predict(Xt1) - model.predict(Xt0)).mean()

In [12]:
beta1 = 1.05

df = generate_data(beta1=beta1, alpha1=.4, alpha2=.3, binary_treatment=True, n=10000000)
df.head()


Unnamed: 0,blood_pressure,sodium,age,proteinuria
0,146.809261,1,73.820262,43.118187
1,133.14517,0,67.000786,37.669696
2,139.28964,0,69.89369,41.37032
3,153.894444,1,76.204466,47.021801
4,150.615711,1,74.33779,46.574317


In [11]:
df.corr()

Unnamed: 0,blood_pressure,sodium,age,proteinuria
blood_pressure,1.0,0.260983,0.993889,0.95014
sodium,0.260983,1.0,0.213186,0.304685
age,0.993889,0.213186,1.0,0.941516
proteinuria,0.95014,0.304685,0.941516,1.0


We will estimate the effects of sodium intake on blood pressure, but in our data we also have estimates `age` and the amount of `protein in urine` (which we will consider as input variables of our model since we don't know the ground-truth equations).

Because we are running simulations, we know that the true Average Treatment Effect of sodium in blood pressure is $\beta_1$


Now, how do we actually estimate the ATE? First, we assume consistency, positivity, and unconfoundedness given X. This means that we’ve identified the ATE as

$$\hat{ATE} = \mathbb{E}_X \Big[\mathbb{E}[Y | T = 1, X] − \mathbb{E}[Y | T = 0, X]\Big] $$ 

For that, we need to fit a model in order to estimate $\mathbb{E}[Y | t, x]$. However, in real case scenario don't know that blood pressure depends only on sodium and age. So, we would include the amount of protein in urine so as to use all available data in the dataset and then calculate the estimate through an empirical mean over X.

In [24]:
def estimate_avg_causal_effect(covariates, outcome, treatment):
    # Fitting model to estimate E[Y|t,x]
    Xt = df[covariates]
    y = df[outcome]
    model = LinearRegression()
    model.fit(Xt, y)

    # Estimating average causal effect
    Xt1 = pd.DataFrame.copy(Xt)
    Xt1[treatment] = 1
    Xt0 = pd.DataFrame.copy(Xt)
    Xt0[treatment] = 0
    ate_est = np.mean(model.predict(Xt1) - model.predict(Xt0))
    print('ACE estimate:', ate_est)
    print ('Relative error: {:.01f}%'.format(100*abs(ate_est-beta1)/beta1))

estimate_avg_causal_effect(
    covariates=['sodium', 'age', 'proteinuria'], 
    outcome='blood_pressure', 
    treatment='sodium'
)


ACE estimate: 0.8537946431496021
Relative error: 18.7%


In [22]:
estimate_avg_causal_effect(
    covariates=['sodium'], 
    outcome='blood_pressure', 
    treatment='sodium'
)

ACE estimate: 5.328501680864975
Relative error: 407.5%
