```
Copyright (c) Gradient Institute. All rights reserved.
Licensed under the Apache 2.0 License.
```


This notebook tests out this fix for estimating treatment effect in regularised linear models:

Hahn, P.R., Carvalho, C.M., Puelz, D., He, J., 2018. Regularization and Confounding in Linear Regression for Treatment Effect Estimation. Bayesian Anal. 13. https://doi.org/10.1214/16-BA1044


In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression, BayesianRidge, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.kernel_approximation import RBFSampler

from twostageridge import TwoStageRidge

# Generate the data

In [2]:
np.random.seed(42)

N = 500
proj_D = 100  # Up-projection dimensionality
rbf_gamma = .1


# Latent confounders - X
D = 5
mu_x = np.zeros(D)
A = np.random.randn(D, D)
cov_x = A @ A.T / D


# Treatment - Z
gamma = np.random.randn(proj_D)
sig_eps = 0.4


# Target - Y
alpha = 0.3
beta = np.random.randn(proj_D)
sig_nu = 0.5


# Generation function
def generate_data():
    
    # Generate the latent covariates, and then a smooth non-linear function of these
    Xo = np.random.multivariate_normal(mean=mu_x, cov=cov_x, size=N)
    X = RBFSampler(n_components=proj_D, gamma=rbf_gamma).fit_transform(Xo)
    
    # Generate data according to the causal relationships X->Z, X->Y, Z->Y
    Z = X @ gamma + sig_eps * np.random.randn(N)
    Y = alpha * Z + X @ beta + sig_nu * np.random.randn(N)

    W = np.hstack((Z[:, np.newaxis], X))
    
    return W, Y

# Generate bias comparison results

In [3]:
S = 20
models = {
    "ols": LinearRegression(),
    "lasso": Lasso(alpha=1e-3, max_iter=20000),
    "ridge": Ridge(alpha=1.),
    "bayes": BayesianRidge(),
    "ts": TwoStageRidge(treatment_index=0, regulariser1=1., regulariser2=1.)
}

scores = {k: [] for k in models.keys()}
mae = {k: [] for k in models.keys()}
rmse = {k: [] for k in models.keys()}

# Sample random datasets
for s in range(S):
    
    # Generate the data and split into train and test
    W, Y = generate_data()
    W_train, W_test, Y_train, Y_test = train_test_split(W, Y, train_size=0.9)
    
    for name, model in models.items():
        
        # Fit the model and get the ATE (alpha)
        model.fit(W_train, Y_train)
        alpha_hat = model.alpha_ if name == "ts" else model.coef_[0]
        err = alpha - alpha_hat
        abs_err = np.abs(err)
        sq_err = (err)**2
        
        # Score on held-out data
        scores[name].append(model.score(W_test, Y_test))
        mae[name].append(abs_err)
        rmse[name].append(sq_err)
        
    print('.', end='')
print()
        
        
scores = {k: np.mean(v) for k, v in scores.items()}
mae = {k: np.mean(v) for k, v in mae.items()}
rmse = {k: np.sqrt(np.mean(v)) for k, v in rmse.items()}


results = pd.DataFrame({
    "scores": scores,
    "mean absolute error": mae,
    "root mean square error": rmse
})

results

....................


Unnamed: 0,scores,mean absolute error,root mean square error
ols,0.28548,0.04331,0.051603
lasso,0.639247,0.040646,0.053998
ridge,0.637409,0.039604,0.051681
bayes,0.650928,0.032775,0.041293
ts,0.638831,0.036214,0.045465


We can also get statistics of the model from the `TwoStageRidge` model.

In [5]:
models["ts"].model_statistics()

Statistical results:
            alpha =
                0.3165366492235731,
            s.e.(alpha) =
                0.0632061027540799
            t-statistic(s):
                5.008007699116379
            p-value(s):
                8.760122205409004e-07
            Degrees of freedom: 348
            