In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib?
%matplotlib inline

import seaborn as sns

In [3]:
# import scipy.special
#
# np.random.seed(123)
# n=2000 # number of raw samples
# d=10 # number of binary features + 1
#
# # Generating random segments aka binary features. We will use features 0,...,3 for heterogeneity.
# # The rest for controls. Just as an example.
# X = np.random.binomial(1, .5, size=(n, d))
# # Generating an imbalanced A/B test
# T = np.random.binomial(1, scipy.special.expit(X[:, 0]))
# # Generating an outcome with treatment effect heterogeneity. The first binary feature creates heterogeneity
# # We also have confounding on the first variable. We also have heteroskedastic errors.
# y = (-1 + 2 * X[:, 0]) * T + X[:, 0] + (1*X[:, 0] + 1)*np.random.normal(0, 1, size=(n,))
#
# X_test = np.random.binomial(1, .5, size=(100, d))
# T_test = np.random.binomial(1, scipy.special.expit(X_test[:, 0]))
# y_test = (-1 + 2 * X_test[:, 0]) * T_test + X_test[:, 0] + (1*X_test[:, 0] + 1)*np.random.normal(0, 1, size=(100,))

In [4]:
time_budget = 600 # time budget for auto-ml in seconds (advisable at least 120)
verbose = 0 # verbosity of auto-ml
n_splits = 5 # cross-fitting and cross-validation splits
data = '401k' # which dataset, one of {'401k', 'criteo', 'welfare', 'poverty', 'star'}
plot = True # whether to plot results
xfeat = 'inc' # feature to use as x axis in plotting, e.g. for criteo 'f1', for 401k 'inc', for welfare 'polviews'
# Formula for the BLP of CATE regression.
blp_formula = 'np.log(inc)' # e.g. 'f1' for criteo, np.log(inc)' for 401k, 'C(polviews)' for the welfare case.
hetero_feats = ['inc'] # list of subset of features to be used for CATE model or the string 'all' for everything
binary_y = False

## For semi-synthetic data generation
semi_synth = False # Whether true outcome y should be replaced by a fake outcome from a known CEF
simple_synth = True # Whether the true CEF of the fake y should be simple or fitted from data
max_depth = 2 # max depth of random forest during for semi-synthetic model fitting
scale = .2 # magnitude of noise in semi-synthetic data
def simple_true_cef(D, X): # simple CEF of the outcome for semi-synthetic data
    return .5 * np.array(X)[:, 1] * D + np.array(X)[:, 1]

In [5]:
from datasets import fetch_data_generator

get_data, abtest, true_cef, true_cate = fetch_data_generator(data=data, semi_synth=semi_synth,
                                                             simple_synth=simple_synth,
                                                             scale=scale, true_f=simple_true_cef,
                                                             max_depth=max_depth)
X, D, y, groups = get_data()

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit

if groups is None:
    X, Xval, D, Dval, y, yval = train_test_split(X, D, y, train_size=.6, shuffle=True, random_state=123)
    Xval, Xtest, Dval, Dtest, yval, ytest = train_test_split(Xval, Dval, yval, train_size=.5, shuffle=True, random_state=123)
    groupsval, groupstest = None, None
else:
    train, val = next(GroupShuffleSplit(n_splits=2, train_size=.6, random_state=123).split(X, y, groups=groups))
    X, Xval, D, Dval, y, yval = X.iloc[train], X.iloc[val], D[train], D[val], y[train], y[val]
    groups, groupsval = groups[train], groups[val]

    val, test = next(GroupShuffleSplit(n_splits=2, train_size=.5, random_state=123).split(Xval, yval, groups=groupsval))
    Xval, Xtest, Dval, Dtest, yval, ytest = Xval.iloc[val], Xval.iloc[test], Dval[val], Dval[test], yval[val], yval[test]
    groupsval, groupstest = groupsval[val], groupsval[test]

In [7]:
from sklearn.linear_model import LassoCV
from econml.dr import LinearDRLearner
from sklearn.linear_model import LogisticRegressionCV
from sklearn.dummy import DummyClassifier

In [8]:
# One can replace model_y and model_t with any scikit-learn regressor and classifier correspondingly
# as long as it accepts the sample_weight keyword argument at fit time.
est = LinearDRLearner(model_regression=LassoCV(cv=3),
                      model_propensity=DummyClassifier(strategy='prior'))
est.fit(y, D, X = X)

Co-variance matrix is underdetermined. Inference will be invalid!


<econml.dr._drlearner.LinearDRLearner at 0x7f85dc710a00>

In [9]:
from sklearn.base import clone

In [12]:
# Xtrain = X[:, :4]
# Xval = X_test[:, :4]
# Dtrain = T
# Dval = T_test
# ytrain = y
# yval = y_test

Xtrain = X
Dtrain = D
ytrain = y

model_reg_zero = est.model_regression
model_reg_one = clone(model_reg_zero, safe = False)
model_t = est.model_propensity
reg_zero = model_reg_zero.fit(Xtrain[Dtrain==0], ytrain[Dtrain==0])
reg_one = model_reg_one.fit(Xtrain[Dtrain==1], ytrain[Dtrain==1])
reg_zero_preds_t = reg_zero.predict(Xval)
reg_one_preds_t = reg_one.predict(Xval)
reg_preds_t = reg_zero_preds_t * (1 - Dval) + reg_one_preds_t * Dval
prop_preds = model_t.fit(Xtrain, Dtrain).predict(Xval)

dr = reg_one_preds_t - reg_zero_preds_t
reisz = (Dval - prop_preds) / np.clip(prop_preds * (1 - prop_preds), .09, np.inf)
dr += (yval - reg_preds_t) * reisz


In [13]:
cate_preds = est.fitted_models_final[0].predict(Xval)
overall_ate_val_dr = dr.mean()

In [14]:
dr_val = dr
drscore_t = np.mean((dr_val - cate_preds)**2)
drscore_b = np.mean((dr_val - overall_ate_val_dr)**2)
1 - drscore_t / drscore_b

0.0007685073901400052

In [None]:
reg_zero = model_reg_zero().fit(Xtrain[Dtrain==0], ytrain[Dtrain==0])
reg_one = model_reg_one().fit(Xtrain[Dtrain==1], ytrain[Dtrain==1])
reg_zero_preds_t = reg_zero.predict(Xval)
reg_one_preds_t = reg_one.predict(Xval)
reg_preds_t = reg_zero_preds_t * (1 - Dval) + reg_one_preds_t * Dval
prop_preds = model_t().fit(Xtrain, Dtrain).predict(Xval)

In [14]:
est.cate(X[:, :4])

AttributeError: 'LinearDRLearner' object has no attribute 'cate'

In [15]:
dir(est)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_cached_values',
 '_check_fitted_dims',
 '_check_fitted_dims_w_z',
 '_check_input_dims',
 '_d_t',
 '_d_t_in',
 '_d_w',
 '_d_x',
 '_d_y',
 '_d_z',
 '_defer_to_inference',
 '_expand_treatments',
 '_fit_final',
 '_fit_nuisances',
 '_gen_featurizer',
 '_gen_model_final',
 '_gen_ortho_learner_model_final',
 '_gen_ortho_learner_model_nuisance',
 '_get_inference',
 '_get_inference_options',
 '_illegal_refit_inference_methods',
 '_inference',
 '_input_names',
 '_models_nuisance',
 '_original_treatment_featurizer',
 '_ortho_learner_model_final',
 '_ortho_learner_model_nuisance',
 '_postfit',


In [22]:
from econml.metalearners import XLearner
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

est = XLearner(models=GradientBoostingRegressor(),
              propensity_model=GradientBoostingClassifier(),
              cate_models=GradientBoostingRegressor())
est.fit(y, T, X=np.hstack([X]))

AttributeError: Can't call 'ate_interval' because 'inference' is None

In [24]:
est.effect(X)

array([ 1.45631509, -1.01028833,  1.2851483 , ..., -0.65917494,
        1.15644912,  0.51027234])

In [26]:
est.models

[GradientBoostingRegressor(), GradientBoostingRegressor()]