In [9]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
import scipy.special
from sklearn.linear_model import LassoCV, LinearRegression, ElasticNetCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.base import clone
# import joblib
from statsmodels.api import OLS
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd

from flaml import AutoML
from flaml import AutoML

from myflaml import auto_reg, auto_clf, auto_weighted_reg

In [10]:
time_budget = 60 # time budget for auto-ml in seconds (advisable at least 120)
verbose = 0 # verbosity of auto-ml
n_splits = 5 # cross-fitting and cross-validation splits
data = '401k' # which dataset, one of {'401k', 'criteo', 'welfare', 'poverty', 'star'}
plot = True # whether to plot results
xfeat = 'inc' # feature to use as x axis in plotting, e.g. for criteo 'f1', for 401k 'inc', for welfare 'polviews'
# Formula for the BLP of CATE regression.
blp_formula = 'np.log(inc)' # e.g. 'f1' for criteo, np.log(inc)' for 401k, 'C(polviews)' for the welfare case.
hetero_feats = ['inc'] # list of subset of features to be used for CATE model or the string 'all' for everything
binary_y = False

## For semi-synthetic data generation
semi_synth = False # Whether true outcome y should be replaced by a fake outcome from a known CEF
simple_synth = True # Whether the true CEF of the fake y should be simple or fitted from data
max_depth = 2 # max depth of random forest during for semi-synthetic model fitting
scale = .2 # magnitude of noise in semi-synthetic data
def simple_true_cef(D, X): # simple CEF of the outcome for semi-synthetic data
    return .5 * np.array(X)[:, 1] * D + np.array(X)[:, 1]

In [11]:
from datasets import fetch_data_generator

get_data, abtest, true_cef, true_cate = fetch_data_generator(data=data, semi_synth=semi_synth,
                                                             simple_synth=simple_synth,
                                                             scale=scale, true_f=simple_true_cef,
                                                             max_depth=max_depth)
X, D, y, groups = get_data()

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit

if groups is None:
    X, Xval, D, Dval, y, yval = train_test_split(X, D, y, train_size=.6, shuffle=True, random_state=123)
    Xval, Xtest, Dval, Dtest, yval, ytest = train_test_split(Xval, Dval, yval, train_size=.5, shuffle=True, random_state=123)
    groupsval, groupstest = None, None
else:
    train, val = next(GroupShuffleSplit(n_splits=2, train_size=.6, random_state=123).split(X, y, groups=groups))
    X, Xval, D, Dval, y, yval = X.iloc[train], X.iloc[val], D[train], D[val], y[train], y[val]
    groups, groupsval = groups[train], groups[val]

    val, test = next(GroupShuffleSplit(n_splits=2, train_size=.5, random_state=123).split(Xval, yval, groups=groupsval))
    Xval, Xtest, Dval, Dtest, yval, ytest = Xval.iloc[val], Xval.iloc[test], Dval[val], Dval[test], yval[val], yval[test]
    groupsval, groupstest = groupsval[val], groupsval[test]

In [13]:
import joblib

mreg, my, mt, mreg_zero, mreg_one = joblib.load('nuisance.jbl')
model_reg = lambda: clone(mreg)
model_y = lambda: clone(my)
model_t = lambda: clone(mt)
model_reg_zero = lambda: clone(mreg_zero)
model_reg_one = lambda: clone(mreg_one)

In [14]:
if groups is None:
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123)
    splits = list(cv.split(X, D))
else:
    cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=123)
    splits = list(cv.split(X, D, groups=groups))

n = X.shape[0]
reg_preds = np.zeros(n)
reg_zero_preds = np.zeros(n)
reg_one_preds = np.zeros(n)
reg_preds_t = np.zeros(n)
reg_zero_preds_t = np.zeros(n)
reg_one_preds_t = np.zeros(n)

DX = np.column_stack((D, X))
for train, test in splits:
    reg = model_reg().fit(DX[train], y[train])
    reg_preds[test] = reg.predict(DX[test])
    reg_one_preds[test] = reg.predict(np.column_stack([np.ones(len(test)), X.iloc[test]]))
    reg_zero_preds[test] = reg.predict(np.column_stack([np.zeros(len(test)), X.iloc[test]]))

    reg_zero = model_reg_zero().fit(X.iloc[train][D[train]==0], y[train][D[train]==0])
    reg_one = model_reg_one().fit(X.iloc[train][D[train]==1], y[train][D[train]==1])
    reg_zero_preds_t[test] = reg_zero.predict(X.iloc[test])
    reg_one_preds_t[test] = reg_one.predict(X.iloc[test])
    reg_preds_t[test] = reg_zero_preds_t[test] * (1 - D[test]) + reg_one_preds_t[test] * D[test]

res_preds = cross_val_predict(model_y(), X, y, cv=splits)
prop_preds = cross_val_predict(model_t(), X, D, cv=splits)

In [15]:
if hetero_feats == 'all':
    hetero_feats = X.columns
Z, Zval, Ztest = X[hetero_feats], Xval[hetero_feats], Xtest[hetero_feats]

In [16]:
if groups is None:
    split_type = 'auto'
else:
    split_type = GroupKFold(n_splits=n_splits)

In [18]:
slearner_best = joblib.load('slearner.jbl')[0]
slearner = slearner_best.fit(Z, reg_one_preds - reg_zero_preds)
cate_model = slearner

### Linear Regression

In [19]:
class DRLinear:
    """
    x = DRLinear(cate, zero, one, t)
    x_fitted = x.fit(X,Y,D,Z)

    x.model ...
    """
    def __init__(
        self,
        cate_model,
        model_y_zero,
        model_y_one,
        model_t,
    ):
        self.cate_model = cate_model
        self.model_y_zero = model_y_zero
        self.model_y_one = model_y_one
        self.model_t = model_t

    def calculate_dr_outcomes(
        self,
        X,
        D,
        y
    ):
        """

        :param X: covariate data
        :param D: treatment assignment
        :param y: outcomes
        :return:
        """
        reg_zero_preds_t = self.model_y_zero.predict(X)
        reg_one_preds_t = self.model_y_one.predict(X)
        reg_preds_t = reg_zero_preds_t * (1 - D) + reg_one_preds_t * D
        prop_preds = self.model_t.predict(X)

        dr = reg_one_preds_t - reg_zero_preds_t
        reisz = (D - prop_preds) / np.clip(prop_preds * (1 - prop_preds), .09, np.inf)
        dr += (y - reg_preds_t) * reisz

        return dr

    def fit(
        self,
        X,
        D,
        Y,
        Z
    ):
        """

        :param X: covariate data
        :param D: treatment assignment
        :param y: outcomes
        :param Z: subsetted covariates on which to test heterogeneity
        :return:
        """
        self.dr_outcomes_ = self.calculate_dr_outcomes(X, D, Y)

        self.cate_predictions_ = self.cate_model.predict(Z)

        self.model = OLS(self.dr_outcomes_, add_constant(self.cate_predictions_)).fit()

        return self

In [20]:
reg_zero = model_reg_zero().fit(X[D==0], y[D==0])
reg_one = model_reg_one().fit(X[D==1], y[D==1])
reg_t = model_t().fit(X, D)

In [26]:
val_model = DRLinear(slearner, reg_zero, reg_one, reg_t)
fitted = val_model.fit(Xtest, Dtest, ytest, Ztest)
fitted.model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,9.563
Date:,"Wed, 26 Apr 2023",Prob (F-statistic):,0.00201
Time:,14:19:10,Log-Likelihood:,-25255.0
No. Observations:,1944,AIC:,50510.0
Df Residuals:,1942,BIC:,50520.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7612.4087,6502.176,-1.171,0.242,-2.04e+04,5139.570
x1,2.8019,0.906,3.092,0.002,1.025,4.579

0,1,2,3
Omnibus:,3130.243,Durbin-Watson:,2.018
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3363335.232
Skew:,9.945,Prob(JB):,0.0
Kurtosis:,205.798,Cond. No.,19400.0


### Calibration

In [33]:
class cal_scorer:
    """
    x = DRLinear(cate, zero, one, t)
    x_fitted = x.fit(X,Y,D,Z)

    x.model ...
    """
    def __init__(
        self,
        cate_model,
        model_y_zero,
        model_y_one,
        model_t,
        n_groups
    ):
        self.cate_model = cate_model
        self.model_y_zero = model_y_zero
        self.model_y_one = model_y_one
        self.model_t = model_t
        self.n_groups = n_groups

    def calculate_dr_outcomes(
        self,
        X,
        D,
        y
    ):
        reg_zero_preds= self.model_y_zero.predict(X)
        reg_one_preds = self.model_y_one.predict(X)
        reg_preds = reg_zero_preds * (1 - D) + reg_one_preds * D
        prop_preds = self.model_t.predict(X)

        dr = reg_one_preds - reg_zero_preds
        reisz = (D - prop_preds) / np.clip(prop_preds * (1 - prop_preds), .01, np.inf)
        dr += (y - reg_preds) * reisz

        return dr

    def score(
        self,
        Xval,
        Dval,
        Yval,
        Zval,
        Ztest
    ):

        self.dr_outcomes_val_ = self.calculate_dr_outcomes(Xval, Dval, Yval)

        self.cate_preds_val_ = self.cate_model.predict(Zval)
        self.cate_preds_test_ = self.cate_model.predict(Ztest)

        probs = np.zeros(self.n_groups)
        g_cate = np.zeros(self.n_groups)
        gate = np.zeros(self.n_groups)

        cuts = np.quantile(self.cate_preds_test_, np.linspace(0, 1, self.n_groups + 1))
        for i in range(self.n_groups):
            ind = (self.cate_preds_val_ >= cuts[i]) & (self.cate_preds_val_ < cuts[i + 1])
            probs[i] = np.mean(ind)
            gate[i] = np.mean(self.dr_outcomes_val_[ind])
            g_cate[i] = np.mean(self.cate_preds_val_[ind])

        ate = np.mean(self.dr_outcomes_val_)

        diff1 = np.sum(abs(gate - g_cate) * probs)

        diff2 = np.sum(abs(gate - ate) * probs)

        self.cal_score = 1 - (diff1 / diff2)

        self.gate = gate
        self.g_cate = g_cate
        self.probs = probs

        return self

In [34]:
cal_model = cal_scorer(slearner, reg_zero, reg_one, reg_t, 4)

In [35]:
fitted = cal_model.score(Xval, Dval, yval, Zval, Ztest)
fitted.cal_score

-0.12329395471163718

In [39]:
fitted.gate
fitted.gate

array([ 6511.78222656, -2845.09619141,  6624.68408203,  5454.87402344])