In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyreadstat
import re
import string
import sklearn
from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, LogisticRegression,LogisticRegressionCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR


from tqdm import tqdm_notebook as tqdm
from reed import *
#from cinspect import dependence, importance
from sklearn.model_selection import cross_val_score, cross_validate

from econml.orf import DMLOrthoForest, DROrthoForest
from econml.dml import CausalForestDML
from econml.sklearn_extensions.linear_model import WeightedLassoCVWrapper, WeightedLasso, WeightedLassoCV
from sklearn.linear_model import LogisticRegression, Lasso


# set global notebook options
pd.options.display.max_columns = 200
pd.options.display.max_rows = 500
%matplotlib inline

%load_ext autoreload
%autoreload 2

import logging.config
DEFAULT_LOGGING = {
    'version': 1,
    'disable_existing_loggers': False,
    'loggers': {
        '': {
            'level': 'INFO',
        },
    }
}

logging.config.dictConfig(DEFAULT_LOGGING)

# Load the data

In [None]:
outcome = 'y_Dwsce'#'y_wsce'
treatment = 'redufl'
optimisation_metric = 'neg_mean_squared_error'
evaluation_metrics = ('r2','neg_mean_squared_error')
log_outcome=False
data_file = "all_vars.csv"

In [None]:
data = pd.read_csv(data_file,index_col='xwaveid')
drop_missing_treatment_or_outcome(data, treatment, outcome)
if log_outcome:
    data[outcome] = np.log(data[outcome]+data[outcome].min())

### Prepare data for modeling

In [None]:
from direct_regression import seperate_and_transform_data
X0, X1, y0, y1, X, y, T, features = seperate_and_transform_data(data, treatment, outcome)

print("Control data dimensions: ",X0.shape)
print("Treated data dimensions:",X1.shape)

### Basic check on E[Y] and E[Y|T]

In [None]:
from direct_regression import print_unconditional_effects
print_unconditional_effects(data, treatment, y0, y1)

### Separate covariates X into Xh and W

EconML requires `X` (covariates with which to compute heterogeneity) to be set, even if you only want to estimate ATE. 

We will use `Xh` for the covariates passed to $X$ for EconML and `W` for the remaining variables ie `Union(Xh, W) = X`

For initial testing, lets just take the feature with the largest coefficient to be the one to compute CATE with respect to. 

In [None]:
def find_most_predictive_feature(X,T,y,features):
    model = Lasso()
    V = np.hstack((X,T.reshape(-1,1)))
    features_ext = features + ['treatment']
    model.fit(V,y)
    coef = pd.DataFrame({"coef":model.coef_},index=features_ext)
    coef['magnitude'] = coef['coef'].abs()
    coef_mag = coef.sort_values('magnitude',ascending=False)  
    return coef_mag

coef_mag = find_most_predictive_feature(X,T,y,features)
coef_mag.head(3)

In [None]:
hetero_feature = coef_mag.iloc[0].name
print(f"Computing treatment effect heterogeneity with respect to: {hetero_feature}")
x_indx = features.index(hetero_feature)
Xh = X[:,x_indx].reshape(-1,1) # X for EconML models
W = np.delete(X,x_indx, axis=1)

### LinearDML

In [None]:
from econml.dml import LinearDML
ldml = LinearDML(discrete_treatment=True)
ldml.fit(y,T,X=Xh,W=W)

In [None]:
ldml.ate(X=Xh), ldml.ate_interval(X=Xh)

In [None]:
from econml.dml import SparseLinearDML
sldml = SparseLinearDML(discrete_treatment=True)
sldml.fit(y,T,X=Xh,W=W)

In [None]:
sldml.ate(X=Xh), ldml.ate_interval(X=Xh)

### LinearDR Learner

In [None]:
from econml.dr import LinearDRLearner
ldr = LinearDRLearner()
ldr.fit(y,T,X=Xh,W=W)

In [None]:
ldr.ate(X=Xh), ldr.ate_interval(X=Xh)

### CausalForestDML

In [None]:
from econml.dml import CausalForestDML
cf = CausalForestDML(model_y=Lasso(),
                       model_t=LogisticRegression(),
                       discrete_treatment=True,
                       random_state=123)
cf.fit(y, T, X=Xh, W=W)

In [None]:
cf.ate(X=Xh), cf.ate_interval(X=Xh)

### Meta Learners

In [None]:
from econml.metalearners import TLearner, SLearner, XLearner, DomainAdaptationLearner
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

#### Lasso

In [None]:
models = LassoCV(alphas=np.logspace(-2,4,30),max_iter=5000)
est_t = TLearner(models=models)
est_t.fit(y, T, X=X, inference='bootstrap')

In [None]:
LassoCV?

In [None]:
est_t.ate(X=X), est_t.ate_interval(X=X)

#### GradientBoosting

In [None]:
models = GradientBoostingRegressor()
est_t2 = TLearner(models=models)
est_t2.fit(y, T, X=X, inference='bootstrap')

In [None]:
est_t2.ate(X=X), est_t2.ate_interval(X=X)

In [None]:
# TODO - figure out how to set the number of bootstrap samples
# TODO - implement manual bootstrapping
# TODO - plot CATE
# TODO - figure out why we have to set X (look at the implementation in EconML)
# TODO - test out some other implementations of DoubleML/CausalForest (maybe in R)
# TODO - look at cricism methods on do-why and test them out (see if we can get the same results through the do-why interface)