In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyreadstat
import re
import string
import sklearn
from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, LogisticRegression,LogisticRegressionCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

from skopt import BayesSearchCV
from tqdm import tqdm_notebook as tqdm
from reed import *
from cinspect import dependence, importance
from sklearn.model_selection import cross_val_score, cross_validate


# set global notebook options
pd.options.display.max_columns = 200
pd.options.display.max_rows = 500
%matplotlib inline

%load_ext autoreload
%autoreload 2

sklearn.__version__


In [None]:
import logging.config
DEFAULT_LOGGING = {
    'version': 1,
    'disable_existing_loggers': False,
    'loggers': {
        '': {
            'level': 'INFO',
        },
    }
}

logging.config.dictConfig(DEFAULT_LOGGING)

# Load the data

### Treatent variables


   - **redhllt**, 
   - **redllt** 
   - **refllt** 
   - **reduhl**	Completed re-education based on highest level of attainment
   - **redudl**	Completed re-education based on detailed qualifications
   - **redufl**	Completed re-education using highest lvl and detailed qualifications.

### Outcome variables
   - Mental health in 2019 (**mh**). This is the transformed mental health scores from the aggregation of mental health items of the SF-36 Health Survey, as reported by the individual in 2019. It ranges from 0 to 100, with higher scores indicating better mental health.  
   - Working hours in 2019 (**wkhr**) records the total number of hours the individual works in all jobs in a week on average. Working hours are set to 0 for those not working. 
   - Hourly Wages in 2019 (**rlwage**) records the average hourly wage for the individual’s main job in 2019. Hourly wages are set to 0 for those not working and set to missing for those reporting working more than 100 hours a week. 

In [None]:
treatments = ['^reduhl$', '^rehllt$', '^redudl$', '^redufl$', '^redllt$', '^refllt$']
outcomes = ['^rlwage$', '^mh$', '^mhbm$', '^wkhr$']
other = [
            '^p_rcom',
            '^p_rdf',
            '^p_cotrl',
            '^xwaveid$',
            'p_rcom18'  # ?
            '^aedcq',  # indicate studying at start - these people should already have been removed
            '^abnfsty',
            '^aedcqfpt',
            '^aedqstdy'
]
exclude = treatments + outcomes + other


outcome = 'wkhr'#'rlwage'
treatment = 'redudl'
optimisation_metric = 'neg_mean_squared_error'

transform = Pipeline([
    ('impute_missing', SimpleImputer()),
    ('scale', StandardScaler()),
])

## Data

In [None]:
meta, basic, df, raw = load_all_data()
for d in [basic, df, raw]:
    drop_missing_treatment_or_outcome(d, treatment, outcome)

## Response Model

How well can we predict outcomes $Y$ conditional on treatment $T$ and other covariates $Z$?
   - fit ML models on kitchen sink, Anna's set & basic set
   - fit basic LR on basic set

#### Columns explicitly excluded
   - **xwaveid** (unique identifier)
   - **p_rcom*** (timing of completion of re-education, proxies treatment) TODO think about how we would include this
   - **p_cotrl** (first avail 2003)
   - **p_rdf*** (first avail 2012)

In [None]:
from econml.orf import DMLOrthoForest, DROrthoForest
from econml.dml import CausalForestDML
from econml.sklearn_extensions.linear_model import WeightedLassoCVWrapper, WeightedLasso, WeightedLassoCV
from sklearn.linear_model import LogisticRegression, Lasso

In [None]:
data = raw
features = regex_select(data.columns, exclude, exclude=True)
X,y = split_and_transform(data, features, outcome, transform)
T = data[treatment].values

### Basic check on E[Y] and E[Y|T]

In [None]:
print(f'Average outcome: {y.mean():.2f}')
print(f'Treatment values: {np.unique(T)}, proportion treated: {T.mean():.2f}')
print(f'Average outcome in treated: {y[T==1].mean():.2f}')
print(f'Average outcome in control: {y[T==0].mean():.2f}')
print(f'Unadjusted ATE: {y[T==1].mean() - y[T==0].mean():.2f}')

In [None]:
# Do a basic check looking at the difference in outcomes, rather than the total
# Note that the basic variable check does not include the initial value for income.

### Identify most predictive feature given X,T

EconML requires $X$ to be set, even if you only want to estimate ATE. We could try the sparse estimator with the full set of covariates. But to test out the other estimators, lets just take the feature with the largest coefficient to be the one to compute CATE with respect to. 

   - Note that with the basic variables included, treatment is being deemed non-significant by Lasso

In [None]:
def find_most_predictive_feature(X,T,y,features):
    model = Lasso()
    V = np.hstack((X,T.reshape(-1,1)))
    features_ext = features + ['treatment']
    model.fit(V,y)
    coef = pd.DataFrame({"coef":model.coef_},index=features_ext)
    coef['magnitude'] = coef['coef'].abs()
    coef_mag = coef.sort_values('magnitude',ascending=False)  
    return coef_mag

coef_mag = find_most_predictive_feature(X,T,y,features)
coef_mag

In [None]:
hetero_feature = coef_mag.iloc[0].name
print(f"Computing treatment effect heterogeneity with respect to: {hetero_feature}")
x_indx = features_ext.index(hetero_feature)
Xh = X[:,x_indx].reshape(-1,1)
W = np.delete(X,x_indx,axis=1)

### LinearDML

In [None]:
from econml.dml import LinearDML
ldml = LinearDML(discrete_treatment=True)
ldml.fit(y,T,X=Xh,W=W)
ldml.ate(X=Xh), ldml.ate_interval(X=Xh)

In [None]:
from econml.dml import SparseLinearDML
sldml = SparseLinearDML(discrete_treatment=True)
sldml.fit(y,T,X=Xh,W=W)
sldml.ate(X=Xh), ldml.ate_interval(X=Xh)

### LinearDR Learner

In [None]:
from econml.dr import LinearDRLearner
ldr = LinearDRLearner()
ldr.fit(y,T,X=Xh,W=W)
ldr.ate(X=Xh), ldr.ate_interval(X=Xh)

### CausalForestDML

In [None]:
from econml.dml import CausalForestDML
cf = CausalForestDML(model_y=Lasso(),
                       model_t=LogisticRegression(),
                       discrete_treatment=True,
                       random_state=123)
cf.fit(y, T, X=Xh, W=W)
cf.ate(X=Xh), cf.ate_interval(X=Xh)

### Meta Learners

In [None]:
from econml.metalearners import TLearner, SLearner, XLearner, DomainAdaptationLearner
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

In [None]:
models = LassoCV()
est_t = TLearner(models=models)
est_t.fit(y, T, X=X, inference='bootstrap')
est_t.ate(X=X), est_t.ate_interval(X=X)

In [None]:
models = GradientBoostingRegressor()
est_t2 = TLearner(models=models)
est_t2.fit(y, T, X=X, inference='bootstrap')
est_t2.ate(X=X), est_t2.ate_interval(X=X)

In [None]:
# TODO - figure out how to set the number of bootstrap samples
# TODO - implement manual bootstrapping
# TODO - plot CATE
# TODO - figure out why we have to set X (look at the implementation in EconML)
# TODO - test out some other implementations of DoubleML/CausalForest (maybe in R)
# TODO - look at cricism methods on do-why and test them out (see if we can get the same results through the do-why interface)