In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyreadstat
import re
import string
import sklearn
from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from skopt import BayesSearchCV
from tqdm import tqdm_notebook as tqdm
from reed import *
from cinspect import dependence, importance
from sklearn.model_selection import cross_val_score, cross_validate
import pickle
import time

# set global notebook options
pd.options.display.max_columns = 200
pd.options.display.max_rows = 500
%matplotlib inline

%load_ext autoreload
%autoreload 2

sklearn.__version__

'0.24.2'

In [2]:
outcome = 'y_Dwsce'#'y_wsce'
treatment = 'redufl'
optimisation_metric = 'neg_mean_squared_error'
log_outcome=False

In [3]:
data = pd.read_csv("all_vars_950.csv",index_col='xwaveid')
drop_missing_treatment_or_outcome(data, treatment, outcome)
if log_outcome:
    data[outcome] = np.log(data[outcome]+data[outcome].min())

Dropped 592 rows missing treatment or outcome.


In [4]:
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor

def construct_models():
    models = [
        Model('ridge',Ridge(), 
              parameters = {
                  'alpha':np.logspace(-1,4,30)
              }
        ),
        Model('lasso',Lasso(),
              parameters = {
                  'alpha':np.logspace(-2,4,30)
              }
        ), 
        Model('gbr',GradientBoostingRegressor(n_iter_no_change=20, max_depth=2),
              parameters = {
                'max_features':[10,20,40,60,80],
                'learning_rate':np.logspace(-3,0,10),
                'min_samples_leaf':np.logspace(0,3,10).astype(int)
              }
        ),
    ]
    return models

In [5]:
# %load -s exclude_vars,seperate_and_transform_data direct_regression.py
def exclude_vars():
    """Return a list of variables that should not be included as features."""
    treatments = ['^reduhl$', '^rehllt$', '^redudl$', '^redufl$', '^redllt$', '^refllt$']
    outcomes = ['^rlwage$', '^mh$', '^mhbm$', '^wkhr$', '^y_']
    other = [
        '^p_rcom',
        '^p_rdf',
        '^p_cotrl',
        '^xwaveid$',
        'p_rcom18'  # ?
        '^aedcq',  # indicate studying at start - these people should already have been removed
        '^abnfsty',
        '^aedcqfpt',
        '^aedqstdy'
    ]
    exclude = treatments + outcomes + other
    return exclude

def seperate_and_transform_data(data, treatment, outcome):

    transform = Pipeline([
        ('impute_missing', SimpleImputer()),
        ('scale', StandardScaler()),
    ])

    exclude = exclude_vars()

    control, treated = treatment_control_split(data, treatment)
    features = regex_select(data.columns, exclude, exclude=True)
    X0, y0 = split_and_transform(control, features, outcome, transform)
    X1, y1 = split_and_transform(treated, features, outcome, transform)

    # construct the full dataset (remove ordering by treatment in case of any order dependance in fit)
    X = np.vstack((X0, X1))
    y = np.concatenate((y0, y1))
    indx = np.arange(len(y))
    np.random.shuffle(indx)
    X = X[indx, :]
    y = y[indx]

    return X0, X1, y0, y1, X, y, features


In [6]:
X0, X1, y0, y1, X, y, features = seperate_and_transform_data(data, treatment, outcome)

Treated:773, Control:4181


In [7]:
# %load -s print_unconditional_effects direct_regression.py
def print_unconditional_effects(data, treatment, y0, y1):
    print(f"Proportion Treated:{100*data[treatment].mean():.0f}%")
    print(f"Average outcome under Control:{y0.mean():.2f}±{y0.std()/np.sqrt(len(y0)):.2f}")
    print(f"Average outcome under Treatment:{y1.mean():.2f}±{y1.std()/np.sqrt(len(y1)):.2f}")
    print(f"Unadjusted treatment estimate {y1.mean() - y0.mean():.2f}")


In [8]:
print_unconditional_effects(data, treatment, y0, y1)

Proportion Treated:16%
Average outcome under Control:76.98±14.61
Average outcome under Treatment:425.08±37.85
Unadjusted treatment estimate 348.10


In [9]:
evaluation_metrics = ('r2','neg_mean_squared_error')


def nested_cross_val(
    load_from_cache=False, 
    cache_name = "nested_cv_results.pkl",
):
    if load_from_cache:
        with open(cache_name,'rb') as f:
            models, results = pickle.load(f)
            
    else:
        models = construct_models()
        results = {}
        for model in models:
            print(f"Fitting {model.name} ...",end='')
            results0 = model.nested_cv_fit_evaluate(X0,y0,optimisation_metric,evaluation_metrics)
            results1 = model.nested_cv_fit_evaluate(X1,y1,optimisation_metric,evaluation_metrics)
            results[model.name] = (results0, results1)
            print("Done")
        
        print(f"Caching results to {cache_name}")
        with open(cache_name,'wb') as f:
            pickle.dump((models,results),f)
            
    return models,results
 

models, results = nested_cross_val(load_from_cache=True)

In [10]:
def estimate_causal_effect(X, models0, models1):
    tau = []
    for e0, e1 in zip(models0,models1):
        y0 = e0.predict(X)
        y1 = e1.predict(X)
        tau.append(y1-y0)
    
    # array of shape len(modelsi),len(X)
    cate = np.array(tau) 
    
    # array of shape len(modelsi) with the ate estimate for each sample
    ate = np.mean(cate,axis=1) 
    return ate

rows = []
index = []
for model_name, r in results.items():
    tau = estimate_causal_effect(X, r[0]['estimator'],r[1]['estimator'])
    row = {'ACE':tau.mean(),'ACE_std':tau.std()}
    for m in evaluation_metrics:
        key = f'test_{m}'
        for name, result in zip(('control','treated'),r):
            label=f"{name}_{m}"
            label_std=f"{label}_std"
            row[label]= result[key].mean()
            std = result[key].std()
            row[label_std] = result[key].std()
    rows.append(row)
    index.append(model_name)
metrics = pd.DataFrame(rows,index=index)

with pd.option_context('display.float_format', '{:,.2f}'.format):
    display(metrics)

Unnamed: 0,ACE,ACE_std,control_r2,control_r2_std,treated_r2,treated_r2_std,control_neg_mean_squared_error,control_neg_mean_squared_error_std,treated_neg_mean_squared_error,treated_neg_mean_squared_error_std
ridge,348.7,15.14,0.22,0.06,0.11,0.03,-698017.69,78428.38,-992360.87,326884.08
lasso,349.53,9.82,0.26,0.04,0.1,0.03,-657597.47,84078.73,-991618.5,166074.33
gbr,483.75,142.32,0.25,0.06,0.06,0.06,-677879.9,125161.17,-1044093.85,205093.54


In [11]:
#c,t = results['ridge'][0],results['ridge'][1]

In [12]:
# feature_coef = pd.DataFrame({
#     "feature":features,
#     'coef0':c['estimator'][0].best_estimator_.coef_,
#     'coef1':c['estimator'][1].best_estimator_.coef_
# })
# absv = np.vstack((feature_coef['coef0'].abs().values,feature_coef['coef1'].abs().values)).T
# feature_coef['importance'] = absv.max(axis=1)
# feature_coef['hetero'] = (feature_coef['coef1']-feature_coef['coef0']).abs()
# feature_coef.sort_values('importance',ascending=False).head(10)

In [13]:
#feature_coef.sort_values('hetero',ascending=False).head(10)

In [14]:
# fig, ax = plt.subplots(1,2,figsize=(15,5),sharey=True)
# ax[0].bar(metrics.index, metrics['control_r2'], yerr=metrics['control_r2_std'], align='center', alpha=0.5, capsize=10)
# ax[1].bar(metrics.index, metrics['treated_r2'], yerr=metrics['treated_r2_std'], align='center', alpha=0.5,capsize=10)
# ax[0].set_ylabel('$R^2$')
# ax[0].set_title('control model')
# ax[1].set_title('treated model');

In [15]:
# def extract_params(estimator):
#     return estimator.coef_

# def bootstrapped_cross_val(load_from_cache=False, cache_name="bootstrap_cv_results.pkl", samples=10):
#     if load_from_cache:
#         with open(cache_name, 'rb') as f:
#             results = pickle.load(f)
#     else:
#         models = construct_models()
#         results = {}
#         start = time.time()
#         for model in models:
#             print(f"Fitting {model.name} ...",end='')
#             results0 = model.bootstrap_cv_evaluate(X0,y0,optimisation_metric,extract_params,
#                                                    bootstrap_samples=samples,return_estimator=True)
#             results1 = model.bootstrap_cv_evaluate(X1,y1,optimisation_metric,extract_params,
#                                                    bootstrap_samples=samples,return_estimator=True)
#             results[model.name] = (results0, results1)
#             print("Done")
#         total = time.time()-start
#         print(f"Total time:{total} seconds")
#         print(f"Caching results to: {cache_name}")
#         with open(cache_name,'wb') as f:
#             pickle.dump(results,f)
    
#     return results

# bootstrap_results = bootstrapped_cross_val(load_from_cache=False,samples=10)

In [16]:
# for model_name, (results0, results1) in bootstrap_results.items():
#     models0 = [r['estimator'] for r in results0]
#     models1 = [r['estimator'] for r in results1]
#     ate = estimate_causal_effect(X,models0, models1)
#     print(model_name, ate.mean(),ate.std()/np.sqrt(len(ate)-1))  

In [17]:
# from collections import defaultdict
# def hyperparam_distributions(samples) -> {str:[]}:
#     """Returns a dict from hyper-parameter name to the best values for that hyper-parameter over the samples."""
#     distributions = defaultdict(list)
#     for sample in samples:
#         h = sample['estimator'].best_params_
#         for key, value in h.items():
#             distributions[key].append(value)
#     return distributions

# def plot_hyperparam_distributions(samples, title) -> None:
#     distributions = hyperparam_distributions(samples)
#     k = len(distributions)
#     fig, axes = plt.subplots(1,k,figsize=(k*5,4))
#     if k == 1:
#         axes = [axes]
#     for i, (key, values) in enumerate(distributions.items()):
#         ax = axes[i]
#         ax.hist(values)
#         ax.set_title(title)
#         ax.set_xlabel(key)
#         ax.set_ylabel('count')
#     return fig,axes

# for model, (results0, results1) in bootstrap_results.items():
#     plot_hyperparam_distributions(results0,f"{model}-control")
#     plot_hyperparam_distributions(results1,f"{model}-treated")

In [18]:
# TODO, could also think about visualising distribution of coefficeints for linear models. 
# TODO, why is this results so different to what I am getting from T-learners in econml