# Comparing validation approaches

In [21]:
import numpy as np
import pandas as pd

from sklearn.linear_model import Ridge
from reed import Model

pd.options.display.max_columns = 200
pd.options.display.max_rows = 500
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
outcome = 'y_Dwsce'#'y_wsce'
treatment = 'redufl'
optimisation_metric = 'neg_mean_squared_error'
evaluation_metrics = ('r2','neg_mean_squared_error')
log_outcome=False
data_file = "all_vars.csv"
cross_val_cache = None
bootstrap_cache = None

In [23]:
from reed import drop_missing_treatment_or_outcome
data = pd.read_csv(data_file,index_col='xwaveid')
drop_missing_treatment_or_outcome(data, treatment, outcome)
if log_outcome:
    data[outcome] = np.log(data[outcome]+data[outcome].min())

Dropped 592 rows missing treatment or outcome.


In [24]:
from direct_regression import seperate_and_transform_data
X0, X1, y0, y1, X, y, t, features = seperate_and_transform_data(data, treatment, outcome)

print("Control data dimensions: ",X0.shape)
print("Treated data dimensions:",X1.shape)

Control data dimensions:  (3659, 638)
Treated data dimensions: (1295, 638)


In [25]:
from direct_regression import importance_from_coef
def construct_models():
    models = [
        Model('ridge',Ridge(), 
              parameters = {
                  'alpha':np.logspace(-1,4,10)
              },
              importance_func=importance_from_coef
        )
    ]
    return models

In [26]:
from direct_regression import print_unconditional_effects
print_unconditional_effects(data, treatment, y0, y1)

Proportion Treated:26%
Average outcome under Control:67.85±15.36
Average outcome under Treatment:310.57±29.39
Unadjusted treatment estimate 242.72


In [83]:
l = list(range(5))
l2 = list('abjcd')
l3 = [construct_models()[0],construct_models()[0]]

np.array(l3)

array([<reed.Model object at 0x7f4e147ea910>,
       <reed.Model object at 0x7f4e147ead30>], dtype=object)

In [27]:
from direct_regression import nested_cross_val
models0, models1, results = nested_cross_val(
    construct_models,
    cross_val_cache,
    X0, X1, y0, y1,
    optimisation_metric,
    evaluation_metrics,
    innercv=5,
    outercv=10,
    load_from_cache=False
)

Fitting ridge ...Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting

In [55]:
from direct_regression import estimate_causal_effect
def compute_ate(results, X, evaluation_metrics):
    rows = []
    index = []
    for model_name, (contr_result, treat_result) in results.items():
        tau = estimate_causal_effect(X, contr_result['estimator'],treat_result['estimator'])
        row = {'ACE':tau.mean(),'ACE_std':tau.std()}
        
        for m in evaluation_metrics:
            key = f'test_{m}'
            for name, result in [('control',contr_result),('treated',treat_result)]:
                label=f"{name}_{m}"
                label_std=f"{label}_std"
                row[label]= result[key].mean()
                row[label_std] = result[key].std()
        rows.append(row)
        index.append(model_name)
    metrics = pd.DataFrame(rows,index=index)
    return metrics

compute_ate(results,X,evaluation_metrics)

Unnamed: 0,ACE,ACE_std,control_r2,control_r2_std,treated_r2,treated_r2_std,control_neg_mean_squared_error,control_neg_mean_squared_error_std,treated_neg_mean_squared_error,treated_neg_mean_squared_error_std
ridge,39.59143,20.345715,0.206389,0.059814,0.139703,0.070269,-688768.403648,168647.264813,-963302.016527,320024.574429


In [58]:
for model, (results0, results1) in results.items():
    pass

In [65]:
results0['estimator']

[GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
              estimator=Ridge(), n_jobs=-1,
              param_grid={'alpha': array([1.00000000e-01, 3.59381366e-01, 1.29154967e+00, 4.64158883e+00,
        1.66810054e+01, 5.99484250e+01, 2.15443469e+02, 7.74263683e+02,
        2.78255940e+03, 1.00000000e+04])},
              scoring='neg_mean_squared_error', verbose=1),
 GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
              estimator=Ridge(), n_jobs=-1,
              param_grid={'alpha': array([1.00000000e-01, 3.59381366e-01, 1.29154967e+00, 4.64158883e+00,
        1.66810054e+01, 5.99484250e+01, 2.15443469e+02, 7.74263683e+02,
        2.78255940e+03, 1.00000000e+04])},
              scoring='neg_mean_squared_error', verbose=1),
 GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
              estimator=Ridge(), n_jobs=-1,
              param_grid={'alpha': array([1.00000000e-01, 3.59381366e-01, 1.29154967e+00, 4.6415

In [None]:
def hyperparam_distributions(samples) -> {str:[]}:
    """Returns a dict from hyper-parameter name to the best values for that hyper-parameter over the samples."""
    distributions = defaultdict(list)
    bounds = defaultdict(lambda:[np.inf,-np.inf])
    for sample in samples:
        h = sample['estimator'].best_params_
        grid = sample['estimator'].param_grid
        for key, value in h.items():
            distributions[key].append(value)
            if key in grid:
                search_space = grid[key]
                minv, maxv = np.min(search_space),np.max(search_space)
                if bounds[key][0] > minv:
                    bounds[key][0] = minv
                if bounds[key][1] < maxv:
                    bounds[key][1] = maxv
    return distributions,bounds

In [56]:
from direct_regression import plot_hyperparam_distributions
for model, (results0, results1) in results.items():
    plot_hyperparam_distributions(results0,f"{model}-control")
    plot_hyperparam_distributions(results1,f"{model}-treated")

TypeError: string indices must be integers

In [None]:
def nested_cv_fit_evaluate(self, X, y,
                               optimisation_metric,
                               evaluation_metrics,
                               inner_cv=None,
                               outer_cv=None
                               ):
        estimator = self.setup_estimator(optimisation_metric, inner_cv)
        outer_cv = self._setup_cv(outer_cv)

        nested_results = cross_validate(estimator, X=X, y=y, cv=outer_cv,
                                        scoring=evaluation_metrics, return_estimator=True)
        return nested_results

In [30]:
for model_name, (contr_result, treat_result) in results.items():
    pass