# Comparing validation approaches

In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import Ridge
from reed import Model

pd.options.display.max_columns = 200
pd.options.display.max_rows = 500
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [None]:
outcome = 'y_Dwsce'#'y_wsce'
treatment = 'redufl'
optimisation_metric = 'neg_mean_squared_error'
evaluation_metrics = ('r2','neg_mean_squared_error')
log_outcome=False
data_file = "all_vars.csv"
cross_val_cache = None
bootstrap_cache = None

In [None]:
from reed import drop_missing_treatment_or_outcome
data = pd.read_csv(data_file,index_col='xwaveid')
drop_missing_treatment_or_outcome(data, treatment, outcome)
if log_outcome:
    data[outcome] = np.log(data[outcome]+data[outcome].min())

In [None]:
from direct_regression import seperate_and_transform_data
X0, X1, y0, y1, X, y, t, features = seperate_and_transform_data(data, treatment, outcome)

print("Control data dimensions: ",X0.shape)
print("Treated data dimensions:",X1.shape)

In [None]:
from direct_regression import importance_from_coef
def construct_models():
    models = [
        Model('ridge',Ridge(), 
              parameters = {
                  'alpha':np.logspace(-1,4,10)
              },
              importance_func=importance_from_coef
        )
    ]
    return models

In [None]:
from direct_regression import print_unconditional_effects
print_unconditional_effects(data, treatment, y0, y1)

In [None]:
l = list(range(5))
l2 = list('abjcd')
l3 = [construct_models()[0],construct_models()[0]]

np.array(l3)

In [None]:
from direct_regression import nested_cross_val
models0, models1, results = nested_cross_val(
    construct_models,
    cross_val_cache,
    X0, X1, y0, y1,
    optimisation_metric,
    evaluation_metrics,
    innercv=5,
    outercv=10,
    load_from_cache=False
)

In [None]:
from direct_regression import estimate_causal_effect
def compute_ate(results, X, evaluation_metrics):
    rows = []
    index = []
    for model_name, (contr_result, treat_result) in results.items():
        tau = estimate_causal_effect(X, contr_result['estimator'],treat_result['estimator'])
        row = {'ACE':tau.mean(),'ACE_std':tau.std()}
        
        for m in evaluation_metrics:
            key = f'test_{m}'
            for name, result in [('control',contr_result),('treated',treat_result)]:
                label=f"{name}_{m}"
                label_std=f"{label}_std"
                row[label]= result[key].mean()
                row[label_std] = result[key].std()
        rows.append(row)
        index.append(model_name)
    metrics = pd.DataFrame(rows,index=index)
    return metrics

compute_ate(results,X,evaluation_metrics)

In [None]:
for model, (results0, results1) in results.items():
    pass

In [None]:
results0['estimator']

In [None]:
def hyperparam_distributions(samples) -> {str:[]}:
    """Returns a dict from hyper-parameter name to the best values for that hyper-parameter over the samples."""
    distributions = defaultdict(list)
    bounds = defaultdict(lambda:[np.inf,-np.inf])
    for sample in samples:
        h = sample['estimator'].best_params_
        grid = sample['estimator'].param_grid
        for key, value in h.items():
            distributions[key].append(value)
            if key in grid:
                search_space = grid[key]
                minv, maxv = np.min(search_space),np.max(search_space)
                if bounds[key][0] > minv:
                    bounds[key][0] = minv
                if bounds[key][1] < maxv:
                    bounds[key][1] = maxv
    return distributions,bounds

In [None]:
from direct_regression import plot_hyperparam_distributions
for model, (results0, results1) in results.items():
    plot_hyperparam_distributions(results0,f"{model}-control")
    plot_hyperparam_distributions(results1,f"{model}-treated")

In [None]:
def nested_cv_fit_evaluate(self, X, y,
                               optimisation_metric,
                               evaluation_metrics,
                               inner_cv=None,
                               outer_cv=None
                               ):
        estimator = self.setup_estimator(optimisation_metric, inner_cv)
        outer_cv = self._setup_cv(outer_cv)

        nested_results = cross_validate(estimator, X=X, y=y, cv=outer_cv,
                                        scoring=evaluation_metrics, return_estimator=True)
        return nested_results

In [None]:
for model_name, (contr_result, treat_result) in results.items():
    pass