In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyreadstat
import re
import string
import sklearn
from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from skopt import BayesSearchCV
from tqdm import tqdm_notebook as tqdm
from reed import *
from cinspect import dependence, importance
from sklearn.model_selection import cross_val_score, cross_validate


# set global notebook options
pd.options.display.max_columns = 200
pd.options.display.max_rows = 500
%matplotlib inline

%load_ext autoreload
%autoreload 2

sklearn.__version__

def drop_missing_treatment_or_outcome(df, treatment, outcome):
    """
    Drop rows missing treatment or outcome variable inplace.
    
    Returns
    -------
    Index of dropped rows.
    """
    l0 = len(df)
    missing_treatment = df.loc[df[treatment].isnull()].index
    missing_outcome = df.loc[df[outcome].isnull()].index
    drop = missing_treatment.union(missing_outcome)
    df.drop(index = drop, inplace=True)
    print(f"Dropped {l0-len(df)} rows missing treatment or outcome.")
    return drop

def treatment_control_split(df, treatment):
    """
    Seperate control and test indices
    
    Returns
    --------
    control: pd.DataFrame
        subset of rows where treatment == 0
        
    treated: pd.DataFrame
        subset of rows where treatment == 1
    """
    control = df[df[treatment]==0]
    treated = df[df[treatment]==1]
    print(f"Treated:{len(treated)}, Control:{len(control)}")
    return control, treated


# Load the data

### Treatent variables


   - **redhllt**, 
   - **redllt** 
   - **refllt** 
   - **reduhl**	Completed re-education based on highest level of attainment
   - **redudl**	Completed re-education based on detailed qualifications
   - **redufl**	Completed re-education using highest lvl and detailed qualifications.

### Outcome variables
   - Mental health in 2019 (**mh**). This is the transformed mental health scores from the aggregation of mental health items of the SF-36 Health Survey, as reported by the individual in 2019. It ranges from 0 to 100, with higher scores indicating better mental health.  
   - Working hours in 2019 (**wkhr**) records the total number of hours the individual works in all jobs in a week on average. Working hours are set to 0 for those not working. 
   - Hourly Wages in 2019 (**rlwage**) records the average hourly wage for the individual’s main job in 2019. Hourly wages are set to 0 for those not working and set to missing for those reporting working more than 100 hours a week. 

In [None]:
treatments = ['^reduhl$', '^rehllt$', '^redudl$', '^redufl$', '^redllt$', '^refllt$']
outcomes = ['^rlwage$', '^mh$', '^mhbm$', '^wkhr$']
other = [
            '^p_rcom',
            '^p_rdf',
            '^p_cotrl',
            '^xwaveid$',
            'p_rcom18'  # ?
            '^aedcq',  # indicate studying at start - these people should already have been removed
            '^abnfsty',
            '^aedcqfpt',
            '^aedqstdy'
]
exclude = treatments + outcomes + other


outcome = 'rlwage'
treatment = 'redudl'
optimisation_metric = 'neg_mean_squared_error'

In [None]:
meta, basic, df, raw = load_all_data()
for d in [basic, df raw]:
    drop_missing_treatment_or_outcome(d, treatment, outcome)

## Data

## Response Model

How well can we predict outcomes $Y$ conditional on treatment $T$ and other covariates $Z$?
   - fit ML models on kitchen sink, Anna's set & basic set
   - fit basic LR on basic set

#### Columns explicitly excluded
   - **xwaveid** (unique identifier)
   - **p_rcom*** (timing of completion of re-education, proxies treatment) TODO think about how we would include this
   - **p_cotrl** (first avail 2003)
   - **p_rdf*** (first avail 2012)

### Set up models

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

def construct_models():
    models = [
        Model('linear-regression',LinearRegression()),
        Model('ridge',Ridge(), 
              parameters = {
                  'alpha':np.logspace(-1,3,20)
              }
        ),
        Model('lasso',Lasso(),
              parameters = {
                  'alpha':np.logspace(-2,5,40)
              }
        ), 
        Model('gbr',GradientBoostingRegressor(n_iter_no_change=20, max_depth=2),
              parameters = {
                'max_features':[10,20,40,60,80],
                'learning_rate':np.logspace(-6,-1,10),
                'min_samples_leaf':np.logspace(0,3,10).astype(int)
              }
        ),
    ]
    return models

### Fit models and visualise performance

In [None]:
def split_and_transform(data, features, outcome, pipeline):
    X = data[features]
    n,m = X.shape
    y = data[outcome]
    X = transform.fit_transform(X)
    assert X.shape == (n,m), f"Transform changed data dimensions: {(n,m)} -> {X.shape}"
    return X,y 

In [None]:
evaluation_metrics = metrics = ['r2','neg_mean_squared_error']

# def lasso():
#     return Model('lasso',Lasso(),
#               parameters = {
#                   'alpha':np.logspace(-2,5,40)
#               }
#         )

transform = Pipeline([
    ('impute_missing', SimpleImputer()),
    ('scale', StandardScaler()),
])

data = raw


control, treated = treatment_control_split(data, treatment)
features = regex_select(data.columns, exclude, exclude=True)
X0,y0 = split_and_transform(control, features, outcome, transform)
X1,y1 = split_and_transform(treated, features, outcome, transform)

models = construct_models()
results = {}
for model in models:
    print(f"Fitting {model.name} ...",end='')
    results0 = model.nested_cv_fit_evaluate(X0,y0,optimisation_metric,evaluation_metrics)
    results1 = model.nested_cv_fit_evaluate(X1,y1,optimisation_metric,evaluation_metrics)
    results[model.name] = (results0, results1)
    print("Done")

In [None]:
import pickle
with open('nested_cv_results.pkl','wb') as f:
    pickle.dump(results,f)

## Visualise and Report results

  - Mean and Std of prediction performance for each model (both treatment & control surface)
  - Mean and Std of average treatment effect for each model
  - Features responsible for treatment effect heterogeneity & functional form (with uncertainty)
      - coefficeints for linear models
      - partial dependence curves for non-linear models

In [None]:
def estimate_causal_effect(X, models0, models1):
    tau = []
    for e0, e1 in zip(models0,models1):
        y0 = e0.predict(X)
        y1 = e1.predict(X)
        tau.append(y1-y0)
    tau = np.array(tau).mean(axis=1)
    return tau

In [None]:
invalid = 1000 # threshold to avoid displaying results that failed to converge entirely

X = np.vstack((X0,X1))

rows = []
index = []
for model_name, r in results.items():
    tau = estimate_causal_effect(X, r[0]['estimator'],r[1]['estimator'])
    row = {'ACE':tau.mean(),'ACE_std':tau.std()}
    for m in evaluation_metrics:
        key = f'test_{m}'
        for name, result in zip(('control','treated'),r):
            label=f"{name}_{m}"
            label_std=f"{label}_std"
            row[label]= result[key].mean()
            row[label_std] = result[key].std()
    rows.append(row)
    index.append(model_name)
metrics = pd.DataFrame(rows,index=index)
metrics[metrics.abs()> invalid] = np.nan

In [None]:
with pd.option_context('display.float_format', '{:,.2f}'.format):
    display(metrics)