In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyreadstat
import re
import string
import sklearn
from sklearn_pandas import DataFrameMapper
from skopt import BayesSearchCV
from tqdm import tqdm_notebook as tqdm
from reed import *
from cinspect import dependence, importance

# set global notebook options
pd.options.display.max_columns = 200
pd.options.display.max_rows = 500
%matplotlib inline

%load_ext autoreload
%autoreload 2

sklearn.__version__

# Load the data

### Treatent variables


   - **redhllt**, ?
   - **redllt** ?
   - **refllt** ?
   - **reduhl**	Completed re-education based on highest level of attainment
   - **redudl**	Completed re-education based on detailed qualifications
   - **redufl**	Completed re-education using highest lvl and detailed qualifications.

### Outcome variables
   - Mental health in 2019 (**mh**). This is the transformed mental health scores from the aggregation of mental health items of the SF-36 Health Survey, as reported by the individual in 2019. It ranges from 0 to 100, with higher scores indicating better mental health.  
   - Working hours in 2019 (**wkhr**) records the total number of hours the individual works in all jobs in a week on average. Working hours are set to 0 for those not working. 
   - Hourly Wages in 2019 (**rlwage**) records the average hourly wage for the individual’s main job in 2019. Hourly wages are set to 0 for those not working and set to missing for those reporting working more than 100 hours a week. 

In [None]:
treatments = ['reduhl', 'rehllt', 'redudl', 'redufl', 'redllt', 'refllt']
outcomes = ['rlwage', 'mh', 'mhbm', 'wkhr']
outcome = 'rlwage'
treatment = 'redudl'
optimisation_metric = 'neg_mean_squared_error'

### Prepare data

In [None]:
meta, basic, df, raw = load_all_data()
train_indx, test_indx, train_indx0, test_indx0, train_indx1, test_indx1 = drop_missing_and_split(
    [basic,df,raw],
    outcome=outcome,
    treatment=treatment,
    test_size=0 
)

features = select_features(df,treatments,outcomes,outcome)

In [None]:
X_train, _, y_train, _,t_train,_,_ = prepare_data(df,features,outcome,treatment,train_indx,test_indx)

## Causal Model

In [None]:
from econml.dml import LinearDML, SparseLinearDML
model = LinearDML(mc_iters=10).fit(
    Y = y_train,
    T = t_train,
    W = X_train,
    #X = X_train
)

In [None]:
model.ate_inference()

In [None]:
from econml.metalearners import XLearner
from sklearn.linear_model import Ridge, LogisticRegression
xm = XLearner(models = Ridge(),propensity_model=LogisticRegression(max_iter=1000))
xm.fit(Y=y_train,T=t_train,X=X_train,inference='bootstrap')

In [None]:
xm.ate_interval(X=X_train,T0=0,T1=1)

In [None]:
from econml.metalearners import SLearner,TLearner
tm = TLearner(models=Ridge())
tm.fit(Y=y_train,T=t_train, X=X_train,inference='bootstrap')
tm.ate_interval(X=X_train,T0=0,T1=1)

In [None]:
sm = SLearner(overall_model=Ridge())
sm.fit(Y=y_train,T=t_train,X=X_train,inference='bootstrap')
sm.ate_interval(X=X_train,T0=0,T1=1)

## Response Model

How well can we predict outcomes $Y$ conditional on treatment $T$ and other covariates $Z$?
   - fit ML models on kitchen sink, Anna's set & basic set
   - fit basic LR on basic set

In [None]:
treatments = ['reduhl', 'rehllt', 'redudl', 'redufl', 'redllt', 'refllt']
outcomes = ['rlwage', 'mh', 'mhbm', 'wkhr']
outcome = 'rlwage'
treatment = 'redudl'
optimisation_metric = 'neg_mean_squared_error'

In [None]:
from sklearn.model_selection import train_test_split

meta, basic, df, raw = load_all_data()

train_indx, test_indx, train_indx0, test_indx0, train_indx1, test_indx1 = drop_missing_and_split(
    [basic,df,raw],
    outcome=outcome,
    treatment=treatment,
    test_size=.33 
)

#### Columns explicitly excluded
   - **xwaveid** (unique identifier)
   - **p_rcom*** (timing of completion of re-education, proxies treatment) TODO think about how we would include this
   - **p_cotrl** (first avail 2003)
   - **p_rdf*** (first avail 2012)

### Set up models

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

def construct_models():
    models = [
        Model('linear-regression',LinearRegression()),
        Model('ridge',Ridge(), 
              parameters = {
                  'alpha':np.logspace(-1,3,20)
              }
        ),
        Model('lasso',Lasso(),
              parameters = {
                  'alpha':np.logspace(-2,5,40)
              }
        ), 
        Model('gbr',GradientBoostingRegressor(n_iter_no_change=20, max_depth=2),
              parameters = {
                'max_features':[10,20,40,60,80],
                'learning_rate':np.logspace(-6,-1,10),
                'min_samples_leaf':np.logspace(0,3,10).astype(int)
              }
        ),
    ]
    return models

### Fit models and visualise performance

#### Large feature set

This is a set of features selected by Anna as the broad set that may be relevant. Some variables have been one-hot encoded. 

In [None]:
features_l = select_features(df,treatments,outcomes,outcome)

In [None]:
Xl_train0, Xl_test0, yl_train0, yl_test0, t_train0, t_test0,_ = prepare_data(df,features_l,outcome,treatment,train_indx0,test_indx0)
Xl_train1, Xl_test1, yl_train1, yl_test1, t_train1,t_test1,_ = prepare_data(df,features_l,outcome,treatment,train_indx1,test_indx1)
Xl = np.vstack((Xl_train0,Xl_train1,Xl_test0,Xl_test1))
yl = np.concatenate((yl_train0,yl_train1, yl_test0,yl_test1))

In [None]:
models_l0 = construct_models()
fit_models(models_l0,optimisation_metric,Xl_train0,yl_train0)
models_l1 = construct_models()
fit_models(models_l1,optimisation_metric,Xl_train1,yl_train1)

In [None]:
visualise_regression_performance(models_l0,Xl_test0,yl_test0)
visualise_regression_performance(models_l1,Xl_test1,yl_test1)

In [None]:
model = models_l0[1].fit_estimator.best_estimator_
coef = pd.DataFrame({"feature":features_l,"coef":model.coef_})
coef.sort_values('coef',ascending=False).head(10)

In [None]:
f_indx = features_l.index("p_wh01")
f_indx

In [None]:
dependence.individual_conditional_expectation(model, Xl_train1, 117,20)

In [None]:
Xl_train0.shape

In [None]:
plt.hist(Xl_train0[:,111])

In [None]:
Xl_train0.min(),Xl_train0.max()

In [None]:
model

In [None]:
from cinspect.dependence import PartialDependencePlot
        

In [None]:
# if categorical, 

In [None]:
pdp = PartialDependencePlot("pd")
pdp.add_dependence(model, Xl_train0, 111, 'hours worked',density='hist')
pdp.plot();

In [None]:
values = Xl_train0[:,117]

In [None]:
np.quantile(values, np.array([.01,.99]))

In [None]:
np.unique(values) # only two values?

In [None]:
plt.hist(Xl_train0[:,117])

In [None]:
grid, predictions, color, name = pdp.curves[0]

In [None]:
# now we may want to compute coefficients and partial dependence etc with respect to variables pre-scaling

In [None]:
n_samples = 100
total = len(predictions)



In [None]:
sample = np.random.choice(np.arange(total),size=n_samples,replace=False)

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
ax.plot(grid,predictions.mean(axis=0),color='black')
ax.plot(grid,predictions[sample,:].T, color='black',alpha=0.1,lw=1);


In [None]:
 if color is None:
        color = "black"

    if ax is None:
        fig, ax = plt.subplots()

    if sample is not None:
        ax.plot(grid_values, predictions[sample].T, alpha=0.1, color="grey")
        if density:
            for x in X[sample, feature_indx]:
                ax.axvline(x, color="grey", ymin=0, ymax=0.03, alpha=0.2)

    ax.plot(grid_values, predictions.mean(axis=0), color=color, label=label)

    ax.set_ylabel("prediction")
    ax.set_xlabel(feature_name)
    if title is None:
        ax.set_title("ICE & Partial Dependence for {}".format(feature_name))
    else:
        ax.set_title(title)


In [None]:
# need to write a version of feature importance that looks just at how much the outcome changes with respect to input

class TLearner:
    def __init__(self,name,model0,model1):
        self.model0 = model0
        self.model1 = model1
        self.name = name
        
    def y0(self,X):
        return self.model0.fit_estimator.predict(X)
    
    def y1(self,X):
        return self.model1.fit_estimator.predict(X)
    
    def tau(self,X):
        return self.y1(X) - self.y0(X)

    def ate(self,X):
        tau = self.tau(X)
        return np.mean(tau)
    

def visualise_causal_estimation(models0,models1,X):
    estimators = {}
    for model0,model1 in zip(models0,models1):
        causal_estimator = TLearner(model0.name,model0,model1)
        estimators[model0.name] = causal_estimator
        ate = causal_estimator.ate(X)
        print(f"{causal_estimator.name}:ATE={ate:.2f}")
        y0,y1 = causal_estimator.y0(X),causal_estimator.y1(X)
        fig,ax = plt.subplots(1,2,figsize=(15,5))
        ax[0].set_title(causal_estimator.name)
        ax[0].scatter(y0,y1,alpha=0.1)
        ax[0].set_xlabel('y0')
        ax[0].set_ylabel('y1')
        ax[1].hist(y0,alpha=0.5,label="y0")
        ax[1].hist(y1,alpha=0.5,label="y1")
        ax[1].legend(loc="upper left") 
    return estimators
        
# feature importance ...
# how much does changing X change tau?

from sklearn.metrics import mean_squared_error

def permutation_importance(X,func,metric,repeat=5):
    """Compute the extent to which the function depends on each column of X."""
    change = []
    y = np.tile(func(X),repeat)
    columns = np.arange(X.shape[1])
    for c in columns:
        X0 = X.copy()
        yp = []
        for r in range(repeat):
            np.random.shuffle(X0[:,c])
            yp.append(func(X0))
        yp = np.concatenate(yp)
        dy = metric(y,yp)
        change.append(dy)
    return change
    

In [None]:
causal_estimators = visualise_causal_estimation(models_l0,models_l1,Xl)

In [None]:
yl_train1.mean() - yl_train0.mean() # unadjusted, hmm

In [None]:
# visualise individual or partial dependence curves (for the difference and for each regression model side seperately)

In [None]:
# feature importance? on tau. 
pi = permutation_importance(Xl, causal_estimators['gbr'].tau, mean_squared_error)
fi = pd.DataFrame({'feature':features_l,'importance':pi}).sort_values(by='importance',ascending=False)
fi.head(20)

In [None]:
for k in fi.head(10)['feature']:
    print(k, meta.column_names_to_labels.get(k))

In [None]:
## we filtered to those who got re-educated in 2002-2017 (in current data, those who completed in 2018)
## dummy variable for those who got re-educated in 2018 or 2019 (drop those people)
## 25th November presentation deadline, returning to education

#### Basic feature set


In [None]:
basic0 = basic[basic[treatment]==0]
basic1 = basic[basic[treatment]==1]
features_b = select_features(basic,treatments,outcomes,outcome)
Xb_train0, Xb_test0, yb_train0, yb_test0, t_train0,t_test0,tr0 = prepare_data(basic0,features_b,outcome,treatment,train_indx0,test_indx0)
Xb_train1, Xb_test1, yb_train1, yb_test1, t_train1,t_test1,tr0 = prepare_data(basic1,features_b,outcome,treatment,train_indx1,test_indx1)
Xb = np.vstack((Xb_train0,Xb_train1,Xb_test0,Xb_test1))
yb = np.concatenate((yb_train0,yb_train1, yb_test0,yb_test1))

In [None]:
models_b0 = construct_models()
fit_models(models_b0,optimisation_metric,Xb_train0,yb_train0)
models_b1 = construct_models()
fit_models(models_b1,optimisation_metric,Xb_train1,yb_train1)


In [None]:
visualise_regression_performance(models_b0,Xb_test0,yb_test0)
visualise_regression_performance(models_b1,Xb_test1,yb_test1)

In [None]:
visualise_causal_estimation(models_b0,models_b1,Xb)

In [None]:
pi = permutation_importance(Xb, causal_estimator.tau, mean_squared_error)
pd.DataFrame({'feature':features_b,'importance':pi}).sort_values(by='importance',ascending=False)

#### Raw feature set

This feature set contains every variable observed in 2001, with very little filtering or pre-processing. The minimal preprocessing includes;
   - removing variables that are more than 95% missing
   - merging variables that are almost perfectly correlated (> .95) 
   - removing variables with 0 variance
   - changing dates to days past an epoch

In [None]:
models_raw = construct_models()
treatment = 'redudl' #reduhl, #refllt
df,meta = load_data('raw',treatments,outcomes)
features_r = select_features(df)
Xr_train, Xr_test, yr_train, yr_test = prepare_data(df, features_r, treatment,train_indx,test_indx)
fit_models(models_raw,optimisation_metric,Xr_train,yr_train)
visualise_performance(models_raw,Xr_test,yr_test)

In [None]:
importances_r = extract_importance(models_raw,Xr_test,yr_test,features_r)

In [None]:
visualise_importance_distribution(importances_r)

In [None]:
models_large = construct_models()
treatment = 'redudl' #reduhl, #refllt
df,meta_l = load_data('anna',treatments,outcomes)
features_l = select_features(df)
Xl_train, Xl_test, yl_train, yl_test = prepare_data(df, features_l, treatment,train_indx,test_indx)
fit_models(models_large,optimisation_metric,Xl_train,yl_train)
visualise_performance(models_large,Xl_test,yl_test)

In [None]:
importances_l = extract_importance(models_large,Xl_test,yl_test,features_l)

In [None]:
visualise_importance_distribution(importances_l)

##### Features ranked by permutation importance

In [None]:
column_labels = meta_l.column_names_to_labels
importances_l['label'] = [ column_labels.get(name,"") for name in importances_l.index]
importances_l.sort_values('permutation-lr-ridge',ascending=False).head(20)

#### Minimal feature set
This is the very minimal set of features used in the original paper. It consists of 4 variables, (sex, age, education, employment). Each is one-hot encoded and interactions are added between sex and the other variables.

In [None]:
models_basic = [construct_models()[0]]
treatment = 'redudl' #reduhl, #refllt
df,meta = load_data('basic',treatments,outcomes)
features_b = select_features(df) 
Xb_train, Xb_test, yb_train, yb_test = prepare_data(df, features_b, treatment,train_indx,test_indx)
fit_models(models_basic,optimisation_metric,Xb_train,yb_train)
visualise_performance(models_basic,Xb_test,yb_test)

In [None]:
importances_b = extract_importance(models_basic,Xb_test,yb_test,features_b)

In [None]:
visualise_importance_distribution(importances_b)

In [None]:
importances_b.sort_values('permutation-lr',ascending=False).head(10)

In [None]:
## TODO add permutation curve for age

# Causal Models

## Direct Regression

Predict the outcome $Y$ based on pre-treatment variables and the treatment variable.

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

In [None]:
outcome = 'wkhr'

transform = Pipeline([
    ('impute_missing', SimpleImputer()),
    ('scale', StandardScaler())
])

XD = transform.fit_transform(df[features])
yD = df[outcome]

assert np.ndim(y)==1

valid = ~np.isnan(yD)
XD = XD[valid,:]
yD = yD[valid]

models = [
    Model('gbc',GradientBoostingRegressor(n_iter_no_change=20, max_depth=2),
          parameters = {
            'max_features':[10,20,40,60,80],
            'learning_rate':np.logspace(-6,-1,10),
            'min_samples_leaf':np.logspace(0,3,10).astype(int)
          }
    ),
    Model('lr',LinearRegression(),
          parameters = {
              'C':np.logspace(-4,0,20)
          }
    )
]

inner_cv = KFold(n_splits=5)
outer_cv = KFold(n_splits=3)

In [None]:
optimisation_metric = 'neg_mean_squared_error'
fit_models = []
for model in models:
    search = GridSearchCV(
        estimator=model.estimator, param_grid=model.parameters, verbose=2,
        n_jobs=-1, scoring = optimisation_metric, cv = inner_cv, refit=True
    )
    search.fit(XD,yD)
    model.fit_estimator = search

In [None]:
# TODO - add calibration curves
for model in models:
    p = model.fit_estimator.predict_proba(X)[:,1]
    visualise_propensity_model_performance(y,p, model.name)  

# Additional code

### Fit a single model

In [None]:
sqrt_features = np.sqrt(X.shape[1])

gbc_params = {
     #'min_samples_split': (1e-3, .2, 'log-uniform'),
     'max_features': (int(sqrt_features/2),int(sqrt_features*5)),
     'learning_rate':(0.00001,0.1,'log-uniform'),
     'min_samples_leaf':(1,2,4,8,16,32,64,128,256,512,1024)
}

gbc_param_grid = {
    'max_features':[10,20,40,60,80],
    'learning_rate':np.logspace(-6,-1,10),
    'min_samples_leaf':np.logspace(0,3,10).astype(int)
}

search = GridSearchCV(estimator=GradientBoostingClassifier(n_iter_no_change=20, max_depth=2),verbose=2,param_grid=gbc_param_grid,n_jobs=-1,scoring='roc_auc')

# ncalls = 300
# search = BayesSearchCV(
#     estimator=GradientBoostingClassifier(n_iter_no_change=20, max_depth=2),search_spaces = gbc_params, cv=inner_cv,n_iter=ncalls,
#     scoring = 'roc_auc',
#     n_jobs = 10
# )

In [None]:
search.fit(X,y)
#search.fit(X,y,callback=tqdm_skopt(total=ncalls, desc="Searching Hyperparams"))

In [None]:
results = pd.DataFrame(search.cv_results_)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

def plot_knn_surface(results,param, ax):

    param_name = "param_"+param
    X = results[param_name].values.reshape(-1,1)
    y = results["mean_test_score"]
    max_neighbours = int(len(y)/10)
    model = GridSearchCV(KNeighborsRegressor(),param_grid={"n_neighbors":range(2,max_neighbours)})
    model.fit(X,y)
    X_test = np.linspace(results[param_name].min(),results[param_name].max(),100).reshape(-1,1)
    y_test = model.predict(X_test)
    ax.plot(X_test.ravel(),y_test)

fig, ax = plt.subplots(2,nparams,figsize=(5*nparams,12))
for i, pname in enumerate(gbc_params.keys()):
    values = results[f"param_{pname}"]
    
    ax[0,i].scatter(values,results['mean_test_score'],alpha=0.2)
    ax[0,i].set_title(pname)
    ax[0,i].set_xlabel(pname)
    ax[0,i].set_ylabel("mean score")
    
    if values.nunique() < 10:
        results.groupby(f"param_{pname}")['mean_test_score'].mean().plot(ax=ax[1,i])
    else:
        plot_knn_surface(results, pname, ax[1,i])

In [None]:
from sklearn.neighbors import KNeighborsRegressor

def plot_knn_surface(results,param, ax):

    param_name = "param_"+param
    X = results[param_name].values.reshape(-1,1)
    y = results["mean_test_score"]
    max_neighbours = int(len(y)/10)
    model = GridSearchCV(KNeighborsRegressor(),param_grid={"n_neighbors":range(2,max_neighbours)})
    model.fit(X,y)
    X_test = np.linspace(results[param_name].min(),results[param_name].max(),100).reshape(-1,1)
    y_test = model.predict(X_test)
    ax.plot(X_test.ravel(),y_test)

fig, ax = plt.subplots(2,nparams,figsize=(5*nparams,12))
for i, pname in enumerate(gbc_params.keys()):
    values = results[f"param_{pname}"]
    
    ax[0,i].scatter(values,results['mean_test_score'],alpha=0.2)
    ax[0,i].set_title(pname)
    ax[0,i].set_xlabel(pname)
    ax[0,i].set_ylabel("mean score")
    
    if values.nunique() < 10:
        results.groupby(f"param_{pname}")['mean_test_score'].mean().plot(ax=ax[1,i])
    else:
        plot_knn_surface(results, pname, ax[1,i])

### Fit models

In [None]:
model = GridSearchCV(lr.estimator,param_grid = lr.parameters, cv=inner_cv,scoring='roc_auc')
scores = cross_val_score(model,X,y,cv=outer_cv)

In [None]:
#scores = {}
#for mname, model in models.items():
#    scores[mname] = cross_val_score(model, X, y, scoring='roc_auc') 

### Propensity model results

In [None]:
coef = pd.DataFrame({'coef':models['lr'].coef_[0]}, index = features)
coef['abs'] = coef['coef'].abs()

In [None]:
for name in coef.sort_values('abs',ascending=False).head(50).index:
    print(name, meta.column_names_to_labels[name])