In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyreadstat
import re
import string
import sklearn


from reed import *

# set global notebook options
pd.options.display.max_columns = 200
pd.options.display.max_rows = 500
pd.options.display.max_colwidth = 1000

%matplotlib inline

# Load the data

### Treatent variables


   - **redhllt**
   - **redllt** 
   - **refllt** 
   - **reduhl**	Completed re-education based on highest level of attainment
   - **redudl**	Completed re-education based on detailed qualifications
   - **redufl**	Completed re-education using highest lvl and detailed qualifications.

### Outcome variables
   - Mental health in 2019 (**mh**). This is the transformed mental health scores from the aggregation of mental health items of the SF-36 Health Survey, as reported by the individual in 2019. It ranges from 0 to 100, with higher scores indicating better mental health.  
   - Working hours in 2019 (**wkhr**) records the total number of hours the individual works in all jobs in a week on average. Working hours are set to 0 for those not working. 
   - Hourly Wages in 2019 (**rlwage**) records the average hourly wage for the individual’s main job in 2019. Hourly wages are set to 0 for those not working and set to missing for those reporting working more than 100 hours a week. 

In [None]:
treatments = ['reduhl', 'rehllt', 'redudl', 'redufl', 'redllt', 'refllt']
outcomes = ['rlwage', 'mh', 'mhbm', 'wkhr']

## Propensity Model

What features are predictive of someone undertaking re-education?

### Specify target and features
For this model, we are predicting whether or not an individual is treated (eg is re-educated) so the target will be one of the measures of re-education.

#### Columns explicitly excluded
   - **xwaveid** (unique identifier)
   - **p_rcom*** (timing of completion of re-education, proxies treatment)
   - **p_cotrl** (first avail 2003)
   - **p_rdf*** (first avail 2012)

### Set up models

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from interpret.glassbox import ExplainableBoostingClassifier

# select splits in advance so the same ones are used for all models.
np.random.seed(666)
indx = np.arange(5298)
train_indx, test_indx = train_test_split(indx)


def construct_models():
    models = [
        Model('lr',LogisticRegression(penalty='none',solver='saga',max_iter=1000)),
        Model('svc',SVC(kernel='rbf',probability=True),
              parameters = {
                  'C':np.logspace(-3,2,20),
                   'gamma':list(np.logspace(-6,-1,10))+['scale']
              }
        ),
        Model('lr-ridge',LogisticRegression(),
              parameters = {
                  'C':np.logspace(-5,0,20)
              }
        ), 
        Model('gbc',GradientBoostingClassifier(n_iter_no_change=20, max_depth=2),
              parameters = {
                'max_features':[10,20,40,60,80],
                'learning_rate':np.logspace(-6,-1,10),
                'min_samples_leaf':np.logspace(0,3,10).astype(int)
              }
        ),

    ]
    return models

In [None]:
#TODO, write code to check where in grid space best param found and throw warning if on edge

### Fit models and visualise performance

In [None]:
optimisation_metric = 'roc_auc'        

#### Raw feature set

This feature set contains every variable observed in 2001, with very little filtering or pre-processing. The minimal preprocessing includes;
   - removing variables that are more than 95% missing
   - merging variables that are almost perfectly correlated (> .95) 
   - removing variables with 0 variance
   - changing dates to days past an epoch

In [None]:
models_raw = construct_models()
treatment = 'redudl' #reduhl, #refllt
df,meta = load_data('raw',treatments,outcomes)
features_r = select_features(df)
Xr_train, Xr_test, yr_train, yr_test = prepare_data(df, features_r, treatment,train_indx,test_indx)
fit_models(models_raw,optimisation_metric,Xr_train,yr_train)
visualise_performance(models_raw,Xr_test,yr_test)

In [None]:
importances_r = extract_importance(models_raw,Xr_test,yr_test,features_r)

In [None]:
visualise_importance_distribution(importances_r)

In [None]:
# get column labels
s='a'
_, meta1 = pyreadstat.read_sav(f'../part1/Combined {s}190c.sav') 
del(_)

In [None]:
column_labels = meta1.column_names_to_labels
importances_r['label'] = [ column_labels.get(name,"") for name in importances_r.index]
importances_r.sort_values('permutation-lr-ridge',ascending=False).head(20)

#### Large feature set

This is a set of features selected by Anna as the broad set that may be relevant. Some variables have been one-hot encoded. 

In [None]:
models_large = construct_models()
treatment = 'redudl' #reduhl, #refllt
df,meta_l = load_data('anna',treatments,outcomes)
features_l = select_features(df,treatments,outcomes,treatment)
Xl_train, Xl_test, yl_train, yl_test = prepare_data(df, features_l, treatment,train_indx,test_indx)
fit_models(models_large,optimisation_metric,Xl_train,yl_train)
visualise_performance(models_large,Xl_test,yl_test)

In [None]:
models_large[1].fit_estimator.best_params_

In [None]:
np.logspace(-5,2,20)

In [None]:
'C':np.logspace(-3,2,20),
'gamma':list(np.logspace(-4,0,10))+['scale']

In [None]:
importances_l = extract_importance(models_large,Xl_test,yl_test,features_l,optimisation_metric)

In [None]:
visualise_importance_distribution(importances_l)

##### Features ranked by permutation importance

In [None]:
column_labels = meta_l.column_names_to_labels
importances_l['label'] = [ column_labels.get(name,"") for name in importances_l.index]
importances_l.sort_values('permutation-lr-ridge',ascending=False).head(20)

#### Minimal feature set
This is the very minimal set of features used in the original paper. It consists of 4 variables, (sex, age, education, employment). Each is one-hot encoded and interactions are added between sex and the other variables.

In [None]:
models_basic = [construct_models()[0]]
treatment = 'redudl' #reduhl, #refllt
df,meta = load_data('basic',treatments,outcomes)
features_b = select_features(df) 
Xb_train, Xb_test, yb_train, yb_test = prepare_data(df, features_b, treatment,train_indx,test_indx)
fit_models(models_basic,optimisation_metric,Xb_train,yb_train)
visualise_performance(models_basic,Xb_test,yb_test)

In [None]:
importances_b = extract_importance(models_basic,Xb_test,yb_test,features_b)

In [None]:
visualise_importance_distribution(importances_b)

In [None]:
importances_b.sort_values('permutation-lr',ascending=False).head(10)