In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyreadstat
import re
import string
from sklearn_pandas import DataFrameMapper
pd.options.display.max_columns = 200
%matplotlib inline
import sklearn

sklearn.__version__

'0.23.2'

In [2]:
# define some functions

def regex_select(lst, regex):
    """
    Return all values from a list of strings that match any of the supplied regexes.
    """
    if isinstance(regex, str):
        regex = [regex]
        
    results = []
    for value in lst:
        for pattern in regex:
            if re.search(pattern, value):
                results.append(value)
                break
    return results

def invert_dict(d):
    """
    Invert a dictionary to return mapping from values to keys. 
    
    Errors if values are not unique.
    """
    result = {}
    for key, value in d.items():
        if value in result:
            raise ValueError(f"Duplicate key: {value}")
        result[value] = key
    return result

# Load the data

In [3]:
df, meta = pyreadstat.read_dta("reduregvars.dta")
df.shape

(5298, 190)

In [4]:
df.head()

Unnamed: 0,xwaveid,p_fem,reduhl,rehllt,rlwage,mh,mhbm,wkhr,p_mh01,p_mb01,p_wh01,p_hrw01,redudl,redufl,redllt,refllt,p_rcom1,p_rcom2,p_rcom3,p_rcom4,p_rcom5,p_rcom6,p_rcom7,p_rcom8,p_rcom9,p_rcom10,p_rcom11,p_rcom12,p_rcom13,p_rcom14,p_rcom15,p_rcom16,p_rcom17,p_rcom18,p_losat,p_jahpj,p_jadnm,p_noch,p_lfs1,p_lfs2,p_lfs3,p_occ1,p_occ2,p_occ3,p_occ4,p_occ5,p_occ6,p_occ7,p_occ8,p_occ9,p_occ10,p_occ11,p_age1,p_age2,p_age3,p_age4,p_age5,p_cob1,p_cob2,p_cob3,p_cob4,p_poeng,p_urdg1,p_urdg2,p_urdg3,p_mar1,p_mar2,p_mar3,p_mar4,p_mar5,p_mar6,p_ddeg1,p_ddeg2,p_ddeg3,p_ddeg4,p_emp1,p_emp2,p_emp3,p_emp4,p_emp5,p_con1,p_con2,p_con3,p_con4,p_con5,p_con6,p_whp1,p_whp2,p_whp3,p_whp4,p_rehdi,p_plfs1,p_plfs2,p_plfs3,p_plfs4,p_fcob1,p_fcob2,p_fcob3,p_mcob1,p_mcob2,p_mcob3,p_fpsm,p_fedu1,p_fedu2,p_fedu3,p_fedu4,p_fedu5,p_fedu6,p_mpsm,p_medu1,p_medu2,p_medu3,p_medu4,p_medu5,p_medu6,p_femp1,p_femp2,p_femp3,p_memp1,p_memp2,p_memp3,p_fsue1,p_fsue2,p_fsue3,p_focc1,p_focc2,p_focc3,p_focc4,p_focc5,p_focc6,p_focc7,p_focc8,p_focc9,p_focc10,p_mocc1,p_mocc2,p_mocc3,p_mocc4,p_mocc5,p_mocc6,p_mocc7,p_mocc8,p_mocc9,p_mocc10,p_rdf1,p_rdf2,p_rdf3,p_rdf4,p_rdf5,p_rdf6,p_rdf7,p_cotrl,p_jbwk,p_femmiss,p_mh01miss,p_mb01miss,p_wh01miss,p_hrw01miss,p_rcom1miss,p_losatmiss,p_jahpjmiss,p_jadnmmiss,p_nochmiss,p_lfs1miss,p_occ1miss,p_age1miss,p_cob1miss,p_poengmiss,p_urdg1miss,p_mar1miss,p_ddeg1miss,p_emp1miss,p_con1miss,p_whp1miss,p_rehdimiss,p_plfs1miss,p_fcob1miss,p_mcob1miss,p_fpsmmiss,p_fedu1miss,p_mpsmmiss,p_medu1miss,p_femp1miss,p_memp1miss,p_fsue1miss,p_focc1miss,p_mocc1miss,p_rdf1miss,p_cotrlmiss,p_jbwkmiss
0,100003,0.0,0.0,0.0,0.0,84.0,0.0,0.0,92.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,7,2,3.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0.0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,44072.386719,0,0,0,0,1,0,0,1,0,0,1.0,0,0,1,0,0,0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,16.0,4.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100014,0.0,,,0.0,76.0,0.0,0.0,60.0,1.0,0.0,39.66251,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,2,4,1.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0.0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,53916.707031,0,1,0,0,1,0,0,1,0,0,0.0,0,1,0,0,0,0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,13.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100015,1.0,1.0,0.0,22.222122,60.0,1.0,38.0,56.0,1.0,38.0,21.683214,1.0,1.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,7,7,7,1.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0.0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,53916.707031,0,1,0,0,0,1,0,1,0,0,1.0,0,0,1,0,0,0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,24.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100018,1.0,0.0,0.0,24.929432,64.0,1.0,30.0,44.0,1.0,30.0,0.0,1.0,1.0,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,6,3,4,2.0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0.0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,63112.148438,0,1,0,0,1,0,0,1,0,0,1.0,0,0,1,0,0,0,1.0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,30.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100019,0.0,1.0,0.0,60.021801,72.0,1.0,45.0,48.0,1.0,45.0,50.729443,1.0,1.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,5,6,2.0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0.0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,63112.148438,0,0,0,1,1,0,0,1,0,0,0.0,0,0,0,0,1,0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,33.0,5.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# make a mapping from labels to column names
labels_to_names = invert_dict(meta.column_names_to_labels)
labels_to_names

{'XW Cross wave ID': 'xwaveid',
 'Female': 'p_fem',
 'Completed re-education based on hightest level of attainment': 'reduhl',
 'Completion re-education after 2017: highest attainment': 'rehllt',
 'Real hourly wage for 2019': 'rlwage',
 'Mental health in 2019': 'mh',
 'Mental health below norm in 2019': 'mhbm',
 'Total weekly working hours in 2019': 'wkhr',
 'Mental health in 2001': 'p_mh01',
 'Mental health below norm in 2001': 'p_mb01',
 'Total weekly working hours in 2001': 'p_wh01',
 'Real hourly wage from main job in 2001': 'p_hrw01',
 'Completed re-education based on detailed qualifications': 'redudl',
 'Completed re-education using hightest lvl and det. qual.': 'redufl',
 'Completion re-education after 2017: detailed qualifications': 'redllt',
 'Completion re-education after 2017: highes lvl and det. qualifications': 'refllt',
 'First wave of re-eduation completion: 2002': 'p_rcom1',
 'First wave of re-eduation completion: 2003': 'p_rcom2',
 'First wave of re-eduation completion

# Clean and prepare for modelling

## Identify columns that should not be included as covariates

### Treatent variables

   - Re-education based on highest attainment (**reduhl**) records whether the individual has had re-education between 2002 and 2017, based on whether there was a change in the highest education level attained stated in the two years. 
   - **redhllt**, 
   - **refllt**

### Outcome variables
   - Mental health in 2019 (**mh**). This is the transformed mental health scores from the aggregation of mental health items of the SF-36 Health Survey, as reported by the individual in 2019. It ranges from 0 to 100, with higher scores indicating better mental health.  
   - Working hours in 2019 (**wkhr**) records the total number of hours the individual works in all jobs in a week on average. Working hours are set to 0 for those not working. 
   - Hourly Wages in 2019 (**rlwage**) records the average hourly wage for the individual’s main job in 2019. Hourly wages are set to 0 for those not working and set to missing for those reporting working more than 100 hours a week. 
   
### Other variables
   - **xwaveid** (unique identifier)
   - **p_rcom*** (timing of completion of re-education, proxies treatment)
   - **p_cotrl** (first avail 2003)
   - **p_rdf*** (first avail 2012)
   

### Questions
   - should I check for any other derived variables (ie that do not have a description in meta?)
reduhl derived variable
rehllt derived variable
redudl derived variable
redufl derived variable
redllt derived variable
refllt derived variable

In [6]:
treatments = regex_select(meta.column_names,
    [
        '^re',
    ]
)

outcomes = regex_select(meta.column_names,
    [
        '^mh',
        '^wkhr$'
        '^rlwage$'
    ]
)

exclude = regex_select(meta.column_names,
    [
        '^p_rcom',
        '^p_rdf',
        '^p_cotrl',
        '^xwaveid$',  
    ]
)

## Redundancy & Highly correlated features


## Propensity Model

What features are predictive of someone undertaking re-education?

### Specify target and features
For this model, we are predicting whether or not an individual is treated (eg is re-educated) so the target will be one of the measures of re-education.

In [7]:
features = list(set(meta.column_names) - set(exclude) - set(treatments) - set(outcomes))

target = 'rehllt'

print(f'Original columns:{len(meta.column_names)}, excluded:{len(meta.column_names) - len(features)}, # features:{len(features)}')

Original columns:190, excluded:38, # features:152


### Set up models

In [8]:
from dataclasses import dataclass
from typing import Any

@dataclass
class Model:
    """Class keeping a model, its human readable name and the set of parameters to search over for it together."""
    name:str
    estimator:Any
    parameters:dict

In [68]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

transform = Pipeline([
    ('impute_missing', SimpleImputer()),
    ('scale', StandardScaler())
])

X = transform.fit_transform(df[features])
y = df[target]

assert np.ndim(y)==1

valid = ~np.isnan(y)
X = X[valid,:]
y = y[valid]

#lr = Model('lr',LogisticRegression(),{'C':np.logspace(-4,0)})
gb = Model('gbc',GradientBoostingClassifier(),parameters = {
    'min_samples_split':np.logspace(-3,0)
})

inner_cv = KFold(n_splits=5)
outer_cv = KFold(n_splits=3)

### Fit a single model to get an idea of the grid range

In [69]:
model = GridSearchCV(gb.estimator,param_grid=gb.parameters)
model.fit(X,y)

GridSearchCV(estimator=GradientBoostingClassifier(),
             param_grid={'min_samples_split': array([0.001     , 0.0011514 , 0.00132571, 0.00152642, 0.00175751,
       0.00202359, 0.00232995, 0.0026827 , 0.00308884, 0.00355648,
       0.00409492, 0.00471487, 0.00542868, 0.00625055, 0.00719686,
       0.00828643, 0.00954095, 0.01098541, 0.01264855, 0.01456348,
       0.01676833, 0.01930698, 0.02222996, 0.02559548, 0.02947052,
       0.03393222, 0.0390694 , 0.04498433, 0.05179475, 0.05963623,
       0.06866488, 0.07906043, 0.09102982, 0.10481131, 0.12067926,
       0.13894955, 0.15998587, 0.184207  , 0.21209509, 0.24420531,
       0.28117687, 0.32374575, 0.37275937, 0.42919343, 0.49417134,
       0.5689866 , 0.65512856, 0.75431201, 0.86851137, 1.        ])})

In [70]:
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009274,0.001189,0.001306,5.5e-05,0.0001,{'C': 0.0001},0.636611,0.765669,0.676558,0.770648,0.502226,0.670342,0.098578,22
1,0.007714,0.000901,0.001216,4.2e-05,0.000121,{'C': 0.00012067926406393288},0.636611,0.765528,0.676805,0.771019,0.502473,0.670487,0.098545,21
2,0.006788,0.000147,0.001188,2.3e-05,0.000146,{'C': 0.00014563484775012445},0.636611,0.766093,0.676805,0.771637,0.502226,0.670674,0.098865,20
3,0.006575,0.000188,0.001155,1.8e-05,0.000176,{'C': 0.00017575106248547912},0.636487,0.766516,0.676682,0.772132,0.502226,0.670808,0.099055,19
4,0.006534,0.000196,0.001166,1e-05,0.000212,{'C': 0.00021209508879201905},0.636734,0.767081,0.676805,0.772626,0.501855,0.67102,0.099375,17


In [None]:
plt.semilogx(results[],results[])

### Fit models

In [66]:
model = GridSearchCV(lr.estimator,param_grid = lr.parameters, cv=inner_cv,scoring='roc_auc')
scores = cross_val_score(model,X,y,cv=outer_cv)

In [None]:
#scores = {}
#for mname, model in models.items():
#    scores[mname] = cross_val_score(model, X, y, scoring='roc_auc') 

### Propensity model results

In [96]:
coef = pd.DataFrame({'coef':models['lr'].coef_[0]}, index = features)
coef['abs'] = coef['coef'].abs()

In [101]:
for name in coef.sort_values('abs',ascending=False).head(50).index:
    print(name, meta.column_names_to_labels[name])

p_age4 Age group in 2001: 55-64
p_age5 Age group in 2001: 65+
p_age1 Age group in 2001: 25-34
p_jbwk Attitude towards having job in 2001
p_jadnm SCQ:D1b I would enjoy having a job even if I didnt need the money
p_whp1miss Missing indicator for extent of hours match with preference of respondent
p_jbwkmiss Missing indiator for attitude towards having job of respondent
p_age2 Age group in 2001: 35-44
p_ddeg4 Severity of health condition in 2001: Severe disability
p_losatmiss Missing indicator for life satisfaction of respondent
p_mocc2 Mother's Occupation in 2001: Armed forces
p_fedu1 Father's highest education: None
p_fem Female
p_lfs2 Labour market status in 2001: Unemployed
p_memp2 Mother's LM status at age 14: Not employed
p_jahpj SCQ:D1a In order to be happy in life it is important to have a paying job
p_plfs4 Partner LM status in 2001: Not in labour force
p_mocc1miss Missing indicator for occupation of respondent's mother
p_plfs1 Partner LM status in 2001: No partner or no resident