# HW06: Python 
(due November 3rd)

# Heterogeneous Treatment Effects with Keras MLP

In [2]:
import pandas as pd
from tensorflow import keras

### Dataset on RCT study of case management on mental health outcomes.


In [3]:
# show variable labels
pd.read_stata('http://www.homepages.ucl.ac.uk/~rmjwiww/stata/missing/uk500.dta', iterator=True).variable_labels()

{'trialid': 'Trial ID',
 'centreid': 'Trial centre',
 'status': 'Patient status at baseline',
 'age': 'Age in years at baseline',
 'sex': 'Sex',
 'afcarib': 'Ethnic group',
 'ocfabth': "Father's social class at birth",
 'chron1l': 'Months since onset of psychosis, logged',
 'hos94': 'Days in hospital for psychiatric reasons: 2 years before baseline',
 'cprs94': 'Psychopathology at baseline (CPRS)',
 'das94': 'Disability at baseline (DAS)',
 'sat94': '(Dis)satisfaction with services at baseline',
 'rand': 'Randomised group',
 'hos96': 'Days in hospital for psychiatric reasons: 2 years after baseline',
 'cprs96': 'Psychopathology at 2 years (CPRS)',
 'sat96': '(Dis)satisfaction with services at 2 years'}

In [7]:
# Load data 
df = pd.read_stata('http://www.homepages.ucl.ac.uk/~rmjwiww/stata/missing/uk500.dta')
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246 entries, 1 to 497
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   trialid   246 non-null    float64 
 1   centreid  246 non-null    category
 2   status    246 non-null    category
 3   age       246 non-null    float64 
 4   sex       246 non-null    category
 5   afcarib   246 non-null    category
 6   ocfabth   246 non-null    category
 7   chron1l   246 non-null    float32 
 8   hos94     246 non-null    float64 
 9   cprs94    246 non-null    float64 
 10  das94     246 non-null    float64 
 11  sat94     246 non-null    float64 
 12  rand      246 non-null    category
 13  hos96     246 non-null    float64 
 14  cprs96    246 non-null    float64 
 15  sat96     246 non-null    float64 
dtypes: category(6), float32(1), float64(9)
memory usage: 22.4 KB


In [8]:
#Encoding Categorical covariates and preparing the data for tensorflow
covariates = ['status', 'sex', 'sat94', 'ocfabth', 'hos94', 'das94', 'cprs94', 'age', 'afcarib']
covariates_cat = ['status', 'sex', 'ocfabth', 'afcarib']

from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
df[covariates_cat] = encoder.fit_transform(df[covariates_cat])
df[covariates] = df[covariates].astype('float32')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246 entries, 1 to 497
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   trialid   246 non-null    float64 
 1   centreid  246 non-null    category
 2   status    246 non-null    float32 
 3   age       246 non-null    float32 
 4   sex       246 non-null    float32 
 5   afcarib   246 non-null    float32 
 6   ocfabth   246 non-null    float32 
 7   chron1l   246 non-null    float32 
 8   hos94     246 non-null    float32 
 9   cprs94    246 non-null    float32 
 10  das94     246 non-null    float32 
 11  sat94     246 non-null    float32 
 12  rand      246 non-null    category
 13  hos96     246 non-null    float64 
 14  cprs96    246 non-null    float64 
 15  sat96     246 non-null    float64 
dtypes: category(2), float32(10), float64(4)
memory usage: 20.0 KB


In [9]:
# the post-treatment outcomes to analyze
outcomes = ['sat96', 'hos96', 'cprs96']
df[outcomes].describe()

Unnamed: 0,sat96,hos96,cprs96
count,246.0,246.0,246.0
mean,17.271341,65.5,17.790587
std,4.723009,104.046722,14.090911
min,9.0,0.0,0.0
25%,14.0,0.0,7.0
50%,17.0,15.0,15.0
75%,20.1875,93.5,26.0
max,32.0,692.0,71.0


In [10]:
# variable describing treatment status
treatvar = 'rand'
df[treatvar].value_counts()

Intensive case management    130
Standard case management     116
Name: rand, dtype: int64

In [11]:
# covariates for predicting the outcome conditional on treatment
df[covariates].describe()

Unnamed: 0,status,sex,sat94,ocfabth,hos94,das94,cprs94,age,afcarib
count,246.0,246.0,246.0,246.0,246.0,246.0,246.0,246.0,246.0
mean,0.605691,0.544715,18.837906,2.715447,94.776421,1.072794,19.362692,38.593494,0.723577
std,0.489698,0.499012,4.907598,1.188518,94.375053,0.820939,13.35019,11.050043,0.44814
min,0.0,0.0,9.0,0.0,1.0,0.0,0.0,20.0,0.0
25%,0.0,0.0,15.75,2.0,33.25,0.428571,9.0,30.0,0.0
50%,1.0,1.0,19.0,3.0,63.0,1.0,17.0,36.0,1.0
75%,1.0,1.0,22.0,4.0,126.0,1.5,27.0,47.0,1.0
max,1.0,1.0,36.0,5.0,730.0,4.714283,67.0,65.0,1.0


In [None]:
# Subset the dataset by treatment (intensive) and control (standard)
df_treat = #TODO
df_control = #TODO

### Choose one of the three outcomes to analyze. Build two FFNN / MLP models to predict the outcome based on the covariates. The first model should use the treatment dataset, the second model should use the control dataset.

In [None]:
# build MLP models with at least 2 hidden layers, ReLU activation, batch normalization, dropout
model_treat = keras.models.Sequential()
model_control = keras.models.Sequential()
#TODO

In [19]:
# compile the models
#TODO

In [None]:
# fit separate models on the treatment dataset and control dataset
# use early stopping
#TODO

### Form predicted outcomes for each data point, using both `model_treat` and `model_control`.

In [None]:
y_pred_treat = #TODO
y_pred_control = #TODO

### Find the 3 individuals that are most and least responsive to treatment and `describe` their covariates.

In [None]:
#TODO

In [None]:
# Bonus: explore what features matter for the predicted difference between control and treatment