## Part 2: Double/Debiased machine learning in observational data

2.1 Load the data

In [1]:
using CSV, DataFrames, StatsModels, GLM, Random, RData, MLBase, MLJ, PrettyTables, FixedEffectModels

In [5]:
df = CSV.read("../data/observational/biased_control.csv", DataFrame);

display(df)

Row,treat,age,educ,black,hisp,marr,nodegree,re74,re75,re78,agesq,agecube,educsq,u74,u75,interaction1,re74sq,re75sq
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Float64,Float64,Float64,Int64,Int64,Int64,Int64,Int64,Float64,Float64,Float64
1,0,45,11,0,0,1,1,21.5167,25.2436,25.5647,2025,91125,121,0,0,236.683,462.967,637.237
2,0,21,14,0,0,0,0,3.17597,5.85256,13.4961,441,9261,196,0,0,44.4636,10.0868,34.2525
3,0,38,12,0,0,1,0,23.039,25.1308,25.5647,1444,54872,144,0,0,276.468,530.796,631.555
4,0,48,6,0,0,1,1,24.9944,25.2436,25.5647,2304,110592,36,0,0,149.966,624.718,637.237
5,0,18,8,0,0,1,1,1.6693,10.7276,9.86087,324,5832,64,0,0,13.3544,2.78655,115.082
6,0,22,11,0,0,1,1,16.3658,18.4493,25.5647,484,10648,121,0,0,180.023,267.838,340.376
7,0,48,10,0,0,1,1,16.8046,16.3546,18.0593,2304,110592,100,0,0,168.046,282.396,267.473
8,0,18,11,0,0,0,1,1.14421,3.62003,15.7393,324,5832,121,0,0,12.5863,1.30922,13.1046
9,0,48,9,0,0,1,1,25.8623,25.2436,25.5647,2304,110592,81,0,0,232.761,668.86,637.237
10,0,45,12,0,0,1,0,25.8623,0.0,3.92484,2025,91125,144,0,1,310.348,668.86,0.0


2.2 Group comparisons

In [7]:
#First of all, we separate the data into the treatment and control group. The column "treat" allows us to identify the groups.!
df_treatment = df[df.treat.==1, :];
df_control = df[df.treat.==0, :];

In [25]:
display(describe(df_treatment[!, 2:10]))
display(summary(df_treatment))

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Float64,Real,Float64,Real,Int64,DataType
1,age,25.8162,17.0,25.0,48.0,0,Int64
2,educ,10.3459,4.0,11.0,16.0,0,Int64
3,black,0.843243,0.0,1.0,1.0,0,Int64
4,hisp,0.0594595,0.0,0.0,1.0,0,Int64
5,marr,0.189189,0.0,0.0,1.0,0,Int64
6,nodegree,0.708108,0.0,1.0,1.0,0,Int64
7,re74,2.09557,0.0,0.0,35.0401,0,Float64
8,re75,1.53206,0.0,0.0,25.1422,0,Float64
9,re78,6.34914,0.0,4.23231,60.3079,0,Float64


"185×18 DataFrame"

In [26]:
display(describe(df_control[!, 2:10]))
display(summary(df_control))

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Float64,Real,Float64,Real,Int64,DataType
1,age,33.2252,16.0,31.0,55.0,0,Int64
2,educ,12.0275,0.0,12.0,18.0,0,Int64
3,black,0.0735368,0.0,0.0,1.0,0,Int64
4,hisp,0.072036,0.0,0.0,1.0,0,Int64
5,marr,0.711731,0.0,1.0,1.0,0,Int64
6,nodegree,0.295835,0.0,0.0,1.0,0,Int64
7,re74,14.0168,0.0,15.1236,25.8623,0,Float64
8,re75,13.6508,0.0,14.5571,25.2436,0,Float64
9,re78,14.8467,0.0,16.422,25.5647,0,Float64


"15992×18 DataFrame"

In [32]:
display("The National Supported Work Demonstration (NSW) was a job-training program designed in the mid-1970s to help disadvantaged workers that lacked basic job skills move into the labour market.")
display("For that goal they were to provide them with work experience and counselling in a sheltered environment (training).")
display("As such, we can observe that the group that received the treatment had overall fewer years of education compared to the control group.")
display("The treated group also consisted of mostly black people and were, in average, younger than the control group.")

"The National Supported Work Demonstration (NSW) was a job-training program designed in the mid-1970s to help disadvantaged workers that lacked basic job skills move into the labour market."

"For that goal they were to provide them with work experience and counselling in a sheltered environment (training)."

"As such, we can observe that the group that received the treatment had overall fewer years of education compared to the control group."

"The treated group also consisted of mostly black people and were, in average, younger than the control group."

2.3 Compute the SMD

In [41]:
#For the SMD we must first identify our variables:

y = df[!, 10];
d = df[!, 1];
x = df[:, Not([1, 10])];
scitype(x)

Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}

In [42]:
#And fix the variable x to continue the analysis
coerce!(x, Count => MLJ.Continuous);

In [None]:
#Find the SMD or naive ATE
fm = term(:re78) ~ term(:treat) + sum(term.(Symbol.(names(df[Not([1, 10]), :]))));
lres = reg(df, fm);
first(DataFrame(GLM.coeftable(lres)))

Row,Name,Estimate,Std. Error,t-stat,Pr(>|t|),Lower 95%,Upper 95%
Unnamed: 0_level_1,String,Float64,Float64,Float64,Float64,Float64,Float64
1,treat,-8.9366e-14,1.1747e-14,-7.60755,2.9469e-14,-1.12392e-13,-6.63406e-14




In [None]:
display("The Naive ATE we found seems to be very small and comparative to 0."
display("In comparison to what was found using causal trees and forests, we can see a great gap between the two results.")

2.4 Use DML to find a better estimate of the ATE

In [47]:
function training_sample_append(cv_split, test_sample_index)
    training_indices = []
    for vector in cv_split[Not(test_sample_index)]
        training_indices = [training_indices; vector]
    end
    return training_indices, cv_split[test_sample_index]
end

function dml(x, d, y, dreg, yreg, nfold)
    n = length(y)
    cv = [partition(eachindex(y), fill(1 / nfold, nfold - 1)..., shuffle=true, rng=1234)...]
    machine_y = machine(yreg, x, y, scitype_check_level=0)
    machine_d = machine(dreg, x, d, scitype_check_level=0)
    y_hat = zeros(n)
    d_hat = zeros(n)

    for fold in 1:nfold
        training_fold, test_fold = training_sample_append(cv, fold)
        y_hat[test_fold] = MLJ.predict(MLJ.fit!(machine_y, rows=training_fold), x[test_fold, :])
        d_hat[test_fold] = MLJ.predict(MLJ.fit!(machine_d, rows=training_fold), x[test_fold, :])
    end

    resy = y .- y_hat
    resd = reshape(d .- d_hat, (n, 1))
    estimate = lm(resd, resy)
    coef_est = GLM.coef(estimate)[1]
    se = GLM.coeftable(estimate).cols[2][1]
    println(" coef (se) = ", coef_est, "(", se, ")")
    return coef_est, se, resy, resd
end

function summarize(point, stderr, resy, resd, name)
    return DataFrame(
        model=[name],
        estimate=[point], stderr=[stderr],
        rmse_y=[sqrt(mean(resy .^ 2))],
        rmse_d=[sqrt(mean(resd .^ 2))]
    )
end

summarize (generic function with 1 method)

In [50]:
import Pkg
Pkg.add("MLJScikitLearnInterface")

[32m[1m   Resolving[22m[39m package versions...
[32m[1m   Installed[22m[39m CondaPkg ──────────────── v0.2.24
[32m[1m   Installed[22m[39m UnsafePointers ────────── v1.0.0
[32m[1m   Installed[22m[39m micromamba_jll ────────── v1.5.8+0
[32m[1m   Installed[22m[39m MLJScikitLearnInterface ─ v0.7.0
[32m[1m   Installed[22m[39m Pidfile ───────────────── v1.3.0
[32m[1m   Installed[22m[39m StructTypes ───────────── v1.11.0
[32m[1m   Installed[22m[39m JSON3 ─────────────────── v1.14.1
[32m[1m   Installed[22m[39m PythonCall ────────────── v0.9.23
[32m[1m   Installed[22m[39m MicroMamba ────────────── v0.1.14
[32m[1m    Updating[22m[39m `C:\Users\sophi\.julia\environments\v1.10\Project.toml`
  [90m[5ae90465] [39m[92m+ MLJScikitLearnInterface v0.7.0[39m
[32m[1m    Updating[22m[39m `C:\Users\sophi\.julia\environments\v1.10\Manifest.toml`
  [90m[992eb4ea] [39m[92m+ CondaPkg v0.2.24[39m
  [90m[0f8b85d8] [39m[92m+ JSON3 v1.14.1[39m
  [90m[5ae9

In [51]:
LinearRegressor = @load LinearRegressor pkg = MLJScikitLearnInterface verbosity = 0
dreg = Standardizer() |> LinearRegressor()
yreg = Standardizer() |> LinearRegressor()
result_ols = dml(x, d, y, dreg, yreg, 10);
table_ols = summarize(result_ols..., "OLS")

LassoCVRegressor = @load LassoCVRegressor pkg = MLJScikitLearnInterface verbosity = 0
dreg = Standardizer() |> LassoCVRegressor(max_iter=200000)
yreg = Standardizer() |> LassoCVRegressor(max_iter=200000)
results_lasso = dml(x, d, y, dreg, yreg, 10);
table_lasso = summarize(results_lasso..., "LassoCV")

RandomForestRegressor = @load RandomForestRegressor pkg = MLJScikitLearnInterface verbosity = 0
dreg = RandomForestRegressor()
yreg = RandomForestRegressor()
results_rf = dml(x, d, y, dreg, yreg, 10);
table_rf = summarize(results_rf..., "RF");

dreg = Standardizer() |> LassoCVRegressor(max_iter=200000)
results_mix = dml(x, d, y, dreg, yreg, 10)
table_mix = summarize(results_mix..., "RF/LassoCV");

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:linear_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:linear_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:linear_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining mac

 coef (se) = 1.5709386603564963(0.5566576621515649)


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:lasso_cv_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:lasso_cv_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:lasso_cv_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraini

 coef (se) = 1.4553455503963622(0.5561085672312804)


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:lasso_cv_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1

 coef (se) = 1.3566685887342813(0.64436342299211)


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:lasso_cv_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:lasso_cv_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1

 coef (se) = 0.935841098453697(0.5886766811548105)


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:lasso_cv_regressor, …).


In [53]:
display("We present now our results:")
pretty_table([table_ols; table_lasso; table_rf; table_mix])

"We present now our results:"

┌────────────┬──────────┬──────────┬─────────┬───────────┐
│[1m      model [0m│[1m estimate [0m│[1m   stderr [0m│[1m  rmse_y [0m│[1m    rmse_d [0m│
│[90m     String [0m│[90m  Float64 [0m│[90m  Float64 [0m│[90m Float64 [0m│[90m   Float64 [0m│
├────────────┼──────────┼──────────┼─────────┼───────────┤
│        OLS │  1.57094 │ 0.556658 │  6.9873 │ 0.0986685 │
│    LassoCV │  1.45535 │ 0.556109 │ 6.99038 │ 0.0988129 │
│         RF │  1.35667 │ 0.644363 │ 7.39913 │ 0.0902723 │
│ RF/LassoCV │ 0.935841 │ 0.588677 │ 7.39878 │ 0.0988129 │
└────────────┴──────────┴──────────┴─────────┴───────────┘


In [56]:
display("We can see that our results show that the OLS function estimates the highest coefficient and the second lowest standard error. Lasso cross-validation shows a similar result.")
display("However, in comparison, the random forest regressor shows a bigger standard error. Furthermore, its MSE value for Y is bigger than in the other two methods while the MSE for D is smaller.")

"We can see that our results show that the OLS function estimates the highest coefficient and the second lowest standard error. Lasso cross-validation shows a similar result."

"However, in comparison, the random forest regressor shows a bigger standard error. Furthermore, its MSE value for Y is bigger than in the other two methods while the MSE for D is smaller."