In [43]:
# !wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
# !dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
# !apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
# !apt update -q
# !apt install cuda gcc-6 g++-6 -y -q
# !ln -s /usr/bin/gcc-6 /usr/local/cuda/bin/gcc
# !ln -s /usr/bin/g++-6 /usr/local/cuda/bin/g++

In [44]:
# !curl -sSL "https://julialang-s3.julialang.org/bin/linux/x64/1.7/julia-1.7.3-linux-x86_64.tar.gz" -o julia.tar.gz
# !tar -xzf julia.tar.gz -C /usr --strip-components 1
# !rm -rf julia.tar.gz*
# !julia -e 'using Pkg; pkg"add IJulia; precompile"'

In [45]:
# using Pkg
# Pkg.add("CSV"), using CSV
# Pkg.add("DataFrames"), using DataFrames
# Pkg.add("StatsModels"), using StatsModels
# Pkg.add("GLM"), using GLM
# Pkg.add("Random"), using Random
# Pkg.add("MLDataUtils"), using MLDataUtils
# Pkg.add("MLBase"), using MLBase
# Pkg.add("FixedEffectModels"), using FixedEffectModels
# Pkg.add("Lasso"), using Lasso
# Pkg.add("MLJ"), using MLJ
# Pkg.add("DecisionTree"), using DecisionTree
# Pkg.add("RData"), using RData
# Pkg.add("GLMNet"), using GLMNet
# Pkg.add("PrettyTables"), using PrettyTables

In [46]:
using Pkg, CSV, DataFrames, StatsModels, GLM, Random, RData, MLDataUtils, MLBase, FixedEffectModels, Lasso, MLJ, DecisionTree, GLMNet, PrettyTables

# Debiased ML for Partially Linear Model in Julia

This is a simple implementation of Debiased Machine Learning for the Partially Linear Regression Model.

Reference: 

https://arxiv.org/abs/1608.00060


https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778

The code is based on the book.

## DML algorithm

Here we perform estimation and inference of predictive coefficient $\alpha$ in the partially linear statistical model,

$$
Y = D\alpha + g(X) + U, \quad E (U | D, X) = 0. 
$$

For $\tilde Y = Y- E(Y|X)$ and $\tilde D= D- E(D|X)$, we can write

$$
\tilde Y = \alpha \tilde D + U, \quad E (U |\tilde D) =0.
$$

Parameter $\alpha$ is then estimated using cross-fitting approach to obtain the residuals $\tilde D$ and $\tilde Y$.
The algorithm comsumes $Y, D, X$, and machine learning methods for learning the residuals $\tilde Y$ and $\tilde D$, where
the residuals are obtained by cross-validation (cross-fitting).

The statistical parameter $\alpha$ has a causal intertpreation of being the effect of $D$ on $Y$ in the causal DAG $$ D\to Y, \quad X\to (D,Y)$$ or the counterfactual outcome model with conditionally exogenous (conditionally random) assignment of treatment $D$ given $X$:

$$
Y(d) = d\alpha + g(X) + U(d),\quad  U(d) \perp\kern-5pt\perp  D |X, \quad Y = Y(D), \quad U = U(D).
$$


## Load data

In [47]:
url = "https://github.com/d2cml-ai/14.388_jl/raw/main/data/GrowthData.RData"
download(url, "data.RData")
rdata_read = RData.load("data.RData")
data = rdata_read["GrowthData"]
names(data)
println("Number of Rows : ", size(data)[1],"\n","Number of Columns : ", size(data)[2],) #rows and columns

Number of Rows : 90
Number of Columns : 63


In [63]:
data

Row,Outcome,intercept,gdpsh465,bmp1l,freeop,freetar,h65,hm65,hf65,p65,pm65,pf65,s65,sm65,sf65,fert65,mort65,lifee065,gpop1,fert1,mort1,invsh41,geetot1,geerec1,gde1,govwb1,govsh41,gvxdxe41,high65,highm65,highf65,highc65,highcm65,highcf65,human65,humanm65,humanf65,hyr65,hyrm65,hyrf65,no65,nom65,nof65,pinstab1,pop65,worker65,pop1565,pop6565,sec65,secm65,secf65,secc65,seccm65,seccf65,syr65,syrm65,syrf65,teapri65,teasec65,ex1,im1,xr65,tot1
Unnamed: 0_level_1,Float64,Int32,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Int32,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,-0.0243358,1,6.59167,0.2837,0.153491,0.043888,0.007,0.013,0.001,0.29,0.37,0.21,0.04,0.06,0.02,6.67,0.16,3.69387,0.0203,6.68,0.165,0.11898,0.0195,0.0176,0.019,0.0931,0.1158,0.07877,0.12,0.23,0.01,0.09,0.18,0.01,0.301,0.568,0.043,0.004,0.008,0.0,89.46,79.98,98.61,0.0,12359,0.3469,0.4441,0.0275912,0.45,0.75,0.17,0.13,0.21,0.04,0.033,0.057,0.01,47.6,17.3,0.0729,0.0667,0.348,-0.014727
2,0.100473,1,6.82979,0.6141,0.313509,0.061827,0.019,0.032,0.007,0.91,1.0,0.65,0.16,0.23,0.09,6.97,0.145,3.93378,0.0185,7.114,0.154,0.12048,0.0556,0.0369,0.019,0.1589,0.156,0.09999,0.7,1.18,0.2,0.63,1.04,0.2,0.706,1.138,0.257,0.027,0.045,0.008,89.1,82.35,96.1,0.02325,4630,0.2703,0.4474,0.0356371,3.0,4.74,1.2,1.36,2.05,0.64,0.173,0.274,0.067,57.1,18.0,0.094,0.1438,0.525,0.00575
3,0.0670515,1,8.89508,0.0,0.204244,0.009186,0.26,0.325,0.201,1.0,1.0,1.0,0.56,0.62,0.51,3.11,0.024,4.27388,0.0188,3.662,0.027,0.23098,0.0465,0.0365,0.04,0.1442,0.1367,0.06,16.67,17.95,15.41,4.5,5.7,3.31,8.317,8.249,8.384,0.424,0.473,0.375,1.4,1.4,1.4,0.0,19678,0.3874,0.3175,0.0766846,36.74,33.5,39.95,15.68,13.19,18.14,2.573,2.478,2.667,26.5,20.7,0.1741,0.175,1.082,-0.01004
4,0.0640892,1,7.56528,0.1997,0.248714,0.03627,0.061,0.07,0.051,1.0,1.0,1.0,0.24,0.22,0.31,6.26,0.072,4.16821,0.0345,6.83,0.085,0.12928,0.0375,0.035,0.011,0.1165,0.2018,0.15616,3.1,3.4,2.8,2.11,2.28,1.95,3.833,3.86,3.807,0.104,0.114,0.095,20.6,20.6,20.6,0.0,1482,0.3011,0.4671,0.0310391,7.6,7.5,7.7,2.76,2.89,2.63,0.438,0.453,0.424,27.8,22.7,0.1265,0.1496,6.625,-0.002195
5,0.0279295,1,7.1624,0.174,0.299252,0.037367,0.017,0.027,0.007,0.82,0.85,0.81,0.17,0.15,0.13,6.71,0.12,3.9982,0.031,6.816,0.131,0.07932,0.0257,0.0224,0.012,0.0971,0.169,0.13427,0.67,0.98,0.36,0.45,0.66,0.25,1.9,2.084,1.72,0.022,0.033,0.012,58.73,55.56,61.82,0.2,3006,0.3314,0.4561,0.0262808,5.07,5.37,4.78,2.17,2.23,2.11,0.257,0.287,0.229,34.5,17.6,0.1211,0.1308,2.5,0.003283
6,0.0464074,1,7.21891,0.0,0.258865,0.02088,0.023,0.038,0.006,0.5,0.55,0.5,0.08,0.1,0.07,6.7,0.112,3.88978,0.0303,6.83,0.119,0.07608,0.0151,0.0156,0.009,0.0713,0.0734,0.04899,0.7,1.09,0.3,0.48,0.74,0.21,1.426,1.622,1.227,0.024,0.037,0.01,69.2,64.27,74.2,0.4242,4568,0.3105,0.4599,0.0273643,3.5,3.6,3.4,1.02,0.59,1.46,0.16,0.174,0.146,34.3,8.1,0.0634,0.0762,1.0,-0.001747
7,0.0673323,1,7.8536,0.0,0.182525,0.014385,0.039,0.063,0.014,0.92,0.94,0.92,0.17,0.21,0.12,6.72,0.082,4.08766,0.032,6.744,0.087,0.17044,0.0139,0.0134,0.007,0.0615,0.0675,0.0471,1.85,3.11,0.66,1.09,1.74,0.46,2.789,3.426,2.182,0.059,0.097,0.022,42.07,34.56,49.23,0.0,44752,0.2822,0.4612,0.0334287,5.28,6.64,3.98,2.42,3.28,1.59,0.342,0.484,0.207,46.6,14.7,0.0342,0.0428,12.499,0.009092
8,0.0209777,1,7.70391,0.2776,0.215275,0.029713,0.024,0.035,0.013,0.69,0.69,0.69,0.14,0.14,0.13,7.19,0.121,3.91999,0.0268,7.302,0.131,0.13006,0.0173,0.0165,0.018,0.0796,0.1137,0.07942,0.6,1.13,0.1,0.41,0.77,0.07,2.148,2.21,2.09,0.02,0.038,0.003,49.6,49.6,49.6,0.0,1750,0.2984,0.4812,0.0234286,4.3,4.83,3.8,1.25,0.85,1.63,0.184,0.219,0.152,34.0,16.1,0.0864,0.0931,7.0,0.01163
9,0.0335512,1,9.06346,0.0,0.109614,0.002171,0.402,0.488,0.314,1.0,1.0,1.0,0.9,0.9,0.9,2.91,0.025,4.25135,0.0146,3.448,0.025,0.20156,0.0474,0.0383,0.088,0.1985,0.1842,0.05831,19.18,22.0,16.59,11.6,14.14,9.27,9.359,9.389,9.332,0.616,0.723,0.517,2.27,2.25,2.29,0.001585,194303,0.4111,0.2956,0.0947283,46.48,42.06,50.55,22.01,19.07,24.72,3.206,3.154,3.253,28.2,20.6,0.0594,0.046,1.0,0.008169
10,0.0391465,1,8.15191,0.1484,0.110885,0.028579,0.145,0.173,0.114,1.0,1.0,1.0,0.28,0.26,0.4,3.07,0.058,4.18662,0.0155,3.08,0.06,0.25374,0.0236,0.0223,0.021,0.0998,0.0984,0.05517,3.74,5.22,2.24,2.25,3.06,1.43,5.336,5.549,5.122,0.12,0.166,0.074,10.5,9.01,12.01,0.90485,22283,0.3906,0.3007,0.0622448,13.07,13.72,12.41,6.19,5.62,6.76,0.703,0.785,0.62,20.3,7.2,0.0524,0.0523,2.119,0.007584


In [61]:
y = data[!,1]
y= reshape(y, (length(y),1))
d = data[!,3]
d= reshape(d, (length(y),1))
x = data[!,4:end]
x = Matrix(x);

## Naive OLS that uses all features w/o cross-fitting

In [67]:
println("\n length of y is \n", size(y,1) )
println("\n num features x is \n", size(x,1 ) )

# Naive OLS
print( "\n Naive OLS that uses all features w/o cross-fitting \n" )
fm = term(:Outcome) ~ term(:gdpsh465) +sum(term.(Symbol.(names(data[:,4:size(data,2)]))));
lres = reg(data, fm);
first(DataFrame(GLM.coeftable(lres)))


 length of y is 
90

 num features x is 
90

 Naive OLS that uses all features w/o cross-fitting 


Row,Name,Estimate,Std. Error,t-stat,Pr(>|t|),Lower 95%,Upper 95%
Unnamed: 0_level_1,String,Float64,Float64,Float64,Float64,Float64,Float64
1,gdpsh465,-0.00937799,0.0298877,-0.313774,0.756104,-0.0707025,0.0519466


## DML with OLS w/o feature selection

In [68]:
function DML2_for_PLM(x , d , y, dreg , yreg , nfold)
    
    # Num ob observations
    nobser = size(x,1)
    
    # Define folds indices 
    foldid = collect(Kfold(size(x)[1], nfold))
    
    # Create array to save errors 
    ytil = ones(nobser)
    dtil = ones(nobser)
    println("Folds: " )
    
    # loop to save results
    for i in 1:nfold
        
        # Lasso regression, excluding folds selected 
        dfit = dreg(x[foldid[i],:], d[foldid[i]])
        yfit = yreg(x[foldid[i],:], y[foldid[i]]) 
        
        # Predict estimates using the 
        dhat = GLM.predict(dfit, x[Not(foldid[i]),:])
        yhat = GLM.predict(yfit, x[Not(foldid[i]),:])
        
        # Save errors 
        dtil[Not(foldid[i])] = (d[Not(foldid[i])] - dhat)
        ytil[Not(foldid[i])] = (y[Not(foldid[i])] - yhat)
        println(i)
    end
    
    # Create dataframe 
    data = DataFrame(ytil = ytil, dtil = dtil)
    
    # OLS clustering at the County level
    rfit = reg(data, @formula(ytil ~ dtil ))
    coef_est = GLM.coef(rfit)[2]
    se = GLM.coeftable(rfit).cols[2][2]
    
    println(" coef (se) = ", coef_est ,"(",se,")")
    
    return rfit, data;
    
end

DML2_for_PLM (generic function with 1 method)

In [69]:
#DML with OLS
print( "\n DML with OLS w/o feature selection \n" )

dreg(x, d) = lm(x,vec(d))    
yreg(x, y) = lm(x,vec(y))

DML2_ols = DML2_for_PLM(x, d, y, dreg, yreg, 10 );


 DML with OLS w/o feature selection 
Folds: 
1
2
3
4
5
6
7
8
9
10
 coef (se) = -0.031728188902734794(0.005897663821878339)


## DML with Lasso

In [70]:
function DML2_lasso_cv(x , d , y, dreg , yreg , nfold)
    
    # Num ob observations
    nobser = size(x,1)
    
    # Define folds indices
    foldid = collect(Kfold(size(x)[1], nfold))
    
    # Create array to save errors 
    ytil = ones(nobser)
    dtil = ones(nobser)
    println("Folds: " )
    
    # loop to save results
    for i in 1:nfold
        dfit = dreg(x[foldid[i],:], d[foldid[i]])
        yfit = yreg(x[foldid[i],:], y[foldid[i]])
        
        dhat = GLMNet.predict(dfit, x[Not(foldid[i]),:])
        yhat = GLMNet.predict(yfit, x[Not(foldid[i]),:])
        
        dtil[Not(foldid[i])]   = (d[Not(foldid[i])] - dhat)
        ytil[Not(foldid[i])]   = (y[Not(foldid[i])] - yhat)
        println(i)
    end
    
    # Create dataframe 
    data = DataFrame(ytil = ytil, dtil = dtil)
    
    # OLS clustering at the County level
    rfit = lm(@formula(ytil ~ dtil), data)
    coef_est = GLM.coef(rfit)[2]
    se = GLM.coeftable(rfit).cols[2][2]

    println(" coef (se) = ", coef_est ,"(",se,")")
    
    return rfit, data;
    
end

DML2_lasso_cv (generic function with 1 method)

In [71]:
# DML with LASSO
print( "\n DML with Lasso \n" )

##ML method = lasso from glmnet 
dreg(x, d) = glmnetcv(x, d, alpha = 1)    
yreg(x, y) = glmnetcv(x, y, alpha = 1)  
DML2_lasso_cv_1 = DML2_lasso_cv(x, d, y, dreg, yreg, 10);


 DML with Lasso 
Folds: 
1
2
3
4
5
6
7
8
9
10
 coef (se) = -0.024715831143724415(0.01595781516785478)


## DML with Random Forest

In [72]:
function DML2_RF(x , d , y, dreg , yreg , nfold)
    
    # Num ob observations
    nobser = size(x,1)
    
    # Define folds indices
    foldid = collect(Kfold(size(x)[1], nfold))
    
    # Create array to save errors 
    ytil = ones(nobser)
    dtil = ones(nobser)
    println("Folds: " )
    
    # loop to save results
    for i in 1:nfold
        dfit = dreg(x[foldid[i],:], d[foldid[i]])
        yfit = yreg(x[foldid[i],:], y[foldid[i]])
        
        dhat = apply_forest(dfit,x[Not(foldid[1]),:])
        yhat = apply_forest(yfit,x[Not(foldid[1]),:])
        
        dtil[Not(foldid[i])]   = (d[Not(foldid[i])] - dhat)
        ytil[Not(foldid[i])]   = (y[Not(foldid[i])] - yhat)
        println(i)
    end
    
    # Create dataframe 
    data = DataFrame(ytil = ytil, dtil = dtil)
    
    # OLS clustering at the County level
    rfit = reg(data, @formula(ytil ~ dtil)) #unico cambio
    coef_est = GLM.coef(rfit)[2]
    se = GLM.coeftable(rfit).cols[2][2]

    println(" coef (se) = ", coef_est ,"(",se,")")
    
    return rfit, data;
    
end

DML2_RF (generic function with 1 method)

In [73]:
print( "\n DML with Random Forest \n" )
function dreg(x,d)
    min_samples_leaf = 5
    rng = 3
    RFmodel = build_forest(d,x, min_samples_leaf, rng)
end
function yreg(x,y)
    min_samples_leaf = 5
    rng = 3
    RFmodel = build_forest(y,x, min_samples_leaf, rng)
end

DML2_RF_1 = DML2_RF(x, d, y, dreg, yreg, 5);


 DML with Random Forest 
Folds: 
1
2
3
4
5
 coef (se) = 0.0023955182062859477(0.006333088985099034)


## DML with Lasso/Random Forest

In [74]:
function DML2_lasso_RF(x , d , y, dreg , yreg , nfold)
    
    # Num ob observations
    nobser = size(x,1)
    
    # Define folds indices
    foldid = collect(Kfold(size(x)[1], nfold))
    
    # Create array to save errors 
    ytil = ones(nobser)
    dtil = ones(nobser)
    println("Folds: " )
    
    # loop to save results
    for i in 1:nfold
        dfit = dreg(x[foldid[i],:], d[foldid[i]])
        yfit = yreg(x[foldid[i],:], y[foldid[i]])
        
        dhat = GLMNet.predict(dfit,x[Not(foldid[1]),:])
        yhat = apply_forest(yfit,x[Not(foldid[1]),:])
        
        dtil[Not(foldid[i])]   = (d[Not(foldid[i])] - dhat)
        ytil[Not(foldid[i])]   = (y[Not(foldid[i])] - yhat)
        println(i)
    end
    
    # Create dataframe 
    data = DataFrame(ytil = ytil, dtil = dtil)
    
    # OLS clustering at the County level
    rfit = reg(data, @formula(ytil ~ dtil)) #unico cambio
    coef_est = GLM.coef(rfit)[2]
    se = GLM.coeftable(rfit).cols[2][2]

    println(" coef (se) = ", coef_est ,"(",se,")")
    
    return rfit, data;
    
end

DML2_lasso_RF (generic function with 1 method)

In [75]:
print( "\n DML with Lasso/Random Forest \n" )

dreg(x, d) = glmnetcv(x, d, alpha = 1)

    min_samples_leaf = 5
    rng = 3
yreg(x,y) = build_forest(y,x, min_samples_leaf, rng)

DML2_lasso_RF_1 = DML2_lasso_RF(x , d , y, dreg , yreg , 2);


 DML with Lasso/Random Forest 
Folds: 
1
2
 coef (se) = -0.005744384715407882(0.006024222705680695)


## Root Mean Square Error

In [76]:
mods = [DML2_ols, DML2_lasso_cv_1, DML2_RF_1];
mods_name = ["DML2_ols", "DML2_lasso", "DML2_RF"];

In [77]:
RMSE_Y = []
RMSE_D = []

for i in mods
    push!(RMSE_Y, sqrt(sum(i[2][!,1].^2)/length(i[2][!,1])))
    push!(RMSE_D,sqrt(sum(i[2][!,2].^2)/length(i[2][!,2])))
end

result = DataFrame([mods_name RMSE_Y RMSE_D], [:Models, :RMSE_Y, :RMSE_D])
pretty_table(result; formatters = ft_printf("%5.10f"))

┌────────────┬──────────────┬──────────────┐
│[1m     Models [0m│[1m       RMSE_Y [0m│[1m       RMSE_D [0m│
│[90m        Any [0m│[90m          Any [0m│[90m          Any [0m│
├────────────┼──────────────┼──────────────┤
│   DML2_ols │ 0.0571382009 │ 0.4501549270 │
│ DML2_lasso │ 0.0544140859 │ 0.3584153226 │
│    DML2_RF │ 0.0582499591 │ 1.0812695669 │
└────────────┴──────────────┴──────────────┘
