# HIGH-DIMENSIONAL METRICS IN R

## 2. How to get started

In [1]:
library(hdm)
library(stats)
library(glmnet)

"package 'hdm' was built under R version 4.2.1"
"package 'glmnet' was built under R version 4.2.1"
Loading required package: Matrix

Loaded glmnet 4.1-4



### 3.1. Prediction in Linear Models using Approximate Sparsity

### 3.2. A Joint Significance Test for Lasso Regression.

In [28]:
set.seed(12345)
n = 100 #sample size
p = 100 # number of variables
s = 3 # nubmer of variables with non-zero coefficients
X = matrix(rnorm(n * p), ncol = p)
beta = c(rep(5, s), rep(0, p - s))
Y = X %*% beta + rnorm(n)
#data = data.frame(cbind(Y, X))
#write.csv(data,"../data/3_2.csv", row.names = FALSE)

In [29]:
lasso.reg = rlasso(Y ~ X, post = FALSE) # use lasso, not-Post-lasso
# lasso.reg = rlasso(X, Y, post=FALSE)
sum.lasso <- summary(lasso.reg, all = FALSE) # can also do print(lasso.reg, all=FALSE)


Call:
rlasso.formula(formula = Y ~ X, post = FALSE)

Post-Lasso Estimation:  FALSE 

Total number of variables: 100
Number of selected variables: 11 

Residuals: 
     Min       1Q   Median       3Q      Max 
-2.09008 -0.45801 -0.01237  0.50291  2.25098 

            Estimate
(Intercept)    0.057
1              4.771
2              4.693
3              4.766
13            -0.045
15            -0.047
16            -0.005
19            -0.092
22            -0.027
40            -0.011
61             0.114
100           -0.025

Residual standard error: 0.8039
Multiple R-squared:  0.9913
Adjusted R-squared:  0.9902
Joint significance test:
 the sup score statistic for joint significance test is 64.02 with a p-value of     0


In [30]:
set.seed(123)
yhat.lasso = predict(lasso.reg) #in-sample prediction
Xnew = matrix(rnorm(n * p), ncol = p) # new X
Ynew = Xnew %*% beta + rnorm(n) #new Y
yhat.lasso.new = predict(lasso.reg, newdata = Xnew) #out-of-sample prediction
# data = data.frame(cbind(Ynew, Xnew))
# write.csv(data,"../data/3_2_2.csv", row.names = FALSE)

In [31]:
post.lasso.reg = rlasso(Y ~ X, post = TRUE) #now use post-lasso
print(post.lasso.reg, all = FALSE) # or use summary(post.lasso.reg, all=FALSE)


Call:
rlasso.formula(formula = Y ~ X, post = TRUE)

(Intercept)            1            2            3  
     0.0341       4.9241       4.8579       4.9644  



In [32]:
yhat.postlasso = predict(post.lasso.reg) #in-sample prediction
yhat.postlasso.new = predict(post.lasso.reg, newdata = Xnew) #out-of-sample prediction
MAE <- apply(cbind(abs(Ynew - yhat.lasso.new), abs(Ynew - yhat.postlasso.new)), 2,
mean)
names(MAE) <- c("lasso MAE", "Post-lasso MAE")
print(MAE, digits = 2) # MAE for Lasso and Post-Lasso

     lasso MAE Post-lasso MAE 
          0.88           0.78 


## 4. Inference on Target Regression Coefficients

### 4.1. Intuition for the Orthogonality Principle in Linear Models via Partialling Out.

In [8]:
set.seed(1)
n = 5000
p = 20
X = matrix(rnorm(n * p), ncol = p)
colnames(X) = c("d", paste("x", 1:19, sep = ""))
xnames = colnames(X)[-1]
beta = rep(1, 20)
y = X %*% beta + rnorm(n)
dat = data.frame(y = y, X)
#save(dat, file = "../data/4_1.csv")
#write.csv(dat,"../data/4_1.csv", row.names = FALSE)


In [9]:
# full fit
fmla = as.formula(paste("y ~ ", paste(colnames(X), collapse = "+")))
full.fit = lm(fmla, data = dat)
summary(full.fit)$coef["d", 1:2]

In [10]:
fmla.y = as.formula(paste("y ~ ", paste(xnames, collapse = "+")))
fmla.d = as.formula(paste("d ~ ", paste(xnames, collapse = "+")))
# partial fit via ols
rY = lm(fmla.y, data = dat)$res
rD = lm(fmla.d, data = dat)$res
partial.fit.ls = lm(rY ~ rD)
summary(partial.fit.ls)$coef["rD", 1:2]

In [11]:
rY = rlasso(fmla.y, data = dat)$res
rD = rlasso(fmla.d, data = dat)$res
partial.fit.postlasso = lm(rY ~ rD)
summary(partial.fit.postlasso)$coef["rD", 1:2]

In [12]:
Eff = rlassoEffect(X[, -1], y, X[, 1], method = "partialling out")
summary(Eff)$coef

Estimate.,Std. Error,t value,Pr(>|t|)
0.9727387,0.01368677,71.07148,0


In [25]:
Eff = rlassoEffect(X[, -1], y, X[, 1], method = "double selection")
summary(Eff)$coef[, 1:2]

### 4.2. Inference: Confidence Intervals and Significance Testing. The function rlassoEffects

In [26]:
set.seed(1)
n = 100 #sample size
p = 100 # number of variables
s = 3 # nubmer of non-zero variables
X = matrix(rnorm(n * p), ncol = p)
colnames(X) <- paste("X", 1:p, sep = "")
beta = c(rep(3, s), rep(0, p - s))
y = 1 + X %*% beta + rnorm(n)
data = data.frame(cbind(y, X))
#write.csv(data,"../data/4_2.csv", row.names = FALSE)
colnames(data)[1] <- "y"
fm = paste("y ~", paste(colnames(X), collapse = "+"))
fm = as.formula(fm)

In [27]:
# lasso.effect = rlassoEffects(X, y, index=c(1,2,3,50))
lasso.effect = rlassoEffects(fm, I = ~X1 + X2 + X3 + X50, data = data)
print(lasso.effect)


Call:
rlassoEffects.formula(formula = fm, data = data, I = ~X1 + X2 + 
    X3 + X50)

Coefficients:
     X1       X2       X3      X50  
2.94448  3.04127  2.97540  0.07196  



In [28]:
summary(lasso.effect)

[1] "Estimates and significance testing of the effect of target variables"
    Estimate. Std. Error t value Pr(>|t|)    
X1    2.94448    0.08815  33.404   <2e-16 ***
X2    3.04127    0.08389  36.253   <2e-16 ***
X3    2.97540    0.07804  38.127   <2e-16 ***
X50   0.07196    0.07765   0.927    0.354    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1



In [29]:
confint(lasso.effect)


Unnamed: 0,2.5 %,97.5 %
X1,2.77171308,3.1172421
X2,2.87685121,3.2056979
X3,2.82244962,3.1283583
X50,-0.08022708,0.2241377


In [31]:
confint(lasso.effect, level = 0.95, joint = TRUE)


Unnamed: 0,2.5 %,97.5 %
X1,2.7357949,3.1531603
X2,2.84452,3.2380291
X3,2.7902789,3.160529
X50,-0.1086592,0.2525698


### 4.3. Application: the effect of gender on wage.

In [50]:
library(hdm)
data(cps2012)
X <- model.matrix(~-1 + female + female:(widowed + divorced + separated + nevermarried +
hsd08 + hsd911 + hsg + cg + ad + mw + so + we + exp1 + exp2 + exp3) + +(widowed +
divorced + separated + nevermarried + hsd08 + hsd911 + hsg + cg + ad + mw + so +
we + exp1 + exp2 + exp3)^2, data = cps2012)
dim(X)
## [1] 29217 136
X <- X[, which(apply(X, 2, var) != 0)] # exclude all constant variables
dim(X)
## [1] 29217 116
index.gender <- grep("female", colnames(X))
y <- cps2012$lnw

In [51]:
effects.female <- rlassoEffects(x = X, y = y, index = index.gender)
summary(effects.female)

[1] "Estimates and significance testing of the effect of target variables"
                    Estimate. Std. Error t value Pr(>|t|)    
female              -0.154923   0.050162  -3.088 0.002012 ** 
female:widowed       0.136095   0.090663   1.501 0.133325    
female:divorced      0.136939   0.022182   6.174 6.68e-10 ***
female:separated     0.023303   0.053212   0.438 0.661441    
female:nevermarried  0.186853   0.019942   9.370  < 2e-16 ***
female:hsd08         0.027810   0.120914   0.230 0.818092    
female:hsd911       -0.119335   0.051880  -2.300 0.021435 *  
female:hsg          -0.012890   0.019223  -0.671 0.502518    
female:cg            0.010139   0.018327   0.553 0.580114    
female:ad           -0.030464   0.021806  -1.397 0.162405    
female:mw           -0.001063   0.019192  -0.055 0.955811    
female:so           -0.008183   0.019357  -0.423 0.672468    
female:we           -0.004226   0.021168  -0.200 0.841760    
female:exp1          0.004935   0.007804   0.632 0.527139

In [52]:
joint.CI <- confint(effects.female, level = 0.95, joint = TRUE)
joint.CI

Unnamed: 0,2.5 %,97.5 %
female,-0.29478224,-0.01506433
female:widowed,-0.13475121,0.40694218
female:divorced,0.07454815,0.19933062
female:separated,-0.11727723,0.16388275
female:nevermarried,0.12902723,0.24467974
female:hsd08,-0.376113,0.43173362
female:hsd911,-0.26962418,0.0309541
female:hsg,-0.06534868,0.03956912
female:cg,-0.04188922,0.06216633
female:ad,-0.09609866,0.03517117


### 4.4. Application: Estimation of the treatment effect in a linear model with many confounding factors

In [53]:
data(GrowthData)
dim(GrowthData)
## [1] 90 63
y = GrowthData[, 1, drop = F]
d = GrowthData[, 3, drop = F]
X = as.matrix(GrowthData)[, -c(1, 2, 3)]
varnames = colnames(GrowthData)

In [54]:
xnames = varnames[-c(1, 2, 3)] # names of X variables
dandxnames = varnames[-c(1, 2)] # names of D and X variables
# create formulas by pasting names (this saves typing times)
fmla = as.formula(paste("Outcome ~ ", paste(dandxnames, collapse = "+")))
ls.effect = lm(fmla, data = GrowthData)

In [55]:
dX = as.matrix(cbind(d, X))
lasso.effect = rlassoEffect(x = X, y = y, d = d, method = "partialling out")
summary(lasso.effect)

[1] "Estimates and significance testing of the effect of target variables"
     Estimate. Std. Error t value Pr(>|t|)    
[1,]  -0.04981    0.01394  -3.574 0.000351 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1



In [56]:
doublesel.effect = rlassoEffect(x = X, y = y, d = d, method = "double selection")
summary(doublesel.effect)

[1] "Estimates and significance testing of the effect of target variables"
         Estimate. Std. Error t value Pr(>|t|)   
gdpsh465  -0.05001    0.01579  -3.167  0.00154 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1



In [58]:
library(xtable)
table = rbind(summary(ls.effect)$coef["gdpsh465", 1:2], summary(lasso.effect)$coef[,
1:2], summary(doublesel.effect)$coef[, 1:2])
colnames(table) = c("Estimate", "Std. Error") #names(summary(full.fit)£coef)[1:2]
rownames(table) = c("full reg via ols", "partial reg
via post-lasso ", "partial reg via double selection")
tab = xtable(table, digits = c(2, 2, 5))
tab

Unnamed: 0_level_0,Estimate,Std. Error
Unnamed: 0_level_1,<dbl>,<dbl>
full reg via ols,-0.009377989,0.02988773
partial reg via post-lasso,-0.049811465,0.01393636
partial reg via double selection,-0.050005855,0.01579138


## 5. Instrumental Variable Estimation in a High-Dimensional Setting

### 5.2. Application: Economic Development and Institutions.

In [2]:
data(AJR)
y = AJR$GDP
d = AJR$Exprop
z = AJR$logMort
x = model.matrix(~-1 + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2,
data = AJR)
dim(x)

In [59]:
AJR.Xselect = rlassoIV(GDP ~ Exprop + (Latitude + Latitude2 + Africa + Asia + Namer +
Samer)^2 | logMort + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2,
data = AJR, select.X = TRUE, select.Z = FALSE)
summary(AJR.Xselect)


[1] "Estimation and significance testing of the effect of target variables in the IV regression model"
       coeff.    se. t-value p-value   
Exprop 0.8450 0.2699   3.131 0.00174 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1




In [60]:
confint(AJR.Xselect)

           2.5 %   97.5 %
Exprop 0.3159812 1.374072


In [62]:
# parialling out by linear model
fmla.y = GDP ~ (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2
fmla.d = Exprop ~ (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2
fmla.z = logMort ~ (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2
rY = lm(fmla.y, data = AJR)$res
rD = lm(fmla.d, data = AJR)$res
rZ = lm(fmla.z, data = AJR)$res
# ivfit.lm = tsls(y=rY,d=rD, x=NULL, z=rZ, intercept=FALSE)
ivfit.lm = tsls(rY ~ rD | rZ, intercept = FALSE)
print(cbind(ivfit.lm$coef, ivfit.lm$se), digits = 3)

   [,1] [,2]
rD 1.27 1.73


In [63]:
# parialling out by lasso
rY = rlasso(fmla.y, data = AJR)$res
rD = rlasso(fmla.d, data = AJR)$res
rZ = rlasso(fmla.z, data = AJR)$res
# ivfit.lasso = tsls(y=rY,d=rD, x=NULL, z=rZ, intercept=FALSE)
ivfit.lasso = tsls(rY ~ rD | rZ, intercept = FALSE)

In [64]:
summary(ivfit.lasso)

[1] "Estimates and Significance Testing from from tsls"
   Estimate Std. Error t value p value   
rD   0.8450     0.2699   3.131 0.00174 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1




### 5.3. Application: Impact of Eminent Domain Decisions on Economic Outcomes.

In [68]:
data(EminentDomain)
z <- as.matrix(EminentDomain$logGDP$z)
x <- as.matrix(EminentDomain$logGDP$x)
y <- EminentDomain$logGDP$y
d <- EminentDomain$logGDP$d
x <- x[, apply(x, 2, mean, na.rm = TRUE) > 0.05] #
z <- z[, apply(z, 2, mean, na.rm = TRUE) > 0.05] #

In [69]:
ED.ols = lm(y ~ cbind(d, x))
ED.2sls = tsls(y = y, d = d, x = x, z = z[, 1:2], intercept = FALSE)

In [72]:
lasso.IV.Z = rlassoIV(x = x, d = d, y = y, z = z, select.X = FALSE, select.Z = TRUE)
summary(lasso.IV.Z)

[1] "Estimates and significance testing of the effect of target variables in the IV regression model"
   coeff.    se. t-value p-value
d1 0.4146 0.2902   1.428   0.153




In [73]:
confint(lasso.IV.Z)

        2.5 %    97.5 %
d1 -0.1542764 0.9834796


In [74]:
lasso.IV.XZ = rlassoIV(x = x, d = d, y = y, z = z, select.X = TRUE, select.Z = TRUE)
summary(lasso.IV.XZ)

Estimates and Significance Testing of the effect of target variables in the IV regression model 
     coeff.      se. t-value p-value
d1 -0.02383  0.12851  -0.185   0.853




In [75]:
confint(lasso.IV.XZ)

        2.5 %    97.5 %
d1 -0.2757029 0.2280335


In [79]:
library(xtable)
table = matrix(0, 4, 2)
table[1, ] = summary(ED.ols)$coef[2, 1:2]
table[2, ] = cbind(ED.2sls$coef[1], ED.2sls$se[1])
table[3, ] = summary(lasso.IV.Z)[, 1:2]
table[4, ] = summary(lasso.IV.XZ)[, 1:2]
colnames(table) = c("Estimate", "Std. Error")
rownames(table) = c("ols regression", "IV estimation ", "selection on Z", "selection on X and Z")
tab = xtable(table, digits = c(2, 2, 7))
tab

[1] "Estimates and significance testing of the effect of target variables in the IV regression model"
   coeff.    se. t-value p-value
d1 0.4146 0.2902   1.428   0.153


Estimates and Significance Testing of the effect of target variables in the IV regression model 
     coeff.      se. t-value p-value
d1 -0.02383  0.12851  -0.185   0.853




Unnamed: 0_level_0,Estimate,Std. Error
Unnamed: 0_level_1,<dbl>,<dbl>
ols regression,0.007864732,0.009865927
IV estimation,-0.010733269,0.033766362
selection on Z,0.414601641,0.290249208
selection on X and Z,-0.023834697,0.128506538


## 6. Inference on Treatment Effects in a High-Dimensional Setting

### 6.3. Application: 401(k) plan participation.

In [2]:
data(pension)
y = pension$tw
d = pension$p401
z = pension$e401
X = pension[, c("i2", "i3", "i4", "i5", "i6", "i7", "a2", "a3", "a4", "a5", "fsize",
"hs", "smcol", "col", "marr", "twoearn", "db", "pira", "hown")] # simple model
xvar = c("i2", "i3", "i4", "i5", "i6", "i7", "a2", "a3", "a4", "a5", "fsize", "hs",
"smcol", "col", "marr", "twoearn", "db", "pira", "hown")
xpart = paste(xvar, collapse = "+")
form = as.formula(paste("tw ~ ", paste(c("p401", xvar), collapse = "+"), "|", paste(xvar,
collapse = "+")))
formZ = as.formula(paste("tw ~ ", paste(c("p401", xvar), collapse = "+"), "|", paste(c("e401",
xvar), collapse = "+")))

In [3]:
pension.ate = rlassoATE(form, data = pension)
summary(pension.ate)

Estimation and significance testing of the treatment effect 
Type: ATE 
Bootstrap: not applicable 
   coeff.   se. t-value  p-value    
TE  10180  1931   5.273 1.34e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1




In [4]:
# pension.atet = rlassoATET(X,d,y)
pension.atet = rlassoATET(form, data = pension)
summary(pension.atet)

Estimation and significance testing of the treatment effect 
Type: ATET 
Bootstrap: not applicable 
   coeff.   se. t-value p-value    
TE  12628  2944   4.289 1.8e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1




In [6]:
pension.late = rlassoLATE(X, d, y, z)
summary(pension.late)

ERROR: Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'drop': argumentos no compatibles


In [None]:
pension.latet = rlassoLATET(X, d, y, z)
summary(pension.latet)

In [None]:
library(xtable)
table = matrix(0, 4, 2)
table[1, ] = summary(pension.ate)[, 1:2]
table[2, ] = summary(pension.atet)[, 1:2]
table[3, ] = summary(pension.late)[, 1:2]
table[4, ] = summary(pension.latet)[, 1:2]
colnames(table) = c("Estimate", "Std. Error")
rownames(table) = c("ATE", "ATET ", "LATE", "LATET")
tab = xtable(table, digits = c(2, 2, 2))

In [18]:
# X = model.matrix(~ -1 + (i2 + i3 + i4 + i5 + i6 + i7 + a2 + a3 + a4 + a5 +
# fsize + hs + smcol + col + marr + twoearn + db + pira + hown)^2, data =
# pension) # model with interactions
xvar2 <- paste("(", xvar, ")^2", sep = "")
formExt = as.formula(paste("tw ~ ", paste(c("p401", xvar2), collapse = "+"), "|",
paste(xvar2, collapse = "+")))
formZExt = as.formula(paste("tw ~ ", paste(c("p401", xvar2), collapse = "+"), "|",
paste(c("e401", xvar2), collapse = "+")))

In [24]:
pension.ate = rlassoATE(X, z, y)
pension.atet = rlassoATET(X, z, y)
pension.late = rlassoLATE(X, d, y, z)
pension.latet = rlassoLATET(X, d, y, z)
# pension.ate = rlassoATE(formExt, data = pension) pension.atet =
# rlassoATET(formExt, data = pension) pension.late = rlassoLATE(formZExt, data =
# pension) pension.latet = rlassoLATET(formZExt, data = pension)
table = matrix(0, 4, 2)
table[1, ] = summary(pension.ate)[, 1:2]
table[2, ] = summary(pension.atet)[, 1:2]
table[3, ] = summary(pension.late)[, 1:2]
table[4, ] = summary(pension.latet)[, 1:2]
colnames(table) = c("Estimate", "Std. Error")
rownames(table) = c("ATE", "ATET ", "LATE", "LATET")
tab = xtable(table, digits = c(2, 2, 2))

## 7. The Lasso Methods for Discovery of Significant Causes amongst Many Potential Causes, with Many Controls

In [50]:
# library(hdm) library(stats)
set.seed(1)
n = 100
p1 = 20
p2 = 20
D = matrix(rnorm(n * p1), n, p1) # Causes
W = matrix(rnorm(n * p2), n, p2) # Controls
X = cbind(D, W) # Regressors
Y = D[, 1] * 5 + W[, 1] * 5 + rnorm(n) #Outcome

In [51]:
confint(rlassoEffects(X, Y, index = c(1:p1)), joint = TRUE)

Unnamed: 0,2.5 %,97.5 %
V1,4.5145877,5.21430498
V2,-0.3142909,0.3049465
V3,-0.3524109,0.1867888
V4,-0.254243,0.28738914
V5,-0.2765802,0.27627177
V6,-0.3214676,0.29422684
V7,-0.2262507,0.30094168
V8,-0.0473541,0.47366372
V9,-0.1865636,0.3902352
V10,-0.2372356,0.26411185
