# HIGH-DIMENSIONAL METRICS IN R

## 2. How to get started

In [1]:
library(hdm)
library(stats)

"package 'hdm' was built under R version 4.2.1"


In [115]:
data(GrowthData)
dim(GrowthData)
## [1] 90 63
y = GrowthData[, 1, drop = F]
d = GrowthData[, 3, drop = F]
x = as.matrix(GrowthData)[, -c(1, 2, 3)]

# a = (rlasso(x = x, y = y)

In [118]:
rlasso(x = x, y = y)


Call:
rlasso.default(x = x, y = y)

Coefficients:
(Intercept)        bmp1l       freeop      freetar          h65         hm65  
    0.05810     -0.07557      0.00000      0.00000      0.00000      0.00000  
       hf65          p65         pm65         pf65          s65         sm65  
    0.00000      0.00000      0.00000      0.00000      0.00000      0.00000  
       sf65       fert65       mort65     lifee065        gpop1        fert1  
    0.00000      0.00000      0.00000      0.00000      0.00000      0.00000  
      mort1      invsh41      geetot1      geerec1         gde1       govwb1  
    0.00000      0.00000      0.00000      0.00000      0.00000      0.00000  
    govsh41     gvxdxe41       high65      highm65      highf65      highc65  
    0.00000      0.00000      0.00000      0.00000      0.00000      0.00000  
   highcm65     highcf65      human65     humanm65     humanf65        hyr65  
    0.00000      0.00000      0.00000      0.00000      0.00000      0.00000  
 

In [117]:
a = 5

## 4. Inference on Target Regression Coefficients

### 4.1. Intuition for the Orthogonality Principle in Linear Models via Partialling Out.

In [469]:
set.seed(1)
n = 5000
p = 20
X = matrix(rnorm(n * p), ncol = p)
colnames(X) = c("d", paste("x", 1:19, sep = ""))
xnames = colnames(X)[-1]
beta = rep(1, 20)
y = X %*% beta + rnorm(n)
dat = data.frame(y = y, X)
#save(dat, file = "../data/4_1.csv")
#write.csv(dat,"../data/4_1.csv", row.names = FALSE)


In [471]:
# full fit
fmla = as.formula(paste("y ~ ", paste(colnames(X), collapse = "+")))
full.fit = lm(fmla, data = dat)
summary(full.fit)$coef["d", 1:2]

In [485]:
fmla.y = as.formula(paste("y ~ ", paste(xnames, collapse = "+")))
fmla.d = as.formula(paste("d ~ ", paste(xnames, collapse = "+")))
# partial fit via ols
rY = lm(fmla.y, data = dat)$res
rD = lm(fmla.d, data = dat)$res
partial.fit.ls = lm(rY ~ rD)
summary(partial.fit.ls)$coef["rD", 1:2]

In [488]:
rY = rlasso(fmla.y, data = dat)$res
rD = rlasso(fmla.d, data = dat)$res
partial.fit.postlasso = lm(rY ~ rD)
summary(partial.fit.postlasso)$coef["rD", 1:2]

### 4.2. Inference: Confidence Intervals and Significance Testing. The function rlassoEffects

In [496]:
set.seed(1)
n = 100 #sample size
p = 100 # number of variables
s = 3 # nubmer of non-zero variables
X = matrix(rnorm(n * p), ncol = p)
colnames(X) <- paste("X", 1:p, sep = "")
beta = c(rep(3, s), rep(0, p - s))
y = 1 + X %*% beta + rnorm(n)
data = data.frame(cbind(y, X))
#write.csv(data,"../data/4_2.csv", row.names = FALSE)
colnames(data)[1] <- "y"
fm = paste("y ~", paste(colnames(X), collapse = "+"))
fm = as.formula(fm)

In [497]:
# lasso.effect = rlassoEffects(X, y, index=c(1,2,3,50))
lasso.effect = rlassoEffects(fm, I = ~X1 + X2 + X3 + X50, data = data)
print(lasso.effect)


Call:
rlassoEffects.formula(formula = fm, data = data, I = ~X1 + X2 + 
    X3 + X50)

Coefficients:
     X1       X2       X3      X50  
2.94448  3.04127  2.97540  0.07196  



In [499]:
summary(lasso.effect)

[1] "Estimates and significance testing of the effect of target variables"
    Estimate. Std. Error t value Pr(>|t|)    
X1    2.94448    0.08815  33.404   <2e-16 ***
X2    3.04127    0.08389  36.253   <2e-16 ***
X3    2.97540    0.07804  38.127   <2e-16 ***
X50   0.07196    0.07765   0.927    0.354    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1



In [500]:
confint(lasso.effect)


Unnamed: 0,2.5 %,97.5 %
X1,2.77171308,3.1172421
X2,2.87685121,3.2056979
X3,2.82244962,3.1283583
X50,-0.08022708,0.2241377


In [501]:
confint(lasso.effect, level = 0.95, joint = TRUE)


Unnamed: 0,2.5 %,97.5 %
X1,2.7279477,3.1610075
X2,2.8371214,3.2454278
X3,2.7833176,3.1674903
X50,-0.1154509,0.2593615


### 4.3. Application: the effect of gender on wage.

In [3]:
library(hdm)
data(cps2012)
X <- model.matrix(~-1 + female + female:(widowed + divorced + separated + nevermarried +
hsd08 + hsd911 + hsg + cg + ad + mw + so + we + exp1 + exp2 + exp3) + +(widowed +
divorced + separated + nevermarried + hsd08 + hsd911 + hsg + cg + ad + mw + so +
we + exp1 + exp2 + exp3)^2, data = cps2012)
dim(X)

In [4]:
X <- X[, which(apply(X, 2, var) != 0)] # exclude all constant variables
dim(X)

In [8]:
index.gender <- grep("female", colnames(X))
index.gender

In [5]:
index.gender <- grep("female", colnames(X))
y <- cps2012$lnw

In [6]:
effects.female <- rlassoEffects(x = X, y = y, index = index.gender)
summary(effects.female)

[1] "Estimates and significance testing of the effect of target variables"
                    Estimate. Std. Error t value Pr(>|t|)    
female              -0.154923   0.050162  -3.088 0.002012 ** 
female:widowed       0.136095   0.090663   1.501 0.133325    
female:divorced      0.136939   0.022182   6.174 6.68e-10 ***
female:separated     0.023303   0.053212   0.438 0.661441    
female:nevermarried  0.186853   0.019942   9.370  < 2e-16 ***
female:hsd08         0.027810   0.120914   0.230 0.818092    
female:hsd911       -0.119335   0.051880  -2.300 0.021435 *  
female:hsg          -0.012890   0.019223  -0.671 0.502518    
female:cg            0.010139   0.018327   0.553 0.580114    
female:ad           -0.030464   0.021806  -1.397 0.162405    
female:mw           -0.001063   0.019192  -0.055 0.955811    
female:so           -0.008183   0.019357  -0.423 0.672468    
female:we           -0.004226   0.021168  -0.200 0.841760    
female:exp1          0.004935   0.007804   0.632 0.527139

In [9]:
X[, index.gender]

Unnamed: 0,female,female:widowed,female:divorced,female:separated,female:nevermarried,female:hsd08,female:hsd911,female:hsg,female:cg,female:ad,female:mw,female:so,female:we,female:exp1,female:exp2,female:exp3
1663459,1,0,0,0,0,0,0,0,0,0,0,0,0,22.0,4.8400,10.648000
1663462,1,0,0,0,0,0,1,0,0,0,0,0,0,30.0,9.0000,27.000000
1663463,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0000,0.000000
1663465,1,0,0,0,0,0,0,1,0,0,0,0,0,14.0,1.9600,2.744000
1663468,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0000,0.000000
1663469,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0000,0.000000
1663473,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0000,0.000000
1663475,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0000,0.000000
1663476,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0000,0.000000
1663477,1,0,0,0,0,0,0,0,0,0,0,0,0,15.5,2.4025,3.723875


### 4.4. Application: Estimation of the treatment effect in a linear model with many confounding factors

In [2]:
data(GrowthData)
dim(GrowthData)
## [1] 90 63
y = GrowthData[, 1, drop = F]
d = GrowthData[, 3, drop = F]
X = as.matrix(GrowthData)[, -c(1, 2, 3)]
varnames = colnames(GrowthData)

In [5]:
xnames = varnames[-c(1, 2, 3)] # names of X variables
dandxnames = varnames[-c(1, 2)] # names of D and X variables
# create formulas by pasting names (this saves typing times)
fmla = as.formula(paste("Outcome ~ ", paste(dandxnames, collapse = "+")))
ls.effect = lm(fmla, data = GrowthData)

In [6]:
ls.effect


Call:
lm(formula = fmla, data = GrowthData)

Coefficients:
(Intercept)     gdpsh465        bmp1l       freeop      freetar          h65  
  2.472e-01   -9.378e-03   -6.886e-02    8.007e-02   -4.890e-01   -2.362e+00  
       hm65         hf65          p65         pm65         pf65          s65  
  7.071e-01    1.693e+00    2.655e-01    1.370e-01   -3.313e-01    3.908e-02  
       sm65         sf65       fert65       mort65     lifee065        gpop1  
 -3.067e-02   -1.799e-01    6.881e-03   -2.335e-01   -1.491e-02    9.702e-01  
      fert1        mort1      invsh41      geetot1      geerec1         gde1  
  8.838e-03    6.656e-02    7.446e-02   -7.151e-01    6.300e-01   -4.436e-01  
     govwb1      govsh41     gvxdxe41       high65      highm65      highf65  
  3.375e-01    4.632e-01   -7.934e-01   -7.525e-01   -3.903e-01   -4.177e-01  
    highc65     highcm65     highcf65      human65     humanm65     humanf65  
 -2.216e+00    2.797e-01    3.921e-01    2.337e+00   -1.209e+00   -1.10

In [4]:
dX = as.matrix(cbind(d, X))
lasso.effect = rlassoEffect(x = X, y = y, d = d, method = "partialling out")
summary(lasso.effect)

[1] "Estimates and significance testing of the effect of target variables"
     Estimate. Std. Error t value Pr(>|t|)    
[1,]  -0.04981    0.01394  -3.574 0.000351 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1



In [121]:
lasso.effect = rlassoEffect(x = X, y = y, d = d, method = "partialling out")
summary(lasso.effect)


[1] "Estimates and significance testing of the effect of target variables"
     Estimate. Std. Error t value Pr(>|t|)    
[1,]  -0.04981    0.01394  -3.574 0.000351 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1



In [122]:
doublesel.effect = rlassoEffect(x = X, y = y, d = d, method = "double selection")

In [123]:
summary(doublesel.effect)

[1] "Estimates and significance testing of the effect of target variables"
         Estimate. Std. Error t value Pr(>|t|)   
gdpsh465  -0.05001    0.01579  -3.167  0.00154 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1



In [495]:
xnames = varnames[-c(1, 2, 3)] # names of X variables
dandxnames = varnames[-c(1, 2)] # names of D and X variables
# create formulas by pasting names (this saves typing times)
fmla = as.formula(paste("Outcome ~ ", paste(dandxnames, collapse = "+")))
ls.effect = lm(fmla, data = GrowthData)
summary((ls.effect))


Call:
lm(formula = fmla, data = GrowthData)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.040338 -0.011298 -0.000863  0.011813  0.043247 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)   
(Intercept)  2.472e-01  7.845e-01   0.315  0.75506   
gdpsh465    -9.378e-03  2.989e-02  -0.314  0.75602   
bmp1l       -6.886e-02  3.253e-02  -2.117  0.04329 * 
freeop       8.007e-02  2.079e-01   0.385  0.70300   
freetar     -4.890e-01  4.182e-01  -1.169  0.25214   
h65         -2.362e+00  8.573e-01  -2.755  0.01019 * 
hm65         7.071e-01  5.231e-01   1.352  0.18729   
hf65         1.693e+00  5.032e-01   3.365  0.00223 **
p65          2.655e-01  1.643e-01   1.616  0.11727   
pm65         1.370e-01  1.512e-01   0.906  0.37284   
pf65        -3.313e-01  1.651e-01  -2.006  0.05458 . 
s65          3.908e-02  1.855e-01   0.211  0.83469   
sm65        -3.067e-02  1.168e-01  -0.263  0.79479   
sf65        -1.799e-01  1.181e-01  -1.523  0.13886   
fert65       6.8

## 5. Instrumental Variable Estimation in a High-Dimensional Setting

### 5.2. Application: Economic Development and Institutions.

In [2]:
data(AJR)
y = AJR$GDP
d = AJR$Exprop
z = AJR$logMort
x = model.matrix(~-1 + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2,
data = AJR)
dim(x)


In [3]:
AJR.Xselect = rlassoIV(GDP ~ Exprop + (Latitude + Latitude2 + Africa + Asia + Namer +
Samer)^2 | logMort + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2,
data = AJR, select.X = TRUE, select.Z = FALSE)
summary(AJR.Xselect)


[1] "Estimation and significance testing of the effect of target variables in the IV regression model"
       coeff.    se. t-value p-value   
Exprop 0.8450 0.2699   3.131 0.00174 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1




In [4]:
confint(AJR.Xselect)

           2.5 %   97.5 %
Exprop 0.3159812 1.374072


In [5]:
# parialling out by linear model
fmla.y = GDP ~ (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2
fmla.d = Exprop ~ (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2
fmla.z = logMort ~ (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2
rY = lm(fmla.y, data = AJR)$res
rD = lm(fmla.d, data = AJR)$res
rZ = lm(fmla.z, data = AJR)$res
# ivfit.lm = tsls(y=rY,d=rD, x=NULL, z=rZ, intercept=FALSE)
ivfit.lm = tsls(rY ~ rD | rZ, intercept = FALSE)
print(cbind(ivfit.lm$coef, ivfit.lm$se), digits = 3)

   [,1] [,2]
rD 1.27 1.73


In [6]:
# parialling out by lasso
rY = rlasso(fmla.y, data = AJR)$res
rD = rlasso(fmla.d, data = AJR)$res
rZ = rlasso(fmla.z, data = AJR)$res
# ivfit.lasso = tsls(y=rY,d=rD, x=NULL, z=rZ, intercept=FALSE)
ivfit.lasso = tsls(rY ~ rD | rZ, intercept = FALSE)

In [10]:
summary(ivfit.lasso)

[1] "Estimates and Significance Testing from from tsls"
   Estimate Std. Error t value p value   
rD   0.8450     0.2699   3.131 0.00174 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1




### 5.3. Application: Impact of Eminent Domain Decisions on Economic Outcomes.

In [45]:
data(EminentDomain)
z <- as.matrix(EminentDomain$logGDP$z)
x <- as.matrix(EminentDomain$logGDP$x)
y <- EminentDomain$logGDP$y
d <- EminentDomain$logGDP$d
x <- x[, apply(x, 2, mean, na.rm = TRUE) > 0.05] #
z <- z[, apply(z, 2, mean, na.rm = TRUE) > 0.05] #

In [46]:
ED.ols = lm(y ~ cbind(d, x))
ED.2sls = tsls(y = y, d = d, x = x, z = z[, 1:2], intercept = FALSE)

In [47]:
lasso.IV.Z = rlassoIV(x = x, d = d, y = y, z = z, select.X = FALSE, select.Z = TRUE)
summary(lasso.IV.Z)

[1] "Estimates and significance testing of the effect of target variables in the IV regression model"
   coeff.    se. t-value p-value
d1 0.4146 0.2902   1.428   0.153




In [22]:
confint(lasso.IV.Z)

        2.5 %    97.5 %
d1 -0.1542764 0.9834796


In [49]:
lasso.IV.XZ = rlassoIV(x = x, d = d, y = y, z = z, select.X = TRUE, select.Z = TRUE)
summary(lasso.IV.XZ)

Estimates and Significance Testing of the effect of target variables in the IV regression model 
     coeff.      se. t-value p-value
d1 -0.02383  0.12851  -0.185   0.853




In [24]:
confint(lasso.IV.XZ)

        2.5 %    97.5 %
d1 -0.2757029 0.2280335


In [30]:
summary(ED.ols)$coef[2, 1:2]

In [31]:
cbind(ED.2sls$coef[1], ED.2sls$se[1])

0,1,2
d1,-0.01073327,0.03376636


In [32]:
library(xtable)
table = matrix(0, 4, 2)
table[1, ] = summary(ED.ols)$coef[2, 1:2]
table[2, ] = cbind(ED.2sls$coef[1], ED.2sls$se[1])
table[3, ] = summary(lasso.IV.Z)[, 1:2]

[1] "Estimates and significance testing of the effect of target variables in the IV regression model"
   coeff.    se. t-value p-value
d1 0.4146 0.2902   1.428   0.153




In [27]:
table

0,1
0.007864732,0.009865927
-0.010733269,0.033766362
0.414601641,0.290249208
0.0,0.0


## 7. The Lasso Methods for Discovery of Significant Causes amongst Many Potential Causes, with Many Controls

In [50]:
# library(hdm) library(stats)
set.seed(1)
n = 100
p1 = 20
p2 = 20
D = matrix(rnorm(n * p1), n, p1) # Causes
W = matrix(rnorm(n * p2), n, p2) # Controls
X = cbind(D, W) # Regressors
Y = D[, 1] * 5 + W[, 1] * 5 + rnorm(n) #Outcome

In [233]:
# data = data.frame(cbind(Y, X))
# write.csv(data,"../data/7_.csv", row.names = FALSE)

In [51]:
confint(rlassoEffects(X, Y, index = c(1:p1)), joint = TRUE)

Unnamed: 0,2.5 %,97.5 %
V1,4.5145877,5.21430498
V2,-0.3142909,0.3049465
V3,-0.3524109,0.1867888
V4,-0.254243,0.28738914
V5,-0.2765802,0.27627177
V6,-0.3214676,0.29422684
V7,-0.2262507,0.30094168
V8,-0.0473541,0.47366372
V9,-0.1865636,0.3902352
V10,-0.2372356,0.26411185
