# HIGH-DIMENSIONAL METRICS IN R

## 2. How to get started

or

In [445]:
#import Pkg; Pkg.add(url = "https://github.com/d2cml-ai/HDMjl.jl")

In [1]:
using CodecXz, RData, DataFrames, StatsModels, Statistics, Distributions, PrettyTables, Distributions

In [1]:
import Pkg

In [88]:
include("E:/causal_ml/hdm_paper/prueba/HDMjl.jl/src/HDMjl.jl")



Main.HDMjl

In [444]:
#Pkg.develop(path = "E:/causal_ml/hdm_paper/HDMjl.jl")

## 3. Prediction using Approximate Sparsity

### 3.2. A Joint Significance Test for Lasso Regression.

In [3]:
using Random
Random.seed!(1234);
n = 100;
p = 100;
s = 3;
X = randn(n, p);
beta = vcat(fill(5, s), zeros(p - s));
Y = X * beta + randn(n);

In [4]:
lasso_reg = HDMjl.rlasso(X, Y, post = false);

In [5]:
post_lasso_reg = HDMjl.rlasso(X, Y, post = true) #now use post-lasso
post_lasso_reg["coefficients"]'

1×101 adjoint(::Vector{Float64}) with eltype Float64:
 -0.00682754  5.00958  4.93178  5.17705  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0

## 4. Inference on Target Regression Coefficients

### 4.1. Intuition for the Orthogonality Principle in Linear Models via Partialling Out.

### 4.2. Inference: Confidence Intervals and Significance Testing. The function rlassoEffects

In [31]:
function print_effects(object, digits = 3)
    if length(object["coefficients"]) !=  0
        b = ["X$y" for y = object["index"]]
        b = reshape(b,(1,length(b)))
        a = vcat(b, round.(object["coefficients"]', digits = digits))
        if length(object["coefficients"]) <= 10
            
            println("Coefficients:\n")
            pretty_table(a[2,:]', tf = tf_borderless, header = a[1,:])
        else 
            for i in 1:trunc(length(object["coefficients"])/10)
                pretty_table(a[2,10*(i-1)+1:10*i]', tf = tf_borderless, header = a[1,10*(i-1)+1:10*i])
            end
        pretty_table(a[2,10*trunc(length(object["coefficients"])/10)+1:length(object["coefficients"])]',
                            tf = tf_borderless, header = a[1,10*trunc(length(object["coefficients"])/10)+1:length(object["coefficients"])])
        end
    else 
        print("No coefficients\n")
    end
end

print_effects (generic function with 2 methods)

In [4]:
function summary_effects(object)
    if length(object) != 0
        k = length(object["coefficients"])
        table = zeros(k, 4)
        table[:, 1] .= object["coefficients"]
        table[:, 2] .= object["se"]
        table[:, 3] .= table[:, 1]./table[:, 2]
        table[:, 4] .= 2 * cdf(Normal(), -abs.(table[:, 3]))
        table1 = DataFrame(hcat(["X$y" for y = object["index"]], table), :auto)
        rename(table1, ["index", "Estimate.", "Std. Error", "t value", "Pr(>|t|)"])
        print("Estimates and significance testing of the effect of target variables", 
                "\n")
        pretty_table(table, show_row_number = false, header = ["Estimate.", "Std. Error", "t value", "Pr(>|t|)"], tf = tf_borderless, row_names = ["X$y" for y = object["index"]])
        print("---", "\n", "Signif. codes:","\n", "0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1")
        print("\n")
    else
        print("No coefficients\n")
    end
end

summary_effects (generic function with 1 method)

In [5]:
function confint_effects(object, level = 0.95)
    n = object["sample_size"]
    k = length(object["coefficients"])
    cf = object["coefficients"]
    #pnames <- names(cf)
    # if (missing(parm)) 
    #     parm <- pnames else if (is.numeric(parm)) 
    #       parm <- pnames[parm]
    a = (1 - level)/2
    a = [a, 1 - a]
    fac = quantile.(Normal(), a)
    pct = string.(round.(a; digits = 3)*100, "%")
    ses = object["se"]
    c_i = []
    for i in 1:length(cf)
        if i == 1
            c_i = (cf[i] .+ ses[i] .* fac)[:,:]'
        else
            c_i = vcat(c_i, (cf[i] .+ ses[i] * fac)[:,:]')
        end
    end
    table1 = DataFrame(hcat(["X$y" for y = object["index"]], c_i), :auto)
    rename(table1, vcat("index", pct))
    #ci = NamedArray(c_i, (1:size(c_i)[1], pct))
    ci = pretty_table(c_i; header = pct, show_row_number = false, tf = tf_borderless, row_names = ["X$y" for y = object["index"]])
    #return c_i;;
end

confint_effects (generic function with 2 methods)

In [6]:
using Random, Distributions, PrettyTables, DataFrames
Random.seed!(1234);
n = 100;
p = 100;
s = 3;
x = randn(n, p);
beta = vcat(fill(3, s), zeros(p - s));
y =1 .+ x * beta + randn(n);

In [7]:
lasso_effects = HDMjl.rlassoEffects(x, y, index = [1,2,3,50]);

In [10]:
HDMjl.r_print(lasso_effects)

Coefficients:

 [1m    X1 [0m [1m    X2 [0m [1m    X3 [0m [1m    X50 [0m

  2.925   2.903   3.101   -0.227


In [12]:
HDMjl.r_summary(lasso_effects)

Estimates and significance testing of the effect of target variables
 [1m     [0m [1m Estimate. [0m [1m Std. Error [0m [1m  t value [0m [1m     Pr(>|t|) [0m

 [1m  X1 [0m    2.92541     0.103597    28.2384   1.97828e-175
 [1m  X2 [0m    2.90258     0.105907     27.407   2.26599e-165
 [1m  X3 [0m    3.10095     0.110626     28.031   6.80893e-173
 [1m X50 [0m   -0.22712    0.0910927   -2.49329      0.0126566
---
Signif. codes:
0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


In [13]:
HDMjl.r_confint(lasso_effects)

 [1m     [0m [1m      2.5% [0m [1m     97.5% [0m

 [1m  X1 [0m    2.72236     3.12845
 [1m  X2 [0m    2.69501     3.11016
 [1m  X3 [0m    2.88413     3.31778
 [1m X50 [0m  -0.405659   -0.048582


### 4.3. Application: the effect of gender on wage

### 4.4. Application: Estimation of the treatment effect in a linear model with many confounding factors

In [44]:
using CodecXz
using RData
using DataFrames
url = "https://github.com/cran/hdm/raw/master/data/GrowthData.rda";
GrowthData = load(download(url))["GrowthData"];
y = GrowthData[:, 1];
d = GrowthData[:, 3];
X = Matrix(GrowthData[:, Not(1, 2, 3)]);

In [45]:
lasso_effect = HDMjl.rlassoEffect(X, y, d, method = "partialling out");

In [18]:
HDMjl.r_print(lasso_effect)

Coefficients:

 [1m     X1 [0m

  -0.053


In [20]:
HDMjl.r_summary(lasso_effect);

Estimates and significance testing of the effect of target variables
 [1m Row [0m [1m Estimate. [0m [1m Std. Error [0m [1m t value [0m [1m    Pr(>|t|) [0m

    1    -0.05333    0.0143283    -3.722   0.000197655
---
Signif. codes:
0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


In [21]:
doublesel_effect = HDMjl.rlassoEffect(X, y, d, method = "double selection");

In [22]:
HDMjl.r_print(doublesel_effect)

Coefficients:

 [1m     X1 [0m

  -0.045


In [26]:
HDMjl.r_summary(doublesel_effect);

Estimates and significance testing of the effect of target variables
 [1m Row [0m [1m  Estimate. [0m [1m Std. Error [0m [1m  t value [0m [1m  Pr(>|t|) [0m

    1   -0.0453558     0.018656   -2.43116   0.0150506
---
Signif. codes:
0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


In [62]:
select_Z=true
select_X=false

select_Z==false & select_X==false

false

## 5. Instrumental Variable Estimation in a High-Dimensional Setting

### 5.2. Application: Economic Development and Institutions.

In [115]:
include("E:/causal_ml/hdm_paper/prueba/HDMjl.jl/src/HDMjl.jl")



Main.HDMjl

In [92]:
using Statistics, StatsModels
url = "https://github.com/cran/hdm/raw/master/data/AJR.rda";
AJR = load(download(url))["AJR"];
y = AJR[!,"GDP"]
d = AJR[!,"Exprop"]
z = AJR[!,"logMort"];
x_formula = @formula(GDP ~ -1 + Latitude + Latitude2 + Africa + Asia + Namer + Samer
    + Latitude*Latitude2 + Latitude*Africa + Latitude*Asia + Latitude*Namer + Latitude*Samer
    + Latitude2*Africa + Latitude2*Asia + Latitude2*Namer + Latitude2*Samer
    + Africa*Asia + Africa*Namer + Africa*Samer
    + Asia*Namer + Asia*Samer
    + Namer*Samer)
x_dframe = ModelFrame( x_formula, AJR)
x1 = ModelMatrix(x_dframe)
x = x1.m
size(x)

(64, 21)

In [109]:
AJR_Xselect  = HDMjl.rlassoIV(x, d, y, z, select_X=true, select_Z=false);

In [110]:
HDMjl.r_print(AJR_Xselect)

Coefficients:

 [1m    X1 [0m

  1.276


In [111]:
HDMjl.r_summary(AJR_Xselect);

Estimates and Significance Testing of the effect of target variables in the IV regression model
 [1m Row [0m [1m  coeff. [0m [1m     se. [0m [1m t-value [0m [1m  p-value [0m

    1   1.27623   0.87538   1.45791   0.144865
---
Signif. codes:
0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


In [112]:
HDMjl.r_confint(AJR_Xselect);

 [1m Row [0m [1m      2.5% [0m [1m   97.5% [0m

    1   -0.439485   2.99194


### 5.3. Application: Impact of Eminent Domain Decisions on Economic Outcomes.

In [113]:
using Statistics, GLM
url = "https://github.com/cran/hdm/raw/master/data/EminentDomain.rda";
EminentDomain = load(download(url))["EminentDomain"];
z = EminentDomain["logGDP"]["z"];
x = EminentDomain["logGDP"]["x"];
d = EminentDomain["logGDP"]["d"];
y = EminentDomain["logGDP"]["y"];
x = x[:, (mean(x, dims = 1) .> 0.05)'];
z = z[:, (mean(z, dims = 1) .> 0.05)'];

In [117]:
lasso_IV_Z = HDMjl.rlassoIV(x, d, y, z, select_X = false, select_Z = true);

In [120]:
HDMjl.r_summary(lasso_IV_Z);

Estimates and Significance Testing of the effect of target variables in the IV regression model
 [1m Row [0m [1m    coeff. [0m [1m     se. [0m [1m   t-value [0m [1m  p-value [0m

    1   -0.451027   2.32377   -0.194093   0.846103
---
Signif. codes:
0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


In [122]:
HDMjl.r_confint(lasso_IV_Z);

 [1m Row [0m [1m     2.5% [0m [1m   97.5% [0m

    1   -5.00553   4.10348


In [125]:
lasso_IV_XZ = HDMjl.rlassoIV(x, d, y, z, select_X = true, select_Z = true);

In [127]:
HDMjl.r_summary(lasso_IV_XZ);

Estimates and Significance Testing of the effect of target variables in the IV regression model
 [1m Row [0m [1m     coeff. [0m [1m       se. [0m [1m   t-value [0m [1m  p-value [0m

    1   -0.0449578   0.0801865   -0.560665   0.575026
---
Signif. codes:
0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


In [129]:
HDMjl.r_confint(lasso_IV_XZ);

 [1m Row [0m [1m      2.5% [0m [1m    97.5% [0m

    1   -0.202121   0.112205


## 6. Inference on Treatment Effects in a High-Dimensional Setting

### 6.3. Application: 401(k) plan participation.

In [26]:
url = "https://github.com/cran/hdm/raw/master/data/pension.rda";
pension = load(download(url))["pension"];
y = pension[:, "tw"];
d = pension[:, "p401"];
z = pension[:, "e401"];
X = Matrix(pension[:, ["i2", "i3", "i4", "i5", "i6", "i7", "a2", "a3", "a4", "a5", "fsize", "hs", "smcol", "col", "marr", "twoearn", "db", "pira", "hown"]]);
rlassoATE(X, d, y)

Dict{String, Any} with 5 entries:
  "se"          => 1930.68
  "individual"  => [-30618.3, -57537.6, -71442.9, 21383.3, -2.32925e5, 3.40765e…
  "sample_size" => 9915
  "te"          => 10180.1
  "type"        => "ATE"

In [51]:
rlassoATET(X, d, y)

Dict{String, Any} with 5 entries:
  "se"          => 2944.43
  "individual"  => [-21536.4, -52877.2, -1.44867e5, -2739.29, -307741.0, 7.3912…
  "sample_size" => 9915
  "te"          => 12628.5
  "type"        => "ATET"

In [52]:
rlassoLATE(X, d, y, z)

Dict{String, Any} with 5 entries:
  "se"          => 2326.9
  "individual"  => [-50526.8, -1.39158e5, -1.37102e5, 38508.0, -6.5644e5, 7.943…
  "sample_size" => 9915
  "te"          => 12992.1
  "type"        => "LATE"

In [53]:
rlassoLATET(X, d, y, z)

Dict{String, Any} with 5 entries:
  "se"          => 3645.28
  "individual"  => [-35580.5, -90558.0, -1.83628e5, -5303.13, -8.0766e5, 1.8866…
  "sample_size" => 9915
  "te"          => 15323.2
  "type"        => "LATET"