# Double Lasso - heterogenous wage effects
- Luis
- Álvaro
- Diego

In [2]:
using RData, LinearAlgebra, GLM, DataFrames, Statistics, Random, Distributions, 
DataStructures, NamedArrays, PrettyTables, StatsModels, Combinatorics

import CodecBzip2

In [None]:
using Pkg
Pkg.add("Lasso")
using Lasso

In [4]:
# Importing .Rdata file

cps2012 = load("../../data/cps2012.RData")

Dict{String, Any} with 1 entry:
  "data" => [1m29217×23 DataFrame[0m…

In [6]:
cps2012 = cps2012["data"]

names(cps2012)
#get names of each variable

23-element Vector{String}:
 "year"
 "lnw"
 "female"
 "widowed"
 "divorced"
 "separated"
 "nevermarried"
 "hsd08"
 "hsd911"
 "hsg"
 "cg"
 "ad"
 "mw"
 "so"
 "we"
 "exp1"
 "exp2"
 "exp3"
 "exp4"
 "weight"
 "married"
 "ne"
 "sc"

In [7]:
# couples variables combinations 
    combinations_upto(x, n) = Iterators.flatten(combinations(x, i) for i in 1:n)

    # combinations without same couple
    expand_exp(args, deg::ConstantTerm) =
        tuple(((&)(terms...) for terms in combinations_upto(args, deg.n))...)

    StatsModels.apply_schema(t::FunctionTerm{typeof(^)}, sch::StatsModels.Schema, ctx::Type) =
        apply_schema.(expand_exp(t.args_parsed...), Ref(sch), ctx)

In [8]:
# Basic model 


reg = @formula(lnw ~ -1 + female + female&(widowed + divorced + separated + nevermarried +
hsd08 + hsd911 + hsg + cg + ad + mw + so + we + exp1 + exp2 + exp3) + (widowed +
divorced + separated + nevermarried + hsd08 + hsd911 + hsg + cg + ad + mw + so +
we + exp1 + exp2 + exp3)^2 )


formula_basic = apply_schema(reg, schema(reg, cps2012))

FormulaTerm
Response:
  lnw(continuous)
Predictors:
  0
  female(continuous)
  widowed(continuous)
  divorced(continuous)
  separated(continuous)
  nevermarried(continuous)
  hsd08(continuous)
  hsd911(continuous)
  hsg(continuous)
  cg(continuous)
  ad(continuous)
  mw(continuous)
  so(continuous)
  we(continuous)
  exp1(continuous)
  exp2(continuous)
  exp3(continuous)
  widowed(continuous) & divorced(continuous)
  widowed(continuous) & separated(continuous)
  widowed(continuous) & nevermarried(continuous)
  widowed(continuous) & hsd08(continuous)
  widowed(continuous) & hsd911(continuous)
  widowed(continuous) & hsg(continuous)
  widowed(continuous) & cg(continuous)
  widowed(continuous) & ad(continuous)
  widowed(continuous) & mw(continuous)
  widowed(continuous) & so(continuous)
  widowed(continuous) & we(continuous)
  widowed(continuous) & exp1(continuous)
  widowed(continuous) & exp2(continuous)
  widowed(continuous) & exp3(continuous)
  divorced(continuous) & separated(continuo

In [9]:
coefnames(formula_basic)
#Y , and regressors

("lnw", Any["female", "widowed", "divorced", "separated", "nevermarried", "hsd08", "hsd911", "hsg", "cg", "ad"  …  "female & hsd911", "female & hsg", "female & cg", "female & ad", "female & mw", "female & so", "female & we", "female & exp1", "female & exp2", "female & exp3"])

In [10]:
Y = select(cps2012,:lnw)  # uptcome variable
control = coefnames(formula_basic)[2]  # regresors 
names_col = Symbol.(control)  # string to Symbol to create varaible's name

136-element Vector{Symbol}:
 :female
 :widowed
 :divorced
 :separated
 :nevermarried
 :hsd08
 :hsd911
 :hsg
 :cg
 :ad
 :mw
 :so
 :we
 ⋮
 Symbol("female & nevermarried")
 Symbol("female & hsd08")
 Symbol("female & hsd911")
 Symbol("female & hsg")
 Symbol("female & cg")
 Symbol("female & ad")
 Symbol("female & mw")
 Symbol("female & so")
 Symbol("female & we")
 Symbol("female & exp1")
 Symbol("female & exp2")
 Symbol("female & exp3")

In [11]:
X = StatsModels.modelmatrix(formula_basic,cps2012) #interaaction

29217×136 Matrix{Float64}:
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  22.0  4.84    10.648
 1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0     0.0  0.0  30.0  9.0     27.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0     0.0  0.0   0.0  0.0      0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0     0.0  0.0  14.0  1.96     2.744
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0   0.0  0.0      0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  …  0.0  0.0   0.0  0.0      0.0
 0.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0     0.0  0.0   0.0  0.0      0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0   0.0  0.0      0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0   0.0  0.0      0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  15.5  2.4025   3.72388
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  …  0.0  0.0   0.0  0.0      0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0   7.0  0.49     0.343
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0   0.0  0.0      0.0
 ⋮            

In [12]:
X = DataFrame(X, names_col)

Unnamed: 0_level_0,female,widowed,divorced,separated,nevermarried,hsd08,hsd911,hsg
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Function to get index of constant columns   

cons_column = []

#recoge todos los valores que tengan varianza 0
for i in 1:size(X,2)
    if var(X[!,i]) == 0
        append!(cons_column  , i)      
    end       
end


# Drop constant columns 

names(X)[cons_column]
select!(X, Not(names(X)[cons_column]))

Unnamed: 0_level_0,female,widowed,divorced,separated,nevermarried,hsd08,hsd911,hsg
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
#New matrix has dropped allconstants
X

Unnamed: 0_level_0,female,widowed,divorced,separated,nevermarried,hsd08,hsd911,hsg
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# demean function
function desv_mean(a)
    a = Matrix(a)   # dataframe to matrix 
    A = mean(a, dims = 1)
    M = zeros(Float64, size(X,1), size(X,2))
    
    for i in 1:size(a,2)
          M[:,i] = a[:,i] .- A[i]
    end
    
    return M
end    


# Matrix Model & demean

X = DataFrame(desv_mean(X), names(X)) # Dataframe and names

Unnamed: 0_level_0,female,widowed,divorced,separated,nevermarried,hsd08,hsd911
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.571243,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789
2,0.571243,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,0.977821
3,-0.428757,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789
4,0.571243,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789
5,-0.428757,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789
6,-0.428757,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789
7,-0.428757,-0.00797481,-0.113393,-0.0165999,0.843653,-0.0041072,-0.0221789
8,-0.428757,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789
9,-0.428757,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789
10,0.571243,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789


In [18]:
# index to get columns that contains female
# f and interacions
index = []

for i in 1:size(X,2)  
        if contains( names(X)[i] , "female")
            append!(index, i)
        end  
end

In [24]:
index
# female y 15 interacciones

16-element Vector{Any}:
   1
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116

In [25]:
# Control variables 
# do not considerate female and interactions variables
W = select(X, Not(names(X)[index]))

Unnamed: 0_level_0,widowed,divorced,separated,nevermarried,hsd08,hsd911,hsg
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789,-0.247288
2,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,0.977821,-0.247288
3,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789,0.752712
4,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789,0.752712
5,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789,-0.247288
6,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789,0.752712
7,-0.00797481,-0.113393,-0.0165999,0.843653,-0.0041072,-0.0221789,0.752712
8,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789,-0.247288
9,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789,-0.247288
10,-0.00797481,-0.113393,-0.0165999,-0.156347,-0.0041072,-0.0221789,-0.247288


## Partialling out (3 stages)

We have the model:

$ln(wage) = \beta_1 + \beta_2fem + \beta_{3-17}(fem * D) + \theta W + \epsilon$ 

### First stage:
Regress interest variable for covariates (W):

Y ~ W.


Then, we obtain $ \hat{\alpha_y} $ estimator, and we will use it to get residuals
$ Y - \hat{\alpha_y} W =  \hat{e_Y}$

### Second stage:
Regress regressors F (a matrix that contains fem and femxD) for covariates (W):

F ~ W.
Then, we obtain $ \hat{\alpha_f} $ estimator, and we will use it to get residuals
$ F - \hat{\alpha_f} W =  \hat{e_F}$

### Third stage:

Regress the residuals from F to Y, and get the $\hat{\beta}$ for each of F in the original equation.



In [40]:


table = NamedArray(zeros(16, 2))

j = 0

for i in 1:length(index)

j = j + 1
    
#first step
D = select(X, names(X)[index[i]])
    
D_reg_0  = rlasso_arg( W, D, nothing, true, true, true, false, false, 
                    nothing, 1.1, nothing, 5000, 15, 10^(-5), -Inf, true, Inf, true )


D_resid[!,j] = rlasso(D_reg_0)["residuals"]

#second step
    
# third step
    
#Lasso_HDM = lm(D_resid, Y_resid)

#table[j,1] = GLM.coeftable(Lasso_HDM).cols[5][1]
#table[j,2] = GLM.coeftable(Lasso_HDM).cols[6][1]

    
end

LoadError: UndefVarError: rlasso_arg not defined