Replication

In [None]:
using DataFrames
using GLM
using CovarianceMatrices

# Load data from Rdata file
data = DataFrame(CSV.File("/Users/jeffr/OneDrive/Escritorio/wage2015_subsample_inference.Rdata"))

# Filter data
Submuestra_sup = filter(row -> row[:scl] == 1 || row[:clg] == 1 || row[:ad] == 1, data)

# Subset columns
Z = select(Submuestra_sup, [:lwage, :sex, :shs, :hsg, :scl, :clg, :ad, :ne, :mw, :so, :we, :exp1])

# Subset data for females and males
data_female = filter(row -> row[:sex] == 1, Submuestra_sup)
Z_female = select(data_female, [:lwage, :sex, :shs, :hsg, :scl, :clg, :ad, :ne, :mw, :so, :we, :exp1])

data_male = filter(row -> row[:sex] == 0, Submuestra_sup)
Z_male = select(data_male, [:lwage, :sex, :shs, :hsg, :scl, :clg, :ad, :ne, :mw, :so, :we, :exp1])

# Create table
table = zeros(12, 3)
table[1:12, 1] .= [mean(Z[!, i]) for i in 1:12]
table[1:12, 2] .= [mean(Z_male[!, i]) for i in 1:12]
table[1:12, 3] .= [mean(Z_female[!, i]) for i in 1:12]

# Set row and column names
row_names = ["Log Wage", "Sex", "Less than High School", "High School Graduate", "Some College", "College Graduate", "Advanced Degree", "Northeast", "Midwest", "South", "West", "Experience"]
col_names = ["All", "Men", "Women"]

# Print the table
println("Table:")
println(DataFrame(table, rownames=row_names, colnames=col_names))

# Fit linear model without controls
nocontrol_fit = lm(@formula(lwage ~ sex), Submuestra_sup)
nocontrol_est = coef(nocontrol_fit)[2]
HCV_coefs_nocontrol = cov(HC0, nocontrol_fit)
nocontrol_se = sqrt(HCV_coefs_nocontrol[2, 2])

println("The estimated gender coefficient is ", nocontrol_est, " and the corresponding robust standard error is ", nocontrol_se)

# Fit linear model with controls
flex = @formula(lwage ~ sex + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we))
control_fit = lm(flex, Submuestra_sup)
control_est = coef(control_fit)[2]
HCV_coefs_control = cov(HC0, control_fit)
control_se = sqrt(HCV_coefs_control[2, 2])

println("Coefficient for OLS with controls: ", control_est)

# Partialling-Out using OLS
flex_y = @formula(lwage ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we))
flex_d = @formula(sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we))

t_Y = residuals(lm(flex_y, Submuestra_sup))
t_D = residuals(lm(flex_d, Submuestra_sup))

partial_fit = lm(@formula(t_Y ~ t_D))
partial_est = coef(partial_fit)[2]

println("Coefficient for D via partialling-out: ", partial_est)

HCV_coefs_partial = cov(HC0, partial_fit)
partial_se = sqrt(HCV_coefs_partial[2, 2])

println("Estimated standard error: ", partial_se)

# Create and print table
table = zeros(3, 2)
table[1, 1] = nocontrol_est
table[1, 2] = nocontrol_se
table[2, 1] = control_est
table[2, 2] = control_se
table[3, 1] = partial_est
table[3, 2] = partial_se

row_names = ["Without controls", "Full regression", "Partial regression"]
col_names = ["Estimate", "Std. Error"]

println("Table:")
println(DataFrame(table, rownames=row_names, colnames=col_names))

# Fit linear model with extra-flex controls
extraflex = @formula(lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)^2)
control_fit_extraflex = lm(extraflex, Submuestra_sup)
control_est_extraflex = coef(control_fit_extraflex)[2]

println("Number of Extra-Flex Controls: ", length(coef(control_fit_extraflex)) - 1)
println("Coefficient for OLS with extra-flex controls: ", control_est_extraflex)

HCV_coefs_extraflex = cov(HC0, control_fit_extraflex)
n = nrow(Submuestra_sup)
p = length(coef(control_fit_extraflex))
control_se_extraflex = sqrt(HCV_coefs_extraflex[2, 2]) * sqrt(n / (n - p))

println("Estimated standard errors: ", control_se_extraflex)