# A Case Study: The Effect of Gun Ownership on Gun-Homicide Rates

We consider the problem of estimating the effect of gun
ownership on the homicide rate. For this purpose, we estimate the following partially
linear model

$$
 Y_{j,t} = \beta D_{j,(t-1)} + g(Z_{j,t}) + \epsilon_{j,t}.
$$

## Data

$Y_{j,t}$ is log homicide rate in county $j$ at time $t$, $D_{j, t-1}$ is log  fraction of suicides committed with a firearm in county $j$ at time $t-1$, which we use as a proxy for gun ownership,  and  $Z_{j,t}$ is a set of demographic and economic characteristics of county $j$ at time $t$. The parameter $\beta$ is the effect of gun ownership on the
homicide rates, controlling for county-level demographic and economic characteristics. 

The sample covers 195 large United States counties between the years 1980 through 1999, giving us 3900 observations.

In [83]:
using Pkg
#Pkg.add("CSV"), using CSV
#Pkg.add("DataFrames"), using DataFrames
#Pkg.add("StatsModels"), using StatsModels
#Pkg.add("GLM"), using GLM
#Pkg.add("Random"), using Random

In [84]:
using CSV, DataFrames, StatsModels, GLM, Random

In [85]:
data = CSV.File("../../../data/gun_clean.csv") |> DataFrame;
println("Number of rows: ",size(data,1))
println("Number of columns: ",size(data,2))

Number of rows: 3900
Number of columns: 415


In [86]:
data

Unnamed: 0_level_0,Column1,CountyCode,logfssl,BPS030D,BPS130D,BPS230D,BNK010D,BNK050D,AGE010D
Unnamed: 0_level_1,Int64,Int64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1,1073,3.57217,6.55878,10.3356,6.38017,4.05163,12.2093,10.9079
2,2,1073,3.54975,6.55878,10.3356,6.38017,4.05163,12.2093,10.9079
3,3,1073,3.55551,6.55878,10.3356,6.38017,4.05163,12.2093,10.9079
4,4,1073,3.57985,6.55878,10.3356,6.38017,4.05163,12.2093,10.9079
5,5,1073,3.53965,6.55878,10.3356,6.38017,4.05163,12.2093,10.9079
6,6,1073,3.48119,6.55878,10.3356,6.38017,4.04605,12.4579,10.9079
7,7,1073,3.57639,6.55878,10.3356,6.38017,4.07361,12.6188,10.9079
8,8,1073,3.3858,6.55878,10.3356,6.38017,4.07361,12.6969,10.9079
9,9,1073,3.46706,6.55878,10.3356,6.38017,4.095,12.7511,10.9079
10,10,1073,3.43092,6.55878,10.3356,6.38017,4.11585,12.826,10.9079


In [87]:
describe(data)

Unnamed: 0_level_0,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Float64,Real,Float64,Real,Int64,DataType
1,Column1,1950.5,1,1950.5,3900,0,Int64
2,CountyCode,27837.3,1073,27123.0,55133,0,Int64
3,logfssl,2.9783,1.24684,2.63039,11.677,0,Float64
4,BPS030D,6.02131,2.09628,5.12222,27.9254,0,Float64
5,BPS130D,9.51085,4.85407,8.03839,42.2708,0,Float64
6,BPS230D,5.70655,0.0,4.91669,26.2124,0,Float64
7,BNK010D,3.80986,1.38362,3.16765,21.6038,0,Float64
8,BNK050D,11.8449,6.14298,10.2418,54.6835,0,Float64
9,AGE010D,10.1894,5.86912,8.83267,46.7731,0,Float64
10,AGE050D,2.63741,1.59889,2.33065,10.0118,0,Float64


### Preprocessing

To account for heterogeneity across counties and time trends in  all variables, we remove from them county-specific and time-specific effects in the following preprocessing.

In [88]:
#################################  Find Variable Names from Dataset ########################

function varlist(df = nothing , type_dataframe = ["numeric","categorical","string"], pattern=String , exclude =  nothing)

    varrs = []
    if "numeric" in type_dataframe
        append!(varrs, [i for i in names(data) if eltype(eachcol(data)[i]) <: Number])    
    end
    if "categorical" in type_dataframe
        append!(varrs,[i for i in names(data) if eltype(eachcol(data)[i]) <: CategoricalVector])
    end
    if "string" in type_dataframe
        append!(varrs,[i for i in names(data) if eltype(eachcol(data)[i]) <: String])
    end
    varrs[(!varrs in exclude) & varrs[findall(x->contains(x,pattern),names(data))]]
end

varlist (generic function with 5 methods)

In [89]:
################################# Create Variables ###############################

# Dummy Variables for Year and County Fixed Effects
fixed = filter(x->contains(x, "X_Jfips"), names(data));
year = filter(x->contains(x, "X_Tyear"), names(data));

In [90]:
data[!,year]
data[!,fixed]

Unnamed: 0_level_0,X_Jfips1,X_Jfips2,X_Jfips3,X_Jfips4,X_Jfips5,X_Jfips6,X_Jfips7,X_Jfips8
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,1,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0
10,1,0,0,0,0,0,0,0


In [91]:
census = []
census_var = ["AGE", "BN", "BP", "BZ", "ED", "EL", "HI", "HS", "INC", "LF", "LN", "PI", "PO", "PP", "PV", "SPR", "VS"]

for i in 1:size(census_var,1) 
    append!(census, filter(x->contains(x, census_var[i]), names(data)))
end

In [92]:
################################ Variables ##################################

# Treatment Variable
d = ["logfssl"];

# Outcome Variable
y = ["logghomr"];

# Other Control Variables
X1 = ["logrobr", "logburg", "burg_missing", "robrate_missing"];
X2 = ["newblack", "newfhh", "newmove", "newdens", "newmal"];

In [93]:
#################################  Partial out Fixed Effects ########################

# Variables to be Partialled-out
variable = [y, d,X1, X2, census]
varlis = []

# Partial out Variables in varlist from year and county fixed effect
for i in variable
    append!(varlis,i)
end

In [94]:
# Running the following lines takes aprox. 10 minutes (depends on your CPU)

example = DataFrame(CountyCode = data[:,"CountyCode"]);
rdata = DataFrame(CountyCode = data[:,"CountyCode"]);

for i in 1:size(varlis,1)
    rdata[!,varlis[i]]= residuals(lm(term(Symbol(varlis[i])) ~ sum(term.(Symbol.(year))) + sum(term.(Symbol.(fixed))), data))
end

In [95]:
example

Unnamed: 0_level_0,CountyCode
Unnamed: 0_level_1,Int64
1,1073
2,1073
3,1073
4,1073
5,1073
6,1073
7,1073
8,1073
9,1073
10,1073


In [96]:
first(rdata, 6)

Unnamed: 0_level_0,CountyCode,logghomr,logfssl,logrobr,logburg,burg_missing,robrate_missing
Unnamed: 0_level_1,Int64,Float64,Float64,Float64,Float64,Float64,Float64
1,1073,-0.134778,0.0961271,0.150893,-0.124395,0.0104613,-0.021229
2,1073,-0.239622,0.0808094,0.0401683,-0.134781,0.0104613,-0.0194181
3,1073,-0.0786772,0.0573399,-0.017679,-0.167909,0.0104613,-0.0220374
4,1073,-0.331465,0.0816945,-0.00963344,-0.22925,0.0104613,-0.0194181
5,1073,-0.31664,0.0253655,-0.0267151,-0.176635,0.00324793,-0.0208037
6,1073,0.105132,-0.00677726,-0.151487,-0.189069,0.0104613,0.016953


## We check that our results are equal to R results at 6 decimals

In [97]:
# load dataset
rdata_read = CSV.File("../../../data/gun_clean2.csv") |> DataFrame
data_1 = rdata_read[!, names(rdata)]
n = size(data_1,1)

3900

In [98]:
data_1

Unnamed: 0_level_0,CountyCode,logghomr,logfssl,logrobr,logburg,burg_missing,robrate_missing
Unnamed: 0_level_1,Int64,Float64,Float64,Float64,Float64,Float64,Float64
1,1073,-0.134778,0.0961271,0.150893,-0.124395,0.0104613,-0.021229
2,1073,-0.239622,0.0808094,0.0401683,-0.134781,0.0104613,-0.0194181
3,1073,-0.0786772,0.0573399,-0.017679,-0.167909,0.0104613,-0.0220374
4,1073,-0.331465,0.0816945,-0.00963344,-0.22925,0.0104613,-0.0194181
5,1073,-0.31664,0.0253655,-0.0267151,-0.176635,0.00324793,-0.0208037
6,1073,0.105132,-0.00677726,-0.151487,-0.189069,0.0104613,0.016953
7,1073,-0.0373401,0.0773061,-0.166729,-0.117739,0.0104613,0.0245505
8,1073,-0.0520609,-0.108433,-0.0996453,-0.0833094,0.00448964,0.021457
9,1073,0.0547007,-0.0340988,0.151557,0.319282,-0.0448348,-0.0366629
10,1073,0.122094,-0.0824292,0.0476034,-0.0144728,-0.00233214,0.00765442


In [99]:
column_names = names(data_1)
result = []

for i in 1:size(data_1,1)
    for j in 1:size(data_1,2)
        data_1[i,j] = round(data_1[i,j], digits=6)
        rdata[i,j] = round(rdata[i,j], digits=6)
    end
end

for col in column_names
    result = sum(data_1[!,col] .== rdata[!,col])

    if result .== 3900
        println("Column ", col,  " are equal at 6 decimals")
    else
        println("Column ", col,  " are not equal at 6 decimals")
    end

end

Column CountyCode are equal at 6 decimals
Column logghomr are equal at 6 decimals
Column logfssl are equal at 6 decimals
Column logrobr are equal at 6 decimals
Column logburg are equal at 6 decimals
Column burg_missing are equal at 6 decimals
Column robrate_missing are equal at 6 decimals
Column newblack are equal at 6 decimals
Column newfhh are equal at 6 decimals
Column newmove are equal at 6 decimals
Column newdens are equal at 6 decimals
Column newmal are equal at 6 decimals
Column AGE010D are equal at 6 decimals
Column AGE050D are equal at 6 decimals
Column AGE110D are equal at 6 decimals
Column AGE170D are equal at 6 decimals
Column AGE180D are equal at 6 decimals
Column AGE270D are equal at 6 decimals
Column AGE310D are equal at 6 decimals
Column AGE320D are equal at 6 decimals
Column AGE350D are equal at 6 decimals
Column AGE380D are equal at 6 decimals
Column AGE410D are equal at 6 decimals
Column AGE470D are equal at 6 decimals
Column AGE570D are equal at 6 decimals
Column AG

Now, we can construct the treatment variable, the outcome variable and the matrix $Z$ that includes the control variables.

In [100]:
# Treatment variable
D = rdata[!,d]

# Outcome variable
Y = rdata[!,y];

# Construct matrix Z
Z = rdata[!, varlis[3:end]];


We have in total 195 control variables. The control variables $Z_{j,t}$ are from the U.S. Census Bureau and  contain demographic and economic characteristics of the counties such as  the age distribution, the income distribution, crime rates, federal spending, home ownership rates, house prices, educational attainment, voting paterns, employment statistics, and migration rates. 

In [101]:
clu = select(rdata,:CountyCode)
data = hcat(Y,D,Z,clu);
first(data, 6)

Unnamed: 0_level_0,logghomr,logfssl,logrobr,logburg,burg_missing,robrate_missing,newblack
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,-0.134778,0.096127,0.150893,-0.124395,0.010461,-0.021229,0.030947
2,-0.239622,0.080809,0.040168,-0.134781,0.010461,-0.019418,0.030947
3,-0.078677,0.05734,-0.017679,-0.167909,0.010461,-0.022037,0.030947
4,-0.331465,0.081694,-0.009633,-0.22925,0.010461,-0.019418,0.030947
5,-0.31664,0.025366,-0.026715,-0.176635,0.003248,-0.020804,0.030947
6,0.105132,-0.006777,-0.151487,-0.189069,0.010461,0.016953,0.030947


In [102]:
size(data), size(rdata)

((3900, 198), (3900, 198))

## The effect of gun ownership


### OLS

After preprocessing the data, we first look at simple regression of $Y_{j,t}$ on $D_{j,t-1}$ without controls as a baseline model.

In [103]:
#Pkg.add("FixedEffectModels")

In [104]:
using FixedEffectModels

In [105]:
# OLS clustering at the County level

fm_1 = @formula(logghomr ~ 0 + logfssl + fe(CountyCode))
baseline_ols = reg(data, fm_1, Vcov.cluster(:CountyCode))

                        Fixed Effect Model                        
Number of obs:              3900  Degrees of freedom:            2
R2:                        0.006  R2 Adjusted:               0.006
F-Stat:                  18.9732  p-value:                   0.000
R2 within:                 0.006  Iterations:                    1
logghomr | Estimate Std.Error t value Pr(>|t|) Lower 95% Upper 95%
------------------------------------------------------------------
logfssl  | 0.282304 0.0648108 4.35582    0.000  0.155238   0.40937


In [106]:
println("2.5% : ", GLM.coeftable(baseline_ols).cols[5])
println("97.5% : " , GLM.coeftable(baseline_ols).cols[6])
println("Estimate: ", GLM.coeftable(baseline_ols).cols[1])
println("Cluster s.e. : " , GLM.r2(baseline_ols))
println("T-value : ", GLM.coeftable(baseline_ols).cols[3])
println("Pr(>|t|) : " , GLM.coeftable(baseline_ols).cols[4])

2.5% : [0.15523789539597124]
97.5% : [0.409370438768637]
Estimate: [0.2823041670823041]
Cluster s.e. : 0.006193251272214595
T-value : [4.355820406592105]
Pr(>|t|) : [1.3597928318408112e-5]


The point estimate is $0.282$ with the confidence interval ranging from 0.155 to 0.409. This
suggests that increases in gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% relative
to a trend then the predicted gun homicide rate goes up by 0.28%, without controlling for counties' characteristics.

Since our goal is to estimate the effect of gun ownership after controlling for a rich set county characteristics we next include the controls. First, we estimate the model by ols and then by an array of the modern regression methods using the double machine learning approach.

In [107]:
control_formula = term(:logghomr) ~ term(:logfssl) + sum(term.(Symbol.(names(Z)))) + fe(:CountyCode)
control_ols = reg(data, control_formula)

                                Fixed Effect Model                                
Number of obs:                      3900  Degrees of freedom:                  375
R2:                                0.203  R2 Adjusted:                       0.118
F-Stat:                           4.9868  p-value:                           0.000
R2 within:                         0.203  Iterations:                            1
logghomr        |   Estimate Std.Error    t value Pr(>|t|)   Lower 95%   Upper 95%
----------------------------------------------------------------------------------
logfssl         |   0.190671  0.057892    3.29355    0.001   0.0771653    0.304176
logrobr         |    0.18903 0.0439518    4.30086    0.000    0.102857    0.275204
logburg         |   0.219294  0.062193    3.52602    0.000   0.0973559    0.341232
burg_missing    |     1.5298  0.452913    3.37768    0.001    0.641798      2.4178
robrate_missing |    1.13297  0.258478    4.38322    0.000    0.626185     1.63975
newb

In [108]:
println("For <<logfssl>> variable: ")
println("2.5% : ", GLM.coeftable(control_ols).cols[5][1])
println("97.5% : " , GLM.coeftable(control_ols).cols[6][1])
println("Estimate: ", GLM.coeftable(control_ols).cols[1][1])
println("Cluster s.e. : " , GLM.r2(control_ols))
println("T-value : ", GLM.coeftable(control_ols).cols[3][1])
println("Pr(>|t|) : " , GLM.coeftable(control_ols).cols[4][1])

For <<logfssl>> variable: 
2.5% : 0.07716526389391981
97.5% : 0.3041757996811467
Estimate: 0.19067053178753327
Cluster s.e. : 0.20296178723373293
T-value : 3.2935540687742195
Pr(>|t|) : 0.0009990606412285847


# DML algorithm

Here we perform inference of the predictive coefficient $\beta$ in our partially linear statistical model, 

$$
Y = D\beta + g(Z) + \epsilon, \quad E (\epsilon | D, Z) = 0,
$$

using the **double machine learning** approach. 

For $\tilde Y = Y- E(Y|Z)$ and $\tilde D= D- E(D|Z)$, we can write
$$
\tilde Y = \alpha \tilde D + \epsilon, \quad E (\epsilon |\tilde D) =0.
$$

Using cross-fitting, we employ modern regression methods
to build estimators $\hat \ell(Z)$ and $\hat m(Z)$ of $\ell(Z):=E(Y|Z)$ and $m(Z):=E(D|Z)$ to obtain the estimates of the residualized quantities:

$$
\tilde Y_i = Y_i  - \hat \ell (Z_i),   \quad \tilde D_i = D_i - \hat m(Z_i), \quad \text{ for each } i = 1,\dots,n.
$$

Finally, using ordinary least squares of $\tilde Y_i$ on $\tilde D_i$, we obtain the 
estimate of $\beta$.

In [109]:
using  MLDataUtils, MLBase

In [110]:
foldid = collect(Kfold(size(Z)[1], 10))[1]

3510-element Vector{Int64}:
    1
    2
    3
    4
    6
    7
    8
    9
   10
   11
   13
   14
   15
    ⋮
 3889
 3890
 3891
 3892
 3893
 3894
 3895
 3896
 3897
 3898
 3899
 3900

In [111]:
function DML2_for_PLM(z , d , y, dreg , yreg , nfold, clu)
    
    # Num ob observations
    nobser = size(z,1)
    
    # Define folds indices 
    foldid = collect(Kfold(size(z)[1], nfold))
    
    # Create array to save errors 
    ytil = ones(nobser)
    dtil = ones(nobser)
    println("Folds: " )
    
    # loop to save results
    for i in 1:nfold
        
        # Lasso regression, excluding folds selected 
        dfit = dreg(z[foldid[i],:], d[foldid[i]])
        yfit = yreg(z[foldid[i],:], y[foldid[i]])
        
        # Predict estimates using the 
        dhat = GLM.predict(dfit, z[Not(foldid[i]),:])
        yhat = GLM.predict(yfit, z[Not(foldid[i]),:])
        
        # Save errors 
        dtil[Not(foldid[i])] = (d[Not(foldid[i])] - dhat)
        ytil[Not(foldid[i])] = (y[Not(foldid[i])] - yhat)
        println(i)
    end
    
    # Create dataframe 
    data = DataFrame(ytil = ytil, dtil = dtil, clu=clu)
    
    # OLS clustering at the County level
    rfit = reg(data, @formula(ytil ~ dtil +fe(clu)))
    coef_est = coef(rfit)[1]
    se = FixedEffectModels.coeftable(rfit).cols[2]

    println(" coef (se) = ", coef_est ,"(",se,")")
    
    return rfit, data;
    
end

DML2_for_PLM (generic function with 1 method)

Now, we apply the Double Machine Learning (DML) approach with different machine learning methods. First, we load the relevant libraries.

Let us, construct the input matrices.

In [112]:
# Create main variables
z = Matrix(Z);
d = D[!,1];
y = Y[!,1];
clu = rdata[!, :CountyCode];
first(DataFrame(logghomr = y,logfssl = d,CountyCode = clu ),6)

Unnamed: 0_level_0,logghomr,logfssl,CountyCode
Unnamed: 0_level_1,Float64,Float64,Int64
1,-0.134778,0.096127,1073
2,-0.239622,0.080809,1073
3,-0.078677,0.05734,1073
4,-0.331465,0.081694,1073
5,-0.31664,0.025366,1073
6,0.105132,-0.006777,1073


In the following, we apply the DML approach with the differnt versions of lasso.


## Lasso

In [113]:
using Lasso

In [114]:
Random.seed!(123)
dreg(z,d) = fit(LassoModel, z, d, standardize = false)
yreg(z,y) = fit(LassoModel, z, y, standardize = false)
DML2_lasso = DML2_for_PLM(z, d, y, dreg, yreg, 10, clu);

Folds: 
1
2
3
4
5
6
7
8
9
10
 coef (se) = 0.212029671273402([0.056430449973936744])


## HDM.JL
We are going to replicate the above regressions but using `hmd` library.

In [115]:
include("../hdmjl/hdmjl.jl")

In [116]:
function DML2_lasso_hdm(z , d , y, dreg , yreg , nfold, clu)
    
    # Num ob observations
    nobser = size(z,1)
    
    # Define folds indices
    foldid = collect(Kfold(size(z)[1], nfold))
    
    # Create array to save errors 
    ytil = ones(nobser)
    dtil = ones(nobser)
    println("Folds: " )
    
    # loop to save results
    for i in 1:nfold
        
        # Lasso regression, excluding folds selected 
        coef_D= dreg(Z[foldid[i],:], D[foldid[i],:]) # coefficients from not Double Lasso
        coef_Y= dreg(Z[foldid[i],:], Y[foldid[i],:]) # coefficients from not Double Lasso
        
        # Predict estimates using the 
        dhat = hcat(ones(size(z[Not(foldid[i]),:])[1]),Matrix(z[Not(foldid[i]),:]))*coef_D
        yhat = hcat(ones(size(z[Not(foldid[i]),:])[1]),Matrix(z[Not(foldid[i]),:]))*coef_Y
        
        # Save errors 
        dtil[Not(foldid[i])] = (d[Not(foldid[i])] - dhat)
        ytil[Not(foldid[i])] = (y[Not(foldid[i])] - yhat)
        
        println(i)
    end
    
    # Create dataframe 
    data = DataFrame(ytil = ytil, dtil = dtil)
    
    # OLS clustering at the County level
    rfit = reg(data, @formula(ytil ~ 0 + dtil))
    coef_est = coef(rfit)[1]
    se = FixedEffectModels.coeftable(rfit).cols[2]

    println(" coef (se) = ", coef_est ,"(",se,")")
    
    return rfit, data;
    
end

DML2_lasso_hdm (generic function with 1 method)

In [117]:
function dreg(Z, Y)
    res_Y_0 = rlasso_arg( Z, Y, nothing, false, true, true, false, false, 
                    nothing, 1.1, nothing, 5000, 15, 10^(-5), -Inf, true, Inf, true )
    betas_Y = rlasso(res_Y_0)["coefficients"][:,2] # coefficients from not Double Lasso
end

function yreg(Z, D)
    res_D_0 = rlasso_arg( Z, D, nothing, false, true, true, false, false, 
                        nothing, 1.1, nothing, 5000, 15, 10^(-5), -Inf, true, Inf, true )
    betas_D = rlasso(res_D_0)["coefficients"][:,2] # coefficients from not Double Lasso
end

DML_lasso_hdm = DML2_lasso_hdm(z, d, y, dreg, yreg, 10, clu);

Folds: 
1
2
3
4
5
6
7
8
9
10
 coef (se) = 0.2187170988213041([0.057064691768603334])


## Post - Lasso (HDM)

In [118]:
function DML2_post_lasso_hdm(z , d , y, dreg , yreg , nfold, clu)
    
    # Num ob observations
    nobser = size(z,1)
    
    # Define folds indices
    foldid = collect(Kfold(size(z)[1], nfold))
    
    # Create array to save errors 
    ytil = ones(nobser)
    dtil = ones(nobser)
    println("Folds: " )
    
    # loop to save results
    for i in 1:nfold
        
        # Lasso regression, excluding folds selected 
        coef_D= dreg(Z[foldid[i],:], D[foldid[i],:]) # coefficients from not Double Lasso
        coef_Y= dreg(Z[foldid[i],:], Y[foldid[i],:]) # coefficients from not Double Lasso
        
        # Predict estimates using the 
        dhat = hcat(ones(size(z[Not(foldid[i]),:])[1]),Matrix(z[Not(foldid[i]),:]))*coef_D
        yhat = hcat(ones(size(z[Not(foldid[i]),:])[1]),Matrix(z[Not(foldid[i]),:]))*coef_Y
        
        # Save errors 
        dtil[Not(foldid[i])] = (d[Not(foldid[i])] - dhat)
        ytil[Not(foldid[i])] = (y[Not(foldid[i])] - yhat)
        
        println(i)
    end
    
    # Create dataframe 
    data = DataFrame(ytil = ytil, dtil = dtil)
    
    # OLS clustering at the County level
    rfit = reg(data, @formula(ytil ~ 0 + dtil))
    coef_est = coef(rfit)[1]
    se = FixedEffectModels.coeftable(rfit).cols[2]

    println(" coef (se) = ", coef_est ,"(",se,")")
    
    return rfit, data;
    
end

DML2_post_lasso_hdm (generic function with 1 method)

In [119]:
function dreg(Z, Y)
    res_Y_0 = rlasso_arg( Z, Y, nothing, true, true, true, false, false, 
                    nothing, 1.1, nothing, 5000, 15, 10^(-5), -Inf, true, Inf, true )
    betas_Y = rlasso(res_Y_0)["coefficients"][:,2] # coefficients from not Double Lasso
end

function yreg(Z, D)
    res_D_0 = rlasso_arg( Z, D, nothing, true, true, true, false, false, 
                        nothing, 1.1, nothing, 5000, 15, 10^(-5), -Inf, true, Inf, true )
    betas_D = rlasso(res_D_0)["coefficients"][:,2] # coefficients from not Double Lasso
end

DML_post_lasso_hdm = DML2_post_lasso_hdm(z, d, y, dreg, yreg, 10, clu);

Folds: 
1
2
3
4
5
6
7
8
9
10
 coef (se) = 0.22551301323407133([0.056432894506908915])


## GLMNET

In [120]:
using GLMNet

In [121]:
function DML2_lasso_cv(z , d , y, dreg , yreg , nfold, clu)
    
    # Num ob observations
    nobser = size(z,1)
    
    # Define folds indices
    foldid = collect(Kfold(size(z)[1], nfold))
    
    # Create array to save errors 
    ytil = ones(nobser)
    dtil = ones(nobser)
    println("Folds: " )
    
    # loop to save results
    for i in 1:nfold
        dfit = dreg(z[foldid[i],:], d[foldid[i]])
        yfit = yreg(z[foldid[i],:], y[foldid[i]])
        dhat = GLMNet.predict(dfit, z[Not(foldid[i]),:])
        yhat = GLMNet.predict(yfit, z[Not(foldid[i]),:])
        dtil[Not(foldid[i])]   = (d[Not(foldid[i])] - dhat)
        ytil[Not(foldid[i])]   = (y[Not(foldid[i])] - yhat)
        println(i)
    end
    
    # Create dataframe 
    data = DataFrame(ytil = ytil, dtil = dtil, clu=clu)
    
    # OLS clustering at the County level
    rfit = reg(data, @formula(ytil ~ dtil +fe(clu)))
    coef_est = coef(rfit)[1]
    se = FixedEffectModels.coeftable(rfit).cols[2]

    println(" coef (se) = ", coef_est ,"(",se,")")
    
    return rfit, data;
    
end

DML2_lasso_cv (generic function with 1 method)

#### ML method = lasso from glmnet 

In [122]:
##ML method = lasso from glmnet 
dreg(z, d) = glmnetcv(z, d, alpha = 1)    
yreg(z, y) = glmnetcv(z, y, alpha = 1)   

yreg (generic function with 1 method)

In [123]:
DML2_lasso_cv_1 = DML2_lasso_cv(z, d, y, dreg, yreg, 10, clu);

Folds: 
1
2
3
4
5
6
7
8
9
10
 coef (se) = 0.2126013359306773([0.05692742601004886])


#### ML method = Elastic net from glmnet 

In [124]:
##ML method = elastic net from glmnet 
dreg(z, d) = glmnetcv(z, d, alpha = 0.5) 
yreg(z, y) = glmnetcv(z, y, alpha = 0.5)

DML2_elnet =  DML2_lasso_cv(z, d, y, dreg, yreg, 10, clu);

Folds: 
1
2
3
4
5
6
7
8
9
10
 coef (se) = 0.2054309363031906([0.056549975407562784])


#### ML method = Ridge from glmnet 

In [125]:
##ML method = elastic net from glmnet 
dreg(z, d) = glmnetcv(z, d, alpha = 0) 
yreg(z, y) = glmnetcv(z, y, alpha = 0)

DML2_ridge = DML2_lasso_cv(z, d, y, dreg, yreg, 10, clu);

Folds: 
1
2
3
4
5
6
7
8
9
10
 coef (se) = 0.21555142508838096([0.05674055794597588])


Here we also compute DML with OLS used as the ML method

In [141]:
Random.seed!(123)
dreg(z,d) = lm(z,d)
yreg(z,y) = lm(z,y)
DML2_ols = DML2_for_PLM(z, d, y, dreg, yreg, 10, clu);

Folds: 
1
2
3
4
5
6
7
8
9
10
 coef (se) = 0.1984429455595527([0.05536820383975084])


Next, we also apply Random Forest for comparison purposes.

### Random Forest


In [127]:
#import Pkg; Pkg.add("MLJ")
#import Pkg; Pkg.add("DecisionTree")
#Pkg.add("ScikitLearn")

In [128]:
using DecisionTree, MLJ, ScikitLearn

In [139]:
function DML2_RF(z , d , y, dreg , yreg , nfold, clu)
    
    # Num ob observations
    nobser = size(z,1)
    
    # Define folds indices
    foldid = collect(Kfold(size(z)[1], nfold))
    
    # Create array to save errors 
    ytil = ones(nobser)
    dtil = ones(nobser)
    println("Folds: " )
    
    # loop to save results
    for i in 1:nfold
        dfit = dreg(z[foldid[i],:], d[foldid[i]])
        yfit = yreg(z[foldid[i],:], y[foldid[i]])
        dhat = apply_forest(dfit,z[Not(foldid[i]),:])
        yhat = apply_forest(yfit,z[Not(foldid[i]),:])
        dtil[Not(foldid[i])]   = (d[Not(foldid[i])] - dhat)
        ytil[Not(foldid[i])]   = (y[Not(foldid[i])] - yhat)
        println(i)
    end
    
    # Create dataframe 
    data = DataFrame(ytil = ytil, dtil = dtil, clu=clu)
    
    # OLS clustering at the County level
    rfit = reg(data, @formula(ytil ~ dtil +fe(clu)))
    coef_est = coef(rfit)[1]
    se = FixedEffectModels.coeftable(rfit).cols[2]

    println(" coef (se) = ", coef_est ,"(",se,")")
    
    return rfit, data;
    
end

DML2_RF (generic function with 1 method)

In [140]:
function dreg(z,d)
    # rf = RandomForestRegressor(n_trees = 500, n_subfeatures = 65, min_samples_leaf = 5, rng =0)
   # dfit = ScikitLearn.fit!(rf, z, d )
    RFmodel = build_forest(d,z)
end
function yreg(z,y)
   # rf = RandomForestRegressor(n_trees = 500, n_subfeatures = 65, min_samples_leaf = 5, rng =0)
   # yfit = ScikitLearn.fit!(rf, z, y )
    RFmodel = build_forest(y,z)
end

DML2_RF_1 = DML2_RF(z, d, y, dreg, yreg, 10, clu);

Folds: 
1
2
3
4
5
6
7
8
9
10


LoadError: "Some observations for the dependent variable are infinite"

We conclude that the gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% relative
to a trend then the predicted gun homicide rate goes up by about 0.20% controlling for counties' characteristics.

Finally, let's see which method is actually better. We compute RMSE for predicting D and Y, and see which
of the methods works better.

In [131]:
#Pkg.add("PrettyTables")

In [132]:
using PrettyTables

In [133]:
mods = [DML2_ols, DML_lasso_hdm, DML_post_lasso_hdm, DML2_lasso_cv_1, DML2_ridge, DML2_elnet, DML2_RF_1];
mods_name = ["DML2_ols", "DML2_lasso", "DML2_post_lasso", "DML2_lasso_cv", "DML2_ridge", "DML2_elnet", "DML2_RF"];

LoadError: UndefVarError: DML2_RF_1 not defined

In [134]:
RMSE_Y = []
RMSE_D = []

for i in mods
    push!(RMSE_Y, sqrt(mean(i[2][!,1])^2))
    push!(RMSE_D, sqrt(mean(i[2][!,2])^2))
end

result = DataFrame([mods_name RMSE_Y RMSE_D], [:Models, :RMSE_Y, :RMSE_D])
pretty_table(result; formatters = ft_printf("%5.10f"))

LoadError: UndefVarError: mods not defined

It looks like the best method for predicting D is Lasso, and the best method for predicting Y is CV Ridge.

In [135]:
#DML with cross-validated Lasso:
dreg(z,d) = glmnetcv(z,d, alpha = 1)
yreg(z,y) = glmnetcv(z,y, alpha = 0)

DML2_best = DML2_lasso_cv(z, d, y, dreg, yreg, 10, clu);

Folds: 
1
2
3
4
5
6
7
8
9
10
 coef (se) = 0.22111658387364352([0.056539747708417326])


In [136]:
ols_coef = GLM.coeftable(baseline_ols).cols[1][1]
ols_std = GLM.coeftable(baseline_ols).cols[2][1]
control_ols_coef = GLM.coeftable(control_ols).cols[1][1]
control_ols_std = GLM.coeftable(control_ols).cols[2][1]
lasso_coef = GLM.coeftable(DML_lasso_hdm[1]).cols[1][1]
lasso_std = GLM.coeftable(DML_lasso_hdm[1]).cols[2][1]
lasso_post_coef = GLM.coeftable(DML_post_lasso_hdm[1]).cols[1][1]
lasso_post_std = GLM.coeftable(DML_post_lasso_hdm[1]).cols[2][1]
DML2_lasso_cv_1_coef = GLM.coeftable(DML2_lasso_cv_1[1]).cols[1][1]
DML2_lasso_cv_1_std = GLM.coeftable(DML2_lasso_cv_1[1]).cols[2][1]
DML2_elnet_coef = GLM.coeftable(DML2_elnet[1]).cols[1][1]
DML2_elnet_std = GLM.coeftable(DML2_elnet[1]).cols[2][1]
DML2_ridge_coef = GLM.coeftable(DML2_ridge[1]).cols[1][1]
DML2_ridge_std = GLM.coeftable(DML2_ridge[1]).cols[2][1]
DML2_RF_1_coef = GLM.coeftable(DML2_RF_1[1]).cols[1][1]
DML2_RF_1_std = GLM.coeftable(DML2_RF_1[1]).cols[2][1]
DML2_best_coef = GLM.coeftable(DML2_best).cols[1][1]
DML2_best_std = GLM.coeftable(DML2_best).cols[2][1];

LoadError: UndefVarError: DML2_RF_1 not defined

In [137]:
tabla = DataFrame(modelos = ["Baseline OLS", "Least Squares with controls", "Lasso", "Post_Lasso", "CV Lasso", "CV Elnet", "CV Ridge", "Random Forest", "Best"], 
Estimate = [ols_coef, control_ols_coef, lasso_coef, lasso_post_coef , DML2_lasso_cv_1_coef, DML2_elnet_coef, DML2_ridge_coef, DML2_RF_1_coef, DML2_best_coef], 
StdError = [ols_std, control_ols_std, lasso_std, lasso_post_std , DML2_lasso_cv_1_std, DML2_elnet_std, DML2_ridge_std, DML2_RF_1_std, DML2_best_std])

LoadError: UndefVarError: DML2_RF_1_coef not defined

In [138]:
# The table in latex
show(stdout, MIME("text/latex"), DataFrame(modelos = ["Baseline OLS", "Least Squares with controls", "Lasso", "Post_Lasso","CV Lasso", "CV Elnet", "CV Ridge"], 
Estimate = [ols_coef, control_ols_coef, lasso_coef, asso_post_coef , DML2_lasso_cv_1_coef, DML2_elnet_coef, DML2_ridge_coef], 
StdError = [ols_std, control_ols_std, lasso_std, lasso_post_std, DML2_lasso_cv_1_std, DML2_elnet_std, DML2_ridge_std]))

LoadError: UndefVarError: asso_post_coef not defined