In [1]:
using RData, LinearAlgebra, GLM, DataFrames, Statistics, Random, Distributions, DataStructures, NamedArrays, PrettyTables,
        Plots, StatsBase,StatsPlots, GLM
import CodecBzip2

In [2]:
using DelimitedFiles, DataFrames, Lasso

In [3]:
using CategoricalArrays

## Bootstraping

In [4]:
mat, head = readdlm("GitHub/ECO224/data/penn_jae.dat", header=true, Float64)
mat
df =DataFrame(mat, vec(head))
df

Unnamed: 0_level_0,abdt,tg,inuidur1,inuidur2,female,black,hispanic,othrace,dep
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,10824.0,0.0,18.0,18.0,0.0,0.0,0.0,0.0,2.0
2,10635.0,2.0,7.0,3.0,0.0,0.0,0.0,0.0,0.0
3,10551.0,5.0,18.0,6.0,1.0,0.0,0.0,0.0,0.0
4,10824.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
5,10747.0,0.0,27.0,27.0,0.0,0.0,0.0,0.0,0.0
6,10544.0,6.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0
7,10845.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
8,10670.0,3.0,3.0,3.0,1.0,0.0,0.0,0.0,2.0
9,10768.0,3.0,28.0,11.0,1.0,0.0,0.0,0.0,0.0
10,10754.0,2.0,20.0,20.0,1.0,0.0,0.0,0.0,0.0


In [5]:
# Filter only rows which have tg = 0 or 4
penn = filter(row -> row[:tg] in [4,0], df)

replace!(penn.tg, 4 => 1)
rename!(penn, "tg" => "T4")

# from float to string
penn[!,:dep] = string.(penn[!,:dep]) 

# dep varaible in categorical format 
penn[!,:dep] = categorical(penn[!,:dep]);


Create a function to make an OLS regression

In [6]:
function boot_fn(data,index)
            ols_1 = lm(@formula(log(inuidur1)~T4+ (female+black+othrace+dep+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)), penn[index,:])
            T4 = GLM.coeftable(ols_1).cols[1][2]
            female = GLM.coeftable(ols_1).cols[1][3]
            black = GLM.coeftable(ols_1).cols[1][4]
            return [T4, female, black]
end

boot_fn (generic function with 1 method)

This function selects observations randomly, with replacement, and then passes it as an argument to boot_fn (bootstrap)

In [7]:
function boot_2(data,func,R)
            T4_coef = []
            fem_coef = []
            black_coef = []
            for i in 1:R
                append!(T4_coef,func(data,sample([1:5099;], 5099, replace = true))[1])
                append!(fem_coef,func(data,sample([1:5099;], 5099, replace = true))[2])
                append!(black_coef,func(data,sample([1:5099;], 5099, replace = true))[3])
            end
        table = NamedArray(zeros(3, 3))

        table[1,2] = mean(T4_coef)
        table[1,3] = std(T4_coef, corrected=true)
        table[2,2] = mean(fem_coef)
        table[2,3] = std(fem_coef, corrected=true)
        table[3,2] = mean(black_coef)
        table[3,3] = std(black_coef, corrected=true)
        T = DataFrame(table, [ :"Variable", :"Coefficient (boostrap)", :"Standard error (boostrap)"]) 
        T[!,:Variable] = string.(T[!,:Variable]) 

        T[1,1] = "T4"
        T[2,1] = "Female"
        T[3,1] = "Black"
        
        bootstrap_statistics = Dict{String,Any}("Table" => T, "T4" => T4_coef, "Female" => fem_coef, "Black" => black_coef)
    return bootstrap_statistics
end

boot_2 (generic function with 1 method)

Just checking the function is ok

In [8]:
lm(@formula(log(inuidur1)~T4+ (female+black+othrace+dep+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)), penn[1:5099,:])

StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}

:(log(inuidur1)) ~ 1 + T4 + female + black + othrace + dep + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd

Coefficients:
───────────────────────────────────────────────────────────────────────────────
                   Coef.  Std. Error      t  Pr(>|t|)    Lower 95%    Upper 95%
───────────────────────────────────────────────────────────────────────────────
(Intercept)   2.17846      0.159015   13.70    <1e-41   1.86672      2.4902
T4           -0.0716925    0.0354633  -2.02    0.0433  -0.141216    -0.00216922
female        0.126368     0.0348249   3.63    0.0003   0.0580965    0.19464
black        -0.293768     0.0529756  -5.55    <1e-07  -0.397623    -0.189913
othrace      -0.472445     0.198398   -2.38    0.0173  -0.86139     -0.0835005
dep: 1.0      0.0298669    0.0541402   0.55    0.5812  -0.07627

In [9]:
boot_2(penn,boot_fn,1000)["Table"]

Unnamed: 0_level_0,Variable,Coefficient (boostrap),Standard error (boostrap)
Unnamed: 0_level_1,String,Float64,Float64
1,T4,-0.0731726,0.0372499
2,Female,0.126347,0.0339095
3,Black,-0.293819,0.0597755


## Comparative Models

In [23]:
#Load data
rdata_read = RData.load("GitHub/ECO224/data/cps2012.RData")
data = rdata_read["data"];

In [30]:
#Basic model
formula_basic = @formula(lnw ~ female + female*(widowed + divorced + separated + nevermarried +
hsd08 + hsd911 + hsg + cg + ad + mw + so + we + exp1 + exp2 + exp3))

#Flexible model
formula_flex  = @formula(lnw ~ female + female*(widowed + divorced + separated + nevermarried + hsd08 + hsd911 + hsg + cg + ad + 
        mw + so + we + exp1 + exp2 + exp3) + (widowed + divorced + separated + nevermarried + hsd08 + hsd911 + hsg + cg + 
        ad + mw + so +we + exp1 + exp2 + exp3)*(widowed + divorced + separated + nevermarried + hsd08 + hsd911 + hsg + cg + 
        ad + mw + so +we + exp1 + exp2 + exp3))

FormulaTerm
Response:
  lnw(unknown)
Predictors:
  female(unknown)
  widowed(unknown)
  divorced(unknown)
  separated(unknown)
  nevermarried(unknown)
  hsd08(unknown)
  hsd911(unknown)
  hsg(unknown)
  cg(unknown)
  ad(unknown)
  mw(unknown)
  so(unknown)
  we(unknown)
  exp1(unknown)
  exp2(unknown)
  exp3(unknown)
  female(unknown) & widowed(unknown)
  female(unknown) & divorced(unknown)
  female(unknown) & separated(unknown)
  female(unknown) & nevermarried(unknown)
  female(unknown) & hsd08(unknown)
  female(unknown) & hsd911(unknown)
  female(unknown) & hsg(unknown)
  female(unknown) & cg(unknown)
  female(unknown) & ad(unknown)
  female(unknown) & mw(unknown)
  female(unknown) & so(unknown)
  female(unknown) & we(unknown)
  female(unknown) & exp1(unknown)
  female(unknown) & exp2(unknown)
  female(unknown) & exp3(unknown)
  widowed(unknown) & widowed(unknown)
  widowed(unknown) & divorced(unknown)
  widowed(unknown) & separated(unknown)
  widowed(unknown) & nevermarried(unknown)

Partition the data in two. One to get the coefficients and another one to test them (X_train,Y_train,X_test,Y_test)

In [32]:

training = sample( collect(1:nrow( data ) ), trunc(Int, 3 * nrow( data ) / 4 ),  replace= false )

data_train = data[ vec(training), : ]
data_test = data[ Not(training), : ]

Unnamed: 0_level_0,year,lnw,female,widowed,divorced,separated,nevermarried,hsd08
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,2012.0,1.36577,1.0,0.0,0.0,0.0,0.0,0.0
2,2012.0,2.54022,0.0,0.0,0.0,0.0,0.0,0.0
3,2012.0,3.14226,1.0,0.0,1.0,0.0,0.0,0.0
4,2012.0,2.43361,0.0,0.0,0.0,0.0,0.0,0.0
5,2012.0,2.65676,1.0,0.0,0.0,0.0,0.0,0.0
6,2012.0,2.43361,0.0,0.0,0.0,0.0,1.0,0.0
7,2012.0,2.05094,1.0,0.0,0.0,0.0,1.0,0.0
8,2012.0,2.49424,0.0,0.0,0.0,0.0,0.0,0.0
9,2012.0,3.92972,1.0,0.0,0.0,0.0,0.0,0.0
10,2012.0,2.0124,1.0,0.0,0.0,0.0,0.0,0.0


In [33]:
model_X_basic_train = ModelMatrix(ModelFrame(formula_basic,data_train)).m
model_X_basic_test = ModelMatrix(ModelFrame(formula_basic,data_test)).m
p_basic = size(model_X_basic_test)[2]

32

In [34]:
model_X_flex_train = ModelMatrix(ModelFrame(formula_flex,data_train)).m
model_X_flex_test = ModelMatrix(ModelFrame(formula_flex,data_test)).m
p_flex = size(model_X_flex_test)[2]

257

In [37]:
Y_train = data_train[!, ["lnw"]] # Dataframe format
Y_test = data_test[ !,  ["lnw"]]

Unnamed: 0_level_0,lnw
Unnamed: 0_level_1,Float64
1,1.36577
2,2.54022
3,3.14226
4,2.43361
5,2.65676
6,2.43361
7,2.05094
8,2.49424
9,3.92972
10,2.0124


**We start to make the regressions for every model**

### 2.1. OLS - Basic

In [137]:
# Get betas from basic regression
fit_lm_basic = lm(formula_basic, data_train);

In [138]:
# Compute the Out-Of-Sample Performance
yhat_lm_basic = GLM.predict( fit_lm_basic , data_test )
res_lm_basic = ( Y_test[!,1] - yhat_lm_basic ).^2
print("The mean squared error (MSE) using the basic model is equal to " , mean( res_lm_basic ) ) # MSE OLS (basic model)

The mean squared error (MSE) using the basic model is equal to 0.34418046057969054

In [139]:
#Get the MSE, std(MSE) and R2
matrix_ones = ones( size(res_lm_basic)[1] ,1 )
mean_residuals = lm(  matrix_ones, res_lm_basic )   # first argument (X), secind argument (Y)
MSE_lm_basic = [ coef( mean_residuals ) , stderror( mean_residuals ) ]
R2_lm_basic = 1 .- ( MSE_lm_basic[1] / var(Y_test[!,1]) ) 

1-element Vector{Float64}:
 0.2388076180858274

In [140]:
1 .- ( MSE_lm_basic[1] / var(Y_test[!,1]) ) 

1-element Vector{Float64}:
 0.2388076180858274

### 2.1. OLS - Flexible

In [141]:
# Get betas from flexible regression
fit_lm_flex = lm(formula_flex, data_train);

In [142]:
# Compute the Out-Of-Sample Performance
yhat_lm_flex = GLM.predict( fit_lm_flex , data_test )
res_lm_flex = ( Y_test[!,1] - yhat_lm_flex ).^2
print("The mean squared error (MSE) using the flexible model is equal to " , mean( res_lm_flex ) ) # MSE OLS (flex model)

The mean squared error (MSE) using the flexible model is equal to 0.3451138214923413

In [143]:
# Get the MSE
matrix_ones = ones( size(res_lm_flex)[1] ,1 )
mean_residuals = lm(  matrix_ones, res_lm_flex )   # first argument (X), secind argument (Y)
MSE_lm_flex = [ coef( mean_residuals ) , stderror( mean_residuals ) ]
R2_lm_flex = 1 .- ( MSE_lm_flex[1] / var(Y_test[!,1]) ) 

1-element Vector{Float64}:
 0.23674338929407812

### 2.2 Lasso - Basic

In [144]:
include("GitHub/ECO224/Labs/Julia_Notebooks/hdmjl/hdmjl.jl")

In [145]:
names_col1 = Symbol.(coefnames(fit_lm_basic))
X1 = DataFrame(model_X_basic_train, names_col1)

Unnamed: 0_level_0,(Intercept),female,widowed,divorced,separated,nevermarried,hsd08,hsd911
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [146]:
# basic model. Lasso sets some variables' coeff. to zero.

rlasso_basic  = rlasso_arg( X1, Y_train, nothing, false, false, true, false, false, 
                    nothing, 1.1, nothing, 5000, 15, 10^(-5), -Inf, true, Inf, true )

fit_rlasso_basic = rlasso(rlasso_basic)

#Get prediction
yhat_rlasso = model_X_basic_test*fit_rlasso_basic["coefficients"] 

#Get erorrs^2, MSE, std(MSE) and R^2 from the regression
res_rlasso_basic = ( Y_test[!,1] - yhat_rlasso ).^ 2
matrix_ones = ones( size(res_rlasso_basic)[1] ,1 )
mean_residuals = lm(  matrix_ones, res_rlasso_basic)  
MSE_rlasso_basic = [ coef( mean_residuals ) , stderror( mean_residuals ) ]
R2_rlasso_basic = 1 .- ( MSE_rlasso_basic[1] / var(Y_test[!,1]) ) 


1-element Vector{Float64}:
 0.22036420343017815

### 2.2 Lasso - Flexible

In [147]:
names_col1 = Symbol.(coefnames(fit_lm_flex))
X1 = DataFrame(model_X_flex_train, names_col1);

In [148]:
#Flexible model. Lasso sets some variables' coeff. to zero.


rlasso_flex  = rlasso_arg( X1, Y_train, nothing, false, false, true, false, false, 
                    nothing, 1.1, nothing, 5000, 15, 10^(-5), -Inf, true, Inf, true )

fit_rlasso_flex = rlasso(rlasso_flex)

#Get prediction
yhat_rlasso = model_X_flex_test*fit_rlasso_flex["coefficients"] 

#Get erorrs^2, MSE, std(MSE) and R^2 from the regression
res_rlasso_flex = ( Y_test[!,1] - yhat_rlasso ).^ 2
matrix_ones = ones( size(res_rlasso_flex)[1] ,1 )
mean_residuals = lm(  matrix_ones, res_rlasso_flex )  
MSE_rlasso_flex = [ coef( mean_residuals ) , stderror( mean_residuals ) ]
R2_rlasso_flex = 1 .- ( MSE_rlasso_flex[1] / var(Y_test[!,1]) ) 

1-element Vector{Float64}:
 0.21799742839218872

### 2.3 Lasso CV - Basic

In [149]:
using GLMNet

In [150]:
fit_lasso_cv   = GLMNet.glmnetcv(model_X_basic_train, Y_train[!,1], alpha=1)

#We fit the model using the coefficients from each model, but using the testing sample
yhat_lasso_cv    = GLMNet.predict(fit_lasso_cv,  model_X_basic_test)

#Getting the errors^2, MSE, std(MSE) and R^2 from regression
res_lasso_cv = ( Y_test[!,1] - yhat_lasso_cv ).^2
matrix_ones = ones( size(res_lasso_cv)[1] ,1 )
mean_residuals = lm(  matrix_ones, res_lasso_cv )
MSE_lasso_cv_basic = [ coef( mean_residuals ) , stderror( mean_residuals ) ]
R2_lasso_cv_basic = 1 .- ( MSE_lasso_cv_basic[1] / var( Y_test[!,1] ) )

1-element Vector{Float64}:
 0.23918712760745175

### 2.3 Lasso CV - Flexible

In [151]:
fit_lasso_cv   = GLMNet.glmnetcv(model_X_flex_train, Y_train[!,1], alpha=1)

#We fit the model using the coefficients from each model, but using the testing sample
yhat_lasso_cv    = GLMNet.predict(fit_lasso_cv,  model_X_flex_test)

#Getting the errors^2, MSE, std(MSE) and R^2 from regression
res_lasso_cv = ( Y_test[!,1] - yhat_lasso_cv ) .^ 2
matrix_ones = ones( size(res_lasso_cv)[1] ,1 )
mean_residuals = lm(  matrix_ones, res_lasso_cv )
MSE_lasso_cv_flex = [ coef( mean_residuals ) , stderror( mean_residuals ) ]
R2_lasso_cv_flex = 1 .- ( MSE_lasso_cv_flex[1] / var( Y_test[!,1] ) )

1-element Vector{Float64}:
 0.23978491704457594

### 2.3 Ridge - Basic

In [152]:
#Fit the model with training data
fit_ridge   = GLMNet.glmnetcv(model_X_basic_train, Y_train[!,1], alpha=0) 

#Get the predictors
yhat_ridge   = GLMNet.predict(fit_ridge,  model_X_basic_test)

#Get the residuals squared for each obs and MSE 
res_ridge = ( Y_test[!,1] - yhat_ridge ) .^ 2
matrix_ones = ones( size(res_ridge)[1] ,1 )
mean_residuals = lm(  matrix_ones, res_ridge )
MSE_ridge_basic = [ coef( mean_residuals ) , stderror( mean_residuals ) ]

#Get the R^2 from the model
R2_ridge_basic = 1 .- ( MSE_ridge_basic[1] / var( Y_test[!,1] ) )

1-element Vector{Float64}:
 0.23762088600288211

### 2.3 Ridge - Flexible

In [153]:
#Fit the model with training data
fit_ridge   = GLMNet.glmnetcv(model_X_flex_train, Y_train[!,1], alpha=0)

#Get the predictors
yhat_ridge   = GLMNet.predict(fit_ridge,  model_X_flex_test)

#Get MSE, std(MSE) and R2
res_ridge = ( Y_test[!,1] - yhat_ridge ) .^ 2
matrix_ones = ones( size(res_ridge)[1] ,1 )
mean_residuals = lm(  matrix_ones, res_ridge )
MSE_ridge_flex = [ coef( mean_residuals ) , stderror( mean_residuals ) ]
R2_ridge_flex = 1 .- ( MSE_ridge_flex[1] / var( Y_test[!,1] ) )

1-element Vector{Float64}:
 0.23835123306951278

### 2.3 Elastic Net - Basic

In [154]:
#Fit the model with training data
fit_elnet   = GLMNet.glmnetcv(model_X_basic_train, Y_train[!,1], alpha= 0.5)

#Get the predictors
yhat_elnet   = GLMNet.predict(fit_elnet,  model_X_basic_test)

#Get MSE, std(MSE) and R2
res_elnet = ( Y_test[!,1] - yhat_elnet ) .^ 2
matrix_ones = ones( size(res_elnet)[1] ,1 )
mean_residuals = lm(  matrix_ones, res_elnet )
MSE_elnet_basic = [ coef( mean_residuals ) , stderror( mean_residuals ) ]
R2_elnet_basic = 1 .- ( MSE_elnet_basic[1] / var( Y_test[!,1] ) )

1-element Vector{Float64}:
 0.23918916860114692

### 2.3 Elastic Net - Flexible

In [155]:
#Fit the model with training data
fit_elnet   = GLMNet.glmnetcv(model_X_flex_train, Y_train[!,1], alpha= 0.5)

#Get the predictors
yhat_elnet   = GLMNet.predict(fit_elnet,  model_X_flex_test)

#Get MSE, std(MSE) and R2
res_elnet = ( Y_test[!,1] - yhat_elnet ) .^ 2
matrix_ones = ones( size(res_elnet)[1] ,1 )
mean_residuals = lm(  matrix_ones, res_elnet )
MSE_elnet_flex = [ coef( mean_residuals ) , stderror( mean_residuals ) ]
R2_elnet_flex = 1 .- ( MSE_elnet_flex[1] / var( Y_test[!,1] ) )

1-element Vector{Float64}:
 0.23965737760094663

## Tree Regression

In [156]:
using ScikitLearn, DecisionTree

In [157]:
tree0 = DecisionTreeRegressor(min_purity_increase = 0, min_samples_leaf=1, min_samples_split = 2,rng = 0)

DecisionTreeRegressor
max_depth:                -1
min_samples_leaf:         1
min_samples_split:        2
min_purity_increase:      0.0
pruning_purity_threshold: 1.0
n_subfeatures:            0
root:                     nothing

In [158]:
trees_fit0 =  ScikitLearn.fit!(tree0, model_X_basic_train, Y_train[!,1] )

DecisionTreeRegressor
max_depth:                -1
min_samples_leaf:         1
min_samples_split:        2
min_purity_increase:      0.0
pruning_purity_threshold: 1.0
n_subfeatures:            0
root:                     Decision Tree
Leaves: 3917
Depth:  27

In [159]:
y_hat_t = ScikitLearn.predict(trees_fit0, model_X_basic_test)

res_tree_noprun = ( Y_test[!,1] - y_hat_t ) .^ 2
mean_residuals = lm(  matrix_ones, res_tree_noprun )
MSE_tree_noprun = [ coef( mean_residuals ) , stderror( mean_residuals ) ]

R2_tree_noprun = ( 1 .- ( MSE_tree_noprun[1] / var( Y_test[!,1] ) ) )[1]

print("R^2 using tree regression:", R2_tree)

R^2 using tree regression:0.19891936997646242

## Pruned Tree Regression

In [160]:
# Using prun purity parameter = 0.010

tree1 = DecisionTreeRegressor( min_samples_leaf=1, min_samples_split = 2, rng = 0, min_purity_increase = 0.01)
trees_fit1 =  ScikitLearn.fit!(tree1, model_X_basic_train, Y_train[!,1] )

DecisionTreeRegressor
max_depth:                -1
min_samples_leaf:         1
min_samples_split:        2
min_purity_increase:      0.01
pruning_purity_threshold: 1.0
n_subfeatures:            0
root:                     Decision Tree
Leaves: 16
Depth:  9

In [161]:
y_hat_t = ScikitLearn.predict(trees_fit1, model_X_basic_test)

res_tree_prun = ( Y_test[!,1] - y_hat_t ) .^ 2
mean_residuals = lm(  matrix_ones, res_tree_prun )
MSE_tree_prun = [ coef( mean_residuals ) , stderror( mean_residuals ) ]

R2_tree_prun = ( 1 .- ( MSE_tree_prun[1] / var( Y_test[!,1] ) ) )[1]

print("R^2 using tree regression:", R2_tree)

R^2 using tree regression:0.19891936997646242

In [163]:
table = NamedArray(zeros(12, 4))

table[1,2:3] = [MSE_lm_basic[1][1], MSE_lm_basic[2][1]]
table[2,2:3] = [MSE_lm_flex[1][1], MSE_lm_flex[2][1]]
table[3,2:3] = [MSE_rlasso_basic[1][1], MSE_rlasso_basic[2][1]]
table[4,2:3] = [MSE_rlasso_flex[1][1], MSE_rlasso_flex[2][1]]
table[5,2:3] = [MSE_lasso_cv_basic[1][1], MSE_lasso_cv_basic[2][1]]
table[6,2:3] = [MSE_lasso_cv_flex[1][1], MSE_lasso_cv_flex[2][1]]
table[7,2:3] = [MSE_ridge_basic[1][1], MSE_ridge_basic[2][1]]
table[8,2:3] = [MSE_ridge_flex[1][1], MSE_ridge_flex[2][1]]
table[9,2:3] = [MSE_elnet_basic[1][1], MSE_elnet_basic[2][1]]
table[10,2:3] = [MSE_elnet_flex[1][1], MSE_elnet_flex[2][1]]
table[11,2:3] = [MSE_tree_noprun[1][1], MSE_tree_noprun[2][1]]
table[12,2:3] = [MSE_tree_prun[1][1], MSE_tree_prun[2][1]]

table[1,4] = R2_lm_basic[1]
table[2,4] = R2_lm_flex[1]
table[3,4] = R2_rlasso_basic[1]
table[4,4] = R2_rlasso_flex[1]
table[5,4] = R2_lasso_cv_basic[1]
table[6,4] = R2_lasso_cv_flex[1]
table[7,4] = R2_ridge_basic[1]
table[8,4] = R2_ridge_flex[1]
table[9,4] = R2_elnet_basic[1]
table[10,4] = R2_elnet_flex[1]
table[11,4] = R2_tree_noprun[1]
table[12,4] = R2_tree_prun[1]

T = DataFrame(table, [ :"Model",:"MSE", :"S.E. for MSE", :"R-squared"]) 
T[!,:Model] = string.(T[!,:Model]) 

T[1,1] = "Least Squares (basic)"
T[2,1] = "Least Squares (flexible)"
T[3,1] = "Lasso (basic)"
T[4,1] = "Lasso (flexible)"
T[5,1] = "Cross-Validated lasso (basic)"
T[6,1] = "Cross-Validated lasso (flexible)"
T[7,1] = "Cross-Validated ridge (basic)"
T[8,1] = "Cross-Validated ridge (flexible)"
T[9,1] = "Cross-Validated elnet (basic)"
T[10,1] = "Cross-Validated elnet (flexible)"
T[11,1] = "Non Pruned Tree"
T[12,1] = "Pruned Tree"

header = (["Model", "MSE", "S.E. for MSE", "R-squared"])

pretty_table(T; backend = Val(:html), header = header, formatters=ft_round(4), alignment=:c)

Model,MSE,S.E. for MSE,R-squared
Least Squares (basic),0.3442,0.0247,0.2388
Least Squares (flexible),0.3451,0.0248,0.2367
Lasso (basic),0.3525,0.0241,0.2204
Lasso (flexible),0.3536,0.024,0.218
Cross-Validated lasso (basic),0.344,0.0247,0.2392
Cross-Validated lasso (flexible),0.3437,0.0246,0.2398
Cross-Validated ridge (basic),0.3447,0.0245,0.2376
Cross-Validated ridge (flexible),0.3444,0.0246,0.2384
Cross-Validated elnet (basic),0.344,0.0247,0.2392
Cross-Validated elnet (flexible),0.3438,0.0246,0.2397


## Tree regression 

In some cases, it will impossible to use a linear regression to predict well. So, tree regression would be a good option. In this case, it is neccesary to  divide the predictor space, which is the set of values of $X1, X2,..., Xp$ in non-overlapping regions $R1, R2, ..., RJ$, where we can analyze which variables are more related to the output for each one. Apart from that, we divide the sample in two groups: training and test. To construct the tree, we use the training data and the goal is to minimize the RSS given by:

\begin{equation}
\sum_{j=1}^{J}\sum_{i\in R_j}(y_i - \hat{y}R_j)^2 
\end{equation}

The prediction is simply the mean of the values for training observations in region $Rj$.

### To build a tree regression using a single predictor you need to follow the next steps:
1. Step 1
- Given a training data, we want to build a regression tree that uses the variable $X$ to predict the variable $Y$. Let's say that X is drug dose and Y is drug effectiveness.
2. Step 2
- Just like a clasification trees, the first thing we do for a regression tree is decide what goes in the root.
3. Step 3
- To make the decision, we calculate the average of the first 2 doses, wich is 3. Then, we buld a very simple tree that splits the measurements in two groups based on whether or not the dose < 3. 
4. Step 4
- Because only one point has a dose < 3, and its average effectiveness is 0, we put 0 and in the corresponding leaf.
- All other points have dose >= 3, and their effectiveness is 38.8, so we put 38.8 in the other leaf.
5. Step 5
- Assuming the following: for the one point with dose < 3, the regression tree makes a good prediction.
6. Setp 6
- Assuming the following: for the one point with dose >= 3, the regression tree makes a bad prediction.
7. Setp 7
- We can visualiaze how good or bad the regression trees is at making predictions by drawning the residuals, the differences between the observed and predicted values.
- We can also quantify how good or bad the predictions are by calculating the Sum of the Squared Residuals (SSR).
- Lastly, we can compare the SSR for different thresholds by plotting them on this graph, wich has dose on the x-axis and SSR on the y-axis.
8. Step 8
- Looking at the SSRs for each dose thershold, the root will be the thershold that had the smaller SSR.
9. Step 9
- If It's necessary, we will repeat step 7 and 8 to split the tree and add more nodes to it.
10. Setp 10 

### To build a regression tree with multiple features
1. Step 1
- For each predictor we select the thershold that give us the smallest SSR. However, instead of that threshold instantly becoming the root, it only becomes a candidate for the root.

2. Step 2
- The root will be the candidate with the  lowest SSR.

3. Step 3
- If It's necessary, you repeat the same process for each node.

## Why is it important to prune a tree? 

We prune a tree to penalize the inclusion of leaves. This is to avoid overfitting with the training observations. So, cutting down the branches, we can improve the predictive perfomance.

\begin{equation}
\sum_{m=1}^{|T|}\sum_{i: x_i\in R_m}(y_i - \hat{y}_{R_m})^2 +\alpha|T|   T \subset T_o
\end{equation}

Where: T indicates the number of leaves of the tree T, R_m is the region corresponding to the mth terminal node and $\hat{y}_{R_j}$ is the same as before. $\alpha$ is the value of penalization