# Elastic Net

I used optimization to find the best $\alpha$ and $\lambda$:

In [1]:
library(caret)
library(MLmetrics)
library(glmnet)
library(tidyverse)
library(magrittr)
source("helpers.r")

df <- get_training_df_clean()


set.seed(25)
number_of_folds <- 10
folds <- createFolds(df$target, k = number_of_folds)

f <- function(x) {

auc <- vector(mode = "numeric", length = number_of_folds)

for(fold_index in c(1:number_of_folds)){
  training <- df[-folds[[fold_index]],]
  test <- df[folds[[fold_index]],]
  
  
  model <- model.matrix(target ~ ., training)[,-1]
  y <- ifelse(training$target == "no_disease", 0, 1)
  fit <- glmnet(model, y, alpha = x[1], family = "binomial", lambda = x[2], standardize = TRUE)
  
  x_test  <- model.matrix(target ~ ., test)[,-1]
  y_probabilities <- predict(fit, x_test,  type="response")
  y_true <- ifelse(test$target == "no_disease", 0, 1)
  
  auc[fold_index] <- AUC(y_true = y_true, y_pred = y_probabilities)

  
}

return(-mean(auc))

}

x_0 <- c(alpha = 0.5, lambda = 0.5)




h <- 0.1
steplength <- 0.05
eps <- 0.01
iter_max <- 50



x_current <- x_0
iter = 0

result <- tibble(auc = vector(mode = "numeric"),
                 gradient = vector(mode = "numeric"),
                 alpha = vector(mode = "numeric"),
                 lambda = vector(mode = "numeric"),
                k = vector(mode = "integer"))

repeat ({

dx_1 <- (f(x_current + c(h,0)) - f(x_current - c(h,0))) / (2 * h)
dx_2 <- (f(x_current + c(0, h)) - f(x_current - c(0, h))) / (2 * h)
gradient <- c(dx_1, dx_2)

x_current <- x_current - steplength * gradient
norm_grad <- norm(gradient, type = "2")

iter <- iter + 1

if(norm_grad < eps) break

if(iter > iter_max) {
  print("max iter reached")
  break
}


result %<>% 
  add_row(auc = -f(x_current), 
          gradient = norm_grad, 
          alpha = x_current[1],
          lambda = x_current[2],
          k = iter)


})


print(tail(result))


"package 'caret' was built under R version 3.6.1"Loading required package: lattice
Loading required package: ggplot2
Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
"package 'MLmetrics' was built under R version 3.6.1"
Attaching package: 'MLmetrics'

The following objects are masked from 'package:caret':

    MAE, RMSE

The following object is masked from 'package:base':

    Recall

"package 'glmnet' was built under R version 3.6.1"Loading required package: Matrix
Loaded glmnet 3.0

"package 'tidyverse' was built under R version 3.6.1"-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v tibble  2.1.1       v purrr   0.3.2  
v tidyr   0.8.3       v dplyr   0.8.0.1
v readr   1.3.1       v stringr 1.4.0  
v tibble  2.1.1       v forcats 0.4.0  
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x tidyr::expand() masks Matrix::expand()
x d

[1] "max iter reached"
# A tibble: 6 x 5
    auc gradient alpha lambda     k
  <dbl>    <dbl> <dbl>  <dbl> <dbl>
1 0.909   0.0623 0.164  0.254    45
2 0.909   0.0556 0.161  0.253    46
3 0.909   0.0636 0.159  0.252    47
4 0.909   0.0683 0.155  0.251    48
5 0.909   0.0683 0.152  0.249    49
6 0.909   0.0555 0.150  0.248    50


With a grid search i received a better result:

In [2]:
set.seed(25)
number_of_folds <- 10
folds <- createFolds(df$target, k = number_of_folds)

f <- function(alpha, lambda) {
  
  auc <- vector(mode = "numeric", length = number_of_folds)
  
  for(fold_index in c(1:number_of_folds)){
    training <- df[-folds[[fold_index]],]
    test <- df[folds[[fold_index]],]
    
    
    model <- model.matrix(target ~ ., training)[,-1]
    y <- ifelse(training$target == "no_disease", 0, 1)
    fit <- glmnet(model, y, alpha = alpha, family = "binomial", lambda = lambda, standardize = TRUE)
    
    x_test  <- model.matrix(target ~ ., test)[,-1]
    y_probabilities <- predict(fit, x_test,  type="response")
    y_true <- ifelse(test$target == "no_disease", 0, 1)
    
    auc[fold_index] <- AUC(y_true = y_true, y_pred = y_probabilities)
    
    
  }
  
  return(mean(auc))
  
}


result <- tibble(auc = vector(mode = "numeric"),
                 alpha = vector(mode = "numeric"),
                 lambda = vector(mode = "numeric"))

alphas <-  seq(1, 0, -0.05)
lambdas <- seq(1, 0, -0.05)
  
for(alpha in alphas) {
  for(lambda in lambdas){
    
    result %<>% 
      add_row(auc = f(alpha, lambda), 
              alpha = alpha,
              lambda = lambda)
  }
}
  


head(result %>%
  arrange(desc(auc)))

auc,alpha,lambda
0.9196304,0.0,0.5
0.9189111,0.0,0.3
0.9188611,0.0,0.35
0.9182318,0.0,0.45
0.9177023,0.05,0.1
0.9175624,0.0,0.25


So the best hyperparemters are $\alpha = 0$ and $\lambda = \frac{1}{2}$.