In [4]:
library(remotes)
remotes::install_github("eawerner306/biostat615_project/lassosolve615")

Using GitHub PAT from the git credential store.

Downloading GitHub repo eawerner306/biostat615_project@HEAD




[36m--[39m [36mR CMD build[39m [36m-----------------------------------------------------------------[39m
1: Setting LC_CTYPE failed, using "C" 
2: Setting LC_TIME failed, using "C" 
3: Setting LC_MESSAGES failed, using "C" 
4: Setting LC_MONETARY failed, using "C" 
* checking for file '/private/var/folders/6c/6bhhsz5x5lg5y3g963_kjmpc0000gn/T/Rtmpan2lXa/remotesca1f4a6b645c/eawerner306-biostat615_project-49ceac2/lassosolve615/DESCRIPTION' ... OK
* preparing 'lassosolve615':
* checking DESCRIPTION meta-information ... OK
* checking for LF line-endings in source and make files and shell scripts
* checking for empty or unneeded directories
Omitted 'LazyData' from DESCRIPTION
* building 'lassosolve615_0.1.0.tar.gz'



Installing package into '/opt/homebrew/lib/R/4.4/site-library'
(as 'lib' is unspecified)



In [5]:
library(lassosolve615)

In [6]:
library(glmnet)

Loading required package: Matrix

Loaded glmnet 4.1-8



In [7]:
generate_simulated_data <- function(feature_size = "low", data_size = "small", sparsity_level = "low", seed = 123) {
  set.seed(seed)

  # Define the ranges for feature size, data size, and sparsity
  feature_size_map <- list("low" = 50, "medium" = 500, "high" = 1500)
  data_size_map <- list("small" = 500, "medium" = 5000, "large" = 50000, "very_large" = 200000)
  sparsity_map <- list("low" = 0.3, "medium" = 0.65, "high" = 0.9)  # Sparsity as zero proportion

  # Validate inputs
  if (!(feature_size %in% names(feature_size_map))) {
    stop("Invalid feature_size. Must be one of: low, medium, high.")
  }
  if (!(data_size %in% names(data_size_map))) {
    stop("Invalid data_size. Must be one of: small, medium, large, very_large.")
  }
  if (!(sparsity_level %in% names(sparsity_map))) {
    stop("Invalid sparsity_level. Must be one of: low, medium, high.")
  }

  # Get the number of features and samples
  p <- feature_size_map[[feature_size]]
  n <- data_size_map[[data_size]]

  # Sparsity level (proportion of zero elements in X)
  sparsity <- sparsity_map[[sparsity_level]]

  # Generate the feature matrix (X) with the specified sparsity
  X <- matrix(0, nrow = n, ncol = p)  # Initialize with zeros
  num_non_zero <- round((1 - sparsity) * n * p)  # Total non-zero elements
  non_zero_indices <- sample(1:(n * p), size = num_non_zero, replace = FALSE)
  X[non_zero_indices] <- rnorm(num_non_zero)  # Fill non-zero entries with random values

  # Generate the true coefficients (beta)
  beta <- rep(0, p)
  num_non_zero_beta <- max(1, round(0.1 * p))  # Assume 10% of beta are non-zero
  non_zero_beta_indices <- sample(1:p, size = num_non_zero_beta, replace = FALSE)
  beta[non_zero_beta_indices] <- runif(length(non_zero_beta_indices), min = -1, max = 1)

  # Generate the response vector (y) with some noise
  y <- X %*% beta + rnorm(n)

  # Return the generated data
  return(list(X = X, y = y, beta = beta))
}

In [8]:
calculate_sparsity <- function(beta, threshold = 1e-6) {
  p <- length(beta)  # the total number of beta
  nnz <- sum(abs(beta) > threshold)  # the number of non-zero
  sparsity <- 1 - (nnz / p)  # sparsity level
  return(sparsity)
}

In [9]:
compare_lasso_methods <- function(X, y, lambda) {
  methods <- c("auto","CGDA", "ISTA", "FISTA", "LARS", "PFA", "SLA")
  results <- list()

  for (method in methods) {
    cat("\nRunning lasso with method:", method, "\n")

    # Measure system time for robust_lasso
    robust_time <- system.time(
      robust_result <- robust_lasso(X, y, lambda, method = method)
    )
    # print(robust_result$method, robust_result$fit$iter, robust_result$fit$convergence)
    print(robust_result$method)
    print(robust_result$fit$iter)
    print(robust_result$fit$convergence)

    # Extract predictions and residuals from robust_lasso
    robust_beta <- robust_result$fit$beta
    y_pred_robust <- cbind(1, X) %*% robust_beta  # Include intercept
    residuals_robust <- y - y_pred_robust


    # Calculate metrics for robust_lasso
    mse_robust <- mean(residuals_robust^2)
    r2_robust <- 1 - sum(residuals_robust^2) / sum((y - mean(y))^2)
    sparsity_robust <- calculate_sparsity(robust_beta)

    # Store results for robust_lasso
    results[[method]] <- list(
      time = robust_time["user.self"],
      mse = mse_robust,
      r2 = r2_robust,
      sparsity = sparsity_robust
    )

    # Plot residuals for robust_lasso
    # plot(y_pred_robust, residuals_robust,
    #      main = paste("Residual Plot for robust_lasso (", method, ")"),
    #      xlab = "Predicted Values", ylab = "Residuals",
    #      col = "blue", pch = 20)
    # abline(h = 0, col = "red", lty = 2)
  }

  # Run glmnet
  cat("\nRunning glmnet\n")
  glmnet_time <- system.time(
    glmnet_result <- glmnet(X, y, alpha = 1, lambda = lambda, intercept = TRUE)
  )

  # Extract predictions and residuals from glmnet
  glmnet_beta <- as.vector(coef(glmnet_result, s = lambda))
  y_pred_glmnet <- cbind(1, X) %*% glmnet_beta  # Include intercept
  residuals_glmnet <- y - y_pred_glmnet

  # Calculate metrics for glmnet
  mse_glmnet <- mean(residuals_glmnet^2)
  r2_glmnet <- 1 - sum(residuals_glmnet^2) / sum((y - mean(y))^2)
  sparsity_glmnet <- calculate_sparsity(glmnet_beta)

  # Store results for glmnet
  results[["glmnet"]] <- list(
    time = glmnet_time["user.self"],
    mse = mse_glmnet,
    r2 = r2_glmnet,
    sparsity = sparsity_glmnet
  )

  # Plot residuals for glmnet
  # plot(y_pred_glmnet, residuals_glmnet,
  #      main = "Residual Plot for glmnet",
  #      xlab = "Predicted Values", ylab = "Residuals",
  #      col = "green", pch = 20)
  # abline(h = 0, col = "red", lty = 2)

  # Print and return results
  print(results)
  return(results)
}

In [10]:
# feature_size:low
data_low_small_low <- generate_simulated_data(feature_size = "low", data_size = "small", sparsity_level = "low")
# data_low_small_medium <- generate_simulated_data(feature_size = "low", data_size = "small", sparsity_level = "medium")
# data_low_small_high <- generate_simulated_data(feature_size = "low", data_size = "small", sparsity_level = "high")

# data_low_medium_low <- generate_simulated_data(feature_size = "low", data_size = "medium", sparsity_level = "low")
# data_low_medium_medium <- generate_simulated_data(feature_size = "low", data_size = "medium", sparsity_level = "medium")
# data_low_medium_high <- generate_simulated_data(feature_size = "low", data_size = "medium", sparsity_level = "high")

# data_low_large_low <- generate_simulated_data(feature_size = "low", data_size = "large", sparsity_level = "low")
# data_low_large_medium <- generate_simulated_data(feature_size = "low", data_size = "large", sparsity_level = "medium")
# data_low_large_high <- generate_simulated_data(feature_size = "low", data_size = "large", sparsity_level = "high")

# data_low_vlarge_low <- generate_simulated_data(feature_size = "low", data_size = "very_large", sparsity_level = "low")
# data_low_vlarge_medium <- generate_simulated_data(feature_size = "low", data_size = "very_large", sparsity_level = "medium")
# data_low_vlarge_high <- generate_simulated_data(feature_size = "low", data_size = "very_large", sparsity_level = "high")


# # # feature_size:medium
# data_medium_small_low <- generate_simulated_data(feature_size = "medium", data_size = "small", sparsity_level = "low")
# data_medium_small_medium <- generate_simulated_data(feature_size = "medium", data_size = "small", sparsity_level = "medium")
# data_medium_small_high <- generate_simulated_data(feature_size = "medium", data_size = "small", sparsity_level = "high")

# data_medium_medium_low <- generate_simulated_data(feature_size = "medium", data_size = "medium", sparsity_level = "low")
# data_medium_medium_medium <- generate_simulated_data(feature_size = "medium", data_size = "medium", sparsity_level = "medium")
# data_medium_medium_high <- generate_simulated_data(feature_size = "medium", data_size = "medium", sparsity_level = "high")

# data_medium_large_low <- generate_simulated_data(feature_size = "medium", data_size = "large", sparsity_level = "low")
# data_medium_large_medium <- generate_simulated_data(feature_size = "medium", data_size = "large", sparsity_level = "medium")
# data_medium_large_high <- generate_simulated_data(feature_size = "medium", data_size = "large", sparsity_level = "high")

# data_medium_vlarge_low <- generate_simulated_data(feature_size = "medium", data_size = "very_large", sparsity_level = "low")
# data_medium_vlarge_medium <- generate_simulated_data(feature_size = "medium", data_size = "very_large", sparsity_level = "medium")
# data_medium_vlarge_high <- generate_simulated_data(feature_size = "medium", data_size = "very_large", sparsity_level = "high")

# # feature_size:high
# data_high_small_low <- generate_simulated_data(feature_size = "high", data_size = "small", sparsity_level = "low")
# data_high_small_medium <- generate_simulated_data(feature_size = "high", data_size = "small", sparsity_level = "medium")
# data_high_small_high <- generate_simulated_data(feature_size = "high", data_size = "small", sparsity_level = "high")

# data_high_medium_low <- generate_simulated_data(feature_size = "high", data_size = "medium", sparsity_level = "low")
# data_high_medium_medium <- generate_simulated_data(feature_size = "high", data_size = "medium", sparsity_level = "medium")
# data_high_medium_high <- generate_simulated_data(feature_size = "high", data_size = "medium", sparsity_level = "high")

# data_high_large_low <- generate_simulated_data(feature_size = "high", data_size = "large", sparsity_level = "low")
# data_high_large_medium <- generate_simulated_data(feature_size = "high", data_size = "large", sparsity_level = "medium")
# data_high_large_high <- generate_simulated_data(feature_size = "high", data_size = "large", sparsity_level = "high")

# fail to generate
# data_high_vlarge_low <- generate_simulated_data(feature_size = "high", data_size = "very_large", sparsity_level = "low")
# data_high_vlarge_medium <- generate_simulated_data(feature_size = "high", data_size = "very_large", sparsity_level = "medium")
# data_high_vlarge_high <- generate_simulated_data(feature_size = "high", data_size = "very_large", sparsity_level = "high")


In [11]:
data=data_low_small_low
compare_lasso_methods(data$X, data$y, lambda = 0.1)


Running lasso with method: auto 
[1] "LARS"
NULL
[1] TRUE

Running lasso with method: CGDA 
[1] "CGDA"
[1] 5
[1] TRUE

Running lasso with method: ISTA 
[1] "ISTA"
[1] 16
[1] TRUE

Running lasso with method: FISTA 
[1] "FISTA"
[1] 24
[1] TRUE

Running lasso with method: LARS 
[1] "LARS"
NULL
[1] TRUE

Running lasso with method: PFA 
[1] "PFA"
[1] 5
[1] TRUE

Running lasso with method: SLA 
[1] "SLA"
[1] 1000
[1] FALSE

Running glmnet
$auto
$auto$time
user.self 
    0.007 

$auto$mse
[1] 0.8208355

$auto$r2
[1] 0.5222557

$auto$sparsity
[1] 0.03921569


$CGDA
$CGDA$time
user.self 
    0.001 

$CGDA$mse
[1] 0.9449868

$CGDA$r2
[1] 0.4499969

$CGDA$sparsity
[1] 0.9215686


$ISTA
$ISTA$time
user.self 
    0.001 

$ISTA$mse
[1] 0.9446945

$ISTA$r2
[1] 0.4501671

$ISTA$sparsity
[1] 0.9019608


$FISTA
$FISTA$time
user.self 
    0.001 

$FISTA$mse
[1] 0.9449869

$FISTA$r2
[1] 0.4499969

$FISTA$sparsity
[1] 0.9215686


$LARS
$LARS$time
user.self 
    0.003 

$LARS$mse
[1] 0.8208355

$LARS$r2
[1

In [None]:
# Function to compare glmnet and robust_lasso results for multiple datasets
compare_lasso_across_datasets <- function(datasets, lambda = 0.1) {
  results <- list()

  for (i in 1:length(datasets)) {
    data_name = names(datasets)[i]
    cat("\nProcessing dataset:", names(datasets)[i], "\n")

    # Extract X and y from the dataset
    data <- datasets[[data_name]]
    X <- data$X
    y <- data$y

    # Compare methods
    dataset_results <- compare_lasso_methods(X, y, lambda)

    # Store results by dataset name
    results[[data_name]] <- dataset_results
  }

  return(results)
}

simulated_datasets <- list(
  data_low_small_low = data_low_small_low,
  data_low_small_medium = data_low_small_medium,
  data_low_small_high = data_low_small_high,
  data_low_medium_low = data_low_medium_low,
  data_low_medium_medium = data_low_medium_medium,
  data_low_medium_high = data_low_medium_high,
  data_low_large_low = data_low_large_low,
  data_low_large_medium = data_low_large_medium,
  data_low_large_high = data_low_large_high,
  data_low_vlarge_low = data_low_vlarge_low,
  data_low_vlarge_medium = data_low_vlarge_medium,
  data_low_vlarge_high = data_low_vlarge_high,
  data_medium_small_low = data_medium_small_low,
  data_medium_small_medium = data_medium_small_medium,
  data_medium_small_high = data_medium_small_high,
  data_medium_medium_low = data_medium_medium_low,
  data_medium_medium_medium = data_medium_medium_medium,
  data_medium_medium_high = data_medium_medium_high,
  data_medium_large_low = data_medium_large_low,
  data_medium_large_medium = data_medium_large_medium,
  data_medium_large_high = data_medium_large_high,
  data_medium_vlarge_low = data_medium_vlarge_low,
  data_medium_vlarge_medium = data_medium_vlarge_medium,
  data_medium_vlarge_high = data_medium_vlarge_high,
  data_high_small_low = data_high_small_low,
  data_high_small_medium = data_high_small_medium,
  data_high_small_high = data_high_small_high,
  data_high_medium_low = data_high_medium_low,
  data_high_medium_medium = data_high_medium_medium,
  data_high_medium_high = data_high_medium_high,
  data_high_large_low = data_high_large_low,
  data_high_large_medium = data_high_large_medium,
  data_high_large_high = data_high_large_high
  # data_high_vlarge_low = data_high_vlarge_low,
  # data_high_vlarge_medium = data_high_vlarge_medium,
  # data_high_vlarge_high = data_high_vlarge_high
)

# Run comparisons


# Print results
# print(comparison_results)


In [None]:
comparison_results <- compare_lasso_across_datasets(simulated_datasets)

In [None]:
save_results_to_csv <- function(results, file_name = "lasso_results.csv") {
  # Flatten results into a data frame
  results_table <- do.call(rbind, lapply(names(results), function(dataset_name) {
    dataset_results <- results[[dataset_name]]
    data.frame(
      Dataset = dataset_name,
      Method = names(dataset_results),
      UserTime = sapply(dataset_results, function(x) x$time),
      MSE = sapply(dataset_results, function(x) x$mse),
      R2 = sapply(dataset_results, function(x) x$r2),
      Sparsity = sapply(dataset_results, function(x) x$sparsity),
      stringsAsFactors = FALSE
    )
  }))

  # Write to CSV
  write.csv(results_table, file_name, row.names = FALSE)
  cat("Results saved to", file_name, "\n")
}

# Save to CSV
save_results_to_csv(comparison_results, "lasso_comparison_results.csv")
