In [2]:
suppressPackageStartupMessages({
  library(caret)
  library(dplyr)
  library(glmnet)
  library(nestedcv)
  library(readr)
})

In [3]:
annot <- readr::read_tsv("../data/albert2018/processed/albert2018_genes.tsv")
expr_irn_rc_train <- readr::read_tsv("../data/albert2018/interim/albert2018_expression_logtpm_irn_regcov_train.tsv")
gen_train <- readr::read_tsv("../data/albert2018/interim/albert2018_genotypes_train.tsv")

[1mRows: [22m[34m7109[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): gene_id, external_gene_name, chromosome_name
[32mdbl[39m (2): strand, transcription_start_site
[33mlgl[39m (2): essential, is_tf

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m808[39m [1mColumns: [22m[34m5711[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[32mdbl[39m (5711): YAL062W, YAL061W, YAL060W, YAL059W, YAL058W, YAL056W, YAL055W, Y...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the colu

In [23]:
CIS_WINDOW <- 2e+4

In [10]:
# Random seed for cross-validation
set.seed(42)

ids <- 1:nrow(expr_irn_rc_train)
cv_out_folds <- caret::createFolds(ids, k = 5)
cv_in_folds <- lapply(cv_out_folds, \(x) caret::createFolds(ids[-x], k = 5))

In [67]:
gene <- colnames(expr_irn_rc_train)[4]

In [68]:
gene_expr <- expr_irn_rc_train[[gene]]

In [69]:
gene_annot <- dplyr::filter(annot, gene_id == gene)
gene_chr <- dplyr::pull(gene_annot, chromosome_name)
gene_tss <- dplyr::pull(gene_annot, transcription_start_site)

In [70]:
gene_var <- gen_train |>
  dplyr::filter(chr == gene_chr, dplyr::between(pos, gene_tss - CIS_WINDOW, gene_tss + CIS_WINDOW)) |>
  dplyr::select(-c(chr, pos)) |>
  tibble::column_to_rownames("variant_id") |>
  t()

In [79]:
res <- list()
res_coef <- list()

In [72]:
if (ncol(gene_var) == 0) {
  res <- append(res, list(data.frame(
    gene_id = gene,
    n_var = 0,
    n_var_sig = NA,
    r2 = NA,
    r2cv_avg = NA,
    rmse = NA
  )))
  
  next
}

In [74]:
res_glm <- nestedcv::nestcv.glmnet(
  y = gene_expr,
  x = gene_var,
  family = "gaussian",
  alphaSet = seq(0.1, 0.9, 0.2),
  outer_folds = cv_out_folds,
  inner_folds = cv_in_folds,
  cv.cores = 10
)

In [82]:
nrow(res_glm[["final_coef"]])

NULL

In [36]:
if (is.null(nrow(res_glm$final_coef))) {
  res <- append(res, list(data.frame(
    gene_id = gene,
    n_var = ncol(gene_var),
    n_var_sig = 0,
    r2 = NA,
    r2cv_avg = NA,
    rmse = NA
  )))
  
  next
}

0,1
Control {base},R Documentation

0,1
cond,"A length-one logical vector that is not NA. Other types are coerced to logical if possible, ignoring any class. (Conditions of length greater than one are an error.)"
var,A syntactical name for a variable.
seq,An expression evaluating to a vector (including a list and an expression) or to a pairlist or NULL. A factor value will be coerced to a character vector. This can be a long vector.
"expr, cons.expr, alt.expr, x, y","An expression in a formal sense. This is either a simple expression or a so-called compound expression, usually of the form { expr1 ; expr2 }."
