In [1]:
suppressPackageStartupMessages({
  library(caret)
  library(dplyr)
  library(readr)
  library(RNOmni)
})

"package 'caret' was built under R version 4.4.2"


In [2]:
annot <- readr::read_tsv("../data/albert2018/processed/albert2018_genes.tsv")
expr <- readr::read_tsv("../data/albert2018/processed/albert2018_expression_logtpm.tsv")
cov <- readr::read_tsv("../data/albert2018/processed/albert2018_segregant_covariates.tsv")
gen <- readr::read_tsv("../data/albert2018/processed/albert2018_genotypes.tsv")

[1mRows: [22m[34m7109[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): gene_id, external_gene_name, chromosome_name
[32mdbl[39m (2): strand, transcription_start_site
[33mlgl[39m (2): essential, is_tf

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m5720[39m [1mColumns: [22m[34m1013[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m    (1): gene_id
[32mdbl[39m (1012): A01_01, A01_02, A01_03, A01_04, A01_05, A01_06, A01_07, A01_08, ...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this dat

In [4]:
expr <- expr |> 
  dplyr::filter(gene_id %in% annot$gene_id) |>
  tibble::column_to_rownames("gene_id")

In [8]:
cov <- dplyr::mutate(cov, segregant_id = stringr::str_split_i(segregant_id, "-", 1))

In [22]:
# Transform to standard normal
expr_irn <- t(apply(expr, 1, RNOmni::RankNorm))

In [25]:
# Regress out the covariates
expr_irn_rc <- apply(expr_irn, 1, \(x) lm(gene ~ batch + od_covariate, data = cbind(cov, gene = x))$residuals)

In [29]:
# Random seed for the holdout set partition
set.seed(101)
holdout_ids <- caret::createDataPartition(1:nrow(expr_irn_rc), p = 0.2)[[1]]

In [35]:
expr_irn_rc_train <- expr_irn_rc[-holdout_ids, ]
expr_irn_rc_holdout <- expr_irn_rc[holdout_ids, ]

In [33]:
gen <- gen |>
  dplyr::select(-c(ref, alt)) |>
  # Convert dosages to a one-hot encoded scheme
  dplyr::mutate(across(starts_with("A", ignore.case = FALSE), \(x) (x + 1) / 2))

In [37]:
gen_train <- dplyr::select(gen, variant_id, chr, pos, all_of(rownames(expr_irn_rc_train)))
gen_holdout <- dplyr::select(gen, variant_id, chr, pos, all_of(rownames(expr_irn_rc_holdout)))

In [42]:
# This gets rid of the column names, but we don't need them since the samples are matched
readr::write_tsv(
  as.data.frame(expr_irn_rc_train), 
  "../data/albert2018/interim/albert2018_expression_logtpm_irn_regcov_train.tsv"
)
readr::write_tsv(
  as.data.frame(expr_irn_rc_holdout), 
  "../data/albert2018/interim/albert2018_expression_logtpm_irn_regcov_holdout.tsv"
)

In [41]:
readr::write_tsv(gen_train, "../data/albert2018/interim/albert2018_genotypes_train.tsv")
readr::write_tsv(gen_holdout, "../data/albert2018/interim/albert2018_genotypes_holdout.tsv")