# Description

This notebook contains code taken from the MultiPLIER repo to download and process recount2 data.
The code was taken from [here](https://github.com/greenelab/rheum-plier-data/blob/master/recount2/1-get_all_recount_dataset.R) and [here](https://github.com/greenelab/rheum-plier-data/blob/master/recount2/2-prep_recount_for_plier.R).

The output are Python pickle files with a large matrix with genes in rows and samples in columns, and another file with gene ID mappings.

# Modules

In [1]:
`%>%` <- dplyr::`%>%`
library(recount)

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges

# Settings

In [2]:
recount2full.data.dir <- Sys.getenv("CM_RECOUNT2FULL_DATA_DIR")

In [3]:
recount2full.data.dir

In [4]:
dir.create(recount2full.data.dir, recursive = TRUE, showWarnings = FALSE)

In [5]:
data.dir <- Sys.getenv("CM_RECOUNT2FULL_INTERNAL_DATA_DIR")

In [6]:
data.dir

In [7]:
dir.create(data.dir, recursive = TRUE, showWarnings = FALSE)

# Functions

In [8]:
# Get RPKM value for each gene - adapted from recount package
getRPKM <- function(rse, length_var = "bp_length", mapped_var = NULL) {
  # Computes the RPKM value for each gene in the sample.
  #
  # Args:
  #  rse: A RangedSummarizedExperiment-class object in recount package
  #  length_var: A length 1 character vector with the column name from rowData(rse) that has
  #              the coding length. For gene level objects from recount this is bp_length. If
  #              NULL, then it will use width(rowRanges(rse)) which should be used for exon RSEs.
  #  mapped_var: A length 1 character vector with the column name from colData(rse) that has
  #              the number of reads mapped. If NULL (default) then it will use the column
  #              sums of the counts matrix
  # Returns:
  #   RPKM value for each sample
  if (!is.null(mapped_var)) {
    mapped <- colData(rse)[, mapped_var]
  } else {
    mapped <- colSums(assays(rse)$counts)
  }
  bg <- matrix(mapped, ncol = ncol(rse), nrow = nrow(rse), byrow = TRUE)
  if (!is.null(length_var)) {
    len <- rowData(rse)[, length_var]
  } else {
    len <- width(rowRanges(rse))
  }
  wid <- matrix(len, nrow = nrow(rse), ncol = ncol(rse), byrow = FALSE)
  rpkm <- assays(rse)$counts / (wid / 1000) / (bg / 1e6)
  return(rpkm)
}

# Download

In [9]:
# Get all samples from recount database
metasample.sra <- all_metadata(subset = "sra", verbose = TRUE)
metasample.sra <- as.data.frame(metasample.sra)

2021-12-24 02:47:51 downloading the metadata to /tmp/RtmpPx1lPX/metadata_clean_sra.Rdata



In [10]:
# Remove samples without description
metadata.nonempty <- metasample.sra[!is.na(metasample.sra$characteristics), ]
included.sample.list <- unique(metadata.nonempty$project)

In [11]:
# Download all recount2 samples in included.sample.list
lapply(
  included.sample.list,
  function(x) {
    download_study(x,
      type = "rse-gene",
      outdir = file.path(data.dir, x)
    )
  }
)

2021-12-24 02:47:54 downloading file rse_gene.Rdata to /opt/data/data/recount2full/data/SRP000599

2021-12-24 02:47:57 downloading file rse_gene.Rdata to /opt/data/data/recount2full/data/SRP001313

2021-12-24 02:47:59 downloading file rse_gene.Rdata to /opt/data/data/recount2full/data/SRP001349

2021-12-24 02:48:00 downloading file rse_gene.Rdata to /opt/data/data/recount2full/data/SRP001462

2021-12-24 02:48:00 downloading file rse_gene.Rdata to /opt/data/data/recount2full/data/SRP001540

2021-12-24 02:48:02 downloading file rse_gene.Rdata to /opt/data/data/recount2full/data/SRP001558

2021-12-24 02:48:03 downloading file rse_gene.Rdata to /opt/data/data/recount2full/data/SRP001563

2021-12-24 02:48:04 downloading file rse_gene.Rdata to /opt/data/data/recount2full/data/SRP001758

2021-12-24 02:48:05 downloading file rse_gene.Rdata to /opt/data/data/recount2full/data/SRP001893

2021-12-24 02:48:06 downloading file rse_gene.Rdata to /opt/data/data/recount2full/data/SRP001997

2021-12-24

# Normalize with RPKM

In [12]:
# get RPKM for each experiment and add to list
rpkm.list <- list()
for (experiment in included.sample.list) {
  load(file.path(data.dir, experiment, "rse_gene.Rdata"))
  rpkm <- as.data.frame(getRPKM(rse_gene))
  rpkm$id <- rownames(rpkm)
  rpkm.list[[experiment]] <- rpkm
}

In [13]:
# combine experiments -- this is the most memory efficient way to go about this
# that I've found -- will need to drop extraneous gene id columns
rpkm.df <- do.call(base::cbind, c(rpkm.list, by = "id"))
rpkm.df <- rpkm.df %>% dplyr::select(-dplyr::ends_with("id"))
rpkm.df <- tibble::rownames_to_column(rpkm.df, "ENSG")
# drop last column "by" -- information about what was used with base::cbind
rpkm.df <- rpkm.df %>% dplyr::select(-by)

# Save

In [14]:
output_filepath <- file.path(recount2full.data.dir, "recount2_rpkm_raw")

In [15]:
output_filepath

In [16]:
saveRDS(rpkm.df, file = paste0(output_filepath, ".rds"))