# Description

This notebook contains code taken from the MultiPLIER repo to download and process recount2 data.
The code was taken from [here](https://github.com/greenelab/rheum-plier-data/blob/master/recount2/1-get_all_recount_dataset.R) and [here](https://github.com/greenelab/rheum-plier-data/blob/master/recount2/2-prep_recount_for_plier.R).

The output are Python pickle files with a large matrix with genes in rows and samples in columns, and another file with gene ID mappings.

# Modules

In [None]:
`%>%` <- dplyr::`%>%`
library(biomaRt)
library(reticulate)

# Settings

In [None]:
recount2full.data.dir <- Sys.getenv("CM_RECOUNT2FULL_DATA_DIR")

In [None]:
recount2full.data.dir

In [None]:
dir.create(recount2full.data.dir, recursive = TRUE, showWarnings = FALSE)

In [None]:
data.dir <- Sys.getenv("CM_RECOUNT2FULL_INTERNAL_DATA_DIR")

In [None]:
data.dir

In [None]:
dir.create(data.dir, recursive = TRUE, showWarnings = FALSE)

# Load raw data

In [None]:
input.file <- file.path(recount2full.data.dir, "recount2_rpkm_raw.rds")

In [None]:
input.file

In [None]:
rpkm.df <- readRDS(input.file)

In [None]:
dim(rpkm.df)

In [None]:
head(rpkm.df[, 1:10])

# Preprocess data

In [None]:
# Transform ensembl id to genesymbol
mart <- biomaRt::useDataset(
  "hsapiens_gene_ensembl",
  biomaRt::useMart("ensembl")
)

In [None]:
genes <- unlist(lapply(strsplit(rpkm.df$ENSG, "[.]"), `[[`, 1))

rpkm.df$ensembl_gene_id <- unlist(lapply(
  strsplit(rpkm.df$ENSG, "[.]"),
  `[[`, 1
))

gene.df <- biomaRt::getBM(
  filters = "ensembl_gene_id",
  attributes = c("ensembl_gene_id", "hgnc_symbol"),
  values = genes,
  mart = mart
)

In [None]:
# filter to remove genes without a gene symbol
gene.df <- gene.df %>% dplyr::filter(complete.cases(.))

In [None]:
# add gene symbols to expression df
rpkm.df <- dplyr::inner_join(gene.df, rpkm.df,
  by = "ensembl_gene_id"
)

In [None]:
# keep gene mappings
gene.df <- rpkm.df %>% dplyr::select(ensembl_gene_id, hgnc_symbol)

In [None]:
dim(gene.df)

In [None]:
head(gene.df)

In [None]:
# set Ensemble IDs as rownames
rownames(rpkm.df) <- make.names(rpkm.df$ensembl_gene_id, unique = TRUE)

In [None]:
# remove gene identifier columns
rpkm.df <- rpkm.df %>% dplyr::select(-c(ensembl_gene_id:ENSG))

In [None]:
dim(rpkm.df)

In [None]:
head(rpkm.df[, 1:10])

# Save

## Gene ID mappings

In [None]:
output_filepath <- file.path(recount2full.data.dir, "recount2_gene_ids_mappings")

In [None]:
output_filepath

In [None]:
saveRDS(gene.df, file = paste0(output_filepath, ".rds"))

In [None]:
py_save_object(gene.df, paste0(output_filepath, ".pkl"))

## Gene expression data

In [None]:
output_filepath <- file.path(recount2full.data.dir, "recount2_rpkm")

In [None]:
output_filepath

In [None]:
saveRDS(rpkm.df, file = paste0(output_filepath, ".rds"))

In [None]:
py_save_object(rpkm.df, paste0(output_filepath, ".pkl"))

# Cleanup

In [None]:
# the raw file is not longer necessary
if (file.exists(input.file)) {
  # Delete file if it exists
  file.remove(input.file)
}