# Description

This notebook contains code taken from the MultiPLIER repo to download and process recount2 data.
The code was taken from [here](https://github.com/greenelab/rheum-plier-data/blob/master/recount2/1-get_all_recount_dataset.R) and [here](https://github.com/greenelab/rheum-plier-data/blob/master/recount2/2-prep_recount_for_plier.R).

The output are Python pickle files with a large matrix with genes in rows and samples in columns, and another file with gene ID mappings.

# Modules

In [1]:
`%>%` <- dplyr::`%>%`
library(biomaRt)
library(reticulate)

# Settings

In [2]:
recount2full.data.dir <- Sys.getenv("CM_RECOUNT2FULL_DATA_DIR")

In [3]:
recount2full.data.dir

In [4]:
dir.create(recount2full.data.dir, recursive = TRUE, showWarnings = FALSE)

In [5]:
data.dir <- Sys.getenv("CM_RECOUNT2FULL_INTERNAL_DATA_DIR")

In [6]:
data.dir

In [7]:
dir.create(data.dir, recursive = TRUE, showWarnings = FALSE)

# Load raw data

In [8]:
input.file <- file.path(recount2full.data.dir, "recount2_rpkm_raw.rds")

In [9]:
input.file

In [10]:
rpkm.df <- readRDS(input.file)

In [11]:
dim(rpkm.df)

In [12]:
head(rpkm.df[, 1:10])

Unnamed: 0_level_0,ENSG,SRP000599.SRR013549,SRP000599.SRR013550,SRP000599.SRR013551,SRP000599.SRR013552,SRP000599.SRR013553,SRP000599.SRR013554,SRP000599.SRR013555,SRP000599.SRR013556,SRP000599.SRR013557
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,ENSG00000000003.14,0.0,0,0.0,0,0,0,0.9293512,1.175312,0.4598586
2,ENSG00000000005.5,0.0,0,0.0,0,0,0,0.0,0.0,0.0
3,ENSG00000000419.12,0.0,0,0.0,0,0,0,0.0,0.0,5.5613668
4,ENSG00000000457.13,0.0,0,0.0,0,0,0,0.0,0.0,0.0
5,ENSG00000000460.16,0.08456512,0,0.4970834,0,0,0,1.177199,0.0,0.0
6,ENSG00000000938.12,0.0,0,0.0,0,0,0,0.0,0.0,0.0


# Preprocess data

In [13]:
# Transform ensembl id to genesymbol
mart <- biomaRt::useDataset(
  "hsapiens_gene_ensembl",
  biomaRt::useMart("ensembl")
)

In [14]:
genes <- unlist(lapply(strsplit(rpkm.df$ENSG, "[.]"), `[[`, 1))

rpkm.df$ensembl_gene_id <- unlist(lapply(
  strsplit(rpkm.df$ENSG, "[.]"),
  `[[`, 1
))

gene.df <- biomaRt::getBM(
  filters = "ensembl_gene_id",
  attributes = c("ensembl_gene_id", "hgnc_symbol"),
  values = genes,
  mart = mart
)

In [15]:
# filter to remove genes without a gene symbol
gene.df <- gene.df %>% dplyr::filter(complete.cases(.))

In [16]:
# add gene symbols to expression df
rpkm.df <- dplyr::inner_join(gene.df, rpkm.df,
  by = "ensembl_gene_id"
)

In [17]:
# keep gene mappings
gene.df <- rpkm.df %>% dplyr::select(ensembl_gene_id, hgnc_symbol)

In [18]:
dim(gene.df)

In [19]:
head(gene.df)

Unnamed: 0_level_0,ensembl_gene_id,hgnc_symbol
Unnamed: 0_level_1,<chr>,<chr>
1,ENSG00000000003,TSPAN6
2,ENSG00000000005,TNMD
3,ENSG00000000419,DPM1
4,ENSG00000000457,SCYL3
5,ENSG00000000460,C1orf112
6,ENSG00000000938,FGR


In [20]:
# set Ensemble IDs as rownames
rownames(rpkm.df) <- make.names(rpkm.df$ensembl_gene_id, unique = TRUE)

In [21]:
# remove gene identifier columns
rpkm.df <- rpkm.df %>% dplyr::select(-c(ensembl_gene_id:ENSG))

In [22]:
dim(rpkm.df)

In [23]:
head(rpkm.df[, 1:10])

Unnamed: 0_level_0,SRP000599.SRR013549,SRP000599.SRR013550,SRP000599.SRR013551,SRP000599.SRR013552,SRP000599.SRR013553,SRP000599.SRR013554,SRP000599.SRR013555,SRP000599.SRR013556,SRP000599.SRR013557,SRP000599.SRR013558
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000000003,0.0,0,0.0,0,0,0,0.9293512,1.175312,0.4598586,1.2544816
ENSG00000000005,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0
ENSG00000000419,0.0,0,0.0,0,0,0,0.0,0.0,5.5613668,2.9458751
ENSG00000000457,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.2622675
ENSG00000000460,0.08456512,0,0.4970834,0,0,0,1.177199,0.0,0.0,0.0
ENSG00000000938,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0


# Save

## Gene ID mappings

In [24]:
output_filepath <- file.path(recount2full.data.dir, "recount2_gene_ids_mappings")

In [25]:
output_filepath

In [26]:
saveRDS(gene.df, file = paste0(output_filepath, ".rds"))

In [27]:
py_save_object(gene.df, paste0(output_filepath, ".pkl"))

## Gene expression data

In [28]:
output_filepath <- file.path(recount2full.data.dir, "recount2_rpkm")

In [29]:
output_filepath

In [30]:
saveRDS(rpkm.df, file = paste0(output_filepath, ".rds"))

In [31]:
py_save_object(rpkm.df, paste0(output_filepath, ".pkl"))

# Cleanup

In [32]:
# the raw file is not longer necessary
if (file.exists(input.file)) {
  # Delete file if it exists
  file.remove(input.file)
}