In [1]:
library(tidyverse)
library(DESeq2)
library(BiocParallel)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: S4Vectors
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects

In [2]:
n_cores <- detectCores() - 2
BiocParallel::register(MulticoreParam(n_cores))

In [3]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
gyn_projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
nongyn_projects <- c("TCGA-BRCA", "TCGA-COAD", "TCGA-LGG")

In [4]:
count_files <- paste0(dirs$data_dir, "/TCGA_RNA_matrix_count_data/", c(gyn_projects, nongyn_projects), "_counts.tsv")
coldata_files <- paste0(dirs$data_dir, "/TCGA_RNA_matrix_count_data/", c(gyn_projects, nongyn_projects), "_coldata.tsv")

count_files
coldata_files

# Load data

In [5]:
matrisome_df <- rutils::load_matrisome_df(
    paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")
)

Parsed with column specification:
cols(
  Division = col_character(),
  Category = col_character(),
  `Gene Symbol` = col_character(),
  `Gene Name` = col_character(),
  Synonyms = col_character(),
  HGNC_IDs = col_double(),
  `HGNC_IDs Links` = col_double(),
  UniProt_IDs = col_character(),
  Refseq_IDs = col_character(),
  Orthology = col_character(),
  Notes = col_character()
)


In [6]:
data_ls <- rutils::load_and_combine_count_matrix_data(
    count_files,
    coldata_files,
    count_join_symbols = c("ensembl_gene_id", "external_gene_name")
)

# Separate out counts & coldata
# Also, for this analysis, we only care about tumor data
pan_coldata_df <- data_ls$coldata_df %>%
    dplyr::filter(condition == "Primary solid Tumor")
pan_counts_df <- data_ls$counts_df %>%
    dplyr::select(ensembl_gene_id, external_gene_name, pan_coldata_df$sample_name)
all(pan_coldata_df$sample_name == colnames(pan_counts_df)[-c(1:2)])

Parsed with column specification:
cols(
  .default = col_double(),
  ensembl_gene_id = col_character(),
  external_gene_name = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  .default = col_double(),
  ensembl_gene_id = col_character(),
  external_gene_name = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  .default = col_double(),
  ensembl_gene_id = col_character(),
  external_gene_name = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  .default = col_double(),
  ensembl_gene_id = col_character(),
  external_gene_name = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  .default = col_double(),
  ensembl_gene_id = col_character(),
  external_gene_name = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  .de

In [7]:
gyn_coldata_df <- pan_coldata_df %>% dplyr::filter(project %in% gyn_projects)
gyn_counts_df <- pan_counts_df[c("ensembl_gene_id", "external_gene_name", gyn_coldata_df$sample_name)]
all(gyn_coldata_df$sample_name == colnames(gyn_counts_df)[-c(1:2)])

# Filter and normalize

## Pre-filter lowly expressed genes

In [8]:
# Keep genes which are expressed in > 1/3 of samples
pan_expression_mask <- rowSums(pan_counts_df[, -c(1:2)] > 0) > (1 / 3 * nrow(pan_coldata_df))
prefiltered_pan_counts_df <- pan_counts_df[pan_expression_mask, ]

gyn_expression_mask <- rowSums(gyn_counts_df[, -c(1:2)] > 0) > (1 / 3 * nrow(gyn_coldata_df))
prefiltered_gyn_counts_df <- gyn_counts_df[gyn_expression_mask, ]

# Remaining genes
nrow(prefiltered_pan_counts_df)
nrow(prefiltered_gyn_counts_df)

## Normalize and filter to matrisome genes

In [9]:
norm_gyn_counts <- varianceStabilizingTransformation(as.matrix(prefiltered_gyn_counts_df[, -c(1:2)]), blind = TRUE)
prefiltered_norm_gyn_counts_df <- norm_gyn_counts %>%
    as_tibble() %>%
    dplyr::mutate(ensembl_gene_id = prefiltered_gyn_counts_df$ensembl_gene_id,
                  external_gene_name = prefiltered_gyn_counts_df$external_gene_name) %>%
    dplyr::select(ensembl_gene_id, external_gene_name, everything()) %>%
    dplyr::filter(external_gene_name %in% matrisome_df$gene_symbol)

converting counts to integer mode


In [10]:
norm_pan_counts <- varianceStabilizingTransformation(as.matrix(prefiltered_pan_counts_df[, -c(1:2)]), blind = TRUE)
prefiltered_norm_pan_counts_df <- norm_pan_counts %>%
    as_tibble() %>%
    dplyr::mutate(ensembl_gene_id = prefiltered_pan_counts_df$ensembl_gene_id,
                  external_gene_name = prefiltered_pan_counts_df$external_gene_name) %>%
    dplyr::select(ensembl_gene_id, external_gene_name, everything()) %>%
    dplyr::filter(external_gene_name %in% matrisome_df$gene_symbol)

converting counts to integer mode


In [11]:
if(!dir.exists(paste0(dirs$data_dir, "/TCGA_RNA_combined_matrix_count_data"))) {
    dir.create(paste0(dirs$data_dir, "/TCGA_RNA_combined_matrix_count_data"))
}

In [12]:
write_tsv(
    prefiltered_norm_gyn_counts_df,
    paste0(dirs$data_dir, "/TCGA_RNA_combined_matrix_count_data/prefiltered_norm_matrisome_gyn_counts.tsv")
)
write_tsv(
    gyn_coldata_df,
    paste0(dirs$data_dir, "/TCGA_RNA_combined_matrix_count_data/matrisome_gyn_coldata.tsv")
)

In [13]:
write_tsv(
    prefiltered_norm_pan_counts_df,
    paste0(dirs$data_dir, "/TCGA_RNA_combined_matrix_count_data/prefiltered_norm_matrisome_pan_counts.tsv")
)
write_tsv(
    pan_coldata_df,
    paste0(dirs$data_dir, "/TCGA_RNA_combined_matrix_count_data/matrisome_pan_coldata.tsv")
)