# Preliminary TCGA DGE analysis
### Primary concerns about this analysis:
1. Low statistical power: healthy condition has few samples
2. Missing data: ovarian cancer cohort has no healthy samples

In [None]:
library(tidyverse)
library(TCGAbiolinks)
library(DESeq2)
library(BiocParallel)

## Take advantage of parallelization in DESeq2 functions

In [None]:
n_cores <- detectCores()
BiocParallel::register(MulticoreParam(n_cores))

## Constants

In [None]:
projects <- c("TCGA-UCEC", "TCGA-CESC")
data_root <- "../../../../../mnt/d/TCGA"
project_dirs <- sapply(
    projects, 
    (function(p) paste0(data_root, "/", as.character(Sys.Date()), "-", p, "-", "TCGAbiolinks"))
)
tumor_levels <- c("Primary solid Tumor", "Metastatic")
healthy_levels <- c("Solid Tissue Normal")
tumor_def <- "Tumor"
healthy_def <- "Healthy"
proj_idx <- 2
# max_padj <- 0.05
# min_l2fc <- log2(2)

## Functions

In [None]:
# Consolidate levels and (optionally) filter remaining levels
# Ex:
# - {"Primary solid Tumor", "Metastatic"} -> "Tumor"
# - {"Solid Tissue Normal"} -> "Healthy"
# - {"Recurrent Tumor"} -> (rows removed)
consolidate_levels <- function(d, old_tumor_levels, old_healthy_levels, new_tumor_level, new_healthy_level, drop_remaining = TRUE) {
    tumor_mask <- d$definition %in% old_tumor_levels
    healthy_mask <- d$definition %in% old_healthy_levels
    d$definition[tumor_mask] <- new_tumor_level
    d$definition[healthy_mask] <- new_healthy_level
    
    if (drop_remaining) {
        level_mask <- d$definition %in% c(new_tumor_level, new_healthy_level)
        d <- d[, level_mask]
    }
}


# Threshold based on expression in both conditions (assumes "Tumor"/"Healthy" are only levels)
filter_by_expression <- function(dds, tumor_level, healthy_level, min_expr) {
    tumor_cond_mask <- dds$definition == tumor_level
    healthy_cond_mask <- dds$definition == healthy_level
    tumor_cond_expr_mask <- rowSums(DESeq2::counts(dds[, tumor_cond_mask])) >= min_expr
    healthy_cond_expr_mask <- rowSums(DESeq2::counts(dds[, healthy_cond_mask])) >= min_expr
    expr_mask <- tumor_cond_expr_mask & healthy_cond_expr_mask
    return(dds[expr_mask, ])
}


# Select which rows to keep based on adjusted p-value and log2 fold-change
filter_DGE_res <- function(df, max_padj = 0.05, min_l2fc = log2(2)) {
    padj_mask <- df$padj <= max_padj
    l2fc_mask <- df$log2FoldChange >= min_l2fc
    final_mask <- padj_mask & l2fc_mask
    return(df[final_mask, ])
}

## Run Query & download

In [None]:
query <- GDCquery(
    project = projects[proj_idx],
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    workflow.type = "HTSeq - Counts"
)

In [None]:
GDCdownload(query, method = "api", directory = project_dirs[proj_idx], files.per.chunk = 10)

## Prepare data for analysis

In [None]:
data <- GDCprepare(query, directory = project_dirs[proj_idx])

In [None]:
data

### Only want two levels

Lump "Metastatic" and "Primary Solid Tumor" together, define "Healthy" as non-tumor, and filter out data from other levels

In [None]:
data_consolidated <- consolidate_levels(
    data,
    old_tumor_levels = tumor_levels,
    old_healthy_levels = healthy_levels,
    new_tumor_level = tumor_def,
    new_healthy_level = healthy_def,
    drop_remaining = TRUE
)

In [None]:
unique(data_consolidated$definition)
data_consolidated

In [None]:
sum(data_consolidated$definition == tumor_def)
sum(data_consolidated$definition == healthy_def)

## Prep data for DGE analysis

In [None]:
dds <- DESeqDataSet(data_consolidated, design = ~ definition)

In [None]:
dds

### Filter out genes which are not expressed in either tumor or healthy conditions
I.E. we want to keep only genes that are expressed in the healthy AND the tumor conditions

In [None]:
dds_filtered <- filter_by_expression(ddsSE, tumor_level = tumor_def, healthy_level = healthy_def, min_expr = 1)

In [None]:
dds_filtered

## Perform DGE analysis

In [None]:
# ddsSESeq <- DESeq(dds, parallel = TRUE)
ddsSeq <- DESeq(dds_filtered, parallel = TRUE)

In [None]:
# ddsSESeq
ddsSeq

In [None]:
# resultsNames(ddsSESeq)
resultsNames(ddsSeq)

In [None]:
# res <- results(ddsSESeq, contrast = c("definition", "Tumor", "Healthy"), pAdjustMethod = "BH", parallel = TRUE)
res <- results(ddsSeq, contrast = c("definition", "Tumor", "Healthy"), pAdjustMethod = "BH", parallel = TRUE)

## Combine results with external gene IDs

In [None]:
res_df <- as_tibble(res, rownames = "geneID")
head(res_df)
ddsSeq_row_data_df <- rowData(ddsSeq) %>% as_tibble()
head(ddsSeq_row_data_df)

### Make sure gene ordering is preserved between results and ddSESeq row data

In [None]:
all(ddsSeq_row_data_df$ensembl_gene_id == res_df$geneID)

In [None]:
res_genes_df <- res_df %>% mutate(
        external_gene_name = ddsSeq_row_data_df$external_gene_name,
        original_ensembl_gene_id = ddsSeq_row_data_df$original_ensembl_gene_id,
        ensembl_gene_id = res_df$geneID
    ) %>%
    select(-geneID) %>%
    select(ensembl_gene_id, original_ensembl_gene_id, external_gene_name, everything())

In [None]:
dim(res_genes_df)

## Create DEG list

### Drop rows (genes) with NA values

In [None]:
na_mask <- rowSums(is.na(res_genes_df)) > 0
res_genes_df <- res_genes_df[!na_mask, ]

No NAs left?

In [None]:
sum(rowSums(is.na(res_genes_df))) == 0

How many rows left?

In [None]:
nrow(res_genes_df)

### Apply final filters (adj. $p$-values, $\log_2$ fold-change)

In [None]:
DEG1_df <- filter_DGE_res(res_genes_df, max_padj = 0.05, min_l2fc = log2(2))

How many rows left?

In [None]:
nrow(DEG1_df)