In [1]:
library(tidyverse)
library(WGCNA)

# Custom package
library(rutils)

-- [1mAttaching packages[22m --------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.3     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.6     [32mv[39m [34mdplyr  [39m 1.0.4
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.1

-- [1mConflicts[22m ------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: dynamicTreeCut

Loading required package: fastcluster


Attaching package: 'fastcluster'


The following object is masked from 'package:stats':

    hclust





Attaching package: 'WGCNA'


The foll

In [2]:
enableWGCNAThreads()

Allowing parallel execution with up to 15 working processes.


In [3]:
# Define constants
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_path <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")


In [4]:
dset_idx <- 3
save_res <- TRUE

In [5]:
event_code <- list("Alive" = 0, "Dead" = 1)

survival_path <- paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/survival_data.tsv")
survival_df <- load_survival_df(survival_path, event_code)

# Load and filter survival data
filtered_survival_df <- survival_df %>%
    decode_figo_stage(to = "n") %>%
    dplyr::select(sample_name, figo_stage) %>% # make sure using same samples as classification models
    dplyr::filter(rowSums(is.na(.)) == 0) %>%
    dplyr::mutate(figo_stage = as.character(figo_stage))

# Load normalized matrisome count data
norm_matrisome_counts_path <- paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/norm_matrisome_counts.tsv")
norm_survival_counts_t_df <- read_tsv(norm_matrisome_counts_path) %>%
    dplyr::select(c("geneID", filtered_survival_df$sample_name)) %>%
    transpose_df("geneID", "sample_name")

# Combine survival data and normalized count data
joined_df <- filtered_survival_df %>%
    inner_join(norm_survival_counts_t_df, by = "sample_name")


[36m--[39m [1m[1mColumn specification[1m[22m [36m--------------------------------------------------------------------------------------------------------------------------[39m
cols(
  sample_name = [31mcol_character()[39m,
  vital_status = [31mcol_character()[39m,
  survival_time = [32mcol_double()[39m,
  figo_stage = [31mcol_character()[39m,
  days_to_last_follow_up = [32mcol_double()[39m,
  days_to_death = [32mcol_double()[39m,
  age_at_diagnosis = [32mcol_double()[39m,
  age_at_index = [32mcol_double()[39m,
  height = [32mcol_double()[39m,
  weight = [32mcol_double()[39m,
  bmi = [32mcol_double()[39m,
  race = [31mcol_character()[39m,
  ethnicity = [31mcol_character()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m--------------------------------------------------------------------------------------------------------------------------[39m
cols(
  .default = col_double(),
  geneID = [31mcol_character()[39m
)
[36mi[39m Use [30

In [6]:
y <- to_one_hot(joined_df, "figo_stage") %>% as.matrix()
rownames(y) <- joined_df$sample_name
x <- joined_df[-c(1:2)] %>% as.matrix()
rownames(x) <- joined_df$sample_name

In [7]:
gene_cor <- WGCNA::cor(x, y, use = "p")

# Remove NA since qvalue can't handle them
gene_pval <- corPvalueStudent(gene_cor, nrow(x)) %>%
    as_tibble(rownames = "geneID") %>%
    dplyr::filter(rowSums(is.na(.)) == 0) %>%
    column_to_rownames("geneID") %>%
    as.matrix()

gene_qval <- apply(gene_pval, MARGIN = 2, function(x) { WGCNA::qvalue(x)$qvalues })
colnames(gene_cor) <- gsub("$", "_cor", colnames(gene_cor))
colnames(gene_pval) <- gsub("$", "_pval", colnames(gene_pval))
colnames(gene_qval) <- gsub("$", "_qval", colnames(gene_qval))

In [8]:
pbc_df <- gene_pval %>%
    as_tibble(rownames = "geneID") %>%
    inner_join(as_tibble(gene_qval, rownames = "geneID"), by = "geneID") %>%
    inner_join(as_tibble(gene_cor, rownames = "geneID"), by = "geneID")

In [9]:
if (save_res) {
    write_tsv(pbc_df, paste0(dirs$analysis_dir, "/feature_selection/", unified_dsets[dset_idx], "_pbc_results.tsv"))
}