In [1]:
library(tidyverse)
library(WGCNA)
library(cowplot)
library(matrixStats)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: dynamicTreeCut
Loading required package: fastcluster

Attaching package: ‘fastcluster’

The following object is masked from ‘package:stats’:

    hclust



Attaching package: ‘WGCNA’

The following object is masked from ‘package:stats’:

    cor


********************************************************
Note: As of version 1.0.0, cowplot does not change the
  default ggplot2 theme anymore. To recover the previous
  behavior, execute:
  theme_set(theme_cowplot())
********************************************************


Attaching package: ‘matrixStats’

The following object is masked 

# Load data

In [2]:
options(stringsAsFactors = FALSE)
enableWGCNAThreads()

Allowing parallel execution with up to 15 working processes.


In [3]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_path <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

event_code <- list("Alive" = 0, "Dead" = 1)

In [4]:
dset_idx <- 1

## Simple data

In [31]:
coldata_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/coldata.tsv"))
m_counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/matrisome_counts.tsv")) %>%
    transpose_df(future_colnames_col = "geneID", previous_colnames_col = "sample_name")
nm_counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/norm_matrisome_counts.tsv")) %>%
    transpose_df(future_colnames_col = "geneID", previous_colnames_col = "sample_name")

Parsed with column specification:
cols(
  sample_name = col_character(),
  condition = col_character(),
  data_source = col_character()
)
Parsed with column specification:
cols(
  .default = col_double(),
  geneID = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  .default = col_double(),
  geneID = col_character()
)
See spec(...) for full column specifications.


## Manipulated data

In [36]:
healthy_samples <- coldata_df %>%
    dplyr::filter(condition == "healthy") %>%
    dplyr::pull(sample_name)
tumor_samples <- coldata_df %>%
    dplyr::filter(condition == "tumor") %>%
    dplyr::pull(sample_name)

tumor_clinical_data_df <- load_survival_df(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/survival_data.tsv"), event_code) %>%
    dplyr::select(sample_name, vital_status, survival_time, figo_stage) %>%
    dplyr::filter(rowSums(is.na(.)) == 0) %>%
    decode_figo_stage(to = "n") %>%
    dplyr::mutate(figo_stage = as.character(figo_stage)) %>%
    dplyr::bind_cols(to_one_hot(., col = "figo_stage")) %>%
    dplyr::select(-figo_stage) %>%
    # The following steps not really necessary since FIGO stage would be NA for 
    # healthy samples, but including anyway
    inner_join(coldata_df, by = "sample_name") %>%
    dplyr::filter(condition == "tumor") %>%
    dplyr::select(-c(condition, data_source))

healthy_m_counts_df <- m_counts_df %>%
    dplyr::filter(sample_name %in% healthy_samples)

tumor_m_counts_df <- m_counts_df %>%
    dplyr::filter(sample_name %in% tumor_samples)

Parsed with column specification:
cols(
  sample_name = col_character(),
  vital_status = col_character(),
  survival_time = col_double(),
  figo_stage = col_character(),
  days_to_last_follow_up = col_double(),
  days_to_death = col_double(),
  age_at_diagnosis = col_double(),
  age_at_index = col_double(),
  height = col_double(),
  weight = col_double(),
  bmi = col_double(),
  race = col_character(),
  ethnicity = col_character()
)


In [37]:
tumor_m_counts_df

sample_name,PGF,TIMP4,C1QTNF6,TNC,PRL,OGN,C1QL3,FGB,NDNF,⋯,PIK3IP1,C1QTNF2,PCSK5,ANXA1,HGF,VWA2,FGF3,POSTN,NTF3,S100A6
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
TCGA-C5-A1BF-01B-11R-A13Y-07,2078,41,3842,9771,3,1,3,490,52,⋯,562,294,812,20650,63,137,0,5063,16,115505
TCGA-MU-A51Y-01A-11R-A26T-07,313,20,1068,12343,1,485,53,1,24,⋯,4838,250,1462,125042,126,124,1,2083,8,28629
TCGA-EK-A2RM-01A-21R-A18M-07,837,120,1773,8877,0,3,14,3,18,⋯,1961,58,326,62706,168,47,2,3645,28,154835
TCGA-Q1-A73P-01A-11R-A32P-07,219,5,915,10318,0,17,113,40,882,⋯,1826,13,739,18678,84,64,0,587,1,104049
TCGA-C5-A8YT-01A-11R-A37O-07,208,18,6332,15891,10,9650,4,3,1685,⋯,748,49,5675,12291,1319,373,0,3931,7,13739
TCGA-C5-A2LZ-01A-11R-A213-07,1333,1,411,2213,1,6,73,0,5,⋯,3506,157,825,77073,31,154,0,169,10,33768
TCGA-UC-A7PI-01A-11R-A42S-07,119,7,696,21561,1,13,22,8,814,⋯,1744,134,336,24072,25,586,0,258,23,90732
TCGA-VS-A9V1-01A-11R-A42T-07,42,22,1019,2629,0,2,2,36,14,⋯,330,72,207,2663,14,723,0,96,4,59927
TCGA-FU-A5XV-01A-11R-A28H-07,392,3,813,10164,3,1,16,0,12,⋯,2495,32,2026,100716,26,473,0,427,234,40916
TCGA-C5-A1BE-01B-11R-A13Y-07,2101,9,604,14785,0,27,18,0,19,⋯,4435,57,1122,57822,45,134,2,1437,6,65652


In [None]:
clinical_df <- load_survival_df(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/survival_data.tsv"), event_code) %>%
    dplyr::select(sample_name, vital_status, survival_time, figo_stage) %>%
    dplyr::filter(rowSums(is.na(.)) == 0)

coldata_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/coldata.tsv"))

unexpressed_genes <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/matrisome_counts.tsv")) %>%
    dplyr::select(one_of("geneID", coldata_df$sample_name)) %>%
    dplyr::filter(rowSums(.[, -1]) == 0) %>%
    dplyr::pull(geneID)

matrisome_norm_counts_df <- read_tsv(paste0(dirs$data_dir, ""))

data_traits <- clinical_df %>%
    decode_figo_stage(to = "n") %>%
    dplyr::mutate(figo_stage = as.character(figo_stage)) %>%
    dplyr::bind_cols(to_one_hot(., col = "figo_stage")) %>%
    dplyr::select(-figo_stage) %>%
    column_to_rownames("sample_name")

In [None]:
unexpressed_genes

In [None]:
healthy_mn_counts_df <- matrisome_norm_counts_df %>% 
    dplyr::filter(condition == "healthy") %>%
    dplyr::select(-c(condition, data_source))
tumor_mn_counts_df <- matrisome_norm_counts_df %>%
    dplyr::filter(condition == "tumor") %>%
    dplyr::select(-c(condition, data_source))

In [None]:
healthy_mn_counts_df