In [1]:
library(tidyverse)
library(WGCNA)
library(matrixStats)

# Custom package
library(rutils)

-- [1mAttaching packages[22m -------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.3     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.6     [32mv[39m [34mdplyr  [39m 1.0.4
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.1

-- [1mConflicts[22m ----------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: dynamicTreeCut

Loading required package: fastcluster


Attaching package: 'fastcluster'


The following object is m

# Load data

In [2]:
options(stringsAsFactors = FALSE)
enableWGCNAThreads()

Allowing parallel execution with up to 15 working processes.


In [3]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")

event_code <- list("Alive" = 0, "Dead" = 1)

In [4]:
dset_idx <- 3

## Simple data

In [5]:
coldata_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/coldata.tsv"))
m_counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/matrisome_counts.tsv"))
nm_counts_df <- read_tsv(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/norm_matrisome_counts.tsv"))


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  sample_name = [31mcol_character()[39m,
  condition = [31mcol_character()[39m,
  data_source = [31mcol_character()[39m
)



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  .default = col_double(),
  geneID = [31mcol_character()[39m
)
[36mi[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m for the full column specifications.



[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  .de

## Manipulated data

In [6]:
tumor_clinical_data_df <- load_survival_df(paste0(dirs$data_dir, "/", unified_dsets[dset_idx], "/survival_data.tsv"), event_code) %>%
    dplyr::select(sample_name, vital_status, survival_time, figo_stage) %>%
    dplyr::filter(rowSums(is.na(.)) == 0) %>%
    decode_figo_stage(to = "n") %>%
    dplyr::mutate(figo_stage = as.character(figo_stage)) %>%
    dplyr::bind_cols(to_one_hot(., col = "figo_stage")) %>%
    dplyr::select(-figo_stage) %>%
    # The following steps may not be necessary since FIGO stage should be NA for 
    # healthy samples, but including anyway
    inner_join(coldata_df, by = "sample_name") %>%
    dplyr::filter(condition == "tumor") %>%
    dplyr::select(-c(condition, data_source))

tumor_m_counts_df <- m_counts_df %>%
    dplyr::select(one_of("geneID", tumor_clinical_data_df$sample_name))

tumor_nm_counts_df <- nm_counts_df %>%
    dplyr::select(one_of("geneID", tumor_clinical_data_df$sample_name))

unexpressed_genes <- tumor_m_counts_df %>%
    dplyr::filter(rowSums(.[, -1]) == 0) %>%
    dplyr::pull(geneID)

data_expr <- tumor_nm_counts_df %>%
    transpose_df("geneID", "sample_name") %>%
    dplyr::select(-one_of(unexpressed_genes)) %>%
    column_to_rownames("sample_name")

data_traits <- tumor_clinical_data_df %>%
    column_to_rownames("sample_name")


[36m--[39m [1m[1mColumn specification[1m[22m [36m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[39m
cols(
  sample_name = [31mcol_character()[39m,
  vital_status = [31mcol_character()[39m,
  survival_time = [32mcol_double()[39m,
  figo_stage = [31mcol_character()[39m,
  days_to_last_follow_up = [32mcol_double()[39m,
  days_to_death = [32mcol_double()[39m,
  age_at_diagnosis = [32mcol_double()[39m,
  age_at_index = [32mcol_double()[39m,
  height = [32mcol_double()[39m,
  weight = [32mcol_double()[39m,
  bmi = [32mcol_double()[39m,
  race = [31mcol_character()[39m,
  ethnicity = [31mcol_character()[39m
)




In [7]:
dir.create(paste0(dirs$data_dir, "/saved_network_objects/"), showWarnings = FALSE)
save(data_expr, data_traits, file = paste0(dirs$data_dir, "/saved_network_objects/", unified_dsets[dset_idx], "_tumor_data.RData"))