In [2]:
### library(reshape2)
library(tibble)
library(igraph)
library(xtable)

## Plotting
library(RColorBrewer)
library(ggplot2)
library(cowplot)
# library(rgl)
library(pROC)

library(umap)
library(pheatmap)
# library(VennDiagram)
# library(UpSetR)
# library(Rtsne)
# library(dendextend)
# library(repr)
options(repr.plot.width=8, repr.plot.height=4)

theme_set(theme_bw())


Attaching package: ‘igraph’

The following object is masked from ‘package:tibble’:

    as_data_frame

The following objects are masked from ‘package:stats’:

    decompose, spectrum

The following object is masked from ‘package:base’:

    union


********************************************************
Note: As of version 1.0.0, cowplot does not change the
  default ggplot2 theme anymore. To recover the previous
  behavior, execute:
  theme_set(theme_cowplot())
********************************************************

Type 'citation("pROC")' for a citation.

Attaching package: ‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var



In [3]:
source("R/calc.R")
source("R/misc.R")
source("R/normalise.R")
source("R/plot.R")
source("R/predict.R")
source("R/subset.R")
source("R/utils.R")

## Import data

In [4]:
## Metadata
METADATA_RPATH <- "data/GSE67684/processed/metadata/sid-metadata_v2.tsv"
LABEL_RPATH <- "data/GSE67684/processed/metadata/pid-metadata_v4.tsv"
BATCH_RPATH <- "data/GSE67684/processed/metadata/sid-batch.tsv"

metadata_df <- read.table(METADATA_RPATH, sep = "\t")
yeoh_batch <- read.table(BATCH_RPATH, sep = "\t", header = T, row.names = 1)
yeoh_label <- read.table(LABEL_RPATH, sep = "\t", header = T, row.names = 1)
yeoh_label[] <- lapply(yeoh_label, as.factor)

## Subset of original data
# Removed outliers, patients with timepoints from different batches and batch 5
SUBSET_RPATH <- "data/GSE67684/processed/subset_yeoh.tsv"
raw_yeoh <- read.table(SUBSET_RPATH, sep = "\t")
# SCALE->REMOVE->FILTER->LOG
scaled_yeoh <- normaliseMeanScaling(raw_yeoh)
selected_yeoh <- removeProbesets(scaled_yeoh)
data <- log2_transform(filterProbesets(selected_yeoh, 0.7, metadata_df))

  P001_D0   P004_D0   P005_D0   P007_D0   P008_D0   P009_D0 
 5.796952  4.123342  3.981577  6.317643  4.841458 11.978124 
[1] "No. of ambiguous and AFFY probesets removed: 10503"
[1] D0 D0 D0 D0 D0 D0
Levels: D0 D8 N
           D0    D8     N
1053_at  TRUE  TRUE  TRUE
117_at  FALSE  TRUE  TRUE
121_at   TRUE  TRUE  TRUE
1294_at  TRUE  TRUE  TRUE
1316_at  TRUE  TRUE  TRUE
1320_at FALSE FALSE FALSE
[1] "No. of probesets removed = 6321"


#### Data
- 405 samples (201 D0, 201 D8,3 N)
- Originally 210 patients
- Removed 2 outliers, 7 different batches

In [5]:
path1 <- "data/leuk_D33/processed/mas5_filtered.tsv"
path2 <- "data/GSE67684/processed/metadata/sid-metadata_v1.tsv"

yeoh_d33 <- read.table(path1, sep = "\t", header = T)
annot_all <- read.table(path2, sep = "\t", header = T)

annot_all[] <- lapply(annot_all, as.factor)
yeoh_d33_allps <- log2_transform(normaliseMeanScaling(yeoh_d33))
# Remove samples that have been curated out of data set
# Members that have been curated out are no longer in annot_all
missing_samples <- colnames(yeoh_d33_allps)[
  !(colnames(yeoh_d33_allps) %in% rownames(annot_all))]
# Remove patients from D33 that have been curated out
yeoh_d33 <- yeoh_d33_allps[
  rownames(data),
  !(colnames(yeoh_d33_allps) %in% missing_samples)
]

P001_D33 P006_D33 P007_D33 P008_D33 P009_D33 P010_D33 
2.106859 2.132480 1.498776 2.559182 1.576584 2.657973 


In [6]:
annot_all$class_info <- factor(
  annot_all$class_info,
  levels = c('D0', 'D8', 'D33', 'N')
)

In [7]:
COL_LABEL <- c("darkolivegreen3", "tomato3")

annot <- metadata_df[colnames(data),]
annot[] <- lapply(annot, as.factor)

# List subtypes
subtypes9 <- levels(metadata_df$subtype)
subtypes7 <- setdiff(subtypes9, c("Hypodiploid", "Normal"))
subtypes5 <- setdiff(
  subtypes9,
  c("Hypodiploid", "Normal", "Hyperdiploid", "Others")
)

In [8]:
# Define train/test split
sid_mrd_na <- rownames(yeoh_label)[is.na(yeoh_label$d33_mrd)] %>%
  rep(each = 2) %>%
  paste0(c("_D0", "_D8"))

sid_alltrain_local <- rownames(annot)[
  !(annot$subtype %in% c("Hypodiploid", "Normal")) &
  !(rownames(annot) %in% sid_mrd_na)
]
sid_alltrain <- rownames(annot)[
  !(annot$subtype %in% c("Hypodiploid", "Hyperdiploid", "Others", "Normal")) &
  !(rownames(annot) %in% sid_mrd_na)
]
sid_train <- rownames(annot)[
  annot$batch_info %in% 1:7 &
  !(annot$subtype %in% c("Hypodiploid", "Hyperdiploid", "Others", "Normal")) &
  !(rownames(annot) %in% sid_mrd_na)
]
sid_test <- rownames(annot)[
  annot$batch_info %in% 8:10 &
  !(annot$subtype %in% c("Hypodiploid", "Hyperdiploid", "Others", "Normal")) &
  !(rownames(annot) %in% sid_mrd_na)
]

sid_remission <- rownames(annot)[annot$label == 0]
sid_normal <- paste0("N0", c(1,2,4))

### Subnetworks

In [9]:
# 1. Removes affymetrix ambiguous and control probesets
# 2. Map probesets to IDs
# Removes one-to-many probesets and probesets with no ID
# Selects maximum if two probesets match to same gene
# CHECK: What microarray platform is the data from?
data_gfs <- normaliseGFS(data, lower = .6)
ENTREZ_GPL570 <- "../info/microarray/HG-U133_Plus_2/annot_entrez-GPL570.tsv"
# entrez_data <- affy2id(data, ENTREZ_GPL570)
entrez_gfs <- affy2id(data_gfs, ENTREZ_GPL570)

## SYMBOL_GPL570 <- "../info/microarray/HG-U133_Plus_2/annot_genesymbol-GPL570.tsv"
## symbol_yeoh <- affy2id(data_yeoh, SYMBOL_GPL570)

Top 0.05 of expressed genes are assigned GFS scores of 1
Genes below the top 0.60 of expressed genes are assigned GFS scores of 0
No. of probesets with no ID removed: 160
Total no. of probesets removed (incl. probesets mapping to same gene): 746


In [10]:
## Import NEA subnetworks
NEA_RPATH <- paste0("../diff_expr/data/subnetwork/nea-hsa/",
                    "ovarian_cancer/geneset-nea_kegg_ovarian.tsv")
nea_df <- read.table(NEA_RPATH, sep = "\t", header = T, stringsAsFactors = F)
subnetwork_nea <- split(as.character(nea_df$gene_id), nea_df$subnetwork_id)

In [11]:
#' Calculates QPSP profiles of samples
#' Rownames of X has to be the same annotation type as list of network vectors
#' @param X dataframe of discrete GFS-transformed expression data with
#' features in rows and samples in columns
#' @param list_complex list of vectors that contain components of networks
#' @return dataframe of QPSP profiles
calc_qpsp <- function(X, list_complex) {
  idx <- sapply(list_complex, function(x) all(x %in% rownames(X)))
  
  if (anyNA(idx))
    stop("NA present in index")
  
  # List of complexes with all IDs present
  complete_complexes <- list_complex[idx]

  .col_qpsp <- function(col) {
    sapply(complete_complexes, function(ids) mean(col[ids]))
  }
           
  data.frame(apply(X, 2, .col_qpsp))
}

In [12]:
qpsp_nea <- calc_qpsp(entrez_gfs, subnetwork_nea)

In [17]:
# write.table(qpsp_nea, 'tmp/nea_gfs.tsv', quote = F, sep = '\t')

### Plots

In [154]:
## Subsetting data
d0_qpsp <- qpsp_nea[, annot[colnames(qpsp_nea), 'class_info'] == 'D0']
d0_hyper <- d0_qpsp[,
  annot[colnames(d0_qpsp), 'subtype'] == 'Hyperdiploid']
d0_others <- d0_qpsp[,
  annot[colnames(d0_qpsp), 'subtype'] == 'Others']

qpsp_hvg <- select_hvg(qpsp_nea, 100)
d0_qpsp_hvg <- qpsp_hvg[, annot[colnames(qpsp_hvg), 'class_info'] == 'D0']
d0_hyper_hvg <- d0_qpsp_hvg[,
  annot[colnames(d0_qpsp_hvg), 'subtype'] == 'Hyperdiploid']

In [155]:
# # Colour settings
# # set1_colours <- ggplot_palette(9)
# set1_colours <- brewer.pal(n = 9, name = "Set1")
# names(set1_colours) <- rev(levels(annot$subtype))
# annot_colours <- list(subtype = set1_colours) 

# pheatmap(
#   qpsp_hvg,
# #   col = brewer.pal(n = 11, name = "RdBu"),
#   col = brewer.pal(9, "Blues"),
#   display_numbers = F, legend = T, border_color = NULL,
#   cluster_method = "complete", cluster_rows = T, cluster_cols = T,
#   clustering_distance_rows = "euclidean", clustering_distance_cols = "euclidean",
#   scale = "none", show_colnames = F, show_rownames = F,
#   annotation_col = annot, annotation_colors = annot_colours,
#   fontsize = 6, cellwidth = 2, cellheight = 3,
#   filename = "~/Dropbox/tmp/heatmap-qpsp_hvg100.pdf"
# )

In [156]:
# umap_obj <- umap(t(d0_hyper))
# ax <- plot_umap(umap_obj, annot, col = 'subtype')

ax <- plot_pca(d0_hyper, annot, col = 'subtype', pch = 'label')
ggsave('~/Dropbox/tmp/qpsp_pca-d0_hyperdiploid.pdf', ax, width = 9, heigh = 5)

In [147]:
#' @param X dataframe with features as rows and samples as columns
#' @param annot dataframe of annotation with samples as rows
#' @param newdata dataframe of data to be predicted by prcomp object
#' @param ... optional arguments are passed to aes_string in ggplot
plot_pca <- function(X, annot, cex = 2, newdata = NULL, ...) {
  # PCA
  pca_obj <- prcomp(t(X))
  Z <- data.frame(pca_obj$x[, 1:3])
  eigenvalues <- (pca_obj$sdev)^2
  var_pc <- eigenvalues[1:3]/sum(eigenvalues)
  pc_labels <- sprintf("PC%d (%.2f%%)", 1:3, var_pc*100)

  # Projects newdata into PCA space
  if (!is.null(newdata)) {
    Z_new <- predict(pca_obj, newdata = t(newdata))[, 1:3]
    # Remove duplicate rows
    Z_new <- Z_new[!(rownames(Z_new) %in% rownames(Z)), ]
    Z <- rbind(Z, Z_new)
  }

  # concat with annotations
  annot_cols <- unlist(list(...))
  annot1 <- annot[rownames(Z), annot_cols, drop = F]
  Z_annot <- cbind(Z, annot1)

  ggplot(Z_annot, aes_string(x = "PC1", y = "PC2", ...)) +
    geom_point(cex = cex) +
    labs(x = pc_labels[1], y = pc_labels[2]) +
    geom_vline(xintercept = 0, color = "black", alpha = 0.5) +
    geom_hline(yintercept = 0, color = "black", alpha = 0.5)
}

In [158]:
ax <- plot_pca(d0_others, annot, newdata = d0_qpsp, col = 'subtype')
ggsave('~/Dropbox/tmp/qpsp_pca-d0_others_project.pdf', ax, width = 9, height = 5)

#### Only half of the KEGG ids are represented in the data

In [None]:
# ## Import KEGG pathways
# KEGG_RPATH <- "../info/KEGG/kegg-human_allpathway_genes.tsv"
# kegg_df <- read.table(KEGG_RPATH, sep = "\t", header = T, stringsAsFactors = F)
# kegg_df$Pathway <- substring(kegg_df$Pathway, 6)
# list_kegg <- split(kegg_df$Entrez.Gene.ID, kegg_df$Pathway)

In [None]:
# # kegg_size <- sapply(subnetwork_pwapi, length)
# # kegg_hit <- sapply(subnetwork_pwapi,
# #                    function(ids) sum(ids %in% rownames(araw)))

# ## Only include the IDs represented in the data
# incomplete_kegg <- lapply(list_kegg,
#                           function(ids) ids[ids %in% rownames(araw)])
# kegg_size <- sapply(incomplete_kegg, length)
# SIZE <- 5
# incomplete_kegg_sub <- incomplete_kegg[kegg_size > SIZE]
# kegg_size1 <- sapply(incomplete_kegg_sub, length)

In [None]:
# ## Import pathwayAPI
# PWAPI_RPATH <- "../info/pathwayAPI/pwapi_id_human-filtered_entrez.tsv"
# pwapi_df <- read.table(PWAPI_RPATH, sep = "\t", header = T, stringsAsFactors = F)
# list_pwapi <- split(pwapi_df[,2:3], pwapi_df$pathway_id)
# subnetwork_pwapi <- lapply(list_pwapi,
#                            function(X) unique(as.numeric(data.matrix(X))))

In [None]:
# pwapi_size <- sapply(subnetwork_pwapi, length)
# pwapi_hit <- sapply(subnetwork_pwapi,
#                     function(ids) sum(ids %in% rownames(araw)))

# ## Only include the IDs represented in the data in pathwayAPI
# subnetwork_pwapi_sub <- lapply(subnetwork_pwapi,
#                                 function(ids) ids[ids %in% rownames(araw)])
# pwapi_size1 <- sapply(subnetwork_pwapi_sub, length)
# SIZE <- 5
# subnetwork_pwapi_sub1 <- subnetwork_pwapi_sub[pwapi_size1 > SIZE]
# pwapi_size2 <- sapply(subnetwork_pwapi_sub1, length)
# print(cbind(pwapi_hit[pwapi_size1 > SIZE], pwapi_size2))

## Subnetworks

In [None]:
# ## NEA - GFS
# file <- 'tmp/nea_gfs.tsv'
# raw_nea <- read.table(file, sep = "\t", header = T, row.names = 1)
# nea_fltr <- remove_rows(raw_nea, sum(row) == 0)

In [None]:
# # Subnetworks are resistant to batch effects
# batch_subnetworks <- identify_batch_features(nea_fltr, annot, method = 'aov')
# length(batch_subnetworks)

In [None]:
# # Prediction (Drug genes) --------------------------------------------
# ## Drug responsive genes
# #' @param X_subtype df of patients from a specific subtype (D0 followed by D8)
# select_subnetworks <- function(
#   X_subtype, sid_remission,
#   alpha = 0.05, N = 50
# ) {
#   sid <- intersect(sid_remission, colnames(X_subtype))
#   X_subtype_remission <- X_subtype[, sid, drop = F]
#   n_pairs <- ncol(X_subtype_remission) / 2
  
#   if (!is_paired(X_subtype_remission))
#     stop("Patient IDs are not paired..")
  
#   # P-value
#   pvalue <- calc_ttest(X_subtype_remission, n_pairs, is_paired = T) # nan values!
#   features <- names(pvalue)[pvalue < alpha & !is.na(pvalue)]
  
#   return(features)
# }

In [None]:
#' Does not perform PCA transform on data
#' Used to predict relapse for all subtypes
#' X df containing all subtypes of patients in arg: pid and normal patients
#' @param pid vector of pid belonging to both D0 and D8 patients (identically ordered)
#' @return list containing prediction plot and vector coordinates
predict_subnetwork <- function(
  subtype, list_X, X_normal,
  metadata, metadata_mrd,
  batch_features = NULL,
  save.rds = FALSE
) {
  X_subtype <- list_X[[subtype]]
  
  sid_remission <- colnames(X_subtype)[
    metadata[colnames(X_subtype), "label"] == 0
  ]
  
  class_features <- select_subnetworks(X_subtype, sid_remission)
  if (is.null(batch_features)) {
    selected_genes <- class_features
  } else {
    selected_genes <- setdiff(class_features, batch_features)
  }
  
  print(c("No. of selected genes = ", length(class_features)))
  print(c("No. of final genes = ", length(selected_genes)))
  
  # Subtype and normal samples
  response <- t(X_subtype[selected_genes, ])
  normal <- t(X_normal[selected_genes, ])
  
  if (save.rds) {
    file <- sprintf('tmp/subnetworks-%s.RDS', subtype) 
    saveRDS(list(response, normal), file)
  }
  
  # Collate MRD results as well
  V <- compute_features(response, normal, colnames(X_subtype), sid_remission)
  V$mrd <- metadata_mrd[rownames(V), "d33_mrd"]

  prediction_obj <- predict_plot(
    V, metadata,                             
    bigpos_names = "angle_d0d8_d0normal",
    smallpos_names = c("erm1_ratio2", "l2norm_ratio2", "mrd")
  )
  
  return(prediction_obj)
}

In [None]:
X_normal <- nea_fltr[, sid_normal]
list_X_subtypes7 <- lapply(
  subtypes7,
  function(subtype) nea_fltr[,
    sid_alltrain_local[annot[sid_alltrain_local, "subtype"] == subtype]
  ]
)
names(list_X_subtypes7) <- subtypes7

list_obj1 <- lapply(
  subtypes7,
  predict_subnetwork,
  list_X_subtypes7,
  X_normal, annot, yeoh_label,
  save.rds = FALSE
)

In [None]:
list_boxplots <- lapply(list_obj1, function(obj) obj$plot)
names(list_boxplots) <- subtypes7

In [None]:
for (subtype in subtypes7) {
  wpath <- sprintf('~/Dropbox/tmp/nea_gfs-%s.pdf', subtype)
  ggsave(wpath, list_boxplots[[subtype]], width = 9, height = 2.5)
}

### Concatenate subnetwork features

In [None]:
# Save scaled subnetwork values to concatenate with probeset values
# Unit scaling of each row
scaled_nea <- t(scale(t(nea_fltr)))

X_normal <- scaled_nea[, sid_normal]
list_X_subtypes7 <- lapply(
  subtypes7,
  function(subtype) scaled_nea[,
    sid_alltrain_local[metadata_sid[sid_alltrain_local, "subtype"] == subtype]
  ]
)
names(list_X_subtypes7) <- subtypes7
list_obj1 <- lapply(
  subtypes7,
  predict_subnetwork,
  list_X_subtypes7,
  X_normal, metadata_sid, yeoh_label,
  save.rds = TRUE
)

In [None]:
#' Concatenates probesets with subnetworks
#'
#' Does not perform PCA transform on data
#' Used to predict relapse for all subtypes
#' X df containing all subtypes of patients in arg: pid and normal patients
#'
#' @param pid vector of pid belonging to both D0 and D8 patients (identically ordered)
#' @return list containing prediction plot and vector coordinates
predict_concat <- function(
  subtype, list_X, X_normal,
  metadata, metadata_mrd,
  batch_genes = NULL
) {
  X_subtype <- list_X[[subtype]]
  sid_remission <- colnames(X_subtype)[
    metadata[colnames(X_subtype), "label"] == 0
  ]
  # Scaled features will fail selection by getLocalGenes due to expression threshold
  class_genes <- select_subnetworks(X_subtype, sid_remission)
  
  if (is.null(batch_genes)) {
    selected_genes <- class_genes
  } else {
    selected_genes <- setdiff(class_genes, batch_genes)
  }
  
  cat(sprintf("No. of selected genes = %d\n", length(class_genes)))
  cat(sprintf("No. of final genes = %d\n", length(selected_genes)))
  
  file <- sprintf('tmp/subnetworks-%s.RDS', subtype)
  subnetworks <- readRDS(file)
  
  # Subtype and normal samples
  response <- cbind(
    t(X_subtype[selected_genes, ]),
    subnetworks[[1]]
  )
  normal <- cbind(
    t(X_normal[selected_genes, ]),
    subnetworks[[2]]
  )

  
  # Collate MRD results as well
  V <- compute_features(response, normal, colnames(X_subtype), sid_remission)
  V$mrd <- metadata_mrd[rownames(V), "d33_mrd"]
  
  prediction_obj <- predict_plot(
    V, metadata,                             
    bigpos_names = "angle_d0d8_d0normal",
    smallpos_names = c("erm1_ratio2", "l2norm_ratio2", "mrd")
  )
  
  return(prediction_obj)
}

In [None]:
# Unit scaling of each row
data_scaled <- t(scale(t(data)))

X_normal <- data_scaled[, sid_normal]
list_X_subtypes7 <- lapply(
  subtypes7,
  function(subtype, sid) data_scaled[, sid[metadata_sid[sid, "subtype"] == subtype]],
  sid_alltrain_local
)
names(list_X_subtypes7) <- subtypes7

list_obj1 <- lapply(
  subtypes7,
  predict_concat,
  list_X_subtypes7,
  X_normal, metadata_sid, yeoh_label,
  batch_ps
)

In [None]:
list_boxplots <- lapply(list_obj1, function(obj) obj$plot)
names(list_boxplots) <- subtypes7
for (subtype in subtypes7) {
  wpath <- sprintf('~/Dropbox/tmp/concat_features-%s.pdf', subtype)
  ggsave(wpath, list_boxplots[[subtype]], width = 9, height = 2.5)
}

In [None]:
# Ablation test: Scaled probesets only
# Unit scaling of each row
data_scaled <- t(scale(t(data)))
X_normal <- data_scaled[, sid_normal]
list_X_subtypes7 <- lapply(
  subtypes7,
  function(subtype, sid) data_scaled[, sid[metadata_sid[sid, "subtype"] == subtype]],
  sid_alltrain_local
)
names(list_X_subtypes7) <- subtypes7

list_obj1 <- lapply(
  subtypes7,
  predict_subnetwork,
  list_X_subtypes7,
  X_normal, metadata_sid, yeoh_label,
  batch_ps,
  save.rds = FALSE
)

In [None]:
list_boxplots <- lapply(list_obj1, function(obj) obj$plot)
names(list_boxplots) <- subtypes7
for (subtype in subtypes7) {
  wpath <- sprintf('~/Dropbox/tmp/scaled_pval-%s.pdf', subtype)
  ggsave(wpath, list_boxplots[[subtype]], width = 9, height = 2.5)
}

### Plot: DE subnetworks

In [None]:
X_normal <- nea_fltr[, sid_normal]
list_X_subtypes7 <- lapply(
  subtypes7,
  function(subtype) nea_fltr[,
    sid_alltrain_local[metadata_sid[sid_alltrain_local, "subtype"] == subtype]
  ]
)
names(list_X_subtypes7) <- subtypes7

list_obj1 <- lapply(
  subtypes7,
  predict_subnetwork,
  list_X_subtypes7,
  X_normal, metadata_sid, yeoh_label,
  save.rds = TRUE
)

In [None]:
list_differential_subnetworks <- list()
for (subtype in subtypes7) {
  file <- sprintf('tmp/subnetworks-%s.RDS', subtype) 
  obj <- readRDS(file)
  list_differential_subnetworks[[subtype]] <- colnames(obj[[1]])
}
differential_subnetworks <- unique(do.call(c, list_differential_subnetworks))

In [None]:
d0_nea_differential <- nea_fltr[
  differential_subnetworks,
  metadata_sid[colnames(nea_fltr), 'class_info'] == 'D0'
]
umap_obj <- umap(t(d0_nea_differential))
ax_umap <- ggplot_umap(umap_obj, metadata_sid, col = 'subtype')
ggsave('~/Dropbox/tmp/umap-differential_nea.pdf',
       ax_umap, width = 9, height = 5)

In [None]:
ggplot_pca(d0_nea_differential, metadata_sid, col = 'subtype')

## Subnetworks

In [None]:
# 1. Removes affymetrix ambiguous and control probesets
# 2. Map probesets to IDs
# Removes one-to-many probesets and probesets with no ID
# Selects maximum if two probesets match to same gene
# CHECK: What microarray platform is the data from?
ENTREZ_GPL570 <- "../info/microarray/HG-U133_Plus_2/annot_entrez-GPL570.tsv"
entrez_data <- affy2id(data, ENTREZ_GPL570)

## SYMBOL_GPL570 <- "../info/microarray/HG-U133_Plus_2/annot_genesymbol-GPL570.tsv"
## symbol_yeoh <- affy2id(data_yeoh, SYMBOL_GPL570)

In [None]:
# Calculate individual D0-N magnitude (normal centroid)
normal <- entrez_data[, startsWith(colnames(entrez_data), "N")]
D0 <- entrez_data[, endsWith(colnames(entrez_data), "D0")]
D8 <- entrez_data[, endsWith(colnames(entrez_data), "D8")]

#### Unfiltered data

In [None]:
raw_data <- log2_transform(selected_yeoh)
araw <- affy2id(raw_data, ENTREZ_GPL570)
lidx1 <- sapply(subnetwork_nea,
                function(ids) all(ids %in% rownames(araw)))
subnetwork2 <- subnetwork_nea[lidx1]
print(length(subnetwork2))
                
# Calculate individual D0-N magnitude (normal centroid)
normal_raw <- araw[, startsWith(colnames(araw), "N")]
D0_raw <- araw[, endsWith(colnames(araw), "D0")]
D8_raw <- araw[, endsWith(colnames(araw), "D8")]

In [None]:
## Import NEA subnetworks
NEA_RPATH <- paste0("../diff_expr/data/subnetwork/nea-hsa/",
                    "ovarian_cancer/geneset-nea_kegg_ovarian.tsv")
nea_df <- read.table(NEA_RPATH, sep = "\t", header = T, stringsAsFactors = F)
subnetwork_nea <- split(as.character(nea_df$gene_id), nea_df$subnetwork_id)

In [None]:
# Filter out subnetworks with genes that are missing in the data
lidx <- sapply(subnetwork_nea,
               function(ids) all(ids %in% rownames(entrez_data)))
nea_fltr <- subnetwork_nea[lidx]
               
lidx1 <- sapply(subnetwork_nea,
               function(ids) all(ids %in% rownames(araw)))
nea_unfltr <- subnetwork_nea[lidx1]

#### Only half of the KEGG ids are represented in the data

In [None]:
## Import KEGG pathways
KEGG_RPATH <- "../info/KEGG/kegg-human_allpathway_genes.tsv"
kegg_df <- read.table(KEGG_RPATH, sep = "\t", header = T, stringsAsFactors = F)
kegg_df$Pathway <- substring(kegg_df$Pathway, 6)
list_kegg <- split(kegg_df$Entrez.Gene.ID, kegg_df$Pathway)

In [None]:
# kegg_size <- sapply(subnetwork_pwapi, length)
# kegg_hit <- sapply(subnetwork_pwapi,
#                    function(ids) sum(ids %in% rownames(araw)))

## Only include the IDs represented in the data
incomplete_kegg <- lapply(list_kegg,
                          function(ids) ids[ids %in% rownames(araw)])
kegg_size <- sapply(incomplete_kegg, length)
SIZE <- 5
incomplete_kegg_sub <- incomplete_kegg[kegg_size > SIZE]
kegg_size1 <- sapply(incomplete_kegg_sub, length)

In [None]:
## Import pathwayAPI
PWAPI_RPATH <- "../info/pathwayAPI/pwapi_id_human-filtered_entrez.tsv"
pwapi_df <- read.table(PWAPI_RPATH, sep = "\t", header = T, stringsAsFactors = F)
list_pwapi <- split(pwapi_df[,2:3], pwapi_df$pathway_id)
subnetwork_pwapi <- lapply(list_pwapi,
                           function(X) unique(as.numeric(data.matrix(X))))

In [None]:
pwapi_size <- sapply(subnetwork_pwapi, length)
pwapi_hit <- sapply(subnetwork_pwapi,
                    function(ids) sum(ids %in% rownames(araw)))

## Only include the IDs represented in the data in pathwayAPI
subnetwork_pwapi_sub <- lapply(subnetwork_pwapi,
                                function(ids) ids[ids %in% rownames(araw)])
pwapi_size1 <- sapply(subnetwork_pwapi_sub, length)
SIZE <- 5
subnetwork_pwapi_sub1 <- subnetwork_pwapi_sub[pwapi_size1 > SIZE]
pwapi_size2 <- sapply(subnetwork_pwapi_sub1, length)
print(cbind(pwapi_hit[pwapi_size1 > SIZE], pwapi_size2))

#### Subnetwork deltas (D0-N)
- Comparing variance of subnetwork deltas was not conclusive
- Means of the random sample seem to have a larger proportion of higher mean subnetworks

- Sum of deltas (D0-D8) is not a good feature
- Percentage of sum of deltas is not a good feature
    - Does not take into account the sign of the logfc
    - Does not consider individual subnetwork
- Percentage of each subnetwork
    - Filter out subnetwork that has delta_D0N of less than threshold?

In [None]:
#' @param ids vector of gene IDs belonging to a subnetwork
#' @param X dataframe of expr values at D0
#' @param Y vector or dataframe of expr values at specific timepoint
#' @return vector of deltas of patients of a particular subnetwork
calc_subnetwork_delta <- function(ids, X, Y) {
  if(is.numeric(ids)) # how to handle factor?
    ids <- as.character(ids)
  
  if (is.vector(Y)) {
    logfc <- Y[ids] - X[ids,]
  } else {
    stopifnot(substring(colnames(X),1,4) ==
              substring(colnames(Y),1,4))
    logfc <- Y[ids,] - X[ids,]
  }
  return(colMeans(abs(logfc)))
}

#### Filtered data - NEA

In [None]:
norm_centroid <- rowMeans(normal)

delta_D0N <- data.frame(t(sapply(nea_fltr,
                                 calc_subnetwork_delta,
                                 D0, norm_centroid)))
delta_D0D8 <- data.frame(t(sapply(nea_fltr,
                                  calc_subnetwork_delta,
                                  D0, D8)))

In [None]:
subtype <- subtypes[[1]]
print(subtype)
delta_D0N_1 <- delta_D0N[
  , Y[colnames(delta_D0N), "subtype"] == subtype
]
delta_D0D8_1 <- delta_D0D8[
  , Y[colnames(delta_D0D8), "subtype"] == subtype
]

stopifnot(substring(colnames(delta_D0N_1),1,4) ==
          substring(colnames(delta_D0D8_1),1,4))

mu_delta <- apply(delta_D0N_1, 1, mean)
var_delta <- apply(delta_D0N_1, 1, var)
# plot(mu_delta, var_delta)

THRESHOLD <- 3
idx_subnetwork <- names(mu_delta)[mu_delta > THRESHOLD]
print(length(idx_subnetwork))
delta_D0D8_2 <- delta_D0D8_1[idx_subnetwork, ]
delta_D0N_2 <- delta_D0N_1[idx_subnetwork, ]

# print(colnames(delta_D0D8_1)[
#   Y[colnames(delta_D0D8_1), "label"] == 1
# ])

pct_delta <- colMeans(delta_D0D8_2) / colMeans(delta_D0N_2)

par(mfrow=c(1,3))
plot(colMeans(delta_D0N_2), col = Y[names(pct_delta), "label"] + 1)
plot(colMeans(delta_D0D8_2), col = Y[names(pct_delta), "label"] + 1)
plot(pct_delta, col = Y[names(pct_delta), "label"] + 1)

In [None]:
plotPCA2D(raw_delta_D0D8_1, Y)

#### Unfiltered data - NEA

In [None]:
norm_centroid1 <- rowMeans(normal_raw)

raw_delta_D0N <- data.frame(t(sapply(nea_unfltr,
                                 calc_subnetwork_delta,
                                 D0_raw, norm_centroid1)))
raw_delta_D0D8 <- data.frame(t(sapply(nea_unfltr,
                                  calc_subnetwork_delta,
                                  D0_raw, D8_raw)))

In [None]:
for (subtype in subtypes) {
  print(subtype)
  raw_delta_D0N_1 <- raw_delta_D0N[
    , Y[colnames(raw_delta_D0N), "subtype"] == subtype
  ]
  raw_delta_D0D8_1 <- raw_delta_D0D8[
    , Y[colnames(raw_delta_D0D8), "subtype"] == subtype
  ]

  stopifnot(substring(colnames(raw_delta_D0N_1),1,4) ==
            substring(colnames(raw_delta_D0D8_1),1,4))

  mu_delta <- apply(raw_delta_D0N_1, 1, mean)
  var_delta <- apply(raw_delta_D0N_1, 1, var)
  # plot(mu_delta, var_delta)
  THRESHOLD <- 3
  idx_subnetwork <- names(mu_delta)[mu_delta > THRESHOLD]

  ## Selecting subnetworks with significant dysregulation
  raw_delta_D0D8_2 <- raw_delta_D0D8_1[idx_subnetwork, ]
  raw_delta_D0N_2 <- raw_delta_D0N_1[idx_subnetwork, ]

  # print(colnames(delta_D0D8_1)[
  #   Y[colnames(delta_D0D8_1), "label"] == 1
  # ])

  pct_delta <- raw_delta_D0D8_2 / raw_delta_D0N_2
  
  #   pct_delta_rem <- pct_delta[, Y[colnames(pct_delta), "label"] == 0]
#   mean_pct <- rowMeans(pct_delta_rem)
#   PCT <- 0.7
#   nea_responsive <- names(mean_pct)[mean_pct > PCT]
#   print(length(nea_responsive))
#   pct_delta_responsive <- colMeans(pct_delta[nea_responsive, ])

  fpath <- sprintf("~/Dropbox/temp/delta-%s.pdf", subtype)
  pdf(fpath, 7, 3)
  par(mfrow=c(1,3))
  plot(colMeans(raw_delta_D0N_2),
       col = Y[names(pct_delta), "label"] + 1,
       ylab = "mean(delta_D0N)",
       pch = 16, cex = 1.5)
  plot(colMeans(raw_delta_D0D8_2),
       col = Y[names(pct_delta), "label"] + 1,
       ylab = "mean(delta_D0D8)",
       pch = 16, cex = 1.5)
  plot(colMeans(pct_delta),
       col = Y[names(pct_delta), "label"] + 1,
       ylab = "mean(pct_delta)",
       pch = 16, cex = 1.5)
#   plot(pct_delta_responsive,
#        col = Y[names(pct_delta), "label"] + 1,
#        ylab = "mean_responsive(pct_delta)")
  dev.off()
  
  ## Relapse prediction
  pid_subtype <- rownames(Y)[Y$subtype == subtype]
  feat <- predict_relapse(pid_subtype, data, pid_remission, Y,
                          batch_genes = batch_genes)  
  
  fpath <- sprintf("~/Dropbox/temp/delta_corr-%s.pdf", subtype)
  pdf(fpath, 7, 3)
  par(mfrow=c(1,3))
  plot(colMeans(pct_delta), feat[, "erm1_ratio2"],
       col = Y[names(pct_delta), "label"] + 1,
       ylab = "erm_ratio",
       pch = 16, cex = 1.5)
  plot(colMeans(pct_delta), feat[, "l2norm_ratio2"],
       col = Y[names(pct_delta), "label"] + 1,
       ylab = "l2norm_ratio",
       pch = 16, cex = 1.5)
  plot(colMeans(pct_delta), feat[, "angle_d0d8_normal"],
       col = Y[names(pct_delta), "label"] + 1,
       ylab = "angle_d0d8_normal",
       pch = 16, cex = 1.5)
  dev.off()
}

In [None]:
subtype <- subtypes[[1]]
print(subtype)
raw_delta_D0N_1 <- raw_delta_D0N[
  , Y[colnames(raw_delta_D0N), "subtype"] == subtype
]
raw_delta_D0D8_1 <- raw_delta_D0D8[
  , Y[colnames(raw_delta_D0D8), "subtype"] == subtype
]

stopifnot(substring(colnames(raw_delta_D0N_1),1,4) ==
          substring(colnames(raw_delta_D0D8_1),1,4))

mu_delta <- apply(raw_delta_D0N_1, 1, mean)
var_delta <- apply(raw_delta_D0N_1, 1, var)
# plot(mu_delta, var_delta)
THRESHOLD <- 3
idx_subnetwork <- names(mu_delta)[mu_delta > THRESHOLD]
print(length(idx_subnetwork))

## Selecting subnetworks with significant dysregulation
raw_delta_D0D8_2 <- raw_delta_D0D8_1[idx_subnetwork, ]
raw_delta_D0N_2 <- raw_delta_D0N_1[idx_subnetwork, ]

# print(colnames(delta_D0D8_1)[
#   Y[colnames(delta_D0D8_1), "label"] == 1
# ])

pct_delta <- raw_delta_D0D8_2 / raw_delta_D0N_2
pct_delta1 <- colMeans(raw_delta_D0D8_2) / colMeans(raw_delta_D0N_2)

In [None]:
subtype <- subtypes[[2]]
print(subtype)
raw_delta_D0N_1 <- raw_delta_D0N[, Y[colnames(raw_delta_D0N), "subtype"] == subtype]
raw_delta_D0D8_1 <- raw_delta_D0D8[, Y[colnames(raw_delta_D0D8), "subtype"] == subtype]

stopifnot(substring(colnames(delta_D0N_1),1,4) ==
          substring(colnames(delta_D0D8_1),1,4))

# pct_delta <- delta_D0D8_1 / delta_D0N_1
# print(Y[colnames(pct_delta), "label"])
# plot(pct_delta, col = Y[names(pct_delta), "label"] + 1)

#### Unfiltered data - PathwayAPI

In [None]:
norm_centroid1 <- rowMeans(normal_raw)

pwapi_delta_D0N <- data.frame(t(sapply(subnetwork_pwapi_sub1,
                                       calc_subnetwork_delta,
                                       D0_raw, norm_centroid1)))
pwapi_delta_D0D8 <- data.frame(t(sapply(subnetwork_pwapi_sub1,
                                        calc_subnetwork_delta,
                                        D0_raw, D8_raw)))

In [None]:
subtype <- subtypes[[4]]
print(subtype)
pwapi_delta_D0N_1 <- pwapi_delta_D0N[
  , Y[colnames(pwapi_delta_D0N), "subtype"] == subtype
]
pwapi_delta_D0D8_1 <- pwapi_delta_D0D8[
  , Y[colnames(pwapi_delta_D0D8), "subtype"] == subtype
]

In [None]:
stopifnot(substring(colnames(pwapi_delta_D0N_1),1,4) ==
          substring(colnames(pwapi_delta_D0D8_1),1,4))

pwapi_mu_delta <- apply(pwapi_delta_D0N_1, 1, mean)
pwapi_var_delta <- apply(pwapi_delta_D0N_1, 1, var)
# plot(pwapi_mu_delta, pwapi_var_delta)

THRESHOLD <- 2.5
idx_subnetwork3 <- names(pwapi_mu_delta)[pwapi_mu_delta > THRESHOLD]
pwapi_delta_D0D8_2 <- pwapi_delta_D0D8_1[idx_subnetwork3, ]
pwapi_delta_D0N_2 <- pwapi_delta_D0N_1[idx_subnetwork3, ]

# print(colnames(pwapi_delta_D0D8_1)[
#   Y[colnames(pwapi_delta_D0D8_1), "label"] == 1
# ])

pct_delta <- pwapi_delta_D0D8_2 / pwapi_delta_D0N_2

plot(colMeans(pct_delta), col = Y[names(pct_delta), "label"] + 1)

#### Unfiltered data - KEGG

In [None]:
norm_centroid1 <- rowMeans(normal_raw)

kegg_delta_D0N <- data.frame(t(sapply(incomplete_kegg_sub,
                                       calc_subnetwork_delta,
                                       D0_raw, norm_centroid1)))
kegg_delta_D0D8 <- data.frame(t(sapply(incomplete_kegg_sub,
                                        calc_subnetwork_delta,
                                        D0_raw, D8_raw)))

In [None]:
subtype <- subtypes[[7]]
print(subtype)
kegg_delta_D0N_1 <- kegg_delta_D0N[
  , Y[colnames(kegg_delta_D0N), "subtype"] == subtype
]
kegg_delta_D0D8_1 <- kegg_delta_D0D8[
  , Y[colnames(kegg_delta_D0D8), "subtype"] == subtype
]

In [None]:
stopifnot(substring(colnames(kegg_delta_D0N_1),1,4) ==
          substring(colnames(kegg_delta_D0D8_1),1,4))

kegg_mu_delta <- apply(kegg_delta_D0N_1, 1, mean)
kegg_var_delta <- apply(kegg_delta_D0N_1, 1, var)
plot(kegg_mu_delta, kegg_var_delta)

THRESHOLD <- 2
idx_subnetwork4 <- names(kegg_mu_delta)[kegg_mu_delta > THRESHOLD]
kegg_delta_D0D8_2 <- kegg_delta_D0D8_1[idx_subnetwork4, ]
kegg_delta_D0N_2 <- kegg_delta_D0N_1[idx_subnetwork4, ]

# print(colnames(kegg_delta_D0D8_1)[
#   Y[colnames(kegg_delta_D0D8_1), "label"] == 1
# ])

pct_delta <- kegg_delta_D0D8_2 / kegg_delta_D0N_2

plot(colMeans(pct_delta), col = Y[names(pct_delta), "label"] + 1)

#### Misc

In [None]:
## Distribution of subnetwork sizes
subnetwork_sizes <- sapply(subnetwork_nea, length)
print(table(subnetwork_sizes))

## Representation of gene IDs in data
all_ids <- unlist(subnetwork_nea)
print(head(sort(names(table(all_ids)))))
print(head(sort(rownames(entrez_data))))