In [9]:
### library(reshape2)
library(tibble)
library(igraph)
library(xtable)

## Plotting
library(RColorBrewer)
library(ggplot2)
library(cowplot)
# library(rgl)
library(pROC)

library(umap)
library(pheatmap)
# library(VennDiagram)
# library(UpSetR)
# library(Rtsne)
# library(dendextend)
# library(repr)
options(repr.plot.width=8, repr.plot.height=4)

theme_set(theme_bw())

## Custom
source("../functions.R")

## Import data

In [11]:
## Metadata
METADATA_RPATH <- "data/GSE67684/processed/metadata/sid-metadata_v2.tsv"
LABEL_RPATH <- "data/GSE67684/processed/metadata/pid-metadata_v4.tsv"
BATCH_RPATH <- "data/GSE67684/processed/metadata/sid-batch.tsv"

metadata_df <- read.table(METADATA_RPATH, sep = "\t")
yeoh_batch <- read.table(BATCH_RPATH, sep = "\t", header = T, row.names = 1)
yeoh_label <- read.table(LABEL_RPATH, sep = "\t", header = T, row.names = 1)
yeoh_label[] <- lapply(yeoh_label, as.factor)

## Subset of original data
# Removed outliers, patients with timepoints from different batches and batch 5
SUBSET_RPATH <- "data/GSE67684/processed/subset_yeoh.tsv"
raw_yeoh <- read.table(SUBSET_RPATH, sep = "\t")
# SCALE->REMOVE->FILTER->LOG
scaled_yeoh <- normaliseMeanScaling(raw_yeoh)
selected_yeoh <- removeProbesets(scaled_yeoh)
data <- log2_transform(filterProbesets(selected_yeoh, 0.7, metadata_df))

  P001_D0   P004_D0   P005_D0   P007_D0   P008_D0   P009_D0 
 5.796952  4.123342  3.981577  6.317643  4.841458 11.978124 
[1] "No. of ambiguous and AFFY probesets removed: 10503"
[1] D0 D0 D0 D0 D0 D0
Levels: D0 D8 N
           D0    D8     N
1053_at  TRUE  TRUE  TRUE
117_at  FALSE  TRUE  TRUE
121_at   TRUE  TRUE  TRUE
1294_at  TRUE  TRUE  TRUE
1316_at  TRUE  TRUE  TRUE
1320_at FALSE FALSE FALSE
[1] "No. of probesets removed = 6321"


In [12]:
path1 <- "data/leuk_D33/processed/mas5_filtered.tsv"
path2 <- "data/GSE67684/processed/metadata/sid-metadata_v1.tsv"

yeoh_d33 <- read.table(path1, sep = "\t", header = T)
annot_all <- read.table(path2, sep = "\t", header = T)

annot_all[] <- lapply(annot_all, as.factor)
yeoh_d33_allps <- log2_transform(normaliseMeanScaling(yeoh_d33))
# Remove samples that have been curated out of data set
# Members that have been curated out are no longer in annot_all
missing_samples <- colnames(yeoh_d33_allps)[
  !(colnames(yeoh_d33_allps) %in% rownames(annot_all))]
# Remove patients from D33 that have been curated out
yeoh_d33 <- yeoh_d33_allps[
  rownames(data),
  !(colnames(yeoh_d33_allps) %in% missing_samples)
]

P001_D33 P006_D33 P007_D33 P008_D33 P009_D33 P010_D33 
2.106859 2.132480 1.498776 2.559182 1.576584 2.657973 


In [13]:
annot_all$class_info <- factor(
  annot_all$class_info,
  levels = c('D0', 'D8', 'D33', 'N')
)

In [14]:
COL_LABEL <- c("darkolivegreen3", "tomato3")

annot <- metadata_df[colnames(data),]
annot[] <- lapply(annot, as.factor)

# List subtypes
subtypes9 <- levels(metadata_df$subtype)
subtypes7 <- setdiff(subtypes9, c("Hypodiploid", "Normal"))
subtypes5 <- setdiff(
  subtypes9,
  c("Hypodiploid", "Normal", "Hyperdiploid", "Others")
)

In [15]:
# Define train/test split
sid_mrd_na <- rownames(yeoh_label)[is.na(yeoh_label$d33_mrd)] %>%
  rep(each = 2) %>%
  paste0(c("_D0", "_D8"))

sid_alltrain_local <- rownames(annot)[
  !(annot$subtype %in% c("Hypodiploid", "Normal")) &
  !(rownames(annot) %in% sid_mrd_na)
]
sid_alltrain <- rownames(annot)[
  !(annot$subtype %in% c("Hypodiploid", "Hyperdiploid", "Others", "Normal")) &
  !(rownames(annot) %in% sid_mrd_na)
]
sid_train <- rownames(annot)[
  annot$batch_info %in% 1:7 &
  !(annot$subtype %in% c("Hypodiploid", "Hyperdiploid", "Others", "Normal")) &
  !(rownames(annot) %in% sid_mrd_na)
]
sid_test <- rownames(annot)[
  annot$batch_info %in% 8:10 &
  !(annot$subtype %in% c("Hypodiploid", "Hyperdiploid", "Others", "Normal")) &
  !(rownames(annot) %in% sid_mrd_na)
]

sid_remission <- rownames(annot)[annot$label == 0]
sid_normal <- paste0("N0", c(1,2,4))

In [16]:
#' Annotates affymetrix probesets to gene ID of choice
#'
#' Annotates affymetrix probesets according to platform and gene
#' naming convention provided in the annotation file. Option of
#' returning vector of annotaions with probesets as names
#'
#' Removes probesets with no matching ID. If multiple probesets
#' map to a single ID, the probeset with the max sum is used.
#' @param X data.frame with affy probesets as rownames
#' @param file name of the annotation file
#' @param ret.annot logical indicating whether to return vector of annotations
affy2id <- function(X, file, ret.annot = F) { 
  # probesets are rownames of dataframe
  annot_table <- read.table(
    file, sep = "\t", header = T, row.names = 1,
    stringsAsFactors = F, strip.white = T
  )
#   # Filters out ambiguous and AFFY probesets from annot 
#   fltr_annot <- annot_table[
#     grepl("[0-9]_at", rownames(annot_table)) 
#     & !startsWith(rownames(annot_table), "A"),
#     , drop = F
#   ]
  orig_rownames <- annot_table[rownames(X), ]
                      
  msg_no_id <- sprintf( 
    "No. of probesets with no ID removed: %d\n", sum(orig_rownames == "")
  )
  cat(msg_no_id)
   
  # Indices of probe sets with no corresponding ID to be deleted 
  idx_del <- which(orig_rownames == "")
   
  # Identifies genes that have multiple probesets mapping to it 
  freq_gene <- table(orig_rownames)
  dup_genes <- names(freq_gene[freq_gene > 1])
  dup_genes <- setdiff(dup_genes, "")
  for (gene in dup_genes) {
    # subset rows of dataframe with the same id
    same_rows <- X[orig_rownames == gene, , drop = F] 
    # assign numeric indices as rownames 
    rownames(same_rows) <- which(orig_rownames == gene)
    # rows that do not have the maximum sum are deleted 
    row_del <- as.numeric(
      rownames(same_rows)[-which.max(rowSums(same_rows))]
    )
    # concat with existing list of indices to be deleted 
    idx_del <- c(idx_del, row_del)
  }
  
  msg_total <- sprintf(
    "Total no. of probesets removed (incl. probesets mapping to same gene): %d\n", 
    length(idx_del)
  ) 
  cat(msg_total)
  
  # Rows are deleted 
  X_genes <- X[-idx_del, ]
  fltr_rownames <- orig_rownames[-idx_del]
  names(fltr_rownames) <- rownames(X)[-idx_del]
  # Assigning id to X
  rownames(X_genes) <- fltr_rownames
  
  if (ret.annot) {
    orig_rownames[idx_del] <- NA
    names(orig_rownames) <- rownames(X)
    return(orig_rownames)
  }
  
  X_genes
}

### Subnetworks

In [17]:
# 1. Removes affymetrix ambiguous and control probesets
# 2. Map probesets to IDs
# Removes one-to-many probesets and probesets with no ID
# Selects maximum if two probesets match to same gene
# CHECK: What microarray platform is the data from?
data_gfs <- normaliseGFS(data)
ENTREZ_GPL570 <- "../info/microarray/HG-U133_Plus_2/annot_entrez-GPL570.tsv"
# entrez_data <- affy2id(data, ENTREZ_GPL570)
entrez_gfs <- affy2id(data_gfs, ENTREZ_GPL570)

## SYMBOL_GPL570 <- "../info/microarray/HG-U133_Plus_2/annot_genesymbol-GPL570.tsv"
## symbol_yeoh <- affy2id(data_yeoh, SYMBOL_GPL570)

Top 0.05 of expressed genes are assigned GFS scores of 1
Genes below the top 0.15 of expressed genes are assigned GFS scores of 0
No. of probesets with no ID removed: 160
Total no. of probesets removed (incl. probesets mapping to same gene): 746


In [18]:
## Import NEA subnetworks
NEA_RPATH <- paste0("../diff_expr/data/subnetwork/nea-hsa/",
                    "ovarian_cancer/geneset-nea_kegg_ovarian.tsv")
nea_df <- read.table(NEA_RPATH, sep = "\t", header = T, stringsAsFactors = F)
subnetwork_nea <- split(as.character(nea_df$gene_id), nea_df$subnetwork_id)

In [19]:
#' Calculates QPSP profiles of samples
#' Rownames of X has to be the same annotation type as list of network vectors
#' @param X dataframe of discrete GFS-transformed expression data with
#' features in rows and samples in columns
#' @param list_complex list of vectors that contain components of networks
#' @return dataframe of QPSP profiles
calc_qpsp <- function(X, list_complex) {
  idx <- sapply(list_complex, function(x) all(x %in% rownames(X)))
  
  if (anyNA(idx))
    stop("NA present in index")
  
  # List of complexes with all IDs present
  complete_complexes <- list_complex[idx]

  .col_qpsp <- function(col) {
    sapply(complete_complexes, function(ids) mean(col[ids]))
  }
           
  data.frame(apply(X, 2, .col_qpsp))
}

In [20]:
qpsp_nea <- calc_qpsp(entrez_gfs, subnetwork_nea)

In [22]:
write.table(qpsp_nea, 'tmp/nea_gfs.tsv', quote = F, sep = '\t')

### Plots

In [154]:
## Subsetting data
d0_qpsp <- qpsp_nea[, annot[colnames(qpsp_nea), 'class_info'] == 'D0']
d0_hyper <- d0_qpsp[,
  annot[colnames(d0_qpsp), 'subtype'] == 'Hyperdiploid']
d0_others <- d0_qpsp[,
  annot[colnames(d0_qpsp), 'subtype'] == 'Others']

qpsp_hvg <- select_hvg(qpsp_nea, 100)
d0_qpsp_hvg <- qpsp_hvg[, annot[colnames(qpsp_hvg), 'class_info'] == 'D0']
d0_hyper_hvg <- d0_qpsp_hvg[,
  annot[colnames(d0_qpsp_hvg), 'subtype'] == 'Hyperdiploid']

In [155]:
# # Colour settings
# # set1_colours <- ggplot_palette(9)
# set1_colours <- brewer.pal(n = 9, name = "Set1")
# names(set1_colours) <- rev(levels(annot$subtype))
# annot_colours <- list(subtype = set1_colours) 

# pheatmap(
#   qpsp_hvg,
# #   col = brewer.pal(n = 11, name = "RdBu"),
#   col = brewer.pal(9, "Blues"),
#   display_numbers = F, legend = T, border_color = NULL,
#   cluster_method = "complete", cluster_rows = T, cluster_cols = T,
#   clustering_distance_rows = "euclidean", clustering_distance_cols = "euclidean",
#   scale = "none", show_colnames = F, show_rownames = F,
#   annotation_col = annot, annotation_colors = annot_colours,
#   fontsize = 6, cellwidth = 2, cellheight = 3,
#   filename = "~/Dropbox/tmp/heatmap-qpsp_hvg100.pdf"
# )

In [156]:
# umap_obj <- umap(t(d0_hyper))
# ax <- plot_umap(umap_obj, annot, col = 'subtype')

ax <- plot_pca(d0_hyper, annot, col = 'subtype', pch = 'label')
ggsave('~/Dropbox/tmp/qpsp_pca-d0_hyperdiploid.pdf', ax, width = 9, heigh = 5)

In [147]:
#' @param X dataframe with features as rows and samples as columns
#' @param annot dataframe of annotation with samples as rows
#' @param newdata dataframe of data to be predicted by prcomp object
#' @param ... optional arguments are passed to aes_string in ggplot
plot_pca <- function(X, annot, cex = 2, newdata = NULL, ...) {
  # PCA
  pca_obj <- prcomp(t(X))
  Z <- data.frame(pca_obj$x[, 1:3])
  eigenvalues <- (pca_obj$sdev)^2
  var_pc <- eigenvalues[1:3]/sum(eigenvalues)
  pc_labels <- sprintf("PC%d (%.2f%%)", 1:3, var_pc*100)

  # Projects newdata into PCA space
  if (!is.null(newdata)) {
    Z_new <- predict(pca_obj, newdata = t(newdata))[, 1:3]
    # Remove duplicate rows
    Z_new <- Z_new[!(rownames(Z_new) %in% rownames(Z)), ]
    Z <- rbind(Z, Z_new)
  }

  # concat with annotations
  annot_cols <- unlist(list(...))
  annot1 <- annot[rownames(Z), annot_cols, drop = F]
  Z_annot <- cbind(Z, annot1)

  ggplot(Z_annot, aes_string(x = "PC1", y = "PC2", ...)) +
    geom_point(cex = cex) +
    labs(x = pc_labels[1], y = pc_labels[2]) +
    geom_vline(xintercept = 0, color = "black", alpha = 0.5) +
    geom_hline(yintercept = 0, color = "black", alpha = 0.5)
}

In [158]:
ax <- plot_pca(d0_others, annot, newdata = d0_qpsp, col = 'subtype')
ggsave('~/Dropbox/tmp/qpsp_pca-d0_others_project.pdf', ax, width = 9, height = 5)

#### Only half of the KEGG ids are represented in the data

In [None]:
# ## Import KEGG pathways
# KEGG_RPATH <- "../info/KEGG/kegg-human_allpathway_genes.tsv"
# kegg_df <- read.table(KEGG_RPATH, sep = "\t", header = T, stringsAsFactors = F)
# kegg_df$Pathway <- substring(kegg_df$Pathway, 6)
# list_kegg <- split(kegg_df$Entrez.Gene.ID, kegg_df$Pathway)

In [None]:
# # kegg_size <- sapply(subnetwork_pwapi, length)
# # kegg_hit <- sapply(subnetwork_pwapi,
# #                    function(ids) sum(ids %in% rownames(araw)))

# ## Only include the IDs represented in the data
# incomplete_kegg <- lapply(list_kegg,
#                           function(ids) ids[ids %in% rownames(araw)])
# kegg_size <- sapply(incomplete_kegg, length)
# SIZE <- 5
# incomplete_kegg_sub <- incomplete_kegg[kegg_size > SIZE]
# kegg_size1 <- sapply(incomplete_kegg_sub, length)

In [None]:
# ## Import pathwayAPI
# PWAPI_RPATH <- "../info/pathwayAPI/pwapi_id_human-filtered_entrez.tsv"
# pwapi_df <- read.table(PWAPI_RPATH, sep = "\t", header = T, stringsAsFactors = F)
# list_pwapi <- split(pwapi_df[,2:3], pwapi_df$pathway_id)
# subnetwork_pwapi <- lapply(list_pwapi,
#                            function(X) unique(as.numeric(data.matrix(X))))

In [None]:
# pwapi_size <- sapply(subnetwork_pwapi, length)
# pwapi_hit <- sapply(subnetwork_pwapi,
#                     function(ids) sum(ids %in% rownames(araw)))

# ## Only include the IDs represented in the data in pathwayAPI
# subnetwork_pwapi_sub <- lapply(subnetwork_pwapi,
#                                 function(ids) ids[ids %in% rownames(araw)])
# pwapi_size1 <- sapply(subnetwork_pwapi_sub, length)
# SIZE <- 5
# subnetwork_pwapi_sub1 <- subnetwork_pwapi_sub[pwapi_size1 > SIZE]
# pwapi_size2 <- sapply(subnetwork_pwapi_sub1, length)
# print(cbind(pwapi_hit[pwapi_size1 > SIZE], pwapi_size2))