In [1]:
library(dplyr)
library(reshape2)
library(survival)
library(tibble)
library(xtable)

## Plotting
library(RColorBrewer)
library(ggplot2)
library(cowplot)
library(pheatmap)
library(pROC)
# library(rgl)
# library(UpSetR)

theme_set(theme_bw())


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


********************************************************
Note: As of version 1.0.0, cowplot does not change the
  default ggplot2 theme anymore. To recover the previous
  behavior, execute:
  theme_set(theme_cowplot())
********************************************************

Type 'citation("pROC")' for a citation.

Attaching package: ‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var



In [45]:
source("R/calc.R")
source("R/misc.R")
source("R/normalise.R")
source("R/plot.R")
source("R/predict.R")
source("R/subset.R")
source("R/utils.R")

# Import data

In [182]:
### ANNOTATIONS
METADATA_SID <- "data/GSE67684/processed/metadata/sid-metadata_v2.tsv"
METADATA_PID <- "data/GSE67684/processed/metadata/pid-metadata_v7.tsv"

metadata_sid <- read.table(METADATA_SID, sep = "\t")
metadata_pid <- read.table(METADATA_PID, sep = "\t", row.names = 1, quote = '"')
metadata_pid$label <- as.factor(metadata_pid$label)

### DATA
# Removed outliers, patients with timepoints from different batches and batch 5
SUBSET_RPATH <- "data/GSE67684/processed/subset_yeoh.tsv"
raw_yeoh <- read.table(SUBSET_RPATH, sep = "\t")
# SCALE->REMOVE->FILTER->LOG
scaled_yeoh <- normaliseMeanScaling(raw_yeoh)
selected_yeoh <- removeProbesets(scaled_yeoh)
yeoh <- log2_transform(filterProbesets(selected_yeoh, 0.7, metadata_sid))

yeoh_allps <- log2_transform(scaled_yeoh)
yeoh_unfltr <- log2_transform(selected_yeoh)

  P001_D0   P004_D0   P005_D0   P007_D0   P008_D0   P009_D0 
 5.796952  4.123342  3.981577  6.317643  4.841458 11.978124 
[1] "No. of ambiguous and AFFY probesets removed: 10503"
[1] D0 D0 D0 D0 D0 D0
Levels: D0 D8 N
           D0    D8     N
1053_at  TRUE  TRUE  TRUE
117_at  FALSE  TRUE  TRUE
121_at   TRUE  TRUE  TRUE
1294_at  TRUE  TRUE  TRUE
1316_at  TRUE  TRUE  TRUE
1320_at FALSE FALSE FALSE
[1] "No. of probesets removed = 6321"


# Global variables

In [183]:
### GLOBAL VARIABLES
metadata <- metadata_sid[colnames(yeoh),]
heatmap_metadata <- metadata[, c("batch_info", "label"), drop = F]
COL_LABEL <- c("darkolivegreen3", "tomato3")

# List subtypes
subtypes9 <- levels(metadata_sid$subtype)
subtypes7 <- setdiff(subtypes9, c("Hypodiploid", "Normal"))
subtypes6 <- setdiff(subtypes7, "Others")
subtypes5 <- setdiff(
  subtypes9,
  c("Hypodiploid", "Normal", "Hyperdiploid", "Others")
)
others <- yeoh[, metadata$subtype == "Others"]
others_normal <- yeoh[, metadata$subtype %in% c("Others", "Normal")]

# Define train/test split
sid_mrd_na <- rownames(metadata_sid)[is.na(metadata_sid$d33_mrd)] %>%
  rep(each = 2) %>%
  paste0(c("_D0", "_D8"))

sid_alltrain_local <- rownames(metadata)[
  !(metadata$subtype %in% c("Hypodiploid", "Normal")) &
  !(rownames(metadata) %in% sid_mrd_na)
]
sid_alltrain <- rownames(metadata)[
  !(metadata$subtype %in% c("Hypodiploid", "Hyperdiploid", "Others", "Normal")) &
  !(rownames(metadata) %in% sid_mrd_na)
]
sid_train <- rownames(metadata)[
  metadata$batch_info %in% 1:7 &
  !(metadata$subtype %in% c("Hypodiploid", "Hyperdiploid", "Others", "Normal")) &
  !(rownames(metadata) %in% sid_mrd_na)
]
sid_test <- rownames(metadata)[
  metadata$batch_info %in% 8:10 &
  !(metadata$subtype %in% c("Hypodiploid", "Hyperdiploid", "Others", "Normal")) &
  !(rownames(metadata) %in% sid_mrd_na)
]

sid_remission <- rownames(metadata)[metadata$label == 0]
sid_normal <- paste0("N0", c(1,2,4))

### Batch features

In [184]:
batch_ps <- identify_batch_features(yeoh, metadata, method = 'aov')
length(batch_ps)

[1] "No. of NaNs = 0"


### Goh (2019): SPS genes
- Super proliferative set
- Obtained from intersection of 48 sets of breast cancer signatures

In [163]:
source("R/utils.R")

In [23]:
GOH_RPATH <- "data/goh_2019/goh_signatures.tsv"
genes_tab <- read.table(GOH_RPATH, sep = "\t", header = T)
goh_genes <- genes_tab[, 1]

In [164]:
ENTREZ_GPL570 <- "../info/microarray/HG-U133_Plus_2/annot_entrez-GPL570.tsv"
entrez_yeoh <- affy2id(yeoh_allps, ENTREZ_GPL570)

No. of probesets with no ID removed = 1314
Total no. of probesets removed (incl. probesets mapping to same gene): 9033


In [169]:
goh_genes1 <- intersect(goh_genes, rownames(entrez_yeoh))

In [170]:
X_normal <- entrez_yeoh[, sid_normal]
list_X_subtypes7 <- lapply(
  subtypes7,
  function(subtype) entrez_yeoh[,
    sid_alltrain_local[annot[sid_alltrain_local, "subtype"] == subtype]
  ]
)
list_obj1 <- lapply(
  list_X_subtypes7,
  predict_pipeline,
  X_normal, annot, annot_pid,
  class_genes = goh_genes1
)
names(list_obj1) <- subtypes7

[1] "No. of selected genes = " "80"                      
[1] "No. of final genes = " "80"                   
[1] "NO. OF SAMPLES IN CENTROID: 5"
[1] "No. of selected genes = " "80"                      
[1] "No. of final genes = " "80"                   
[1] "NO. OF SAMPLES IN CENTROID: 5"
[1] "No. of selected genes = " "80"                      
[1] "No. of final genes = " "80"                   
[1] "NO. OF SAMPLES IN CENTROID: 31"
[1] "No. of selected genes = " "80"                      
[1] "No. of final genes = " "80"                   
[1] "NO. OF SAMPLES IN CENTROID: 3"
[1] "No. of selected genes = " "80"                      
[1] "No. of final genes = " "80"                   
[1] "NO. OF SAMPLES IN CENTROID: 63"
[1] "No. of selected genes = " "80"                      
[1] "No. of final genes = " "80"                   
[1] "NO. OF SAMPLES IN CENTROID: 8"
[1] "No. of selected genes = " "80"                      
[1] "No. of final genes = " "80"                   
[1] "NO. OF 

In [171]:
boxplots <- lapply(list_obj1, function(obj) plot_boxplots(obj$X_y))
names(boxplots) <- subtypes7

“cannot compute exact p-value with ties”

In [172]:
for (subtype in subtypes7) {
  wpath <- sprintf("~/Dropbox/tmp/boxplot_sps-%s.pdf", subtype)
  ggsave(wpath, boxplots[[subtype]], width = 8, height = 2.5)
}

In [None]:
# # All D0 samples
# ayeoh_fltr <- ann_yeoh[
#   gene_signatures,
#   Y[colnames(ann_yeoh), "class_info"] == "D0"
# ]
# ayeoh_fltr <- ayeoh_fltr[rowSums(ayeoh_fltr) != 0, ]

# heatmap_annot <- Y[, c("batch_info", "label", "subtype"), drop = F]
# heatmap_annot$batch_info <- as.double(heatmap_annot$batch_info)

# sps_hmap <- pheatmap(
#   sps_yeoh,
#   col = brewer.pal(n = 11, name = "RdBu"),
# #   col = brewer.pal(9, "Blues"),
#   display_numbers = F, legend = T, border_color = NULL,
#   cluster_method = "ward.D2", cluster_rows = T, cluster_cols = T,
# #   cutree_cols = 4,
#   scale = "row", show_colnames = F, show_rownames = F,
#   annotation_col = Y,
#   fontsize = 3, cellwidth = 2, cellheight = 5,
#   filename = "~/Dropbox/temp/heatmap-sps_signatures.pdf"
# )

### B cell marker proteins

In [173]:
strict_bcell <- c('CD19', 'CD38', 'CD72', 'CD79A', 'CD79B')

In [174]:
SYMBOL_GPL570 <- "../info/microarray/HG-U133_Plus_2/annot_genesymbol-GPL570.tsv"
symbol_yeoh <- affy2id(yeoh_allps, SYMBOL_GPL570)

No. of probesets with no ID removed = 1052
Total no. of probesets removed (incl. probesets mapping to same gene): 8762


In [176]:
strict_bcell1 <- intersect(strict_bcell, rownames(symbol_yeoh))

In [177]:
X_normal <- symbol_yeoh[, sid_normal]
list_X_subtypes7 <- lapply(
  subtypes7,
  function(subtype) symbol_yeoh[,
    sid_alltrain_local[annot[sid_alltrain_local, "subtype"] == subtype]
  ]
)
list_obj1 <- lapply(
  list_X_subtypes7,
  predict_pipeline,
  X_normal, annot, annot_pid,
  class_genes = strict_bcell1
)
names(list_obj1) <- subtypes7

[1] "No. of selected genes = " "5"                       
[1] "No. of final genes = " "5"                    
[1] "NO. OF SAMPLES IN CENTROID: 5"
[1] "No. of selected genes = " "5"                       
[1] "No. of final genes = " "5"                    
[1] "NO. OF SAMPLES IN CENTROID: 5"
[1] "No. of selected genes = " "5"                       
[1] "No. of final genes = " "5"                    
[1] "NO. OF SAMPLES IN CENTROID: 31"
[1] "No. of selected genes = " "5"                       
[1] "No. of final genes = " "5"                    
[1] "NO. OF SAMPLES IN CENTROID: 3"
[1] "No. of selected genes = " "5"                       
[1] "No. of final genes = " "5"                    
[1] "NO. OF SAMPLES IN CENTROID: 63"
[1] "No. of selected genes = " "5"                       
[1] "No. of final genes = " "5"                    
[1] "NO. OF SAMPLES IN CENTROID: 8"
[1] "No. of selected genes = " "5"                       
[1] "No. of final genes = " "5"                    
[1] "NO. OF 

In [178]:
boxplots <- lapply(list_obj1, function(obj) plot_boxplots(obj$X_y))
names(boxplots) <- subtypes7

“cannot compute exact p-value with ties”

In [179]:
for (subtype in subtypes7) {
  wpath <- sprintf("~/Dropbox/tmp/boxplot_cd-%s.pdf", subtype)
  ggsave(wpath, boxplots[[subtype]], width = 8, height = 2.5)
}