# Haemosphere 

In [None]:
# Tools
library(edgeR)
library(limma)

library(SingleCellExperiment)
library(SingleR)
library(Seurat)

# Source
library(biomaRt)

# Data 
library(dplyr)
library(reshape2)

# Plotting
library(ggplot2)
library(patchwork)
library(ComplexHeatmap)
library(ggplotify)
library(gridExtra)
library(grid)
library(viridis)

In [None]:
# Set working directory to project root
setwd("/research/peer/fdeckert/FD20200109SPLENO")

In [None]:
# Source files
source("plotting_global.R")

# Parameter 

In [None]:
# Seurat files 
cnt_file <- "data/haemosphere/Haemopedia-Mouse-RNASeq_raw.txt"
meta_file <- "data/haemosphere/Haemopedia-Mouse-RNASeq_samples.txt"

so_file <- "data/object/seurat.rds"

filter_min_cpm = 0.5
filter_min_expressed_samples = 2 
normalization_method = "TMM"
adj_p_cutoff = 0.05
min_rows = 50

# Plotting Theme
ggplot2::theme_set(theme_global_set()) # From project global source()

# Settings 
options(digits = 4)

# Helper functions 

In [None]:
# Takes a prcomp pca object and group labels as input to compute a PCA scattern plot of the first two PCA 
pca_scattern <- function(pca, group, group_color) {
    
    pca_data <- cbind(pca$x, data.frame(group = group))
    pca_scattern <- ggplot(pca_data, aes(x = PC1, y = PC2, fill = group)) + 
      geom_point(color = "black", pch = 21, size = 3) + 
      ggtitle("log2(CPM) - z-score") + 
      xlab(paste("PC1 (", round(100 * (pca$sdev^2)[1] / sum(pca$sdev^2), digits = 1), " %)")) + 
      ylab(paste("PC2 (", round(100 * (pca$sdev^2)[2] / sum(pca$sdev^2), digits = 1), " %)")) +
      scale_fill_manual(values = group_color) +
      guides(fill = guide_legend(title = "Group")) + 
      theme(aspect.ratio = 1, 
            panel.border = element_blank(), 
            axis.line    = element_line())
    
    return(pca_scattern)
}

In [None]:
top_hm <- function(deg, cnt = NULL, meta = NULL, level = "", contrast = "", top = 50, min_contrast_log2cpm = 2) {

    # Filter and sort cnt by meta samples 
    # Columns of cnt must be corespondend to meta sample id to compute mean cpm by group
    cnt <- cnt[, colnames(cnt) %in% meta$sample_id]  
    cnt <- cnt[, match(meta$sample_id, colnames(cnt))]
    
    # Compute cpm and log2(cpm)
    cpm <- edgeR::cpm(cnt, normalized.lib.sizes = FALSE, log = FALSE, prior.count = 0)
    log_cpm <- edgeR::cpm(cnt, normalized.lib.sizes = FALSE, log = TRUE, prior.count = 1)
    
    # Define grouping levels to compute the group mean log2 cpm
    if(level == "main_labels") {group <- meta$main_labels}
    if(level == "fine_labels") {group <- meta$fine_labels}
    
    # Mean group cpm and log2(cpm)
    colnames(cpm) <- group    
    cpm <- lapply(unique(group), function(i) data.frame(X = rowMeans(cpm[, grepl(i, colnames(cpm), fixed = TRUE)])))
    cpm <- do.call("cbind", cpm)
    colnames(cpm) <- unique(group)
                  
    colnames(log_cpm) <- group
    log_cpm <- lapply(unique(group), function(i) data.frame(X = rowMeans(log_cpm[, grepl(i, colnames(log_cpm), fixed = TRUE)])))
    log_cpm <- do.call("cbind", log_cpm)
    colnames(log_cpm) <- unique(group)
                  
    # Compute log2fc between contrast and maximum group 
    max_log2fc <- data.frame(max_log2fc = log2(cpm[, contrast] / apply(cpm[, colnames(cpm) != contrast, drop = FALSE], 1, max)))
    rownames(max_log2fc) <- rownames(cpm)
    
    # filter log_cpm and max_log2fc by deg top hits
    log_cpm <- log_cpm[rownames(log_cpm) %in% rownames(deg), ]
    max_log2fc <- max_log2fc[rownames(max_log2fc) %in% rownames(deg), , drop = FALSE]
                      
    # Sort log_cpm by max_log2fc 
    log_cpm <- log_cpm[arrange(max_log2fc, desc(max_log2fc)) %>% rownames(), ]
                      
    # Filter for min expression in contrast group 
    log_cpm <- log_cpm[log_cpm[, contrast] > min_contrast_log2cpm, ]

    # Filter log_cpm top hits 
    if (nrow(log_cpm) > top) {log_cpm <- log_cpm[1:top, ]}
                      
    # ComplexHeatmap color ramp
    color_range <- max(abs(log_cpm))
    color_break <- seq(0, color_range, 0.01)
    color_ramp <- viridis(length(color_break), option = "magma")

                  
    # Heat map 
    hm <- grid.grabExpr(draw(ComplexHeatmap::pheatmap(
        mat           = as.matrix(log_cpm),
        main          = contrast,
        fontsize_row  =  10,
        scale         = "none",
        cluster_rows  = TRUE,
        cluster_cols  = FALSE,
        cellwidth      = 12, 
        cellheight     = 12, 
        clustering_distance_rows = "euclidean",
        clustering_distance_cols = "euclidean",
        clustering_method        = "complete",
        show_row_dend = FALSE, 
        show_rownames = TRUE,
        show_colnames = TRUE,
        color         = color_ramp,
        breaks        = color_break, 
        border_color  = NA)))

    
    return(list(hm, log_cpm))
    
}

In [None]:
# Contrast matrix for one vs average of all other explanatory variables 
contrast_one_vs_average <- function(design) {
    
    exp_variables <- colnames(design) 
    contrast <- list()
    for(i in seq_along(exp_variables)) {
        design_string <- paste(exp_variables[i], "-", "(", paste(exp_variables[-i], collapse = "+"), ")/", length(exp_variables[-i]), sep = "")
        contrast[[i]] <- makeContrasts(contrasts = design_string, levels = exp_variables)
    }
    
    return(contrast)
    
}

In [None]:
# Compute DEG matrix needs a contrast, design and count matrix object
deg <- function(
    
    cnt, design, contrast, 
    filter_min_cpm = 0.5,
    filter_min_expressed_samples = 2, 
    normalization_method = "TMM",
    adj_p_cutoff = 0.05
    
) {
    
    # Filter and sort cnt by design matrix
    cnt <- cnt[, colnames(cnt) %in% rownames(design)]
    cnt <- cnt[, match(rownames(design), colnames(cnt))]
    
    # Create a DGEList object from cnt and normalize
    cnt <- DGEList(cnt)
    cnt <- calcNormFactors(cnt, method = normalization_method)
    
    # Keep only rows where filter_min_expressed_samples or more samples out of group1 and group2 must be greater than filter_min_cpm
    cnt <- cnt[rowSums(edgeR::cpm(cnt, normalized.lib.sizes = TRUE, log = FALSE, prior.count = 0) > filter_min_cpm) >= filter_min_expressed_samples, ] # cpm default is normalized.lib.sizes = TRUE and log = FALSE
    
    # Limma fit to fit explanatory variable
    fit <- lmFit(voom(cnt, design = design, normalize.method = "none"), design)
    # For a linear model fit, compute moderated t-statistics, moderated F-statistic, and log-odds of differential expression by empirical Bayes moderation of the standard errors towards a common value.
    eb <- eBayes(contrasts.fit(fit, contrast))
    
    # Fetch topTable
    m = topTable(eb, adjust = "fdr", number = Inf)   # This should fetch every gene/probe sorted by adj p
    m = m[m$adj.P.Val < adj_p_cutoff, ]

    # select column subset and remove dots in adj.P.Val colname
    m = m[c("logFC", "adj.P.Val","AveExpr")] # missing logFC
    colnames(m)[2] = "adjPValue"

    return(m)
}

# Import data 

In [None]:
cnt <- read.delim(cnt_file, row.names = 1)
meta <- read.delim(meta_file)

# Select samples by meta

In [None]:
# Select sample by cell_name 
cell_name <- c(
    
    "Fob", "MZB", "GCB", "SplPlsC", #B-cells SPLEEN 
    "BasoBM",  #Basophil BM 
    "cDC2", "pDC", #DC conventional and plasmacytoid SPLEEN
    "EoP", "Eo", #Eosinophil progenitor and adult BM
    "MEP", "PreCFUE", "CFUE", "EryBlPB", "EryBlPO", "Retic", #Erythrocyte BM
    "MonoBM", "Mac", #Monocyte BM and macrophage PERITONEAL  
    "Mast", #Mast cell PERITONEAL 
    "MegTPO", #Megakaryocyte cultured from BM
    "STHSC", "LSK", "MPP", #Multi potential progenitor BM
    "NeutBM", #Neutrophil BM
    "NK", #Natural killer cell SPLEEN 
    "CMP", #Common myeloid Progenitor BM
    "GMP", #Granulocyte macrophage progenitor BM
    "CLP", #Common lymphoid progenitor BM
    "CD4T", "CD8T" #T-cells total SPLEEN and LYMPHNODES
    
)

meta <- meta[meta$cell_name %in% cell_name, ]


# Select sample and order count matrix by meta 

In [None]:
# Filter and order count matrix columns by meta data samples 
cnt <- cnt[, colnames(cnt) %in% meta$sample_id]
cnt <- cnt[, match(meta$sample_id, colnames(cnt))]

# Reset meta sample id and cnt column names

In [None]:
meta$sample_id <- make.names(paste0(meta$fine_labels, ".", seq(1:nrow(meta))))
colnames(cnt) <- meta$sample_id

# Translate count matrix ENSEMBL to Symbol 

In [None]:
# Get ensembl to symbol relationship from biomaRt 
mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl"))
ens_to_sym <- getBM(filters = "ensembl_gene_id", attributes = c("ensembl_gene_id", "mgi_symbol"), values = rownames(cnt), mart = mart)

# Remove duplicates 
ens_to_sym <- ens_to_sym[!ens_to_sym$ensembl_gene_id %>% duplicated, ]
ens_to_sym <- ens_to_sym[!ens_to_sym$mgi_symbol %>% duplicated, ]

# Filter data by ensembl id in reference 
cnt <- cnt[rownames(cnt) %in% ens_to_sym$ensembl_gene_id, ]

# order ens_to_sym ensembl id by rownames of data 
ens_to_sym <- ens_to_sym[match(ens_to_sym$ensembl_gene_id, rownames(cnt)), ]

# Set mgi_symbol as data rownames
rownames(cnt) <- ens_to_sym$mgi_symbol

# Filter count matrix by genes 

In [None]:
# Keep only genes with counts greater 0
cnt <- cnt[rowSums(cnt) > 0, ]

# Set factor level 

In [None]:
# Set factor level for labels 
meta$main_labels <- factor(meta$main_labels, levels = names(color$main_labels_haemosphere))
meta$fine_labels <- factor(meta$fine_labels, levels = names(color$fine_labels_haemosphere))

# PCA on all data 

In [None]:
options(repr.plot.width = 10, repr.plot.height = 5)

# create a DGEList object from cnt and normalize
cnt_pca <- cnt[rowSums(edgeR::cpm(cnt, normalized.lib.sizes = FALSE, log = FALSE) > 0.5) >= 2, ]
pca <- prcomp(t(edgeR::cpm(cnt_pca, normalized.lib.sizes = FALSE, log = TRUE)), scale = TRUE, center = TRUE)
pca_1 <- pca_scattern(pca, meta$main_labels, color$main_labels_haemosphere)
pca_2 <- pca_scattern(pca, meta$fine_labels, color$fine_labels_haemosphere)

sc_pca <- pca_1 + pca_2
sc_pca
ggsave(sc_pca, filename = "result/plot/haemosphere/sc_pca.png", width = 10, height = 5)

# Design and contrast matrix

Explanatory variable (factor): Cell lineage (design_cl) or cell type  (design_ct)  
Indipendent meassurements (character): Cell type with batch information   
Statistical model: Means model to get all possible combinations of levels  
Design matrix: Coded as model.matrix(~ 0+explanatory_variable) which is here equivalent to model.matrix(~ explanatory_variable)   


# Compare main label

In [None]:
meta_ml <- meta
meta_ml$main_labels <- make.names(meta_ml$main_labels)

# Make design for means model parameterization
design_ml = model.matrix(~0+as.factor(meta_ml$main_labels))
colnames(design_ml) = unique(meta_ml$main_labels)
rownames(design_ml) = meta_ml$sample_id

# Create contrast 
contrast_ml <- contrast_one_vs_average(design_ml)

# Compute deg for all contrast combination 
deg_ml <- list()
for(i in seq_along(contrast_ml)) {deg_ml[[i]] <- deg(cnt = cnt, design = design_ml, contrast = contrast_ml[[i]])}

In [None]:
options(repr.plot.width = 30, repr.plot.height = 10)

# Heat map of marker genes
hm_ml <- lapply(seq_along(unique(meta_ml$main_labels)), function(i) top_hm(deg = deg_ml[[i]], cnt = cnt, meta = meta_ml, level = "main_labels", contrast = unique(meta_ml$main_labels)[i], top = 25, min_contrast_log2cpm = 2))
                
# Plot
hm_ml_grid <- gridExtra::arrangeGrob(grobs = sapply(hm_ml, "[", 1), ncol = 7) 
ggsave(hm_ml_grid, filename = "result/plot/haemosphere/hm_ml_grid.png", width = 30, height = 10)
hm_ml_grid %>% grid::grid.draw()

# Compare fine labels 

In [None]:
meta_fl <- meta
meta_fl$fine_labels <- make.names(meta_fl$fine_labels)

# Make design for means model parameterization
design_fl <- model.matrix(~0+as.factor(meta_fl$cell_name))
colnames(design_fl) <- unique(meta_fl$cell_name)
rownames(design_fl) <- meta_fl$sample_id

# Create contrast 
contrast_fl <- contrast_one_vs_average(design_fl)

# Compute deg for all contrast combination 
deg_fl <- list()
for(i in seq_along(contrast_fl)) {deg_fl[[i]] <- deg(cnt = cnt, design = design_fl, contrast = contrast_fl[[i]])}

In [None]:
options(repr.plot.width = 30, repr.plot.height = 25)

# Heat map of marker genes 
hm_fl <- lapply(seq_along(unique(meta_fl$fine_labels)), function(i) top_hm(deg = deg_fl[[i]], cnt = cnt, meta = meta_fl, level = "fine_labels", contrast = unique(meta_fl$fine_labels)[i], top = 25, min_contrast_log2cpm = 1))
                
# Plot
hm_fl_grid <- gridExtra::arrangeGrob(grobs = sapply(hm_fl, "[", 1), ncol = 5) 
ggsave(hm_fl_grid, filename = "result/plot/haemosphere/hm_fl_grid.png", width = 30, height = 25)
hm_fl_grid %>% grid::grid.draw()

# Compare erythoblast stages 

In [None]:
meta_ery <- meta[meta$main_labels == "Ery", ]
meta_ery$fine_labels <- make.names(meta_ery$fine_labels)

# Make design for means model parameterization
design_ery <- model.matrix(~0+as.factor(meta_ery$cell_name))
colnames(design_ery) <- unique(meta_ery$cell_name)
rownames(design_ery) <- meta_ery$sample_id

# Create contrast 
contrast_ery <- contrast_one_vs_average(design_ery)

# Compute deg for all contrast combination 
deg_ery <- list()
for(i in seq_along(contrast_ery)) {deg_ery[[i]] <- deg(cnt = cnt, design = design_ery, contrast = contrast_ery[[i]])}

In [None]:
options(repr.plot.width = 15, repr.plot.height = 6)

# Heat map of marker genes 
hm_ery <- lapply(seq_along(unique(meta_ery$fine_labels)), function(i) top_hm(deg = deg_ery[[i]], cnt = cnt, meta = meta_ery, level = "fine_labels", contrast = unique(meta_ery$fine_labels)[i], top = 25, min_contrast_log2cpm = 1))
                 
# Plot
hm_ery_grid <- gridExtra::arrangeGrob(grobs = sapply(hm_ery, "[", 1),, ncol = 6)      
ggsave(hm_ery_grid, filename = "result/plot/haemosphere/hm_ery_grid.png", width = 15, height = 6)
hm_ery_grid %>% grid::grid.draw()

# Compare multipotent progenitor cells 

In [None]:
meta_mpp <- meta[meta$main_labels == "MPP", ]
meta_mpp$fine_labels <- make.names(meta_mpp$fine_labels)

# Make design for means model parameterization
design_mpp <- model.matrix(~0+as.factor(meta_mpp$cell_name))
colnames(design_mpp) <- unique(meta_mpp$cell_name)
rownames(design_mpp) <- meta_mpp$sample_id

# Create contrast 
contrast_mpp <- contrast_one_vs_average(design_mpp)

# Compute deg for all contrast combination 
deg_mpp <- list()
for(i in seq_along(contrast_mpp)) {deg_mpp[[i]] <- deg(cnt = cnt, design = design_mpp, contrast = contrast_mpp[[i]])}

In [None]:
options(repr.plot.width = 10, repr.plot.height = 6)

# Heat map of marker genes 
hm_mpp <- lapply(seq_along(unique(meta_mpp$fine_labels)), function(i) top_hm(deg = deg_mpp[[i]], cnt = cnt, meta = meta_mpp, level = "fine_labels", contrast = unique(meta_mpp$fine_labels)[i], top = 25, min_contrast_log2cpm = 1))

# Plot
hm_mpp_grid <- gridExtra::arrangeGrob(grobs = sapply(hm_mpp, "[", 1), ncol = 3)
ggsave(hm_mpp_grid, filename = "result/plot/haemosphere/hm_mpp_grid.png", width = 10, height = 6)
hm_mpp_grid %>% grid::grid.draw()

# Compare restricted progenitor cells 

In [None]:
meta_rpp <- meta[meta$main_labels == "RPP", ]
meta_rpp$fine_labels <- make.names(meta_rpp$fine_labels)

# Make design for means model parameterization
design_rpp <- model.matrix(~0+as.factor(meta_rpp$cell_name))
colnames(design_rpp) <- unique(meta_rpp$cell_name)
rownames(design_rpp) <- meta_rpp$sample_id

# Create contrast 
contrast_rpp <- contrast_one_vs_average(design_rpp)

# Compute deg for all contrast combination 
deg_rpp <- list()
for(i in seq_along(contrast_rpp)) {deg_rpp[[i]] <- deg(cnt = cnt, design = design_rpp, contrast = contrast_rpp[[i]])}

In [None]:
options(repr.plot.width = 5, repr.plot.height = 6)

# Heat map of marker genes 
hm_rpp <- lapply(seq_along(unique(meta_rpp$main_labels)), function(i) top_hm(deg = deg_rpp[[i]], cnt = cnt, meta = meta_rpp, level = "fine_labels", contrast = unique(meta_rpp$main_labels)[i], top = 25, min_contrast_log2cpm = 1))
                 
# Plot
hm_rpp_grid <- gridExtra::arrangeGrob(grobs = sapply(hm_rpp, "[", 1), ncol = 1)
ggsave(hm_rpp_grid, filename = "result/plot/haemosphere/hm_rpp_grid.png", width = 5, height = 6)
hm_rpp_grid %>% grid::grid.draw()

# Create SingleR object 

In [None]:
# Create SingleR object from Haemosphere RNA-seq data
ref <- SummarizedExperiment(list(counts = cnt, logcounts = edgeR::cpm(cnt, log = TRUE, prior.count = 1)))
ref$label.main <- as.character(meta$main_labels)
ref$label.fine <- as.character(meta$fine_labels)

saveRDS(ref, "data/haemosphere/se_haemosphere.rds")