# QC and filter seurat object

In [34]:
# Seurat 
library(Seurat)
library(SeuratDisk)

# Single R
library(SingleCellExperiment)
library(SingleR)
library(celldex)

# Data 
library(dplyr)

# Plotting
library(ggplot2)
library(RColorBrewer)
library(patchwork)

In [35]:
# Set working directory to project root
setwd(gsub("/script/seurat", "", getwd()))

In [36]:
# Source files
source("plotting_global.R")

# Parameter settings

In [37]:
# Filtering Parameter
nFeature_RNA_min_m <- 250
nFeature_RNA_min_p <- 500

nFeature_RNA_max_m <- 4000
nFeature_RNA_max_p <- 4000

nCount_RNA_min_m <- 1000
nCount_RNA_min_p <- 1500

nCount_RNA_max_m <- 20000
nCount_RNA_max_p <- 20000

pMt_RNA_max_m <- 5
pMt_RNA_max_p <- 5

# Files 
so_raw_file <- "data/seurat_object/so_raw.rds"
so_qc_file <- "data/seurat_object/so_qc.rds"
h5ad_qc_file <- "data/h5ad_object/h5ad_qc.h5Seurat" # will be converted into h5ad

# Plotting Theme
ggplot2::theme_set(theme_global_set()) # From project global source()

# Import Seurat object

In [None]:
so_raw <- readRDS(so_raw_file)

# Add filter meta data to Seurat object 

In [None]:
so_raw$nFeature_RNA_max <- ifelse(so_raw$tissue == "Myeloid", nFeature_RNA_max_m, nFeature_RNA_max_p)
so_raw$nFeature_RNA_min <- ifelse(so_raw$tissue == "Myeloid", nFeature_RNA_min_m, nFeature_RNA_min_p)

so_raw$nCount_RNA_max <- ifelse(so_raw$tissue == "Myeloid", nCount_RNA_max_m, nCount_RNA_max_p)
so_raw$nCount_RNA_min <- ifelse(so_raw$tissue == "Myeloid", nCount_RNA_min_m, nCount_RNA_min_p)

so_raw$pMt_RNA_max <- ifelse(so_raw$tissue == "Myeloid", pMt_RNA_max_m, pMt_RNA_max_p)

# QC 
so_raw$qc_class <- ifelse(
  so_raw$cellranger_class == "Cell" &
    so_raw$nFeature_RNA <= so_raw$nFeature_RNA_max & 
    so_raw$nFeature_RNA >= so_raw$nFeature_RNA_min & 
    so_raw$nCount_RNA <= so_raw$nCount_RNA_max & 
    so_raw$nCount_RNA >= so_raw$nCount_RNA_min &
    so_raw$pMt_RNA <= so_raw$pMt_RNA_max, 
  "pass", "fail"
  )

# Filter cells by rank plot

Empty droplets were determined with CellRanger V3.0.2 Lun et al., 2019 EmptyDrop heuristic. RNAse activity of granulocytes might be wrongly identified as empty cells by CellRanger.

**Typical Sample** A steep drop-off is indicative of good separation between the cell-associated barcodes and the barcodes associated with empty GEMs. A ideal barcode rank plot has a distincitve shape, which is referred to as a "cliff and knee".

**Heterogeneous Sample** Heterogeneous populations of cells in a sample result in two "cliff and knee" distributions. However, there should still be clear separation between the bacodes. 

**Compromised Sample** Round curve and lack of steep cliff may indicate low sample quality or loss of single-cell behavior. This can be due to a wetting failure, premature cell lysis, or low cell viability. 

**Compromised Sample** Defined cliff and knee, but the total number of barcodes detected may be lower than expected. This can be caused by a sample clog or inaccurate cell count. 

In [None]:
rank_plot <- ggplot(so_raw@meta.data, aes(x = log10(nCount_RNA_rank), y = log10(nCount_RNA), color = cellranger_class)) + 
  geom_point() + 
  geom_hline(aes(yintercept = log10(nCount_RNA_min)), color = "red", linetype = "longdash") +
  scale_color_manual(values = so_color$cellranger_class) +
  ggtitle("Barcode rank plot") +
  xlab("log10(cell barcode rank)") + ylab("log10(cell UMI counts)") + 
  facet_grid(tissue~treatment) + 
  theme(aspect.ratio = 1, legend.position = "bottom")

options(repr.plot.width = 5, repr.plot.height = 5)
rank_plot
ggsave(rank_plot, filename = "result/plot/seurat/rank_plot.png", width = 4, height = 4)

# Filter by cell_ranger class

In [None]:
so_qc <- subset(so_raw, subset = cellranger_class == "Cell")

# Filter by UMI and Feature count 

In [None]:
qc_1 <- ggplot(so_qc@meta.data, aes(x = log10(nCount_RNA), fill = tissue)) + 
  geom_density() + 
  ggtitle("Density plot UMI count") + xlab("log10(UMI count)") + ylab("Density") +
  geom_vline(aes(xintercept = log10(nCount_RNA_min)), color = "red", linetype = "longdash") +
  scale_x_continuous(breaks = integer_breaks()) + 
  scale_fill_manual(values = so_color$tissue) +
  facet_grid(tissue~treatment) + 
  theme(legend.position = "bottom", aspect.ratio = 1) 

qc_2 <- ggplot(so_qc@meta.data, aes(x = log10(nFeature_RNA), fill = tissue)) + 
  geom_density() + 
  ggtitle("Density plot Feature count") + xlab("log10(Feature count)") + ylab("Density") +
  geom_vline(aes(xintercept = log10(nFeature_RNA_min)), color = "red", linetype = "longdash") +
  scale_x_continuous(breaks = integer_breaks()) + 
  scale_fill_manual(values = so_color$tissue) +
  facet_grid(tissue~treatment) + 
  theme(legend.position = "bottom", aspect.ratio = 1)

qc_3 <- ggplot(so_qc@meta.data, aes(x = pMt_RNA, fill = tissue)) + 
  geom_density() + 
  ggtitle("Density plot Mt %") + xlab("Mt [%]") + ylab("Density") +
  geom_vline(aes(xintercept = pMt_RNA_max), color = "red", linetype = "longdash") +
  scale_x_continuous(breaks = integer_breaks()) +
  xlim(0, 20) +
  scale_fill_manual(values = so_color$tissue) +
  facet_grid(tissue~treatment)  + 
  theme(legend.position = "bottom", aspect.ratio = 1)

options(repr.plot.width = 15, repr.plot.height = 5)
qc_1 + qc_2 + qc_3 + plot_layout(guides = "collect") & theme(legend.position = "bottom")
ggsave(qc_1, filename = "result/plot/seurat/density_umi.png", width = 4, height = 4)
ggsave(qc_2, filename = "result/plot/seurat/density_feature.png", width = 4, height = 4)
ggsave(qc_3, filename = "result/plot/seurat/density_mt.png", width = 4, height = 4)

In [None]:
sc_1 <- ggplot(so_qc@meta.data, aes(x = log10(nCount_RNA), y = log10(nFeature_RNA), color = pMt_RNA)) + 
  geom_point() + ggtitle("Mitochondrial gene percentage") + ylab("log10(feature count)") + xlab("log10(umi count)") + 
  geom_vline(aes(xintercept = log10(nCount_RNA_min)), color = "red", linetype = "longdash") +
  #geom_vline(aes(xintercept = log10(nCount_RNA_max)), color = "red", linetype = "longdash") +
  geom_hline(aes(yintercept = log10(nFeature_RNA_min)), color = "red", linetype = "longdash") + 
  geom_hline(aes(yintercept = log10(nFeature_RNA_max)), color = "red", linetype = "longdash") + 
  facet_grid(tissue~treatment) + theme(aspect.ratio = 1, legend.position = "bottom") + 
  scale_size(guide = guide_legend(direction = "vertical"))

sc_2 <- ggplot(so_qc@meta.data, aes(x = log10(nCount_RNA), y = log10(nFeature_RNA), color = pHb_RNA)) + 
  geom_point() + ggtitle("Hemoglobin gene percentage") + ylab("log10(feature count)") + xlab("log10(umi count)") +
  geom_vline(aes(xintercept = log10(nCount_RNA_min)), color = "red", linetype = "longdash") +
  #geom_vline(aes(xintercept = log10(nCount_RNA_max)), color = "red", linetype = "longdash") +
  geom_hline(aes(yintercept = log10(nFeature_RNA_min)), color = "red", linetype = "longdash") + 
  geom_hline(aes(yintercept = log10(nFeature_RNA_max)), color = "red", linetype = "longdash") + 
  facet_grid(tissue~treatment) + theme(aspect.ratio = 1, legend.position = "bottom") + 
  scale_size(guide = guide_legend(direction = "vertical"))

sc_3 <- ggplot(so_qc@meta.data, aes(x = log10(nCount_RNA), y = log10(nFeature_RNA), color = pRp_RNA)) + 
  geom_point() + ggtitle("Ribsonmal gene percentage") + ylab("log10(feature count)") + xlab("log10(umi count)") +
  geom_vline(aes(xintercept = log10(nCount_RNA_min)), color = "red", linetype = "longdash") +
  #geom_vline(aes(xintercept = log10(nCount_RNA_max)), color = "red", linetype = "longdash") +
  geom_hline(aes(yintercept = log10(nFeature_RNA_min)), color = "red", linetype = "longdash") + 
  geom_hline(aes(yintercept = log10(nFeature_RNA_max)), color = "red", linetype = "longdash") + 
  facet_grid(tissue~treatment) + theme(aspect.ratio = 1, legend.position = "bottom") + 
  scale_size(guide = guide_legend(direction = "vertical"))

options(repr.plot.width = 15, repr.plot.height = 5)
sc_1 + sc_2 + sc_3 & theme(legend.position = "bottom")
ggsave(sc_1, filename = "result/plot/seurat/sc_mt.png", width = 4, height = 4)
ggsave(sc_2, filename = "result/plot/seurat/sc_hg.png", width = 4, height = 4)
ggsave(sc_3, filename = "result/plot/seurat/sc_rb.png", width = 4, height = 4)

In [None]:
so_qc <- subset(so_qc, subset = nFeature_RNA >= nFeature_RNA_min & nFeature_RNA <= nFeature_RNA_max & nCount_RNA >= nCount_RNA_min & nCount_RNA <= nCount_RNA_max & pMt_RNA <= pMt_RNA_max)

# Cell Cycle Raw Seurat Object

In [None]:
cc_kowalczyk <- read.csv("cc_kowalczyk.csv")
cc_kowalczyk <- cc_kowalczyk[cc_kowalczyk$sig_population >= 5, ]

cc_kowalczyk_g0 <- cc_kowalczyk[cc_kowalczyk$state == "down", ]
cc_kowalczyk_int <- cc_kowalczyk[cc_kowalczyk$state == "up", ]

cc_kowalczyk_s <- cc_kowalczyk_int[cc_kowalczyk_int$S > cc_kowalczyk_int$G2_M, ]
cc_kowalczyk_g2m <- cc_kowalczyk_int[cc_kowalczyk_int$G2_M > cc_kowalczyk_int$S, ]

so_raw <- CellCycleScoring(so_raw, s.features = cc_kowalczyk_s$gene, g2m.features = cc_kowalczyk_g2m$gene, set.ident = FALSE)
colnames(so_raw@meta.data) <- gsub("Phase", "cc_phase_class", colnames(so_raw@meta.data))
colnames(so_raw@meta.data) <- gsub("S.Score", "msS_RNA", colnames(so_raw@meta.data))
colnames(so_raw@meta.data) <- gsub("G2M.Score", "msInt_RNA", colnames(so_raw@meta.data)) 
so_raw$msCC_diff_RNA <- so_raw$msS_RNA - so_raw$msInt_RNA

so_raw <- AddModuleScore(so_raw, features = list(cc_kowalczyk_g0$gene), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msG0_RNA")
so_raw <- AddModuleScore(so_raw, features = list(cc_kowalczyk_int$gene), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msInt_RNA")

# Box plots 

In [None]:
qc_vln_FUN <- function(data, y, fill, ylab = "", scale_y_log10, ymin, ymax) {
  
  vln_plot_1 <- ggplot(data, aes(x = treatment, y = {{y}}, color = {{fill}})) + 
    geom_jitter(alpha = 0.2, shape = 16, color = "gray") + 
    geom_boxplot(alpha = 1.0) + xlab("") + ylim(ymin, ymax) +
    scale_color_manual(values = so_color$tissue) + 
    ggtitle("Cell containing GEM") + 
    facet_wrap(~tissue, scales = "free_x") + ylab(ylab) +
    theme(
      plot.title = element_text(size = 12, face = "bold", margin = margin(t = 0, r = 0, b = 5, l = 0)), 
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1), 
      strip.text = element_blank()
      )
  
  vln_plot_2 <- ggplot(data[data$qc_class == "pass", ], aes(x = treatment, y = {{y}}, color = {{fill}})) +
    geom_jitter(alpha = 0.2, shape = 16, color = "gray") +
    geom_boxplot(alpha = 1.0) + xlab("") + ylim(ymin, ymax) +
    ylim(ymin, ymax) +
    scale_color_manual(values = so_color$tissue) +
    ggtitle("Filtered") +
    facet_wrap(~tissue, scales = "free_x") + ylab(ylab) + 
    theme(
      plot.title = element_text(size = 12, face = "bold", margin = margin(t = 0, r = 0, b = 5, l = 0)), 
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1), 
      strip.text = element_blank()
      )
  
  if(scale_y_log10) {
    
    vln_plot_1 <- vln_plot_1 + scale_y_log10(limits = c(ymin, NA))
    vln_plot_2 <- vln_plot_2 + scale_y_log10(limits = c(ymin, NA))
    
    }

  vln_plot <- vln_plot_1 + vln_plot_2 + plot_layout(ncol = 2, guides = "collect") & theme(legend.position = "bottom")

  return(vln_plot)
  
}

qc_vln_1 <- qc_vln_FUN(so_raw@meta.data, nCount_RNA, tissue, "UMI [count]", FALSE, ymin = 0, ymax = max(so_raw$nCount_RNA))
qc_vln_2 <- qc_vln_FUN(so_raw@meta.data, nFeature_RNA, tissue, "Feature [count]", FALSE, ymin = 0, ymax = max(so_raw$nFeature_RNA))
qc_vln_3 <- qc_vln_FUN(so_raw@meta.data, msCC_diff_RNA, tissue, "cc diff [count]", FALSE, ymin = 0, ymax = max(so_raw$nFeature_RNA))
qc_vln_4 <- qc_vln_FUN(so_raw@meta.data, pMt_RNA, tissue, "Mt [%]", FALSE, ymin = 0, ymax = 100)
qc_vln_5 <- qc_vln_FUN(so_raw@meta.data, pHb_RNA, tissue, "Hb [%]", FALSE, ymin = 0, ymax = 100)
qc_vln_6 <- qc_vln_FUN(so_raw@meta.data, pRp_RNA, tissue, "Rbl [%]", FALSE, ymin = 0, ymax = 100)
options(repr.plot.width = 15, repr.plot.height = 10)
qc_vln_1 + qc_vln_2 + qc_vln_3 + qc_vln_4 + qc_vln_5 + qc_vln_6 + plot_layout(ncol = 3, nrow = 2) + plot_layout(guides = "collect") & theme(legend.position = "bottom")
ggsave(qc_vln_1, filename = "result/plot/seurat/qc_vln_1.png", width = 5, height = 2.5)
ggsave(qc_vln_2, filename = "result/plot/seurat/qc_vln_2.png", width = 5, height = 2.5)
ggsave(qc_vln_3, filename = "result/plot/seurat/qc_vln_3.png", width = 5, height = 2.5)
ggsave(qc_vln_4, filename = "result/plot/seurat/qc_vln_4.png", width = 5, height = 2.5)
ggsave(qc_vln_5, filename = "result/plot/seurat/qc_vln_5.png", width = 5, height = 2.5)
ggsave(qc_vln_6, filename = "result/plot/seurat/qc_vln_6.png", width = 5, height = 2.5)

# Cell cycle QC Seurat object

In [None]:
so_qc <- CellCycleScoring(so_qc, s.features = cc_kowalczyk_s$gene, g2m.features = cc_kowalczyk_g2m$gene, set.ident = FALSE)
colnames(so_qc@meta.data) <- gsub("Phase", "cc_phase_class", colnames(so_qc@meta.data))
colnames(so_qc@meta.data) <- gsub("S.Score", "msS_RNA", colnames(so_qc@meta.data))
colnames(so_qc@meta.data) <- gsub("G2M.Score", "msInt_RNA", colnames(so_qc@meta.data)) 
so_qc$msCC_diff_RNA <- so_qc$msS_RNA - so_qc$msInt_RNA

so_qc <- AddModuleScore(so_qc, features = list(cc_kowalczyk_g0$gene), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msG0_RNA")
so_qc <- AddModuleScore(so_qc, features = list(cc_kowalczyk_int$gene), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msInt_RNA")

# Normalize

In [None]:
so_qc <- NormalizeData(
  object               = so_qc, 
  assay                = "RNA", 
  normalization.method = "LogNormalize", 
  scale.factor         = 10000
  )

# Variable feature selection 

In [None]:
so_qc <- FindVariableFeatures(
  object           = so_qc, 
  assay            = "RNA", 
  selection.method = "vst", 
  nfeatures        = 2000
  )

# Remove cell cycle variable feature

In [None]:
# variable_features <- VariableFeatures(so_qc)
# VariableFeatures(so_qc) <- variable_features[!variable_features %in% c(cc_kowalczyk$gene)]

# Scale data
Regress nCount_RNA (total UMI per cell) and pMT_RNA (mitochondrial UMI percentage)

In [None]:
so_qc <- ScaleData(
  object          = so_qc, 
  assay           = "RNA", 
  vars.to.regress = c("nCount_RNA", "pMt_RNA", "msCC_diff_RNA"), 
  do.scale        = TRUE, 
  do.center       = TRUE
  )

# Dim reduction 

In [None]:
source("script/seurat/seurat_function.R")
so_qc <- dim_clust(
    # Seurat 
    so = so_qc, 
    assay = "RNA",
    # PCA features
    blacklist_genes = NULL, 
    # Dim reduction
    dims_pca  = 30,  # Default 20 - RunPCA dims
    dims_umap = 30,  # Default NULL - RunUMAP dims 
    dims_tsne = 20,  # Default 1:5
    min_dist  = 0.3,  # Default 0.3  - RunUMAP dmin.ist - controls how tightly the embedding
    # Cluster 
    dims_cluster = 30,  # Default 1:10 - FindNeighbors dims  
    cluster_res  = 0.8 # Default 0.8 - FindClusters resoluton - above(below) 1.0 to obtain larger(smaller) number of communities)
)


# SingleR annotation 
SingleR identifies marker genes from the reference and uses them to compute assignment scores (based on the Spearman correlation across markers) for each cell in the test dataset against each label in the reference. The label with the highest score is the assigned to the test cell, possibly with further fine-tuning to resolve closely related labels.  

first.labels: Labels before fine-tuning  
labels: Labels after fine-tuning  
pruning: labels after pruning  

In [None]:
#load celldex Immgen reference
ref <- celldex::ImmGenData(ensembl = FALSE)

# Seurat to SingleCellExperiment 
sce <- SingleCellExperiment(list(counts = so_qc@assays$RNA@counts))

# Predict labels
label_main <- SingleR::SingleR(test = sce, ref = ref, labels = ref$label.main, assay.type.test = "counts", de.method = "classic")
label_fine <- SingleR::SingleR(test = sce, ref = ref, labels = ref$label.fine, assay.type.test = "counts", de.method = "classic")

# Add labels to Seurat object
label_main_meta <- as.data.frame(label_main) %>% dplyr::select(pruned.labels, tuning.scores.first) %>% dplyr::rename(main_labels = pruned.labels, main_delta_score = tuning.scores.first)
so_qc <- AddMetaData(so_qc, label_main_meta)
so_qc$main_labels <- factor(so_qc$main_labels, levels = names(so_color$main_labels))

label_fine_meta <- as.data.frame(label_fine) %>% dplyr::select(pruned.labels, tuning.scores.first) %>% dplyr::rename(fine_labels = pruned.labels, fine_delta_score = tuning.scores.first)
so_qc <- AddMetaData(so_qc, label_fine_meta)
# so_qc$fine_labels <- factor(so_qc$main_labels, levels = names(so_color$main_labels)) # Dont have a sort yet

In [None]:
# Dim and Feature plots 

In [None]:
reduction <- "rna_umap_nno"

In [None]:
dplot_1 <- DimPlot(so_qc, reduction = reduction, group.by = "rna_snn_res.0.8", label = TRUE) & 
    theme(aspect.ratio = 1, legend.position = "none")
dplot_2 <- DimPlot(so_qc, reduction = reduction, group.by = "main_labels", label = FALSE) & 
    theme(aspect.ratio = 1, legend.position = "bottom") & 
    scale_color_manual(values = so_color$main_labels, na.value = "dark gray") & 
    guides(color = guide_legend(ncol = 3, override.aes = list(size = 2)))
dplot_3 <- DimPlot(so_qc, reduction = reduction, group.by = "tissue", label = FALSE) & 
    theme(aspect.ratio = 1, legend.position = "bottom") & 
    scale_color_manual(values = so_color$tissue, na.value = "dark gray") & 
    guides(color = guide_legend(ncol = 3, override.aes = list(size = 2)))
dplot_4 <- DimPlot(so_qc, reduction = reduction, group.by = "treatment", label = FALSE) & 
    theme(aspect.ratio = 1, legend.position = "bottom") & 
    scale_color_manual(values = so_color$treatment, na.value = "dark gray") & 
    guides(color = guide_legend(ncol = 3, override.aes = list(size = 2)))

options(repr.plot.width = 10, repr.plot.height = 10)
dplot <- dplot_1 + dplot_2 + dplot_3 + dplot_4 + plot_layout(ncol = 2)
dplot
ggsave(dplot, filename = "result/plot/seurat/dimplot_1.png", width = 9, height = 9)

In [None]:
fplot_1 <- FeaturePlot(so_qc, reduction = reduction, features = "nCount_RNA") & theme(aspect.ratio = 1)
fplot_2 <- FeaturePlot(so_qc, reduction = reduction, features = "nFeature_RNA") & theme(aspect.ratio = 1)
fplot_3 <- FeaturePlot(so_qc, reduction = reduction, features = "main_delta_score") & theme(aspect.ratio = 1)

options(repr.plot.width = 15, repr.plot.height = 5)
fplot <- fplot_1 + fplot_2 + fplot_3 + plot_layout(ncol = 3)
fplot
ggsave(fplot, filename = "result/plot/seurat/fplot_1.png", width = 9, height = 6)

In [None]:
fplot_1 <- FeaturePlot(so_qc, reduction = reduction, features = "pMt_RNA") & theme(aspect.ratio = 1)
fplot_2 <- FeaturePlot(so_qc, reduction = reduction, features = "pRp_RNA") & theme(aspect.ratio = 1)
fplot_3 <- FeaturePlot(so_qc, reduction = reduction, features = "pHb_RNA") & theme(aspect.ratio = 1)

options(repr.plot.width = 15, repr.plot.height = 5)
fplot <- fplot_1 + fplot_2 + fplot_3 + plot_layout(ncol = 3)
fplot
ggsave(fplot, filename = "result/plot/seurat/fplot_2.png", width = 9, height = 6)

In [None]:
fplot_1 <- FeaturePlot(so_qc, reduction = reduction, features = "msG0_RNA1") & theme(aspect.ratio = 1)
fplot_2 <- FeaturePlot(so_qc, reduction = reduction, features = "msInt_RNA1") & theme(aspect.ratio = 1)
fplot_3 <- FeaturePlot(so_qc, reduction = reduction, features = "msCC_diff_RNA") & theme(aspect.ratio = 1)

options(repr.plot.width = 15, repr.plot.height = 5)
fplot <- fplot_1 + fplot_2 + fplot_3 + plot_layout(ncol = 3)
fplot
ggsave(fplot, filename = "result/plot/seurat/fplot_3.png", width = 9, height = 6)

# Bar plot cluster overlap

In [None]:
cluster_tissue <- ggplot(so_qc@meta.data, aes(x = seurat_clusters, fill = tissue)) + 
  geom_bar(stat = "count", position = "fill") + 
  scale_fill_manual(values = so_color$tissue) +
  ggtitle("Cluster frequency") + xlab("Cluster") + ylab("Cell frequency") + 
  theme(legend.position = "bottom")

cluster_treatment <- ggplot(so_qc@meta.data, aes(x = seurat_clusters, fill = treatment)) + 
  geom_bar(stat = "count", position = "fill") + 
  scale_fill_manual(values = so_color$treatment) +
  ggtitle("Cluster frequency") + xlab("Cluster") + ylab("Cell frequency") + 
  theme(legend.position = "bottom")

options(repr.plot.width = 10, repr.plot.height = 5)
cluster_tissue + cluster_treatment + plot_layout(ncol = 2)
ggsave(cluster_tissue, filename = "result/plot/seurat/cluster_tissue.png", width = 6, height = 3)
ggsave(cluster_treatment, filename = "result/plot/seurat/cluster_treatment.png", width = 6, height = 3)

In [None]:
bar_1 <- ggplot(so_qc@meta.data, aes(x = seurat_clusters, y = pHb_RNA)) + geom_boxplot()
bar_2 <- ggplot(so_qc@meta.data, aes(x = seurat_clusters, y = pMt_RNA)) + geom_boxplot()
bar_3 <- ggplot(so_qc@meta.data, aes(x = seurat_clusters, y = pRp_RNA)) + geom_boxplot()

options(repr.plot.width = 10, repr.plot.height = 10)
bar_1 + bar_2 + bar_3 + plot_layout(ncol = 2)

# Cluster marker 

## FACS marker 
CD45 - Ptprc for Immune cells 
Erythroblast marker CD71 Tfrc CD71 is an integral membrane protein that mediates the uptake of transferrin iron complexes

In [None]:
fplot_1 <- FeaturePlot(so_qc, reduction = reduction, features = "Ptprc") & theme(aspect.ratio = 1) & ggtitle("Ptprc (CD45)")
fplot_2 <- FeaturePlot(so_qc, reduction = reduction, features = "Cd34") & theme(aspect.ratio = 1) & ggtitle("Cd34")
fplot_3 <- FeaturePlot(so_qc, reduction = reduction, features = "Kit") & theme(aspect.ratio = 1) & ggtitle("Kit")
fplot_4 <- FeaturePlot(so_qc, reduction = reduction, features = "Tfrc") & theme(aspect.ratio = 1) & ggtitle("Tfrc (CD71)")
fplot_5 <- FeaturePlot(so_qc, reduction = reduction, features = "Gata2") & theme(aspect.ratio = 1) & ggtitle("Gata2")
fplot_6 <- FeaturePlot(so_qc, reduction = reduction, features = "Gata1") & theme(aspect.ratio = 1) & ggtitle("Gata1")

fplot <- fplot_1 + fplot_2 + fplot_3 + fplot_4 + fplot_5 + fplot_6 + plot_layout(ncol = 3)
options(repr.plot.width = 15, repr.plot.height = 10)
fplot
ggsave(fplot, filename = "result/plot/seurat/fplot_2.png", width = 9, height = 6)

## Histocompatibility Class I
H2-K1 - Class I histocompatibility antigen, kappa-B alpha chain   
H2-D1 - Class I histocompatibility antigen, D-B alpha chain   
H2-L1 - not found   

## Histocompatibility Class II
H2-Aa - Class II histocompatibility antigen, A-B alpha chain   
H2-Ab1 - Class II histocompatibility antigen, A beta chain   

H2-Eb1 - Class II histocompatibility antigen, I-E beta chain   
H2-Eb2 - Class II histocompatibility antigen, E beta chain  

In [None]:
mhcI_genes <- c("H2-K1", "H2-D1")
mhcII_genes <- c("H2-Aa", "H2-Ab1", "H2-Eb1", "H2-Eb2")
cd4_genes <- c("Cd4") 
cd8_genes <- c("Cd8a")

so_qc <- AddModuleScore(so_qc, features = list(mhcI_genes), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msMHCI_RNA")
so_qc <- AddModuleScore(so_qc, features = list(mhcII_genes), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msMHCII_RNA")
so_qc <- AddModuleScore(so_qc, features = list(cd4_genes), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msCd4_RNA")
so_qc <- AddModuleScore(so_qc, features = list(cd8_genes), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msCd8_RNA")

fplot_mhcI <- FeaturePlot(so_qc, reduction = reduction, features = "msMHCI_RNA1") & theme(aspect.ratio = 1) & ggtitle("MHC class I")
fplot_mhcII <- FeaturePlot(so_qc, reduction = reduction, features = "msMHCII_RNA1") & theme(aspect.ratio = 1) & ggtitle("MHC class II")
fplot_cd4 <- FeaturePlot(so_qc, reduction = reduction, features = "msCd4_RNA1") & theme(aspect.ratio = 1) & ggtitle("CD4")
fplot_cd8 <- FeaturePlot(so_qc, reduction = reduction, features = "msCd8_RNA1") & theme(aspect.ratio = 1) & ggtitle("CD8")

fplot_mhc <- fplot_mhcI + fplot_mhcII + fplot_cd4 + fplot_cd8 + plot_layout(ncol = 2)
options(repr.plot.width = 15, repr.plot.height = 10)
fplot_mhc
ggsave(fplot_mhc, filename = "result/plot/seurat/fplot_mhc.png", width = 6, height = 6)

## T-cell TCR-CD3 complex 
Trac - T cell receptor alpha constant  
Trbc1 - T cell receptor beta constant 1  
Trbc2 - T cell receptor beta constant 2  
Trdc - T cell receptor delta constant  
Trgc1 - T cell receptor gamma constant 1  
Trgc2 - T cell receptor gamma constant 2  

Cd247 - T-cell surface glycoprotein Cd3 zeta chain (Cd3z)  
Cd3g - T-cell surface glycoprotein Cd3 gamma chain   
Cd3e - T-cell surface glycoprotein Cd3 epsilon chain   
Cd3d - T-cell surface glycoprotein Cd3 delta chain   

In [None]:
tcr_genes <- c("Trac", "Trbc1", "Trbc2", "Trdc", "Trgc1", "Trgc2")
tcr_cd3_genes <- c("Cd247", "Cd3g", "Cd3e", "Cd3d")

so_qc <- AddModuleScore(so_qc, features = list(tcr_genes), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msTcr_RNA")
so_qc <- AddModuleScore(so_qc, features = list(tcr_cd3_genes), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msTcr_cd3_RNA")

fplot_tcr <- FeaturePlot(so_qc, reduction = reduction, features = "msTcr_RNA1") & theme(aspect.ratio = 1) & ggtitle("TCR")
fplot_tcr_cd3 <- FeaturePlot(so_qc, reduction = reduction, features = "msTcr_cd3_RNA1") & theme(aspect.ratio = 1) & ggtitle("Cd247/Cd3 family")

fplot_tcr_cd3 <- fplot_tcr + fplot_tcr_cd3 + plot_layout(ncol = 2)
options(repr.plot.width = 10, repr.plot.height = 5)
fplot_tcr_cd3
ggsave(fplot_tcr_cd3, filename = "result/plot/seurat/fplot_tcr_cd3.png", width = 6, height = 3)

## B-cell Immonogloblin complex 
Naive B-cells produce the following Ig classes:  
Ighm - Immunoglobulin heavy constant mu (naive B-cells)  
Ighd - Immunoglobulin heavy constant delta (naive B-cells)  

Through isotope switching the following Ig classes can be produced:   
Ighg1 - Immunoglobulin heavy constant gamma (Mouse with Igh1b have Igg2c isotope instead of Igg2a)  
Ighg2a - Immunoglobulin heavy constant gamma (NA)  
Ighg2b - Immunoglobulin heavy constant gamma  
Ighg2c - Immunoglobulin heavy constant gamma   
Ighg3 - Immunoglobulin heavy constant gamma   
Igha - Immunoglobulin heavy constant alpha   
Ighe - Immunoglobulin heavy constant epsilon (NA)  

Igkc - Immunoglobulin kappa constant (light chain)  
Iglc - Immunoglobulin lambda constant (light chain)  

In [None]:
Ighm_genes <- c("Ighm")
Ighd_genes <- c("Ighd")
Ighg_genes <- c("Ighg1", "Ighg2a", "Ighg2b", "Ighg2c", "Ighg3")
Igha_genes <- c("Igha")
Ighe_genes <- c("Ighe") 
Igkc_genes <- c("Igkc")
Iglc_genes <- c("Iglc1", "Iglc2", "Iglc3")

so_qc <- AddModuleScore(so_qc, features = list(Ighm_genes), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msIghm_RNA")
so_qc <- AddModuleScore(so_qc, features = list(Ighd_genes), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msIghd_RNA")
so_qc <- AddModuleScore(so_qc, features = list(Ighg_genes), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msIghg_RNA")
so_qc <- AddModuleScore(so_qc, features = list(Igha_genes), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msIgha_RNA")
#so_qc <- AddModuleScore(so_qc, features = list(Ighe_genes), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msIghe_RNA")
so_qc <- AddModuleScore(so_qc, features = list(Igkc_genes), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msIgkc_RNA")
so_qc <- AddModuleScore(so_qc, features = list(Iglc_genes), assays = "RNA", slot = "data", ctrl = 100, nbin = 25, name = "msIglc_RNA")

fplot_Ighm <- FeaturePlot(so_qc, reduction = reduction, features = "msIghm_RNA1") & theme(aspect.ratio = 1) & ggtitle("Ighm")
fplot_Ighd <- FeaturePlot(so_qc, reduction = reduction, features = "msIghd_RNA1") & theme(aspect.ratio = 1) & ggtitle("Ighd")
fplot_Ighg <- FeaturePlot(so_qc, reduction = reduction, features = "msIghg_RNA1") & theme(aspect.ratio = 1) & ggtitle("Ighg")
fplot_Igha <- FeaturePlot(so_qc, reduction = reduction, features = "msIgha_RNA1") & theme(aspect.ratio = 1) & ggtitle("Igha")
fplot_Igkc <- FeaturePlot(so_qc, reduction = reduction, features = "msIgkc_RNA1") & theme(aspect.ratio = 1) & ggtitle("Igkc")
fplot_Iglc <- FeaturePlot(so_qc, reduction = reduction, features = "msIglc_RNA1") & theme(aspect.ratio = 1) & ggtitle("Iglc")

fplot_ig <- fplot_Ighm + fplot_Ighd + fplot_Ighg + fplot_Igha + fplot_Igkc + fplot_Iglc + plot_layout(ncol = 3)
options(repr.plot.width = 15, repr.plot.height = 10)
fplot_ig
ggsave(fplot_ig, filename = "result/plot/seurat/fplot_ig.png", width = 9, height = 6)

# Save output 

In [None]:
# Write tsv file for each sample into 10x sample dir
for (sample_path in unique(so_qc$sample_path)) {
    
    tsv_file <- paste0(sample_path, "/filtered_feature_bc_matrix/barcodes.qc.tsv")
    print(tsv_file)
    cell_id <- so_qc@meta.data[so_qc@meta.data$sample_path == sample_path, ]$cell_id
    
    library(stringr)
    write.table(x = cell_id,
                file = tsv_file,
                row.names = FALSE, 
                col.names = FALSE, 
                sep = "\t", 
                quote = FALSE)

}

In [None]:
# Write Seurat object 
saveRDS(so_qc, so_qc_file)

Its only possible to store two out of three count matrix. Data can be recovered from counts but not vice versa. However, data can not be removed and is prioretized before counts. Therefore we fill counts in the data slot so that it gets exported.  

In [None]:
# Write h5ad
so_qc <- readRDS(so_qc_file)
so_qc@assays$RNA@data <- so_qc@assays$RNA@counts
SaveH5Seurat(so_qc, filename = h5ad_qc_file, overwrite = TRUE)
Convert(h5ad_qc_file, dest = "h5ad", overwrite = TRUE)

# Session info

In [None]:
sessionInfo()