In [188]:
library(Matrix)
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(viridis))
library(glmGamPoi)
library(RColorBrewer)
library(stringr)
options(future.globals.maxSize = 16000 * 1024^2)


In [189]:
#setwd("../../snrna/")
setwd("/share/crsp/lab/seyedam/share/enc4_mouse/snrna/")
meta = read.delim("ref/enc4_mouse_snrna_metadata.tsv")
tissue = "C2C12"

# Functions
# read in sparse matrix and assign row and column names
get_counts = function(batch){
    counts = readMM(paste0("scrublet/",batch,"_matrix.mtx"))
    barcodes = read.delim(paste0("scrublet/",batch,"_barcodes_scrublet.tsv"),header = F, 
                          col.names=c("cellID","doublet_scores","doublets"))
    
    features = read.delim(paste0("scrublet/",batch,"_genes.tsv"),header = F) 
    rownames(counts) = features$V1 
    colnames(counts) = barcodes$cellID
    out = counts
}

# read in associated metadata
get_metadata = function(batch){
    barcodes = read.delim(paste0("scrublet/",batch,"_barcodes_scrublet.tsv"),header = F, 
                          col.names=c("cellID","doublet_scores","doublets"))
    barcodes$library_accession = do.call("rbind", strsplit(barcodes$cellID, "[.]"))[,2]
    barcodes = left_join(barcodes,meta,by = "library_accession")
    out = barcodes
}

# merge the counts across experimental "batches"

merge_counts = function(batches_list){
    matrix_list = list()
    for (i in 1:length(batches_list)){
        batch = batches_list[i]
        matrix_list[[i]] = get_counts(batch)
    }
    
    if (length(batches_list) < 2){
       matrix = matrix_list[[1]] 
       out = matrix
    } else {
        matrix = matrix_list[[1]]
        for (j in 2:length(batches_list)){
            matrix = RowMergeSparseMatrices(matrix,matrix_list[[j]])
        }
        out = matrix
    }
}

# merge the metadata across experimental "batches"
merge_metadata = function(batches_list){
    meta_list = list()
    for (i in 1:length(batches_list)){
        batch = batches_list[i]
        meta_list[[i]] = get_metadata(batch)
    }
    
    if (length(batches_list) < 2){
       meta = meta_list[[1]] 
       out = meta
    } else {
        meta = meta_list[[1]]
        for (j in 2:length(batches_list)){
            meta = rbind(meta,meta_list[[j]])
        }
        out = meta
    }
}

# make seurat object
seurat_obj = function(counts,metadata){
    obj = CreateSeuratObject(counts = counts, min.cells = 0, min.features = 0)
    obj@meta.data = cbind(obj@meta.data,metadata)
    obj[["percent.mt"]] = PercentageFeatureSet(obj, pattern = "^mt-")
    obj[["percent.ribo"]] <- PercentageFeatureSet(obj, pattern = "^Rp[sl][[:digit:]]|^Rplp[[:digit:]]|^Rpsa")
    out = obj
}


In [190]:
# Read in data
#Use functions defined above to create 1 Parse Seurat object
meta = meta[meta$tissue == tissue,]

# get the experimental batches for Parse
parse_batches = unique(meta$experiment_batch[meta$technology == "Parse"])
parse_counts = merge_counts(parse_batches)
parse_meta = merge_metadata(parse_batches)

# Make Seurat object
obj_parse= seurat_obj(parse_counts, parse_meta)

obj_parse

"Non-unique features (rownames) present in the input matrix, making unique"


An object of class Seurat 
47721 features across 25615 samples within 1 assay 
Active assay: RNA (47721 features, 0 variable features)

In [191]:
# Filter by unique cells in original object
load("ref/mb_mt_36869_cells_20clusters.rda")


In [192]:
mb_mt$cellID = colnames(mb_mt)
mb_mt$cellID = gsub("single_cells_barcoded_headAligned_sorted_ZE3GQ:","",mb_mt$cellID)
mb_mt$cellID = do.call("rbind", strsplit(as.character(mb_mt$cellID), "_"))[,1]


In [193]:
mb_mt_1k = subset(mb_mt, subset = Library == "1k")
mb_mt_1k_mb_nuc = subset(mb_mt_1k, subset = SampleType == "MB_nuclei")
mb_mt_1k_mb_nuc$cellID = paste0(mb_mt_1k_mb_nuc$cellID,".ENCLB514RKT")
mb_mt_1k_mb_nuc$subtypes = mb_mt_1k_mb_nuc$SampleType

mb_mt_1k_mb_cells = subset(mb_mt_1k, subset = SampleType == "MB_cells")
mb_mt_1k_mb_cells$cellID = paste0(mb_mt_1k_mb_cells$cellID,".ENCLB952MZJ")
mb_mt_1k_mb_cells$subtypes = mb_mt_1k_mb_cells$SampleType

mb_mt_1k_mt = subset(mb_mt_1k, subset = SampleType == "MT_nuclei")
mb_mt_1k_mt$cellID = paste0(mb_mt_1k_mt$cellID,".ENCLB273ZWE")
mb_mt_1k_mt$subtypes = mb_mt_1k_mt$final_clusters_ordered
mb_mt_1k_mt$subtypes = gsub("\\<1\\>","MB_cells",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<2\\>","MB_cells",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<3\\>","MB_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<4\\>","MB_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<5\\>","MB_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<6\\>","MB_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<7\\>","MB_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<7\\>","Pax7hi_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<8\\>","Pax7hi_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<9\\>","Pax7hi_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<10\\>","Pax7hi_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<11\\>","Pax7hi_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<12\\>","Pax7hi_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<13\\>","Pax7hi_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<14\\>","Pax7hi_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<15\\>","Pax7hi_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<16\\>","Myoghi_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<17\\>","Myoghi_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<18\\>","Myoghi_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<19\\>","Myoghi_nuclei",mb_mt_1k_mt$subtypes)
mb_mt_1k_mt$subtypes = gsub("\\<20\\>","Myoghi_nuclei",mb_mt_1k_mt$subtypes)

mb_mt_9k = subset(mb_mt, subset = Library != "1k")
mb_mt_9k_mb_nuc = subset(mb_mt_9k, subset = SampleType == "MB_nuclei")
mb_mt_9k_mb_nuc$cellID = paste0(mb_mt_9k_mb_nuc$cellID,".ENCLB294FBZ")
mb_mt_9k_mb_nuc = subset(mb_mt_9k_mb_nuc, cellID %in% mb_mt_9k_mb_nuc$cellID[!(duplicated(mb_mt_9k_mb_nuc$cellID)|duplicated(mb_mt_9k_mb_nuc$cellID, fromLast=TRUE))])
mb_mt_9k_mb_nuc$subtypes = mb_mt_9k_mb_nuc$SampleType

mb_mt_9k_mb_cells = subset(mb_mt_9k, subset = SampleType == "MB_cells")
mb_mt_9k_mb_cells$cellID = paste0(mb_mt_9k_mb_cells$cellID,".ENCLB129LMS")
mb_mt_9k_mb_cells = subset(mb_mt_9k_mb_cells, cellID %in% mb_mt_9k_mb_cells$cellID[!(duplicated(mb_mt_9k_mb_cells$cellID)|duplicated(mb_mt_9k_mb_cells$cellID, fromLast=TRUE))])
mb_mt_9k_mb_cells$subtypes = mb_mt_9k_mb_cells$SampleType


mb_mt_9k_mt = subset(mb_mt_9k, subset = SampleType == "MT_nuclei")
mb_mt_9k_mt$cellID = paste0(mb_mt_9k_mt$cellID,".ENCLB527JDU")
mb_mt_9k_mt = subset(mb_mt_9k_mt, cellID %in% mb_mt_9k_mt$cellID[!(duplicated(mb_mt_9k_mt$cellID)|duplicated(mb_mt_9k_mt$cellID, fromLast=TRUE))])
mb_mt_9k_mt$subtypes = mb_mt_9k_mt$final_clusters_ordered
mb_mt_9k_mt$subtypes = gsub("\\<1\\>","MB_cells",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<2\\>","MB_cells",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<3\\>","MB_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<4\\>","MB_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<5\\>","MB_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<6\\>","MB_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<7\\>","MB_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<7\\>","Pax7hi_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<8\\>","Pax7hi_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<9\\>","Pax7hi_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<10\\>","Pax7hi_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<11\\>","Pax7hi_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<12\\>","Pax7hi_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<13\\>","Pax7hi_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<14\\>","Pax7hi_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<15\\>","Pax7hi_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<16\\>","Myoghi_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<17\\>","Myoghi_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<18\\>","Myoghi_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<19\\>","Myoghi_nuclei",mb_mt_9k_mt$subtypes)
mb_mt_9k_mt$subtypes = gsub("\\<20\\>","Myoghi_nuclei",mb_mt_9k_mt$subtypes)


                            

In [194]:
mb_mt_unique = merge(mb_mt_1k_mb_nuc,
                     c(mb_mt_1k_mb_cells,mb_mt_1k_mt,
                       mb_mt_9k_mb_nuc, mb_mt_9k_mb_cells, mb_mt_9k_mt))

In [195]:
# bc file 
bc = read.csv("ref/Rnd1_bc_8nt_v2.csv",header=F)
colnames(bc) = c("Index","rnd1_Barcode")

barcodes = data.frame(barcode = colnames(obj_parse))

barcodes$rnd3_rnd2 = do.call("rbind", strsplit(as.character(barcodes$barcode), "_"))[,1]
barcodes$Index = do.call("rbind", strsplit(as.character(barcodes$barcode), "_"))[,2]
barcodes$Index = as.numeric(do.call("rbind", strsplit((barcodes$Index), "[.]"))[,1])
barcodes$library = obj_parse$library_accession

barcodes = left_join(barcodes,bc)
barcodes$cellID = paste0(barcodes$rnd3_rnd2, barcodes$rnd1_Barcode, ".", barcodes$library)
barcodes = barcodes[barcodes$cellID %in% mb_mt_unique$cellID,]
barcodes = left_join(barcodes, mb_mt_unique@meta.data[,c("cellID","final_clusters_ordered","SampleType","CellType","subtypes")])
barcodes$gen_celltype = barcodes$CellType
barcodes$gen_celltype = gsub("\\<MB\\>","Myoblast",barcodes$gen_celltype) 
barcodes$gen_celltype = gsub("\\<MT\\>","Myotube",barcodes$gen_celltype) 
barcodes$celltypes = barcodes$SampleType
barcodes = barcodes[barcodes$barcode %in% obj_parse$cellID,]

[1m[22mJoining, by = "Index"
[1m[22mJoining, by = "cellID"


In [197]:
# Filter 
obj_parse = subset(obj_parse, cellID %in% barcodes$barcode)
barcodes = barcodes[match(obj_parse$cellID, barcodes$barcode),]

In [198]:
table(barcodes$barcode == obj_parse$cellID)


 TRUE 
23699 

In [202]:
obj_parse$gen_celltype = barcodes$gen_celltype
obj_parse$celltypes = barcodes$celltypes
obj_parse$subtypes = barcodes$subtypes


In [203]:
obj_parse_standard = subset(obj_parse, subset = depth1 == "shallow")
obj_parse_deep = subset(obj_parse, subset = depth1 == "deep")


In [204]:
# Filter
#Use QC information in metadata to filter by # UMIs and # genes detected per nucleus as well as doublet scores and percent mitochondrial gene expression. 
obj_parse_standard = subset(obj_parse, subset = depth1 == "shallow")
obj_parse_standard_cells = subset(obj_parse_standard, subset = sample == "C2C12_0hr_cells")
obj_parse_standard_nuclei= subset(obj_parse_standard, subset = sample != "C2C12_0hr_cells")

obj_parse_deep = subset(obj_parse, subset = depth1 == "deep")
obj_parse_deep_cells = subset(obj_parse_deep, subset = sample == "C2C12_0hr_cells")
obj_parse_deep_nuclei= subset(obj_parse_deep, subset = sample != "C2C12_0hr_cells")


obj_parse_standard_cells <- subset(obj_parse_standard_cells, 
                            subset = nCount_RNA > unique(obj_parse_standard_cells$lower_nCount_RNA) & 
                            nCount_RNA < unique(obj_parse_standard_cells$upper_nCount_RNA)  & 
                            nFeature_RNA > unique(obj_parse_standard_cells$lower_nFeature_RNA) & 
                            doublet_scores < unique(obj_parse_standard_cells$upper_doublet_scores) & 
                            percent.mt < unique(obj_parse_standard_cells$upper_percent.mt))

obj_parse_standard_nuclei <- subset(obj_parse_standard_nuclei, 
                            subset = nCount_RNA > unique(obj_parse_standard_nuclei$lower_nCount_RNA) & 
                            nCount_RNA < unique(obj_parse_standard_nuclei$upper_nCount_RNA)  & 
                            nFeature_RNA > unique(obj_parse_standard_nuclei$lower_nFeature_RNA) & 
                            doublet_scores < unique(obj_parse_standard_nuclei$upper_doublet_scores) & 
                            percent.mt < unique(obj_parse_standard_nuclei$upper_percent.mt))

obj_parse_deep_cells <- subset(obj_parse_deep_cells, 
                         subset = nCount_RNA > unique(obj_parse_deep_cells$lower_nCount_RNA) & 
                         nCount_RNA < unique(obj_parse_deep_cells$upper_nCount_RNA)  & 
                         nFeature_RNA > unique(obj_parse_deep_cells$lower_nFeature_RNA) & 
                         doublet_scores < unique(obj_parse_deep_cells$upper_doublet_scores) & 
                         percent.mt < unique(obj_parse_deep_cells$upper_percent.mt))

obj_parse_deep_nuclei <- subset(obj_parse_deep_nuclei, 
                         subset = nCount_RNA > unique(obj_parse_deep_nuclei$lower_nCount_RNA) & 
                         nCount_RNA < unique(obj_parse_deep_nuclei$upper_nCount_RNA)  & 
                         nFeature_RNA > unique(obj_parse_deep_nuclei$lower_nFeature_RNA) & 
                         doublet_scores < unique(obj_parse_deep_nuclei$upper_doublet_scores) & 
                         percent.mt < unique(obj_parse_deep_nuclei$upper_percent.mt))

obj_parse_filt = merge(obj_parse_standard_cells,c(obj_parse_standard_nuclei,
                      obj_parse_deep_cells,obj_parse_deep_nuclei))


In [None]:
## SCT , PCA, etc. No need to integrate based on depth based on previous work
obj_parse_filt <- SCTransform(obj_parse_filt, method = "glmGamPoi", 
                         vars.to.regress = c("percent.mt","nFeature_RNA"), verbose = F)

#Dimensionality reduction and clustering
# PCA
obj_parse_filt <- RunPCA(obj_parse_filt, verbose = T, npcs = 50)

# UMAP and clustering
obj_parse_filt <- RunUMAP(obj_parse_filt, reduction = "pca", dims = 1:30,verbose = F)
obj_parse_filt <- FindNeighbors(obj_parse_filt, reduction = "pca", dims = 1:30,verbose = F)
obj_parse_filt <- FindClusters(obj_parse_filt,resolution=1.6,verbose = F)


In [206]:
# Add cell cycle scores 
load("ref/mouse_cellcycle_genes.rda")
DefaultAssay(obj_parse_filt) = "SCT"
obj_parse_filt<- CellCycleScoring(obj_parse_filt, s.features = m.s.genes, g2m.features = m.g2m.genes)


# Plot annotations

In [210]:
system("mkdir plots/c2c12/")
system("mkdir plots/c2c12/annotation")

In [214]:
color_ref = read.delim("ref/enc4_mouse_snrna_celltypes_c2c12.csv",sep=",",col.names = c("tissue","gen_celltype","celltypes",
                                                                              "subtypes","gen_celltype_color",
                                                                              "celltype_color","subtype_color"))
gen_celltype_colors = unique(color_ref[color_ref$tissue == "C2C12",c("gen_celltype","gen_celltype_color")])
rownames(gen_celltype_colors) = gen_celltype_colors$gen_celltype
gen_celltype_colors = gen_celltype_colors[sort(unique(obj_parse_filt$gen_celltype)),]

pdf(file="plots/c2c12/annotation/UMAP_final_gen_celltype.pdf",
   width = 15, height = 10)

DimPlot(obj_parse_filt, reduction = "umap", 
        group.by = "gen_celltype", 
        label = TRUE, label.size = 8, repel = TRUE,
       cols = gen_celltype_colors$gen_celltype_color)

dev.off()

In [216]:
celltype_colors = unique(color_ref[color_ref$tissue == "C2C12",c("celltypes","celltype_color")])
rownames(celltype_colors) = celltype_colors$celltypes
celltype_colors = celltype_colors[sort(unique(obj_parse_filt$celltypes)),]

pdf(file="plots/cortex/annotation/UMAP_final_celltypes.pdf",
    width = 15, height = 10)

DimPlot(obj_parse_filt, reduction = "umap", 
        group.by = "celltypes", 
        label = TRUE, label.size = 8, repel = TRUE,
       cols = celltype_colors$celltype_color)

dev.off()

In [218]:
subtype_colors = unique(color_ref[color_ref$tissue == "C2C12",c("subtypes","subtype_color")])
rownames(subtype_colors) = subtype_colors$subtypes
subtype_colors = subtype_colors[sort(unique(obj_parse_filt$subtypes)),]

pdf(file="plots/cortex/annotation/UMAP_final_subtypes.pdf",
    width = 15, height = 10)

DimPlot(obj_parse_filt, reduction = "umap", 
        group.by = "subtypes", 
        label = TRUE, label.size = 8, repel = TRUE,
       cols = subtype_colors$subtype_color)

dev.off()

In [219]:
# SAVE
saveRDS(obj_parse_filt,file=paste0("seurat/",str_to_lower(tissue),"_Parse_integrated.rds"))
write.csv(obj_parse_filt@meta.data,file=paste0("seurat/",str_to_lower(tissue),"_Parse_integrated_metadata.csv"))
