# Load libraries and Themes

In [1]:
suppressPackageStartupMessages({
        suppressWarnings({
        library(Seurat)
        library(SoupX)
        library(ggplot2)
        library(tidyverse)
        library(scDblFinder)
        library(harmony)
        library(SeuratDisk)
        library(SingleCellExperiment)
        library(dplyr)
        library(ggpubr)
        library(pals)
        library(viridis)
        library(scCustomize)
            })})

In [None]:
setwd("/media/daten/dmeral/scseq_analysis/2024_LV_CTRL_ALDO_REC")

In [None]:
set.seed(1234)

# SoupX (remove ambient signal)

In [None]:
ContaminationFraction <- 0.15

In [None]:
samples <- c("LV_2108", "LV_2109", "LV_2110", "LV_2111", "LV_2112", "LV_2113", "LV_2114", "LV_2115", "LV_2310", "LV_2311", "LV_2312", "LV_2313")

In [None]:
# Simple pre-process

mad_outlier <- function(sobj, metric, nmads){
  M <- sobj@meta.data[[metric]]
  median_M <- median(M, na.rm = TRUE)
  mad_M <- mad(M, na.rm = TRUE)
  outlier <- (M < (median_M - nmads * mad_M)) | (M > (median_M + nmads * mad_M))
  return(outlier)
}

    # Load in filtered cellranger outs
filter_mad_outliers <- function(sample_id){
  path <- paste0("/media/daten/dmeral/scseq/2024_LV_CTRL_ALDO_REC/", sample_id, "/outs/filtered_feature_bc_matrix/")
  sobj <- Read10X(data.dir = path)
  sobj <- CreateSeuratObject(counts = sobj, min.cells = 0, min.features = 200)
  sobj$sample_id <- sample_id
  
  # add QC metrics
  sobj$log1p_total_counts <- log1p(sobj@meta.data$nCount_RNA)
  sobj$log1p_n_genes_by_counts <- log1p(sobj@meta.data$nFeature_RNA)
  sobj[["percent.mt"]] <- PercentageFeatureSet(sobj, pattern = "^mt-")
  
  # find outliers and subset
  bool_vector <- !mad_outlier(sobj, "log1p_total_counts", 5) & !mad_outlier(sobj, "log1p_n_genes_by_counts", 5) & !mad_outlier(sobj, "percent.mt", 5)
  sobj <- subset(sobj, cells = which(bool_vector))
  
  return(sobj)
}

In [None]:
data_list <- sapply(samples, filter_mad_outliers)

In [None]:
# Basic seurat normalization and clustering
get_soup_groups <- function(sobj){
  sobj <- NormalizeData(sobj, verbose = FALSE)
  sobj <- FindVariableFeatures(object = sobj, nfeatures = 2000, verbose = FALSE, selection.method = "vst")
  sobj <- ScaleData(sobj, verbose = FALSE)
  sobj <- RunPCA(sobj, npcs = 35, verbose = FALSE)
  sobj <- FindNeighbors(sobj, dims = 1:35, verbose = FALSE)
  sobj <- FindClusters(sobj, resolution = 0.5, verbose = FALSE)
  
  return(sobj@meta.data[["seurat_clusters"]])
}

In [None]:
add_soup_groups <- function(sobj){
  sobj$soup_group <- get_soup_groups(sobj)
  return(sobj)
}

data_list <- sapply(data_list, add_soup_groups)

In [None]:
make_soup <- function(sobj){
  sample_id <- as.character(sobj$sample_id[1])  # e.g., LA_2109
  path <- paste0("/media/daten/dmeral/scseq/2024_LV_CTRL_ALDO_REC/", sample_id, "/outs/raw_feature_bc_matrix/")
  raw <- Read10X(data.dir = path)

  sc <- SoupChannel(raw, GetAssayData(sobj, layer = "counts"))
  sc <- setClusters(sc, sobj$soup_group)
  sc <- setContaminationFraction(sc, ContaminationFraction)
  out <- adjustCounts(sc, roundToInt = TRUE)
  
  # Optional: keep original counts
  sobj[["original.counts"]] <- CreateAssayObject(counts = GetAssayData(sobj, layer = "counts"))
  
  # Set adjusted counts back to the RNA assay
  sobj <- SetAssayData(sobj, layer = "counts", new.data = out)
  
  return(sobj)
}

In [None]:
suppressWarnings({
    data_list <- sapply(data_list, make_soup)
})

In [None]:
# Check if  correct number of reads were removed by SoupX
# Create a data frame to store results
results <- data.frame(Sample = samples, Counts_Before = NA, Counts_After = NA, Fraction_Left = NA)

# Iterate through each sample
for (i in seq_along(samples)) {
  sample_name <- samples[i]
  data_obj <- data_list[[i]] 

  # Check if the assay exists
  if ("original.counts" %in% names(data_obj) && "RNA" %in% names(data_obj)) {
    counts_before <- sum(GetAssayData(data_obj, assay = "original.counts", layer = "counts"))
    counts_after <- sum(GetAssayData(data_obj, assay = "RNA", layer = "counts"))

    # Store results in the data frame
    results[i, "Counts_Before"] <- counts_before
    results[i, "Counts_After"] <- counts_after
    
    # Calculate percent left if counts_before is not zero
    if (!is.na(counts_before) && counts_before > 0) {
      results[i, "Fraction_Left"] <- counts_after / counts_before
    }
  } else {
    message(paste("Assay not found for sample:", sample_name))
  }
}

# Print 
print(results)

# scDblFinder (remove doublets)

In [None]:
saveRDS(data_list, paste0("seurat_objects/rds_data_list_", ContaminationFraction, ".rds"))

In [None]:
data_list <- readRDS(paste0("seurat_objects/rds_data_list_", ContaminationFraction, ".rds"))

In [None]:
#Normalize/ Center and Scale Matrix
data_list_scale <- lapply(data_list, function(obj.seu) {
   obj.seu <- NormalizeData(obj.seu, verbose = FALSE)
   obj.seu <- ScaleData(obj.seu, verbose = FALSE)
   return(obj.seu)
})

In [None]:
# Convert to SingleCellExperiment
obj_sce <- lapply(data_list_scale, function(obj.seu){
    as.SingleCellExperiment(obj.seu, assay = "RNA")
})

In [None]:
#Run scDblFinder
obj_scDblFinder <- lapply(obj_sce, function(expression_matrix){
   scDblFinder(expression_matrix)
})

In [None]:
# Convert to Seurat objet
obj_seu <- lapply(obj_scDblFinder, function(sce){
    as.Seurat(x = sce, 
              counts = "counts", 
              data = "logcounts")
})

In [None]:
table_singlets <- lapply(obj_seu, function(obj){
    table(obj$scDblFinder.class)
})

print(table_singlets)

# Add meta.data, filter singlets and merge

In [None]:
meta.data_tab <- matrix(c(rep(1, each = 12), 
                          "CTRL", "CTRL",  "ALDO", "ALDO", "ALDO", "ALDO", "CTRL", "CTRL", "REC", "REC", "REC", "REC", 
                          "f", "f", "f", "f","m", "m", "m", "m", "f", "f", "m", "m", 
                          2, 1, 2, 1, 1, 2, 2, 1, rep(3, each = 4),
                          rep("LV", each = 12), 
                          1:12), 
                          ncol = 12, byrow = TRUE)

rownames(meta.data_tab) <- c("replicate", "treatment", "sex", "batch", "chamber", "unique")
colnames(meta.data_tab) <- samples

meta.data_tab <- as.table(meta.data_tab)

meta.data_tab

In [None]:
# Adding metadata to Seurat objects with alignment
obj_seu_newmeta <- lapply(samples, function(nam) {
    # Retrieve the Seurat object for the current sample
    scobj <- obj_seu[[nam]]
    
    # Extract the corresponding metadata and convert it to a data frame
    meta <- meta.data_tab[, nam]
    
    # Ensure the metadata is in the right format
    meta_df <- as.data.frame(matrix(rep(meta, ncol(scobj)), nrow = ncol(scobj), byrow = TRUE))
    colnames(meta_df) <- rownames(meta.data_tab)  # Set column names to match the metadata structure
    
    # Add metadata to the Seurat object
    scobj <- AddMetaData(scobj, metadata = meta_df)
    
    return(scobj)  # Return the modified Seurat object
})

# Assign names to the new list
names(obj_seu_newmeta) <- samples

In [None]:
# Merge in one object
obj_seu_merge <- Merge_Seurat_List(obj_seu_newmeta, add.cell.ids = c(1:12),  merge.data = TRUE, project = "LA_C57Bl6_Aldosterone")

In [None]:
SaveH5Seurat(obj_seu_merge, paste0("seurat_objects/setContaminationFraction/setContaminationFraction_", ContaminationFraction, "_dbl"))

# Harmony (integration)

In [None]:
#Filter Singlets
obj_seu_merge_singlet <- subset(obj_seu_merge, scDblFinder.class == "singlet")

In [None]:
# switch default assay to RNA
DefaultAssay(object = obj_seu_merge_singlet) <- "RNA"

In [None]:
obj <- subset(obj_seu_merge_singlet, subset = nFeature_RNA > 300 & nFeature_RNA < 5000 & 
                        nCount_RNA > 500 & nCount_RNA < 15000 &
                        percent.mt < 5)

obj <- NormalizeData(obj, verbose = FALSE)
obj <- FindVariableFeatures(obj, verbose = FALSE)
obj <- ScaleData(obj, verbose = FALSE)
obj <- RunPCA(obj, verbose = FALSE)

In [None]:
# Integrate "sex"
obj <- obj %>%
  RunHarmony(group.by.vars = c("batch"), plot_convergence = FALSE, assay.use = "RNA", verbose = FALSE)

obj_embed <- Embeddings(obj, "harmony")

In [None]:
obj_harmony <- obj %>%
  RunUMAP(reduction = "harmony", dims = 1:35, verbose = FALSE) %>%
  FindNeighbors(reduction = "harmony", dims = 1:35, verbose = FALSE) %>%
  FindClusters(resolution = 0.15)

In [None]:
SaveH5Seurat(obj_harmony, paste0("seurat_objects/setContaminationFraction/setContaminationFraction_", ContaminationFraction, "_harmony_sgl"))

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10, repr.plot.res = 300)

UMAP_dbl_seurat_clusters <- DimPlot(obj_harmony, group.by = "treatment", label = TRUE, shuffle = TRUE, raster = FALSE) +
  ggtitle("UMAP excluded doublets") + theme(text = element_text(size = 15))



UMAP_dbl_seurat_clusters

# FindAllMarkers

In [None]:
# used assay RNA
all.markers <- FindAllMarkers(obj_harmony, only.pos = TRUE, min.pct = 0.50, logfc.threshold = 0.58)
all.markers %>%
  group_by("seurat_clusters") %>%
  slice_max(n= 10, order_by = avg_log2FC)

write.csv(all.markers, paste0("setContaminationFraction/all.markers_wilcox_setContaminationFraction_", ContaminationFraction, "_harmony_sgl.csv"))

In [None]:
# Initialize a list to store marker results for each sample
marker_results <- list()

# Loop through each sample
for (sample in samples) {
  
  # Subset the Seurat object based on the sample_id
  obj_sample <- subset(obj_harmony, subset = sample_id == sample)
  
  # Further subset the Seurat object to include only clusters 0 to 5
  obj_sample <- subset(obj_sample, subset = seurat_clusters %in% 0:5)
  
  # Find markers for this sample (only clusters 0-5)
  all.markers <- FindAllMarkers(obj_sample, 
                                only.pos = TRUE, 
                                min.pct = 0.75, 
                                logfc.threshold = 0.58)
  
  # Group by clusters and get top 10 markers per cluster
  top.markers <- all.markers %>%
    group_by(cluster) %>%
    slice_max(n = 10, order_by = avg_log2FC)
  
  # Save the markers to a CSV file with ContaminationFraction and sample in the name
  write.csv(top.markers, paste0("setContaminationFraction/all.markers_wilcox_setContaminationFraction_", 
                                ContaminationFraction, "_", sample, "_harmony_sgl.csv"))
  
  # Optionally store the result in the list for later use
  marker_results[[sample]] <- top.markers
}

In [2]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 22.04.2 LTS

Matrix products: default
BLAS/LAPACK: /media/daten/dmeral/micromamba/envs/scrna_dm/lib/libopenblasp-r0.3.27.so;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] scCustomize_3.0.0           viridis_0.6.5              
 [3] viridisLite_0.4.2           pals_1.9                   
 [5] ggpubr_0.6.0                SeuratDisk_0.0.0.9021      
 [7] harmony_1.2.3      