In [1]:
# libraries
library(Seurat)
library(tidyverse)
library(igraph)
require(circlize)
library(R.utils)
library(data.table) #to read gz file

Attaching SeuratObject

── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.2      [32m✔[39m [34mpurrr  [39m 0.3.5 
[32m✔[39m [34mtibble [39m 3.2.1      [32m✔[39m [34mdplyr  [39m 1.0.10
[32m✔[39m [34mtidyr  [39m 1.2.1      [32m✔[39m [34mstringr[39m 1.5.0 
[32m✔[39m [34mreadr  [39m 2.1.3      [32m✔[39m [34mforcats[39m 0.5.2 
── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘igraph’


The following objects are masked from ‘package:dpl

In [2]:
input_dir <- "../../../../results/data_preprocessing/Lasry/preprocessed/"

In [3]:
# # load counts
# print("load counts")

counts <- fread(paste0(input_dir,"counts_corr.csv.gz"), header = TRUE,check.names=FALSE)
counts <- as.data.frame(counts)
rownames(counts) <- counts$gene_symbol
counts <- counts[,-1]

In [4]:
# load cell annotation
print("load cell annotation")
anno_cells <- read.table(paste0(input_dir,"anno_cells_corr.txt")
                         ,sep = "\t"
                         ,row.names = 1
                         ,header = TRUE
                         ,check.names=FALSE
                         )
# print(str(anno_cells))

[1] "load cell annotation"


In [5]:
#set rownames of annotation to cell_ids
rownames(anno_cells) <- anno_cells$cell

#set colnames of counts to cell_ids
colnames(counts) <- rownames(anno_cells)

In [6]:
#create a Seurat object
srt=CreateSeuratObject(counts=counts, meta.data=anno_cells)

In [7]:
#peek into the number of cells for case/control
srt@meta.data$health_status %>% table()

.
    AML healthy 
  21311   25391 

In [8]:
#peek into the number of cell types
srt@meta.data$cell_type %>% table()

.
    B    DC   Ery  Gran  HSPC  Mono    NK     T 
 4765  1634  1674  2332  3169 18004  3078 12046 

In [9]:
#set the indent to cell_type
Idents(srt) <- "cell_type"

In [53]:
# initialize empty vector for storing DEGs
DEGs <- c()

# iterate over each unique cell type 
for (cell_type in unique(srt@meta.data$cell_type)) {
  
  # subset Seurat object to only include cells of current cell type
  seurat_obj_receiver <- subset(srt, idents = cell_type)
  
  # set cell identity using the "health_status" feature
  seurat_obj_receiver <- SetIdent(seurat_obj_receiver, value = seurat_obj_receiver[["health_status"]])
  
  # specify the two conditions to compare
  condition_oi <- "AML"
  condition_reference <- "healthy" 
  
  # find differentially expressed genes between the two conditions
#   DE_table_receiver <- FindMarkers(object = seurat_obj_receiver, 
#                                    ident.1 = condition_oi, 
#                                    ident.2 = condition_reference, 
#                                    min.pct = 0.10) %>%
#     # convert row names to a separate "gene" column
#     rownames_to_column("gene")
    
    DE_table_receiver <- FindMarkers(object = seurat_obj_receiver, 
                                   ident.1 = condition_oi, 
                                   ident.2 = condition_reference, 
                                   min.pct = -Inf,
                                   thresh.use = 0,
                                   logfc.threshold = -Inf,
                                   min.cells.feature = 1, min.cells.group = 1) %>% rownames_to_column("gene")
    
    
    
  
  # add cell type information to the DEG table
  DE_table_receiver <- data.frame(cluster = cell_type, DE_table_receiver)
  
  # filter DEGs based on statistical significance and fold change threshold
#   DE_table_receiver <- DE_table_receiver %>% 
#     filter(p_val_adj <= 0.05 & abs(avg_log2FC) >= 0.25)
  
  # print cell type and number of DEGs found
  print(cell_type)
  print("number of genes in the cell type")
  print(length(row.names(seurat_obj_receiver@assays$RNA@data)))
  print("number of genes output from DE")
  print(nrow(DE_table_receiver))
  
  # append DEGs to the vector of all DEGs
  DEGs <- rbind(DEGs, DE_table_receiver)
}

[1] "Mono"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 15770
[1] "Gran"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 15770
[1] "T"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 15770
[1] "NK"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 15770
[1] "B"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 15770
[1] "HSPC"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 15770
[1] "Ery"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 15770
[1] "DC"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 15770


In [13]:
write.csv(DEGs, "DEG_table.csv")

In [39]:
?FindMarkers

In [54]:
DEGs[DEGs$cluster==|]

cluster,gene,p_val,avg_log2FC,pct.1,pct.2,p_val_adj
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Mono,ISG15,0,0.69203995,0.739,0.243,0
Mono,RPL22,0,-0.08436030,0.965,0.955,0
Mono,RBP7,0,-0.43000121,0.113,0.473,0
Mono,C1QA,0,0.36905454,0.360,0.043,0
Mono,RPL11,0,-0.06997682,0.987,0.991,0
Mono,CD52,0,-0.61667412,0.295,0.791,0
Mono,IFI6,0,0.67420242,0.742,0.224,0
Mono,CITED4,0,0.33017937,0.489,0.133,0
Mono,RPS8,0,-0.08066314,0.983,0.990,0
Mono,PLK3,0,0.29892299,0.550,0.188,0


In [40]:
FindMarkers(object = seurat_obj_receiver, 
                                   ident.1 = condition_oi, 
                                   ident.2 = condition_reference, 
                                   min.pct = -Inf,
                                   thresh.use = 0,
                                   logfc.threshold = -Inf,
                                   min.cells.feature = 1, min.cells.group = 1)

Unnamed: 0_level_0,p_val,avg_log2FC,pct.1,pct.2,p_val_adj
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
MTRNR2L8,1.659515e-106,0.55656057,0.568,0.091,2.617055e-102
ARL4C,5.857174e-102,-0.68506693,0.231,0.857,9.236764e-98
MT-ND4,1.355322e-65,0.09641028,0.992,0.967,2.137343e-61
MS4A4E,4.651743e-64,0.34124641,0.373,0.053,7.335798e-60
AC007952.4,1.741676e-63,-0.49788160,0.147,0.631,2.746624e-59
AHR,1.537076e-62,0.42779198,0.717,0.362,2.423968e-58
CCL23,1.094474e-60,0.24893574,0.224,0.004,1.725986e-56
HOXA9,5.168143e-60,0.26862590,0.306,0.030,8.150161e-56
ANKRD28,1.893597e-58,0.40515700,0.635,0.249,2.986203e-54
AC253572.2,5.614950e-55,-0.47929299,0.157,0.619,8.854776e-51


In [51]:
length(row.names(seurat_obj_receiver@assays$RNA@data))