In [1]:
# libraries
library(Seurat)
library(tidyverse)
library(igraph)
require(circlize)
library(R.utils)
library(data.table) #to read gz file

Attaching SeuratObject

── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.2      [32m✔[39m [34mpurrr  [39m 0.3.5 
[32m✔[39m [34mtibble [39m 3.2.1      [32m✔[39m [34mdplyr  [39m 1.0.10
[32m✔[39m [34mtidyr  [39m 1.2.1      [32m✔[39m [34mstringr[39m 1.5.0 
[32m✔[39m [34mreadr  [39m 2.1.3      [32m✔[39m [34mforcats[39m 0.5.2 
── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘igraph’


The following objects are masked from ‘package:dpl

In [2]:
input_dir <- "../../../../results/data_preprocessing/Lasry/preprocessed/"

In [3]:
# # load counts
# print("load counts")

counts <- fread(paste0(input_dir,"counts_corr.csv.gz"), header = TRUE,check.names=FALSE)
counts <- as.data.frame(counts)
rownames(counts) <- counts$gene_symbol
counts <- counts[,-1]

In [4]:
# load cell annotation
print("load cell annotation")
anno_cells <- read.table(paste0(input_dir,"anno_cells_corr.txt")
                         ,sep = "\t"
                         ,row.names = 1
                         ,header = TRUE
                         ,check.names=FALSE
                         )
# print(str(anno_cells))

[1] "load cell annotation"


In [5]:
#set rownames of annotation to cell_ids
rownames(anno_cells) <- anno_cells$cell

#set colnames of counts to cell_ids
colnames(counts) <- rownames(anno_cells)

In [6]:
#create a Seurat object
srt=CreateSeuratObject(counts=counts, meta.data=anno_cells)

In [7]:
#peek into the number of cells for case/control
srt@meta.data$health_status %>% table()

.
    AML healthy 
  21311   25391 

In [8]:
#peek into the number of cell types
srt@meta.data$cell_type %>% table()

.
    B    DC   Ery  Gran  HSPC  Mono    NK     T 
 4765  1634  1674  2332  3169 18004  3078 12046 

In [9]:
#set the indent to cell_type
Idents(srt) <- "cell_type"

In [10]:
# initialize empty vector for storing DEGs
DEGs <- c()

# iterate over each unique cell type 
for (cell_type in unique(srt@meta.data$cell_type)) {
  
  # subset Seurat object to only include cells of current cell type
  seurat_obj_receiver <- subset(srt, idents = cell_type)
  
  # set cell identity using the "health_status" feature
  seurat_obj_receiver <- SetIdent(seurat_obj_receiver, value = seurat_obj_receiver[["health_status"]])
  
  # specify the two conditions to compare
  condition_oi <- "AML"
  condition_reference <- "healthy" 
  
#   find differentially expressed genes between the two conditions
  DE_table_receiver <- FindMarkers(object = seurat_obj_receiver, 
                                   slot = "counts",
                                   ident.1 = condition_oi, 
                                   ident.2 = condition_reference, 
                                   min.pct = 0.10) %>%
    # convert row names to a separate "gene" column
    rownames_to_column("gene")
    
#     DE_table_receiver <- FindMarkers(object = seurat_obj_receiver, 
#                                    slot = "data",
#                                    ident.1 = condition_oi, 
#                                    ident.2 = condition_reference, 
#                                    min.pct = -Inf,
#                                    thresh.use = 0,
#                                    logfc.threshold = -Inf,
#                                    min.cells.feature = 1, min.cells.group = 1) %>% rownames_to_column("gene")
    
    
    
  
  # add cell type information to the DEG table
  DE_table_receiver <- data.frame(cluster = cell_type, DE_table_receiver)
  
  # filter DEGs based on statistical significance and fold change threshold
#   DE_table_receiver <- DE_table_receiver %>% 
#     filter(p_val_adj <= 0.05 & abs(avg_log2FC) >= 0.25)
  
  # print cell type and number of DEGs found
  print(cell_type)
  print("number of genes in the cell type")
  print(length(row.names(seurat_obj_receiver@assays$RNA@data)))
  print("number of genes output from DE")
  print(nrow(DE_table_receiver))
  
  # append DEGs to the vector of all DEGs
  DEGs <- rbind(DEGs, DE_table_receiver)
}

[1] "Mono"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 178
[1] "Gran"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 103
[1] "T"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 48
[1] "NK"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 164
[1] "B"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 74
[1] "HSPC"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 90
[1] "Ery"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 645
[1] "DC"
[1] "number of genes in the cell type"
[1] 15770
[1] "number of genes output from DE"
[1] 56


In [12]:
pre=read.csv("DEG_significant_table.csv")

In [13]:
identical(pre$avg_log2FC, DEGs$avg_log2FC)

In [63]:
x <- c(-0.2,2.0,3.0)

In [64]:
y <- c(-0.2,2.0,3.0)

In [56]:
x

In [57]:
str(x)

 num [1:3] 1.2 2 3


In [58]:
str(y)

 num [1:3] 1.2 2 3


In [65]:
identical(x,y)

In [14]:
DEGs

cluster,gene,p_val,avg_log2FC,pct.1,pct.2,p_val_adj
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Mono,ISG15,0,0.6920399,0.739,0.243,0
Mono,RBP7,0,-0.4300012,0.113,0.473,0
Mono,C1QA,0,0.3690545,0.360,0.043,0
Mono,CD52,0,-0.6166741,0.295,0.791,0
Mono,IFI6,0,0.6742024,0.742,0.224,0
Mono,CITED4,0,0.3301794,0.489,0.133,0
Mono,PLK3,0,0.2989230,0.550,0.188,0
Mono,IFI44L,0,0.6104812,0.643,0.061,0
Mono,IFI44,0,0.2887123,0.412,0.076,0
Mono,GBP1,0,0.2674201,0.383,0.085,0


In [57]:
write.csv(DEGs, "DEG_table.csv")

In [59]:
write.csv(DEGs, "DEG_significant_table.csv")

In [19]:
?FindMarkers