## reformat the adata for ATAC

- reformat the adata converted from Seurat objects
- (1) make sure that we have extracted the "raw counts" in adata.X - this can be worked-around by using adata.raw.X, instead of adata.X
- (2) export the "gene.activity" as a csv
- (3) export "cell_id", "var_names", and "count matrices"


In [1]:
# load the libraries
suppressMessages(library(Seurat))
suppressMessages(library(Signac))
library(SeuratData)
library(SeuratDisk)
library(Matrix)

# genome info
library(GenomeInfoDb)
# library(ggplot2)
# library(patchwork)
# library(stringr)
library(BSgenome.Drerio.UCSC.danRer11)

print(R.version)
print(packageVersion("Seurat"))

# parallelization in Signac: https://stuartlab.org/signac/articles/future
library(future)
plan()

plan("multicore", workers = 20)
plan()

Registered S3 method overwritten by 'SeuratDisk':
  method            from  
  as.sparse.H5Group Seurat

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors

Loading required package: stats4


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:Matrix’:

    expand, unname


The following object is masked from ‘package:utils’:

    findMatches


The following object

               _                           
platform       x86_64-pc-linux-gnu         
arch           x86_64                      
os             linux-gnu                   
system         x86_64, linux-gnu           
status                                     
major          4                           
minor          3.2                         
year           2023                        
month          10                          
day            31                          
svn rev        85441                       
language       R                           
version.string R version 4.3.2 (2023-10-31)
nickname       Eye Holes                   
[1] ‘4.4.0’


In [9]:
# seurat <- readRDS("/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/01_Signac_processed/TDR118reseq/TDR118_processed.RDS")
# seurat

An object of class Seurat 
1573599 features across 13614 samples within 7 assays 
Active assay: peaks_merged (485357 features, 485357 variable features)
 6 other assays present: RNA, ATAC, SCT, peaks_bulk, peaks_celltype, Gene.Activity
 5 dimensional reductions calculated: pca, umap.rna, lsi, umap.atac, umap.joint

In [7]:
# seurat@assays$peaks_merged@counts

In [None]:
# # Access the counts matrix from the 'Gene.Activity' assay, which is stored as a sparse matrix
# gene_activity_counts_sparse <- GetAssayData(seurat_obj, assay = "Gene.Activity", slot = "counts")

# # Convert the sparse matrix to a dense matrix
# gene_activity_counts_dense <- as.matrix(gene_activity_counts_sparse)

# # Optionally, convert the dense matrix to a data frame
# gene_activity_counts_df <- as.data.frame(gene_activity_counts_dense)

# # Export the dense matrix (or data frame) to a CSV file
# write.csv(gene_activity_counts_df, "gene_activity_counts.csv", row.names = TRUE)

In [10]:
# save the gene.activity count matrices into the 01_Signac_processed
write.table(
  as.matrix(GetAssayData(object = seurat, assay = "Gene.Activity", slot = "counts")), 
  '/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/01_Signac_processed/TDR118reseq/gene_activity_counts.csv', 
  sep = ',', 
  row.names = TRUE, 
  col.names = TRUE, # Use NA for default behavior, which includes column names for data frames
  quote = FALSE
)

“sparse->dense coercion: allocating vector of size 2.5 GiB”


In [3]:
# export the "gene.activity" count matrices to the 01_Signac_processed folder
data.list = c("TDR118reseq", "TDR119reseq","TDR124reseq","TDR125reseq","TDR126","TDR127","TDR128")
data.list

seurat_data_path = "/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/01_Signac_processed/"

for (data_id in data.list) {
    # load the seurat object
    # Modify data_id to remove "reseq" for the filename
    filename_id <- gsub("reseq", "", data_id)
    seurat_obj <- readRDS(paste0(seurat_data_path, data_id,"/",filename_id,"_processed.RDS"))

    # Define the file path dynamically based on data_id
    file_path <- paste0('/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/01_Signac_processed/', data_id, '/gene_activity_counts.csv')

    # Extract the counts matrix and write to CSV
    write.table(
    as.matrix(GetAssayData(object = seurat_obj, assay = "Gene.Activity", slot = "counts")), 
    file_path, 
    sep = ',', 
    row.names = TRUE, 
    col.names = NA, # Use NA for default behavior, which includes column names for data frames
    quote = FALSE
    )
    print(paste0(filename_id, " gene.activity exported"))
}



“sparse->dense coercion: allocating vector of size 2.5 GiB”


[1] "TDR118 gene.activity exported"


“sparse->dense coercion: allocating vector of size 2.3 GiB”


[1] "TDR119 gene.activity exported"


“sparse->dense coercion: allocating vector of size 1.2 GiB”


[1] "TDR124 gene.activity exported"


“sparse->dense coercion: allocating vector of size 3.2 GiB”


[1] "TDR125 gene.activity exported"


“sparse->dense coercion: allocating vector of size 2.8 GiB”


[1] "TDR126 gene.activity exported"


“sparse->dense coercion: allocating vector of size 3.6 GiB”


[1] "TDR127 gene.activity exported"


“sparse->dense coercion: allocating vector of size 2.1 GiB”


[1] "TDR128 gene.activity exported"
