# 07d scATAC HSC MPP merge

Merge HSCs and MPPs into LSKs

used this docker image

docker run --rm -d --name signac -p 8880:8888 -e JUPYTER_ENABLE_LAB=YES -v /Users/efast/Documents/:/home/jovyan/work  signac:0.2.5



#### ran in commandline - was too hard to make it work inside the R notebook
`cd /home/jovyan/work/Z_TRASH/raw_data`

#### decompress files and add the same cell prefix as was added to the Seurat object
`gzip -dc /home/jovyan/work/Z_TRASH/raw_data/ATAC_LT/fragments.tsv.gz | awk 'BEGIN {FS=OFS="\t"} {print $1,$2,$3,"HSC_"$4,$5}' - > HSC_fragments.tsv`  
`gzip -dc /home/jovyan/work/Z_TRASH/raw_data/ATAC_MPP/fragments.tsv.gz | awk 'BEGIN {FS=OFS="\t"} {print $1,$2,$3,"MPP_"$4,$5}' - > MPP_fragments.tsv `

#### merge files (avoids having to re-sort)
`sort -m -k 1,1V -k2,2n HSC_fragments.tsv MPP_fragments.tsv > fragments.tsv`

#### block gzip compress the merged file
`bgzip -@ 4 fragments.tsv # -@ 4 uses 4 threads`

#### index the bgzipped file
`tabix -p bed fragments.tsv.gz`

#### remove intermediate files
`rm HSC_fragments.tsv MPP_fragments.tsv`

In [1]:
library(Signac)
library(Seurat)
library(SingleCellExperiment)

“package ‘Seurat’ was built under R version 3.6.3”Loading required package: SummarizedExperiment
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
 

In [2]:
## read in LTs

LT <- readRDS("./sc_objects/LT_macs2.rds")

#read in metadata
metadata <- read.csv(
  file = "./raw_data/ATAC_LT/singlecell.csv",
  header = TRUE,
  row.names = 1
)

#convert to SingleCellExperiment - Seurat compatible format
LT <- as(LT, "SingleCellExperiment")

#convert to SingleCellExperiment - Seurat compatible format
LT <- as(LT, "SingleCellExperiment")
LT <- as.Seurat(LT, counts = "counts", data = "counts")
LT <- AddMetaData(object = LT, metadata = metadata)
LT@assays$peaks <- LT@assays$RNA
DefaultAssay(LT) <- 'peaks'

fragment.path <- './raw_data/ATAC_LT/fragments.tsv.gz'

LT <- SetFragments(
  object = LT,
  file = fragment.path
)

In [3]:
## read in MPPs

MPP_r <- readRDS("./sc_objects/scATAC_MPP_macs2.rds")

# need to specify the rownames
temp <- DataFrame(MPP_r@rowRanges)
rownames(MPP_r) <- temp$X

#read in metadata
metadata <- read.csv(
  file = "./raw_data/ATAC_MPP/singlecell.csv",
  header = TRUE,
  row.names = 1
)

#convert to SingleCellExperiment - Seurat compatible format
MPP <- as(MPP_r, "SingleCellExperiment")

#convert to SingleCellExperiment - Seurat compatible format
MPP <- as(MPP, "SingleCellExperiment")
MPP <- as.Seurat(MPP, counts = "counts", data = "counts")
MPP <- AddMetaData(object = MPP, metadata = metadata)
MPP@assays$peaks <- MPP@assays$RNA
DefaultAssay(MPP) <- 'peaks'

fragment.path <- './raw_data/ATAC_MPP/fragments.tsv.gz'

MPP <- SetFragments(
  object = MPP,
  file = fragment.path
)

In [4]:
combined.peaks <- UnifyPeaks(object.list = list(LT, MPP), mode = "reduce")
head(combined.peaks)

GRanges object with 6 ranges and 0 metadata columns:
      seqnames          ranges strand
         <Rle>       <IRanges>  <Rle>
  [1]     chr1 3052614-3052915      *
  [2]     chr1 3067067-3067368      *
  [3]     chr1 3140056-3140357      *
  [4]     chr1 3158733-3159034      *
  [5]     chr1 3284384-3284685      *
  [6]     chr1 3361767-3362068      *
  -------
  seqinfo: 22 sequences from an unspecified genome; no seqlengths

In [5]:
LT.counts <- FeatureMatrix(
  fragments = GetFragments(LT),
  features = combined.peaks,
  sep = c(":", "-"),
  cells = colnames(LT)
)

MPP.counts <- FeatureMatrix(
  fragments = GetFragments(MPP),
  features = combined.peaks,
  sep = c(":", "-"),
  cells = colnames(MPP)
)


Extracting reads overlapping genomic regions
Constructing matrix
Extracting reads overlapping genomic regions
Constructing matrix


In [6]:
LT[['peaks']] <- CreateAssayObject(counts = LT.counts)
MPP[['peaks']] <- CreateAssayObject(counts = MPP.counts)

In [7]:
# add information to identify dataset of origin
LT$dataset <- 'HSC'
MPP$dataset <- 'MPP'

# merge all datasets, adding a cell ID to make sure cell names are unique
combined <- merge(x = LT, y = MPP, add.cell.ids = c("HSC", "MPP"))

In [8]:
combined <- SetFragments(combined, "./raw_data/fragments.tsv.gz")

In [9]:
combined

An object of class Seurat 
586709 features across 10750 samples within 2 assays 
Active assay: peaks (243581 features, 0 variable features)
 1 other assay present: RNA

In [10]:
### save combined object

saveRDS(combined, file = "./sc_objects/scATAC_combined_macs2.rds")

In [11]:
LT

An object of class Seurat 
405477 features across 730 samples within 2 assays 
Active assay: peaks (223370 features, 0 variable features)
 1 other assay present: RNA

In [12]:
MPP

An object of class Seurat 
406237 features across 10020 samples within 2 assays 
Active assay: peaks (242335 features, 0 variable features)
 1 other assay present: RNA

In [13]:
sessionInfo()

R version 3.6.1 (2019-07-05)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.2 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.7.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] SingleCellExperiment_1.8.0  SummarizedExperiment_1.16.0
 [3] DelayedArray_0.12.0         BiocParallel_1.20.0        
 [5] matrixStats_0.56.0          Biobase_2.46.0             
 [7] GenomicRanges_1.38.0        GenomeInfoDb_1.22.0        
 [9] IRanges_2.20.0              S4Vectors_0.24.0        