# PoissonVI: Analyzing quantitative scATAC-seq fragment counts

In [2]:
!pip install pooch scanpy scvi torch muon anndata

import os
import tempfile
from pathlib import Path
from scipy.sparse import csr_matrix
import numpy as np
import anndata
import pooch
import scanpy as sc
import torch
import muon

Collecting pooch
  Downloading pooch-1.8.2-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.6/64.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scanpy
  Downloading scanpy-1.10.2-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scvi
  Downloading scvi-0.6.8-py2.py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m332.7 MB/s[0m eta [36m0:00:00[0m
Collecting muon
  Downloading muon-0.1.6-py3-none-any.whl (293 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.3/293.3 kB[0m [31m361.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting anndata
  Downloading anndata-0.10.8-py3-none-any.whl (124 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.4/124.4 kB[0m [31m327.6 MB/s[0m eta [36m0:00:00[0m
Collecting numba>=0.56
  Dow

In [5]:
# curl -O https://cf.10xgenomics.com/samples/cell-atac/1.2.0/atac_pbmc_5k_nextgem/atac_pbmc_5k_nextgem_filtered_peak_bc_matrix.h5

mdata = muon.read_10x_h5("data/atac_pbmc_5k_nextgem_filtered_peak_bc_matrix.h5")
mdata

adata = mdata.mod["atac"]
adata.var

Unnamed: 0,gene_ids,feature_types,derivation,genome
chr1:10404-10411,chr1:10404-10411,Peaks,,hg19
chr1:237567-237947,chr1:237567-237947,Peaks,,hg19
chr1:565116-565538,chr1:565116-565538,Peaks,,hg19
chr1:569178-569639,chr1:569178-569639,Peaks,,hg19
chr1:713460-715296,chr1:713460-715296,Peaks,,hg19
...,...,...,...,...
chrY:23602417-23602787,chrY:23602417-23602787,Peaks,,hg19
chrY:23898794-23899450,chrY:23898794-23899450,Peaks,,hg19
chrY:28816591-28817535,chrY:28816591-28817535,Peaks,,hg19
chrY:58827188-58827516,chrY:58827188-58827516,Peaks,,hg19


In [6]:
(adata.X == 1).sum()


1617289

In [7]:
(adata.X == 2).sum()


25632158

Since there are many 2s compared to 1s, it's showing us the read counts. We can convert to the fragment counts using scvi.data.reads_to_fragments, but since I am having issues with scvi, I will use an alternative apprach of convert read counts to fragment counts: round the read counts to the nearest even count and divide by two- which in most cases is very close to the true fragment counts.

In [8]:
def round_to_even_csr(csr_mat):
    # Access the data array of the CSR matrix
    data = csr_mat.data
    odd_data = data % 2 != 0
    data[odd_data] = data[odd_data] + 1
    data = data / 2
    return csr_matrix((data, csr_mat.indices, csr_mat.indptr), shape=csr_mat.shape)

In [9]:
adata.layers['fragments'] = round_to_even_csr(adata.X)

In [10]:
(adata.layers['fragments'] == 1).sum()

27249447

In [11]:
(adata.layers['fragments'] == 2).sum()

7233892

In [12]:
print("# regions before filtering:", adata.shape[-1])

# compute the threshold: 5% of the cells
min_cells = int(adata.shape[0] * 0.05)
# in-place filtering of regions
sc.pp.filter_genes(adata, min_cells=min_cells)

print("# regions after filtering:", adata.shape[-1])

# regions before filtering: 115554
# regions after filtering: 33142
