In [1]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from pathlib import Path
from scipy import stats
import scvelo as scv
import pandas as pd
from sim_low_umi import downsample_anndata
import gc

In [None]:
DATA_DIR = Path("/root/autodl-tmp/dataset")
DATASET = "gastrulation_downsample"
(DATA_DIR / DATASET / "processed").mkdir(parents=True, exist_ok=True)

In [3]:
adata = sc.read_h5ad(DATA_DIR / DATASET / "raw" / "MouseErythroid_anndata.h5ad")
adata

AnnData object with n_obs × n_vars = 9815 × 53801
    obs: 'sample', 'stage', 'sequencing.batch', 'theiler', 'celltype', 'clusters'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'MURK_gene', 'Δm', 'scaled Δm'
    uns: 'celltype_colors'
    obsm: 'X_pca', 'X_umap'
    layers: 'spliced', 'unspliced'

In [4]:
adata.layers['raw_spliced'] = adata.layers['spliced'].copy()
adata.layers['raw_unspliced'] = adata.layers['unspliced'].copy()
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000)

Filtered out 47456 genes that are detected 20 counts (shared).
Normalized count data: X, spliced, unspliced.
Extracted 2000 highly variable genes.
Logarithmized X.


  log1p(adata)


In [6]:
# 1.0 is the original data, 0.2 is 20% of the original data, etc.
for idx, downsample_rate in enumerate([0.2, 0.4, 0.6, 0.8, 1.0]):
    downsampled_adata = downsample_anndata(
        adata,
        downsample_rate=downsample_rate,
        seed=0
    )
    downsampled_adata.layers['raw_spliced'] = downsampled_adata.layers['spliced']
    downsampled_adata.layers['raw_unspliced'] = downsampled_adata.layers['unspliced']
    scv.pp.filter_and_normalize(downsampled_adata, min_shared_counts=20, n_top_genes=2000)
    if 'X_pca' in downsampled_adata.obsm:
        del downsampled_adata.obsm['X_pca']
    if 'pca' in downsampled_adata.uns:
        del downsampled_adata.uns['pca']
    if "neighbors" in downsampled_adata.uns.keys():
        del downsampled_adata.uns['neighbors']
    scv.pp.moments(downsampled_adata, n_neighbors=30, n_pcs=30)
    sc.tl.umap(downsampled_adata, random_state=0)
    downsampled_adata.obs['u_lib_size_raw'] = downsampled_adata.layers['raw_unspliced'].toarray().sum(-1) 
    downsampled_adata.obs['s_lib_size_raw'] = downsampled_adata.layers['raw_spliced'].toarray().sum(-1)
    downsampled_adata.write_h5ad(DATA_DIR / DATASET / "processed" / f"adata_preprocessed_{idx}.h5ad")
    del downsampled_adata
    gc.collect()

Filtered out 1046 genes that are detected 20 counts (shared).
Normalized count data: spliced, unspliced.
Skip filtering by dispersion since number of variables are less than `n_top_genes`.


  log1p(adata)
  scv.pp.moments(downsampled_adata, n_neighbors=30, n_pcs=30)
  neighbors(
  _set_pca(adata=adata, n_pcs=n_pcs, use_highly_variable=use_highly_variable)


Logarithmized X.
computing neighbors
    finished (0:00:53) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Filtered out 613 genes that are detected 20 counts (shared).
Normalized count data: spliced, unspliced.
Skip filtering by dispersion since number of variables are less than `n_top_genes`.
Logarithmized X.


  log1p(adata)
  scv.pp.moments(downsampled_adata, n_neighbors=30, n_pcs=30)
  neighbors(
  _set_pca(adata=adata, n_pcs=n_pcs, use_highly_variable=use_highly_variable)


computing neighbors
    finished (0:00:05) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:01) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Filtered out 362 genes that are detected 20 counts (shared).
Normalized count data: spliced, unspliced.
Skip filtering by dispersion since number of variables are less than `n_top_genes`.
Logarithmized X.


  log1p(adata)
  scv.pp.moments(downsampled_adata, n_neighbors=30, n_pcs=30)
  neighbors(
  _set_pca(adata=adata, n_pcs=n_pcs, use_highly_variable=use_highly_variable)


computing neighbors
    finished (0:00:04) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:01) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Filtered out 154 genes that are detected 20 counts (shared).
Normalized count data: spliced, unspliced.
Skip filtering by dispersion since number of variables are less than `n_top_genes`.
Logarithmized X.


  log1p(adata)
  scv.pp.moments(downsampled_adata, n_neighbors=30, n_pcs=30)
  neighbors(
  _set_pca(adata=adata, n_pcs=n_pcs, use_highly_variable=use_highly_variable)


computing neighbors
    finished (0:00:04) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:01) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Normalized count data: spliced, unspliced.
Extracted 2000 highly variable genes.
Logarithmized X.


  log1p(adata)
  scv.pp.moments(downsampled_adata, n_neighbors=30, n_pcs=30)
  neighbors(
  _set_pca(adata=adata, n_pcs=n_pcs, use_highly_variable=use_highly_variable)


computing neighbors
    finished (0:00:04) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:01) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)


In [7]:
for idx, downsample_rate in enumerate([0.2, 0.4, 0.6, 0.8, 1.0]):
    adata = sc.read_h5ad(DATA_DIR / DATASET / "processed" / f"adata_preprocessed_{idx}.h5ad")
    spliced = adata.layers['raw_spliced'].toarray().astype(np.float32)
    mean_expression = np.mean(spliced)
    print(f"Downsample rate: {downsample_rate}, Mean spliced expression: {mean_expression:.2f}")

Downsample rate: 0.2, Mean spliced expression: 0.47
Downsample rate: 0.4, Mean spliced expression: 0.68
Downsample rate: 0.6, Mean spliced expression: 0.88
Downsample rate: 0.8, Mean spliced expression: 1.05
Downsample rate: 1.0, Mean spliced expression: 1.23
