In [4]:
import scanpy as sc
import scvelo as scv
import numpy as np
from pathlib import Path
import pandas as pd
import utils
import matplotlib.pyplot as plt

In [None]:
DATA_DIR = Path("/root/autodl-tmp/dataset/")
DATASET = "pancreas_quantification"

In [3]:
quantification_methods = ["alevin", "dropest", "kallisto_bustools",
                          "starsolo", "velocyto"]

In [None]:
for idx, method in enumerate(quantification_methods):
    adata = sc.read_h5ad(DATA_DIR / DATASET / "raw" / f"adata_{method}.h5ad")
    adata = adata[~pd.isna(adata.obs["clusters"]), :]
    adata.layers['raw_spliced'] = adata.layers['spliced']
    adata.layers['raw_unspliced'] = adata.layers['unspliced']
    scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000)
    if adata.n_vars < 2000:
        sc.pp.highly_variable_genes(adata, n_top_genes=adata.n_vars, subset=True)
    if 'X_pca' in adata.obsm:
        del adata.obsm['X_pca']
    scv.pp.moments(adata, n_neighbors=30, n_pcs=30)
    utils.fill_in_neighbors_indices(adata)
    sc.tl.umap(adata, random_state=1234)
    adata.obs['u_lib_size_raw'] = adata.layers['raw_unspliced'].toarray().sum(-1) 
    adata.obs['s_lib_size_raw'] = adata.layers['raw_spliced'].toarray().sum(-1)
    adata.write_h5ad(DATA_DIR / DATASET / "processed" / f"adata_preprocessed_{idx}.h5ad")
    print(f"Processed {method} with {adata.n_obs} cells and {adata.n_vars} genes.")

Filtered out 45279 genes that are detected 20 counts (shared).
Extracted 2000 highly variable genes.
Logarithmized X.


  log1p(adata)
  scv.pp.moments(adata, n_neighbors=30, n_pcs=30)
  neighbors(
  _set_pca(adata=adata, n_pcs=n_pcs, use_highly_variable=use_highly_variable)


computing neighbors
    finished (0:00:07) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Processed alevin with 3696 cells and 2000 genes.
Filtered out 46399 genes that are detected 20 counts (shared).
Normalized count data: X, spliced, unspliced.
Extracted 2000 highly variable genes.
Logarithmized X.


  log1p(adata)
  scv.pp.moments(adata, n_neighbors=30, n_pcs=30)
  neighbors(
  _set_pca(adata=adata, n_pcs=n_pcs, use_highly_variable=use_highly_variable)


computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Processed dropest with 3696 cells and 2000 genes.
Filtered out 47014 genes that are detected 20 counts (shared).
Normalized count data: X, spliced, unspliced.
Extracted 2000 highly variable genes.
Logarithmized X.


  log1p(adata)
  scv.pp.moments(adata, n_neighbors=30, n_pcs=30)
  neighbors(
  _set_pca(adata=adata, n_pcs=n_pcs, use_highly_variable=use_highly_variable)


computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Processed kallisto_bustools with 3696 cells and 2000 genes.
Filtered out 47304 genes that are detected 20 counts (shared).
Normalized count data: X, spliced, unspliced.
Extracted 2000 highly variable genes.
Logarithmized X.


  log1p(adata)
  scv.pp.moments(adata, n_neighbors=30, n_pcs=30)
  neighbors(
  _set_pca(adata=adata, n_pcs=n_pcs, use_highly_variable=use_highly_variable)


computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Processed starsolo with 3696 cells and 2000 genes.
Filtered out 47036 genes that are detected 20 counts (shared).
Normalized count data: X, spliced, unspliced.
Extracted 2000 highly variable genes.
Logarithmized X.


  log1p(adata)
  scv.pp.moments(adata, n_neighbors=30, n_pcs=30)
  neighbors(
  _set_pca(adata=adata, n_pcs=n_pcs, use_highly_variable=use_highly_variable)


computing neighbors
    finished (0:00:00) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
Processed velocyto with 3696 cells and 2000 genes.


In [None]:
# for idx, method in enumerate(quantification_methods):
#     adata = sc.read_h5ad(DATA_DIR / DATASET / "processed" / f"adata_preprocessed_{idx}.h5ad")
#     clusters = adata.obs['clusters'].astype('category')

#     plt.figure(figsize=(8, 6))
#     colors = plt.cm.tab20(np.linspace(0, 1, len(clusters.cat.categories)))
#     for i, cluster in enumerate(clusters.cat.categories):
#         mask = (clusters == cluster)
#         plt.scatter(
#             adata.obsm['X_umap'][mask, 0],
#             adata.obsm['X_umap'][mask, 1],
#             s=2,
#             color=colors[i],
#             alpha=0.7,
#             label=cluster
#         )
#     plt.title(f"UMAP projection of {method}")
#     plt.legend(markerscale=4, fontsize=8, frameon=False)
#     plt.savefig(DATA_DIR / DATASET / "umap_figures" / f"umap_{method}.png", dpi=300)
#     plt.close()