In [1]:
import argparse
import dynamo as dyn
import scanpy as sc
import gc
from pathlib import Path
import warnings
import utils
import importlib
import pandas as pd
import anndata as ad
import mygene
import scvelo as scv
importlib.reload(utils)
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
dyn.configuration.set_figure_params('dynamo', background='white')
dyn.get_all_dependencies_version()
%load_ext autoreload
%autoreload 2

package,mudata,umap-learn,pynndescent,numdifftools,leidenalg,igraph,seaborn,statsmodels,numba,session-info,dynamo-release,scipy,pandas,loompy,colorcet,openpyxl,networkx,matplotlib,pre-commit,typing-extensions,tqdm,setuptools
version,0.2.3,0.5.7,0.5.13,0.9.41,0.10.2,0.11.8,0.13.2,0.14.4,0.61.0,1.0.0,1.4.2rc1,1.15.2,2.2.3,3.0.8,3.1.0,3.1.5,3.4.2,3.10.0,4.1.0,4.12.2,4.62.3,59.5.0


In [2]:
DATA_DIR = Path("/root/autodl-tmp/dataset") 
DATASET = "labeled_embryo"
K_FOLD = 3
CLUSTER_KEY = 'time'
SEED = 1234

In [3]:
(DATA_DIR / DATASET / "raw").mkdir(parents=True, exist_ok=True)
SAVE_DATA = True
if SAVE_DATA:
    (DATA_DIR / DATASET / "processed").mkdir(parents=True, exist_ok=True)

In [4]:
neuron_labeling = sc.read(DATA_DIR / DATASET / "raw" / "neuron_labeling.h5ad")
neuron_splicing = sc.read(DATA_DIR / DATASET / "raw" / "neuron_splicing.h5ad")

In [5]:
activity_genes = neuron_labeling.var.activity_genes
activity_genes = activity_genes[activity_genes].index.tolist()

In [6]:
neuron_labeling.obs['label_time'] = 2 # this is the labeling time 
tkey = 'label_time'

In [7]:
common_genes = set(neuron_labeling.var_names) & set(neuron_splicing.var_names)
common_genes = list(common_genes)

In [None]:
neuron_labeling = neuron_labeling[:, common_genes]
neuron_splicing = neuron_splicing[neuron_labeling.obs_names, common_genes]
neuron_labeling.obs['time'] = neuron_labeling.obs['time']/60
neuron_splicing.obs = neuron_labeling.obs.copy()

In [None]:
neuron_labeling.obs["time"] = neuron_labeling.obs["time"].astype(str).astype("category")
neuron_splicing.obs["time"] = neuron_splicing.obs["time"].astype(str).astype("category")

In [None]:
sub_adata_labeling_lst = utils.split_anndata_stratified(neuron_labeling, n_splits=K_FOLD, cluster_key="time")

In [10]:
sub_adata = neuron_splicing.copy()
sub_adata.layers['raw_spliced'] = sub_adata.layers['spliced']
sub_adata.layers['raw_unspliced'] = sub_adata.layers['unspliced']
scv.pp.filter_and_normalize(
    sub_adata,
    min_shared_counts=5,
    n_top_genes=None  # 不在这里限制基因数，先过滤
)
sc.pp.highly_variable_genes(sub_adata, n_top_genes=2000, subset=False)
hvg_genes = set(sub_adata.var_names[sub_adata.var["highly_variable"]])
selected_genes_in_adata = set(sub_adata.var_names) & set(activity_genes)
final_genes = list(hvg_genes | selected_genes_in_adata)
sub_adata = sub_adata[:, final_genes].copy()
print(sub_adata.n_vars)
if 'X_pca' in sub_adata.obsm:
    del sub_adata.obsm['X_pca']
if 'pca' in sub_adata.uns:
    del sub_adata.uns['pca']
if "neighbors" in sub_adata.uns.keys():
    del sub_adata.uns['neighbors']
scv.pp.moments(sub_adata, n_neighbors=30, n_pcs=30)
utils.fill_in_neighbors_indices(sub_adata)
sc.tl.umap(sub_adata, random_state=SEED)
sub_adata.obs['u_lib_size_raw'] = sub_adata.layers['raw_unspliced'].toarray().sum(-1) 
sub_adata.obs['s_lib_size_raw'] = sub_adata.layers['raw_spliced'].toarray().sum(-1)
if SAVE_DATA:
    sub_adata.write_h5ad(DATA_DIR / DATASET / "processed" / f"adata_preprocessed_full.h5ad")

sub_adata = neuron_labeling[:,final_genes].copy()
if 'X_pca' in sub_adata.obsm:
    del sub_adata.obsm['X_pca']
if 'pca' in sub_adata.uns:
    del sub_adata.uns['pca']
if "neighbors" in sub_adata.uns.keys():
    del sub_adata.uns['neighbors']
scv.pp.moments(sub_adata, n_neighbors=30, n_pcs=30)
utils.fill_in_neighbors_indices(sub_adata)
sc.tl.umap(sub_adata, random_state=SEED)
if SAVE_DATA:
    sub_adata.write_h5ad(DATA_DIR / DATASET / "processed" / f"adata_preprocessed_label_full.h5ad")

del sub_adata
gc.collect()

for i in range(K_FOLD):
    sub_adata = neuron_splicing[sub_adata_labeling_lst[i].obs_names, :].copy()
    sub_adata.layers['raw_spliced'] = sub_adata.layers['spliced']
    sub_adata.layers['raw_unspliced'] = sub_adata.layers['unspliced']
    scv.pp.filter_and_normalize(
        sub_adata,
        min_shared_counts=5,
        n_top_genes=None  # 不在这里限制基因数，先过滤
    )
    sc.pp.highly_variable_genes(sub_adata, n_top_genes=2000, subset=False)
    hvg_genes = set(sub_adata.var_names[sub_adata.var["highly_variable"]])
    selected_genes_in_adata = set(sub_adata.var_names) & set(activity_genes)
    final_genes = list(hvg_genes | selected_genes_in_adata)
    sub_adata = sub_adata[:, final_genes].copy()
    print(sub_adata.n_vars)
    if 'X_pca' in sub_adata.obsm:
        del sub_adata.obsm['X_pca']
    if 'pca' in sub_adata.uns:
        del sub_adata.uns['pca']
    if "neighbors" in sub_adata.uns.keys():
        del sub_adata.uns['neighbors']
    scv.pp.moments(sub_adata, n_neighbors=30, n_pcs=30)
    utils.fill_in_neighbors_indices(sub_adata)
    sc.tl.umap(sub_adata, random_state=SEED)
    sub_adata.obs['u_lib_size_raw'] = sub_adata.layers['raw_unspliced'].toarray().sum(-1) 
    sub_adata.obs['s_lib_size_raw'] = sub_adata.layers['raw_spliced'].toarray().sum(-1)
    if SAVE_DATA:
        sub_adata.write_h5ad(DATA_DIR / DATASET / "processed" / f"adata_preprocessed_{i}.h5ad")

    sub_adata = sub_adata_labeling_lst[i][:,final_genes].copy()
    if 'X_pca' in sub_adata.obsm:
        del sub_adata.obsm['X_pca']
    if 'pca' in sub_adata.uns:
        del sub_adata.uns['pca']
    if "neighbors" in sub_adata.uns.keys():
        del sub_adata.uns['neighbors']
    scv.pp.moments(sub_adata, n_neighbors=30, n_pcs=30)
    utils.fill_in_neighbors_indices(sub_adata)
    sc.tl.umap(sub_adata, random_state=SEED)
    if SAVE_DATA:
        sub_adata.write_h5ad(DATA_DIR / DATASET / "processed" / f"adata_preprocessed_label_{i}.h5ad")
    
    del sub_adata
    gc.collect()

Filtered out 15521 genes that are detected 5 counts (shared).
Normalized count data: X, spliced, unspliced.
Logarithmized X.


  log1p(adata)


2008


  scv.pp.moments(sub_adata, n_neighbors=30, n_pcs=30)
  neighbors(
  _set_pca(adata=adata, n_pcs=n_pcs, use_highly_variable=use_highly_variable)


computing neighbors
    finished (0:00:10) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:01) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)


  scv.pp.moments(sub_adata, n_neighbors=30, n_pcs=30)
  neighbors(
  _set_pca(adata=adata, n_pcs=n_pcs, use_highly_variable=use_highly_variable)


computing neighbors
    finished (0:00:05) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)


100005