In [1]:
import anndata

In [2]:
DATA_FOLDER = "../../../../data/"
adata = anndata.read_h5ad(f"{DATA_FOLDER}cellcycle_maxine/VASA_preprocesseed.h5ad")
adata

AnnData object with n_obs × n_vars = 38913 × 16011
    obs: 'Cell_ID', 'obs_names', 'Stage', 'Dataset', 'G0?', 'UMAP_phase', 'UMI_SUM', 'S-phase', 'Celltype', 's_counts', 'u_counts', 'percent_unspliced', 'total_counts', 'pca_theta'
    var: 'var_names', 'Type', 'n_cells'
    uns: 'log1p'
    layers: 'matrix', 's_log', 's_norm', 'spliced', 'u_log', 'u_norm', 'unspliced'

In [4]:
import numpy as np

SMALL_CELL_CYCLE_GENE_SET = np.array(
    [
        "Anln",
        "Anp32e",
        "Atad2",
        "Aurka",
        "Aurkb",
        "Birc5",
        "Blm",
        "Brip1",
        "Bub1",
        "Casp8ap2",
        "Cbx5",
        "Ccnb2",
        "Ccne2",
        "Cdc20",
        "Cdc25c",
        "Cdc45",
        "Cdc6",
        "Cdca2",
        "Cdca3",
        "Cdca7",
        "Cdca8",
        "Cdk1",
        "Cenpa",
        "Cenpe",
        "Cenpf",
        "Cenpu",
        "Chaf1b",
        "Ckap2",
        "Ckap2l",
        "Ckap5",
        "Cks1b",
        "Cks2",
        "Clspn",
        "Ctcf",
        "Dlgap5",
        "Dscc1",
        "Dtl",
        "E2f8",
        "Ect2",
        "Esco2",
        "Exo1",
        "Fen1",
        "G2e3",
        "Gas2l3",
        "Gins2",
        "Gmnn",
        "Gtse1",
        "Hells",
        "Hjurp",
        "Hmgb2",
        "Hmmr",
        "Jpt1",
        "Kif11",
        "Kif20b",
        "Kif23",
        "Kif2c",
        "Lbr",
        "Mcm2",
        "Mcm4",
        "Mcm5",
        "Mcm6",
        "Mki67",
        "Msh2",
        "Nasp",
        "Ncapd2",
        "Ndc80",
        "Nek2",
        "Nuf2",
        "Nusap1",
        "Pcna",
        "Pimreg",
        "Pola1",
        "Pold3",
        "Prim1",
        "Psrc1",
        "Rad51",
        "Rad51ap1",
        "Rangap1",
        "Rfc2",
        "Rpa2",
        "Rrm1",
        "Rrm2",
        "Slbp",
        "Smc4",
        "Tacc3",
        "Tipin",
        "Tmpo",
        "Top2a",
        "Tpx2",
        "Ttk",
        "Tubb4b",
        "Tyms",
        "Ube2c",
        "Ubr7",
        "Uhrf1",
        "Ung",
        "Usp1",
        "Wdr76",
    ]
)

In [None]:
SMALL_CELL_CYCLE_GENE_SET = np.intersect1d(SMALL_CELL_CYCLE_GENE_SET, adata.var_names)

In [None]:
import scvelo as scv

scv.pp.filter_and_normalize(
    adata,
    min_shared_counts=20,
    n_top_genes=2000,
    retain_genes=SMALL_CELL_CYCLE_GENE_SET,
)

scv.pp.moments(adata, n_pcs=30, n_neighbors=30)

Filtered out 2233 genes that are detected 20 counts (shared).
Normalized count data: spliced, unspliced.
Extracted 2086 highly variable genes.


  log1p(adata)


Logarithmized X.


  scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
  neighbors(
  _set_pca(adata=adata, n_pcs=n_pcs, use_highly_variable=use_highly_variable)


computing neighbors
    finished (0:00:36) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:05) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)


In [9]:
adata

AnnData object with n_obs × n_vars = 38913 × 2086
    obs: 'Cell_ID', 'obs_names', 'Stage', 'Dataset', 'G0?', 'UMAP_phase', 'UMI_SUM', 'S-phase', 'Celltype', 's_counts', 'u_counts', 'percent_unspliced', 'total_counts', 'pca_theta', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'n_counts'
    var: 'var_names', 'Type', 'n_cells', 'means', 'dispersions', 'dispersions_norm', 'highly_variable'
    uns: 'log1p', 'pca', 'neighbors'
    obsm: 'X_pca'
    varm: 'PCs'
    layers: 'matrix', 's_log', 's_norm', 'spliced', 'u_log', 'u_norm', 'unspliced', 'Ms', 'Mu'
    obsp: 'distances', 'connectivities'

In [8]:
import anndata as ad
import numpy as np
import scipy.sparse as sp

# Make sure Ms and Mu are dense
Ms = adata.layers["Ms"]
Mu = adata.layers["Mu"]

if sp.issparse(Ms):
    Ms = Ms.toarray()
if sp.issparse(Mu):
    Mu = Mu.toarray()

# Write legacy-compatible file
adata.write_h5ad(f"{DATA_FOLDER}cellcycle_maxine/VASA_preprocesseed_moments.h5ad")

Command ran:

```python DeepCycle.py   --input_adata /data/cellcycle_maxine/VASA_preprocesseed_moments.h5ad   --gene_list go_annotation/small_cycling_set.txt  --base_gene Top2a   --expression_threshold 0.5   --gpu    --output_adata CoPhaser/paper/code/figure_2/vasa/deepcycle_res.h5ad```

In [4]:
import anndata
import pickle
from CoPhaser import utils

DATA = "../data/"

adata = anndata.read_h5ad("deepcycle_res.h5ad")

In [5]:
vasa_results = pickle.load(open(DATA + "vasa_res.pkl", "rb"))

In [8]:
adata.obs

Unnamed: 0,Cell_ID,obs_names,Stage,Dataset,G0?,UMAP_phase,UMI_SUM,S-phase,Celltype,s_counts,u_counts,percent_unspliced,total_counts,pca_theta,initial_size_unspliced,initial_size_spliced,initial_size,n_counts,cell_cycle_theta
0,AAACAAACAAAGCCTA_E8.5-19_i36,0,E8.5,E8.5-19_i36,0.0,2.234725,3716.0,,,3714,4097,0.524517,3714,3.116877,4097,3714,3644.830308,3499.841937,0.11
1,AAACAAACAAAGCCTA_E8.5-7_i20,1,E8.5,E8.5-7_i20,0.0,4.659329,6066.0,,,6066,5598,0.479938,6066,0.584829,5598,6066,4052.869163,3905.797666,0.74
2,AAACAAACAACAATCC_E9.5-6_i42,2,E9.5,E9.5-6_i42,0.0,1.721789,6203.0,1.0,Intermediate Mesoderm I,6203,5817,0.483943,6203,-1.683279,5817,6203,4001.021097,3784.249579,0.16
3,AAACAAACAACCCTTG_E8.5-2_i16,3,E8.5,E8.5-2_i16,0.0,5.050425,26326.0,,,26326,22491,0.460721,26326,-0.140485,22491,26326,4865.090157,4709.485790,0.95
4,AAACAAACAACCCTTG_E9.5-4_i27,4,E9.5,E9.5-4_i27,0.0,2.447364,7234.0,1.0,Paraxial Mesoderm,7233,10065,0.581859,7233,3.073479,10065,7233,4260.573320,4075.968091,0.17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46117,TGTTATCATTAGGACC_E9.5-5_i28,46117,E9.5,E9.5-5_i28,0.0,6.099113,12009.0,0.0,Intermediate Mesoderm I,12008,12064,0.501163,12008,0.965278,12064,12008,4625.876817,4487.888805,0.87
46118,TGTTATCATTCCAGAC_E9.5-5_i28,46118,E9.5,E9.5-5_i28,0.0,1.952546,8341.0,1.0,Paraxial Mesoderm,8341,8797,0.513304,8341,3.028000,8797,8341,4342.116611,4132.540839,0.22
46120,TGTTATCATTGATCTA_E8.5-2_i16,46120,E8.5,E8.5-2_i16,0.0,6.067719,3548.0,0.0,Somites,3548,5096,0.589542,3548,0.331325,5096,3548,3595.733302,3517.078543,0.80
46121,TGTTATCATTGCATAT_E9.5-8_i44,46121,E9.5,E9.5-8_i44,0.0,1.767173,4804.0,,,4804,5316,0.525296,4804,2.944686,5316,4804,3915.664443,3756.094152,0.19


In [9]:
deepcycle_phase = adata.obs["cell_cycle_theta"]

In [None]:
res = utils.get_jensenshannon_raw(deepcycle_phase, vasa_results["author_labels"].values)
print("DeepCycle JS distance:", res)
vasa_results["JS_res"]["DeepCycle"] = [res]
vasa_results["deepcycle_phase"] = deepcycle_phase

In [None]:
# with open(DATA + "vasa_res.pkl", "wb") as f:
#     pickle.dump(vasa_results, f)