In [None]:
import anndata
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import scanpy as sc
import scvelo as scv
import pandas as pd
import pyreadr

# Mouse Embryo Dataset
Cao, J., Spielmann, M., Qiu, X. et al. The single-cell transcriptional landscape of mammalian organogenesis. Nature 566, 496–502 (2019). https://doi.org/10.1038/s41586-019-0969-x

In [None]:
#Load Gene and Cell IDs
genes_u = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/gene_name_u.csv")
cells_u = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/cell_name_u.csv")
genes_s = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/gene_name_s.csv")
cells_s = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/cell_name_s.csv")

In [None]:
genes_u = genes_u['genes'].to_numpy() 
cells_u = cells_u['cells'].to_numpy()
genes_s = genes_s['genes'].to_numpy() 
cells_s = cells_s['cells'].to_numpy()

In [None]:
print(np.all(genes_u==genes_s), np.all(cells_u==cells_s))

In [None]:
# rows are genes
U_ix = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/U_ix.csv")
S_ix = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/S_ix.csv")
row_u = U_ix['row'].to_numpy()
val_u = U_ix['val'].to_numpy()
row_s = S_ix['row'].to_numpy()
val_s = S_ix['val'].to_numpy()

In [None]:
row_s.max()

$U_j,S_j$ contains the cumulative number of nonzero elements until each column. 

In [None]:
# columns are cells
U_j = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/U_j.csv")
S_j = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/S_j.csv")
col_u_sum = U_j['col'].to_numpy()
col_u_sum = np.diff(col_u_sum)
col_s_sum = S_j['col'].to_numpy()
col_s_sum = np.diff(col_s_sum)

In [None]:
anno = pyreadr.read_r("/nfs/turbo/umms-welchjd/yichen/data/scRNA/MOCA_df_cell.rds")
anno = anno[None]
anno.columns = ['clusters','day']

In [None]:
print(np.all(np.array(anno.index)==cells_s))

In [None]:
N = len(cells_s)
G = len(genes_s)
print(f"Number of cells: {N}, Number of Genes: {G}")

In [None]:
col_u, col_s = np.zeros((len(row_u))), np.zeros((len(row_s)))
ptr = 0
for i in range(N):
    col_u[ptr:ptr+col_u_sum[i]] = i
    ptr += col_u_sum[i]

ptr = 0
for i in range(N):
    col_s[ptr:ptr+col_s_sum[i]] = i
    ptr += col_s_sum[i]

U = sp.sparse.csr_matrix((val_u, (col_u, row_u)), shape=(N,G))
S = sp.sparse.csr_matrix((val_s, (col_s, row_s)), shape=(N,G))

In [None]:
X = U+S

In [None]:
adata = anndata.AnnData(X=X,
                        obs=anno,
                        var=pd.DataFrame({},index=pd.Index(genes_s)),
                        layers={'unspliced':U, 'spliced':S})

In [None]:
adata.obs

In [None]:
adata.write_h5ad("/nfs/turbo/umms-welchjd/yichen/data/scRNA/mouse_E9_13.h5ad")

## Add Major Cell Type Annotation

In [None]:
cell_anno = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/cell_annotate.csv")

In [None]:
cell_anno.keys()

In [None]:
x = cell_anno["Main_trajectory"].to_numpy()
isstr = np.array([isinstance(x[i], str) for i in range(len(x))])
x = x[isstr]
len(np.unique(x))

In [None]:
adata = anndata.read_h5ad("/nfs/turbo/umms-welchjd/yichen/data/scRNA/mouse_E9_13.h5ad")

In [None]:
cell_id_anno = cell_anno['sample'].to_numpy()
cell_id = adata.obs.index.to_numpy()

In [None]:
is_in_adata = np.array([x in cell_id for x in cell_id_anno])
cell_labels = cell_anno["Main_trajectory"][is_in_adata].to_numpy()