In [1]:
import pandas as pd
import scanpy as sc
import h5py
import numpy as np
import anndata
from metacells import tools
from metacells import pipeline

In [2]:
single_cell_h5py = h5py.File('/data/passala/Data_from_CoCoCoNet/single_cell_data/Ara_data.hdf5','r')
list(single_cell_h5py.keys())
col_data = single_cell_h5py['coldata']
row_data = single_cell_h5py['rowdata']
embedding_data = single_cell_h5py['embedding']
normalized_counts = single_cell_h5py['normalized_counts']

row_data_decoded = []

for gene_name in row_data:
    row_data_decoded.append(gene_name[0].decode())

cell_type_number =[]
study_number = []
study_id = []
batch_cluster = []
meta_cluster = []
umap_coordinates = []

for cell_identity in col_data:
    cell_type_number.append(cell_identity[0])  
    study_number.append(cell_identity[1])
    study_id.append(cell_identity[2])
    batch_cluster.append(cell_identity[3])
    meta_cluster.append(cell_identity[4])

barcode_for_each_cell = [] 
for barcode in embedding_data:
    barcode_for_each_cell.append(barcode[2])
    current_umap_coordinates =[barcode[1],barcode[0]]
    umap_coordinates.append(current_umap_coordinates)
umap_coordinates = np.array(umap_coordinates)


In [3]:
obs_arabidop = pd.DataFrame(index = barcode_for_each_cell, data = list(zip(cell_type_number,study_number,batch_cluster, meta_cluster)), columns = ['Cell Type','Study Number','Batch Cluster','Meta Cluster'] )

vars_arabidop = pd.DataFrame(index = row_data_decoded)
single_cell_arabidopsis_root_4_datasets = anndata.AnnData(X = normalized_counts[:],obs = obs_arabidop, var = vars_arabidop)
single_cell_arabidopsis_root_4_datasets.obsm['X_umap'] = umap_coordinates
umap_df = pd.DataFrame(data = single_cell_arabidopsis_root_4_datasets.obsm['X_umap'], columns = ['Axis 1','Axis 2'], index = barcode_for_each_cell)
bad_values = umap_df.sort_values(by = 'Axis 1', ascending = False).head(6).index
single_cell_arabidopsis_root_4_datasets.obs.loc[bad_values]
good_obs = single_cell_arabidopsis_root_4_datasets.obs.loc[~single_cell_arabidopsis_root_4_datasets.obs.index.isin(bad_values)]
single_cell_arabidopsis_root_4_datasets  = single_cell_arabidopsis_root_4_datasets[good_obs.index,:]

  single_cell_arabidopsis_root_4_datasets = anndata.AnnData(X = normalized_counts[:],obs = obs_arabidop, var = vars_arabidop)


In [4]:
arabi_cell_cycle = pd.read_csv('/data/passala/Collaborator_Data/Maize_arabi_Jack_Collab/arabi_cell_cycle_genes.csv')


In [5]:
single_cell_arabidopsis_root_4_datasets

View of AnnData object with n_obs × n_vars = 16635 × 22271
    obs: 'Cell Type', 'Study Number', 'Batch Cluster', 'Meta Cluster'
    obsm: 'X_umap'

In [6]:
np.asmatrix(single_cell_arabidopsis_root_4_datasets.X)

matrix([[5.8552675, 0.       , 0.       , ..., 5.8552675, 0.       ,
         0.       ],
        [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
         0.       ],
        [0.       , 0.       , 0.       , ..., 5.2714353, 0.       ,
         0.       ],
        ...,
        [5.1976123, 0.       , 0.       , ..., 4.1099987, 0.       ,
         0.       ],
        [0.       , 4.5786586, 0.       , ..., 0.       , 0.       ,
         0.       ],
        [0.       , 3.337244 , 0.       , ..., 3.337244 , 0.       ,
         0.       ]], dtype=float32)

In [7]:
tools.find_bursty_lonely_genes(single_cell_arabidopsis_root_4_datasets, random_seed= 201, min_gene_total= 0)

set unnamed.var[bursty_lonely_gene]: 0 true (0%) out of 22271 bools
  adata.var[name] = data


In [8]:
cell_cycle_list = arabi_cell_cycle['Arabidopsis Genes'].to_list()

In [9]:
pipeline.mark.mark_lateral_genes(single_cell_arabidopsis_root_4_datasets, lateral_gene_names = cell_cycle_list)

set unnamed.var[lateral_gene]: 237 true (1.064%) out of 22271 bools


In [10]:
pipeline.related_genes.relate_to_lateral_genes(single_cell_arabidopsis_root_4_datasets)

set unnamed.var[lateral_genes_module]: 9197 outliers (41.3%) and 13074 grouped (58.7%) out of 22271 int32 elements with 323 groups with mean size 40.48
set unnamed.varp[lateral_genes_similarity]: csr_matrix 22271 X 22271 float32s (170929476 > 0, 34.46%)


In [11]:
single_cell_arabidopsis_root_4_datasets.var

Unnamed: 0,bursty_lonely_gene,lateral_gene,lateral_genes_module
AT1G01010,False,False,302
AT1G01020,False,False,145
AT1G01030,False,False,248
AT1G01040,False,False,70
AT1G01050,False,False,186
...,...,...,...
ATCG01110,False,False,-1
ATCG01120,False,False,-1
ATCG01130,False,False,17
ATCG01230,False,False,-1


In [12]:
pipeline.divide_and_conquer_pipeline(single_cell_arabidopsis_root_4_datasets, random_seed= 101)

set unnamed.var[selected_gene]: * -> False
set unnamed.var[rare_gene]: 10 true (0.0449%) out of 22271 bools
set unnamed.var[rare_gene_module]: 22261 outliers (99.96%) and 10 grouped (0.0449%) out of 22271 int32 elements with 1 groups with mean size 10
set unnamed.obs[cells_rare_gene_module]: 16553 outliers (99.51%) and 82 grouped (0.4929%) out of 16635 int32 elements with 1 groups with mean size 82
set unnamed.obs[rare_cell]: 82 true (0.4929%) out of 16635 bools
