# 5.1 Generate Overview of Embryos

In [2]:
from clustergrammer2 import net
df = {}

In [32]:
import pandas as pd
from glob import glob
import os
from copy import deepcopy

### Cell Type Distributions

In [48]:
list_cell_types = []
new_samples = sorted(glob('../data/big_data/cao_2million-cell_2019_61-embryo_parquet_files_binder/*'))
for inst_sample in new_samples:

    df_meta = pd.read_parquet(inst_sample + '/meta_cell.parquet')
    
    list_cell_types.extend(list(df_meta['Main_cell_type'].get_values()))
    
print(len(list_cell_types))

1386587


In [51]:
ser_cell_types = pd.Series(list_cell_types)
print('there are', ser_cell_types.value_counts().shape[0], 'cell types')
ser_cell_types.value_counts()

there are 38 cell types


Chondrocytes & osteoblasts       104698
Connective tissue progenitors     98964
Intermediate Mesoderm             89518
Jaw and tooth progenitors         82289
Early mesenchyme                  71949
Excitatory neurons                68567
Epithelial cells                  66209
Radial glia                       65428
Neural progenitor cells           58332
Postmitotic premature neurons     56033
Oligodendrocyte Progenitors       54606
Isthmic organizer cells           48498
Neural Tube                       45985
Inhibitory neurons                44658
Myocytes                          43197
Definitive erythroid lineage      34205
Chondroctye progenitors           33539
Inhibitory neuron progenitors     31214
Premature oligodendrocyte         29538
Limb mesenchyme                   26559
Sensory neurons                   26477
Endothelial cells                 26431
Stromal cells                     23259
Osteoblasts                       23223
Schwann cell precursor            23145


In [23]:
for inst_sample in new_samples[:1]:
    df_gex = pd.read_parquet(inst_sample + '/gex.parquet')
    df_meta = pd.read_parquet(inst_sample + '/meta_cell.parquet')
    df_gex.shape

In [24]:
df_gex.shape

(5000, 15666)

In [26]:
df_meta.shape

(15666, 36)

### Working on adding categories from metadata

In [11]:
def add_cats_from_meta(barcodes, df_meta, add_cat_list):
    '''
    Add categories from df_meta.
    '''

    # get metadata of interest (add_cat_list) from barcodes of interest
    df_cats = df_meta.loc[barcodes][add_cat_list]

    # get list of cats
    list_cat_ini = [list(x) for x in df_cats.values]

    # add titles to cats
    list_cat_titles = [ list([str(x) + ': ' + str(y) for x,y in zip(add_cat_list, a)]) for a in list_cat_ini]

    # add barcodes to new columns
    new_cols = [tuple([x] + y) for x,y in zip(barcodes, list_cat_titles)]

    return new_cols

In [13]:
df_meta.head()

Unnamed: 0_level_0,Unnamed: 0,all_exon_count,all_intron_count,all_read_count,intergenic_rate,embryo_id,embryo_sex,nuclei_extraction_date,development_stage,Total_mRNAs,...,Main_trajectory_umap_3,Main_trajectory_refined_by_cluster,Main_trajectory_refined_umap_1,Main_trajectory_refined_umap_2,Main_trajectory_refined_umap_3,Sub_trajectory_name,Sub_trajectory_umap_1,Sub_trajectory_umap_2,Sub_trajectory_louvain_component,Sub_trajectory_Pseudotime
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sci3-me-001.ATTAGTCTGTGTATAATACG,0,1612.0,382.0,2442.0,0.183456,9,M,2,11.5,1989.0,...,1.723381,Endothelial trajectory,1.208097,1.13821,1.053613,Venous and capillary endothelial trajectory,1.130985,1.567624,1.0,0.068362
sci3-me-002.GTACCTCTTATTCGACCAA,12,138.0,416.0,672.0,0.175595,9,M,2,11.5,554.0,...,0.864672,Neural tube and notochord trajectory,1.81137,1.316749,1.021694,Neural epithelial trajectory,2.394788,1.07892,1.0,1.570391
sci3-me-002.ACCGTAGCTAGGAGAGAAC,13,181.0,682.0,997.0,0.134403,9,M,2,11.5,858.0,...,0.721429,Neural tube and notochord trajectory,1.557839,0.811976,0.521807,Neuron progenitor trajectory,1.782221,0.887197,1.0,0.168541
sci3-me-002.ATAACTTCCTTATGAGTTAA,15,359.0,935.0,1470.0,0.119728,9,M,2,11.5,1293.0,...,1.093316,Neural tube and notochord trajectory,1.524517,1.654862,1.773734,Oligodendrocyte trajectory,0.304404,0.886391,1.0,1.583377
sci3-me-002.TCTCTCCATAATGCCGCTT,17,160.0,367.0,602.0,0.124585,9,M,2,11.5,527.0,...,1.089299,Neural tube and notochord trajectory,1.586869,1.665207,1.784211,Oligodendrocyte trajectory,0.293331,0.843669,1.0,1.580017


In [27]:
new_cols = add_cats_from_meta(df_gex.columns.tolist(), df_meta, ['Main_cell_type', 'Main_trajectory', 'development_stage'])

In [28]:
len(new_cols)

15666

In [29]:
new_cols[0]

('sci3-me-002.AGATCGGATGCGTTGGAGC',
 'Main_cell_type: Early mesenchyme',
 'Main_trajectory: Mesenchymal trajectory',
 'development_stage: 9.5')

In [30]:
df_gex.shape

(5000, 15666)

In [31]:
df_meta.shape

(15666, 36)

In [33]:
df_cat = deepcopy(df_gex)
df_cat.columns = new_cols

In [38]:
ser_traj = pd.Series([x[1] for x in df_cat.columns.tolist()])
cell_types = ser_traj.value_counts().index.tolist()
ser_traj.value_counts()

Main_cell_type: Early mesenchyme                 3623
Main_cell_type: Neural Tube                      2448
Main_cell_type: Intermediate Mesoderm            1560
Main_cell_type: Epithelial cells                 1319
Main_cell_type: Isthmic organizer cells           925
Main_cell_type: Oligodendrocyte Progenitors       784
Main_cell_type: Radial glia                       696
Main_cell_type: Endothelial cells                 585
Main_cell_type: Stromal cells                     527
Main_cell_type: Primitive erythroid lineage       495
Main_cell_type: Neural progenitor cells           439
Main_cell_type: Chondroctye progenitors           388
Main_cell_type: Notochord cells                   370
Main_cell_type: Jaw and tooth progenitors         344
Main_cell_type: Schwann cell precursor            307
Main_cell_type: Cardiac muscle lineages           263
Main_cell_type: Sensory neurons                   142
Main_cell_type: Myocytes                           80
Main_cell_type: Cholinergic 

In [43]:
for inst_cell_type in cell_types:
    print(inst_cell_type)
    ser_traj = pd.Series([x[2] for x in df_cat.columns.tolist() if x[1] == inst_cell_type])
    print(len(ser_traj.value_counts().index.tolist()))

Main_cell_type: Early mesenchyme
4
Main_cell_type: Neural Tube
4
Main_cell_type: Intermediate Mesoderm
6
Main_cell_type: Epithelial cells
6
Main_cell_type: Isthmic organizer cells
5
Main_cell_type: Oligodendrocyte Progenitors
5
Main_cell_type: Radial glia
5
Main_cell_type: Endothelial cells
4
Main_cell_type: Stromal cells
6
Main_cell_type: Primitive erythroid lineage
1
Main_cell_type: Neural progenitor cells
1
Main_cell_type: Chondroctye progenitors
1
Main_cell_type: Notochord cells
1
Main_cell_type: Jaw and tooth progenitors
1
Main_cell_type: Schwann cell precursor
5
Main_cell_type: Cardiac muscle lineages
5
Main_cell_type: Sensory neurons
2
Main_cell_type: Myocytes
3
Main_cell_type: Cholinergic neurons
1
Main_cell_type: Osteoblasts
4
Main_cell_type: Ependymal cell
4
Main_cell_type: Megakaryocytes
4
Main_cell_type: Hepatocytes
2
Main_cell_type: Melanocytes
3
Main_cell_type: Postmitotic premature neurons
2
Main_cell_type: Premature oligodendrocyte
2
Main_cell_type: White blood cells
1


In [35]:
ser_traj = pd.Series([x[3] for x in df_cat.columns.tolist()])
ser_traj.value_counts()

development_stage: 9.5    15666
dtype: int64

In [None]:
net.generate_signatures(df_cat, 'Main_cell_type')