In [1]:
import os
import numpy as np
import pandas as pd
import h5py
from pathlib import Path
from scipy.sparse import csr_matrix, csc_matrix
from settings import DATA_DIR

In [2]:
data_dir = Path(DATA_DIR)
gene_csv_name = 'gene.csv'
non_gene_csv_name = 'non_gene.csv'
non_gene_table_columns = ['tissue_type', 'cell_type', 'size', 'disease', 'sex', 'ethnicity', 'development_stage','assay', 'cell_type_ontology_id', 'dataset']
gene_table_columns = [*non_gene_table_columns[:2], 'gene_ids', *non_gene_table_columns[2:]]

Structure of the CSV.

- Dataset Name
- Dataset ID
- Organ
- AS UBERON IDs 
- Num Cells
- Sex
- Age
- Donor ID
- RUI ID



In [4]:
def read_dataset(dataset_name):
    f = h5py.File(dataset_name, "r")
    title = f['uns']['title'][()].decode('utf-8')
    print(f"Reading {dataset_name} ", title)

    tissue_dict = pd.Series(f['obs']['__categories/tissue']).str.decode('utf-8')
    cell_dict = pd.Series(f['obs']['__categories/cell_type']).str.decode('utf-8')
    # TODO: add blank gene_id values rather than skipping the file
    if '_index' not in f['var']:
        print(f"No gene _index in {dataset_name}")
        return
    if not os.path.exists('data'):
        os.mkdir('data')
    if not os.path.exists('data/gene'):
        os.mkdir('data/gene')
    if not os.path.exists('data/non_gene'):
        os.mkdir('data/non_gene')
    gene_dict = pd.Series(f['var']['_index']).str.decode('utf-8')
    disease_dict = pd.Series(f['obs']['__categories/disease']).str.decode('utf-8')
    assay_dict = pd.Series(f['obs']['__categories/assay']).str.decode('utf-8')
    cell_id_dict = pd.Series(f['obs']['__categories/cell_type_ontology_term_id']).str.decode('utf-8')
    # TODO: add sex/ethnicity/development_stage/disease data too
    sex_types = pd.Series(f['obs']['__categories/sex']).str.decode('utf-8')[f['obs']['sex']] if ('sex' in pd.Series(f['obs']['__categories'])) else pd.Series(['data_unavailable'] * len(f['obs']['sex']))
    #disease_types = pd.Series(f['obs']['__categories/disease']).str.decode('utf-8')[f['obs']['disease']] if ('disease' in pd.Series(f['obs']['__categories'])) else pd.Series(['data_unavailable'] * len(f['obs']['disease']))
    stage_types = pd.Series(f['obs']['__categories/development_stage']).str.decode('utf-8')[f['obs']['development_stage']] if ('development_stage' in pd.Series(f['obs']['__categories'])) else pd.Series(['data_unavailable'] * len(f['obs']['development_stage']))
    ethnicity_types = pd.Series(f['obs']['__categories/ethnicity']).str.decode('utf-8')[f['obs']['ethnicity']] if ('ethnicity' in pd.Series(f['obs']['__categories'])) else pd.Series(['data_unavailable'] * len(f['obs']['ethnicity']))

    tissue_types = tissue_dict[f['obs']['tissue']]
    cell_types = cell_dict[f['obs']['cell_type']]
    cell_type_ids = cell_id_dict[f['obs']['cell_type_ontology_term_id']]
    diseases = disease_dict[f['obs']['disease']]

    attrs = dict(f['X'].attrs)
    if 'encoding-type' in attrs and attrs['encoding-type'] == 'csc_matrix':
        data, indices, indptr = f['X']['data'], f['X']['indices'], f['X']['indptr']
        data = csc_matrix((data, indices, indptr), shape=attrs['shape']).toarray()
    elif 'encoding-type' in attrs and attrs['encoding-type'] == 'csr_matrix':
        data, indices, indptr = f['X']['data'], f['X']['indices'], f['X']['indptr']
        data = csr_matrix((data, indices, indptr), shape=attrs['shape']).toarray()
    else:
        data = np.array(f['X'])

    x, y = data.nonzero()
    gene_ids = []
    for i in range(max(x) + 1):
        start = np.searchsorted(x, i, side='left')
        end = np.searchsorted(x, i, side='right')
        cell_gene_ids = ';'.join(gene_dict[y[start: end]])
        gene_ids.append(cell_gene_ids)

    new_data = pd.DataFrame({
        'tissue_type': tissue_types.values,
        'cell_type': cell_types.values,
        'cell_type_ontology_id': cell_type_ids.values,
        'gene_id': gene_ids,
        'disease': diseases.values,
        'sex' : sex_types.values,
        'ethnicity' : ethnicity_types.values,
        'development_stage' : stage_types.values
    })
    

    new_data = new_data.loc[new_data.disease == 'normal', :]
    #print('Data : \n', new_data)
    gene_group = new_data.groupby(['tissue_type', 'cell_type', 'cell_type_ontology_id'])
    #print('GG \n', gene_group)
    # gene_group = new_data.groupby(['tissue', 'cell_type', 'cell_type_ontology_id'])
    #stage_group = new_data.groupby(['tissue_type', 'cell_type', 'cell_type_ontology_id'])['development_stage']
    #ethnicity_group = new_data.groupby(['tissue_type', 'cell_type', 'cell_type_ontology_id'])['ethnicity']
    #sex_group = new_data.groupby(['tissue_type', 'cell_type', 'cell_type_ontology_id'])['sex']

    #print('Count info : \n', gene_group.count())
    def merge_field_values(x):
        gene_id_set = set()
        for cell_gene_ids in x:
            gene_id_set.update(cell_gene_ids.split(';'))
        if '' in gene_id_set:
            gene_id_set.remove('')
        return ';'.join(gene_id_set)
    
    genes = gene_group.agg(merge_field_values)
    genes['count'] = gene_group.count()['gene_id']
    print('genes : \n', genes)
    genes.to_csv('data/gene/organ_' + str(title).replace(' ', '_') + '.csv')
    non_genes = genes.drop(['gene_id'], axis=1)
    print('Non - genes : \n', non_genes)

    non_genes.to_csv('data/non_gene/organ_' + str(title).replace(' ', '_') + '.csv')


In [None]:
if __name__ == '__main__':
    with open(gene_csv_name, 'w') as gene_out:
        gene_out.write('|'.join(gene_table_columns) + "\n")
    with open(non_gene_csv_name, 'w') as non_gene_out:
        non_gene_out.write('|'.join(non_gene_table_columns) + "\n")
    with open(non_gene_csv_name, 'w') as non_gene_out:
        non_gene_out.write('|'.join(non_gene_table_columns) + "\n")

    for dataset_name in data_dir.glob('*.h5ad'):
    #for dataset_name in [str(data_dir)+'/c42c8ad3-9761-49e5-b9bf-ee8ebd50416f.h5ad', str(data_dir)+'/f75f2ff4-2884-4c2d-b375-70de37a34507.h5ad']:
        try:
            read_dataset(str(dataset_name))
        except Exception as e:
            print(f"{dataset_name} => {str(e)}")
        break

In [3]:
import pandas as pd

pd.set_option('max_rows', 20)
pd.set_option('max_columns', None)

In [3]:
import scanpy as sc

ls = [ds for ds in data_dir.glob('*.h5ad')]
#print(ls)



In [4]:
len(ls)

107

In [22]:
data =  sc.read_h5ad(str(ls[4]), )

In [23]:
data

AnnData object with n_obs × n_vars = 13782 × 22231
    obs: 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'donor_BMI', 'donor_uuid', 'ethnicity_ontology_term_id', 'library_uuid', 'mapped_reference_annotation', 'organism_ontology_term_id', 'sample_preservation_method', 'sample_uuid', 'tissue_ontology_term_id', 'tissue_section_thickness', 'tissue_section_uuid', 'is_primary_data', 'author_predicted_cell_type', 'cell_type_ontology_term_id', 'disease_ontology_term_id', 'reported_diseases', 'sex_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage'
    var: 'feature_biotype', 'feature_is_filtered', 'feature_name', 'feature_reference'
    uns: 'X_normalization', 'cell_type_ontology_term_id_colors', 'default_embedding', 'layer_descriptions', 'schema_version', 'title'
    obsm: 'X_spatial', 'X_umap'

In [24]:
data.obs

Unnamed: 0,assay_ontology_term_id,development_stage_ontology_term_id,donor_BMI,donor_uuid,ethnicity_ontology_term_id,library_uuid,mapped_reference_annotation,organism_ontology_term_id,sample_preservation_method,sample_uuid,...,reported_diseases,sex_ontology_term_id,cell_type,assay,disease,organism,sex,tissue,ethnicity,development_stage
AAAAAAAAAAGTAA,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,...,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,kidney proximal convoluted tubule epithelial cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
AAAAAAAATCATAA,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,...,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,kidney collecting duct intercalated cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
AAAAAAACAAAGAC,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,...,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,endothelial cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
AAAAAAACAATCAC,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,...,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,glomerular visceral epithelial cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
AAAAAAACCAAGAC,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,...,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,blood vessel smooth muscle cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTTTTTTCTGTTT,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,...,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,kidney collecting duct intercalated cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
TTTTTTTTGTTCTT,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,...,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,leukocyte,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
TTTTTTTTTATCTT,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,...,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,endothelial cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
TTTTTTTTTTATTT,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,...,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,endothelial cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage


In [25]:
data.var

Unnamed: 0,feature_biotype,feature_is_filtered,feature_name,feature_reference
ENSG00000121410,gene,True,A1BG,NCBITaxon:9606
ENSG00000268895,gene,False,A1BG-AS1,NCBITaxon:9606
ENSG00000148584,gene,True,A1CF,NCBITaxon:9606
ENSG00000175899,gene,False,A2M,NCBITaxon:9606
ENSG00000245105,gene,False,A2M-AS1,NCBITaxon:9606
...,...,...,...,...
ENSG00000036549,gene,False,ZZZ3,NCBITaxon:9606
ENSG00000141854,gene,False,MISP3,NCBITaxon:9606
ENSG00000234352,gene,False,AC009264.1,NCBITaxon:9606
ENSG00000215769,gene,False,ARHGAP27P1-BPTFP1-KPNA2P3,NCBITaxon:9606


In [26]:
raw_data = data.obs

In [28]:

pd.set_option('max_rows', 20)
pd.set_option('max_columns', None)
raw_data

Unnamed: 0,assay_ontology_term_id,development_stage_ontology_term_id,donor_BMI,donor_uuid,ethnicity_ontology_term_id,library_uuid,mapped_reference_annotation,organism_ontology_term_id,sample_preservation_method,sample_uuid,tissue_ontology_term_id,tissue_section_thickness,tissue_section_uuid,is_primary_data,author_predicted_cell_type,cell_type_ontology_term_id,disease_ontology_term_id,reported_diseases,sex_ontology_term_id,cell_type,assay,disease,organism,sex,tissue,ethnicity,development_stage
AAAAAAAAAAGTAA,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,UBERON:0000362,10μm,7df8890e-3960-41d9-bb18-d5cbf6b4c05e,True,PCT,CL:1000838,PATO:0000461,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,kidney proximal convoluted tubule epithelial cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
AAAAAAAATCATAA,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,UBERON:0000362,10μm,7df8890e-3960-41d9-bb18-d5cbf6b4c05e,True,CD-B-IC,CL:1001432,PATO:0000461,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,kidney collecting duct intercalated cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
AAAAAAACAAAGAC,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,UBERON:0000362,10μm,7df8890e-3960-41d9-bb18-d5cbf6b4c05e,True,Endothelial,CL:0000115,PATO:0000461,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,endothelial cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
AAAAAAACAATCAC,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,UBERON:0000362,10μm,7df8890e-3960-41d9-bb18-d5cbf6b4c05e,True,Podocyte,CL:0000653,PATO:0000461,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,glomerular visceral epithelial cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
AAAAAAACCAAGAC,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,UBERON:0000362,10μm,7df8890e-3960-41d9-bb18-d5cbf6b4c05e,True,vSMC,CL:0019018,PATO:0000461,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,blood vessel smooth muscle cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTTTTTTCTGTTT,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,UBERON:0000362,10μm,7df8890e-3960-41d9-bb18-d5cbf6b4c05e,True,CD-B-IC,CL:1001432,PATO:0000461,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,kidney collecting duct intercalated cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
TTTTTTTTGTTCTT,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,UBERON:0000362,10μm,7df8890e-3960-41d9-bb18-d5cbf6b4c05e,True,Immune,CL:0000738,PATO:0000461,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,leukocyte,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
TTTTTTTTTATCTT,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,UBERON:0000362,10μm,7df8890e-3960-41d9-bb18-d5cbf6b4c05e,True,Endothelial,CL:0000115,PATO:0000461,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,endothelial cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage
TTTTTTTTTTATTT,EFO:0009920,HsapDv:0000169,28.0,6caaa3d2-bed4-4253-bd57-dca17e93598e,HANCESTRO:0005,cf23c778-0665-4cd5-ba35-d14302fbf16b,GENCODE 19,NCBITaxon:9606,flash-freezing,2dab56f7-555c-4737-a25f-54eb30d21f8f,UBERON:0000362,10μm,7df8890e-3960-41d9-bb18-d5cbf6b4c05e,True,Endothelial,CL:0000115,PATO:0000461,"[hypertensive disorder,clear cell renal carcin...",PATO:0000384,endothelial cell,Slide-seq,normal,Homo sapiens,male,renal medulla,European,75-year-old human stage


In [18]:
#print('Regions : \n {}\n'.format(raw_data['region'].unique()))
#print('Age Group : \n {}\n'.format(raw_data['age_group'].unique()))#
print('Disease : \n {}\n'.format(raw_data['disease'].unique()))
print('Ethnicity : \n {}\n'.format(raw_data['ethnicity'].unique()))
print('Cell type : \n {}\n'.format(raw_data['cell_type'].unique()))


len(raw_data.columns)

Disease : 
 ['normal']
Categories (1, object): ['normal']

Ethnicity : 
 ['African American']
Categories (1, object): ['African American']

Cell type : 
 ['endothelial cell', 'glomerular visceral epithelial cell', 'kidney collecting duct principal cell', 'kidney proximal convoluted tubule epithelial ..., 'kidney collecting duct intercalated cell', ..., 'blood vessel smooth muscle cell', 'kidney loop of Henle thick ascending limb epi..., 'macrophage', 'macula densa epithelial cell', 'mesangial cell']
Length: 14
Categories (14, object): ['blood vessel smooth muscle cell', 'endothelial cell', 'glomerular visceral epithelial cell', 'kidney collecting duct intercalated cell', ..., 'leukocyte', 'macrophage', 'macula densa epithelial cell', 'mesangial cell']



28

Scrublet score : % of repitition  - Single-cell remover of doublets(SCRUBLET)

In [19]:
### SORTING LOGIC TO FILTER OUT DISEASE CELLS.


In [34]:
drop_cols = ['NRP', 'cell_source', 'donor', 'percent_mito', 'percent_ribo', 'sample', 'scrublet_score', 'source', 'type', 'version', 'cell_states', \
    'Used', 'assay_ontology_term_id', 'assay', 'cell_type_original', 'development_stage_ontology_term_id', 'ethnicity_ontology_term_id', \
    'disease_ontology_term_id', 'n_genes', 'n_counts', 'gender', 'disease']
refined_data = raw_data[['tissue', 'tissue_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'disease', 'sex', 'donor_uuid', 'ethnicity', 'development_stage', 'assay']]

In [35]:
refined_data

Unnamed: 0,tissue,tissue_ontology_term_id,cell_type,cell_type_ontology_term_id,disease,sex,donor_uuid,ethnicity,development_stage,assay
AAAAAAAAAAGTAA,renal medulla,UBERON:0000362,kidney proximal convoluted tubule epithelial cell,CL:1000838,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq
AAAAAAAATCATAA,renal medulla,UBERON:0000362,kidney collecting duct intercalated cell,CL:1001432,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq
AAAAAAACAAAGAC,renal medulla,UBERON:0000362,endothelial cell,CL:0000115,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq
AAAAAAACAATCAC,renal medulla,UBERON:0000362,glomerular visceral epithelial cell,CL:0000653,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq
AAAAAAACCAAGAC,renal medulla,UBERON:0000362,blood vessel smooth muscle cell,CL:0019018,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq
...,...,...,...,...,...,...,...,...,...,...
TTTTTTTTCTGTTT,renal medulla,UBERON:0000362,kidney collecting duct intercalated cell,CL:1001432,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq
TTTTTTTTGTTCTT,renal medulla,UBERON:0000362,leukocyte,CL:0000738,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq
TTTTTTTTTATCTT,renal medulla,UBERON:0000362,endothelial cell,CL:0000115,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq
TTTTTTTTTTATTT,renal medulla,UBERON:0000362,endothelial cell,CL:0000115,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq


In [36]:
refined_data = refined_data.reset_index()
refined_data['tissues'] = refined_data['tissue'].astype(str) +  '|' +  refined_data['tissue_ontology_term_id'].astype(str) 
refined_data = refined_data.drop(['index', 'tissue', 'tissue_ontology_term_id'], axis=1)
refined_data

Unnamed: 0,cell_type,cell_type_ontology_term_id,disease,sex,donor_uuid,ethnicity,development_stage,assay,tissues
0,kidney proximal convoluted tubule epithelial cell,CL:1000838,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq,renal medulla|UBERON:0000362
1,kidney collecting duct intercalated cell,CL:1001432,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq,renal medulla|UBERON:0000362
2,endothelial cell,CL:0000115,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq,renal medulla|UBERON:0000362
3,glomerular visceral epithelial cell,CL:0000653,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq,renal medulla|UBERON:0000362
4,blood vessel smooth muscle cell,CL:0019018,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq,renal medulla|UBERON:0000362
...,...,...,...,...,...,...,...,...,...
13777,kidney collecting duct intercalated cell,CL:1001432,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq,renal medulla|UBERON:0000362
13778,leukocyte,CL:0000738,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq,renal medulla|UBERON:0000362
13779,endothelial cell,CL:0000115,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq,renal medulla|UBERON:0000362
13780,endothelial cell,CL:0000115,normal,male,6caaa3d2-bed4-4253-bd57-dca17e93598e,European,75-year-old human stage,Slide-seq,renal medulla|UBERON:0000362


In [37]:
pd.set_option('max_rows', 80)
pd.set_option('max_columns', None)


In [38]:

refined_data.groupby([ 
'sex', 
'age_group', 
'development_stage',
'tissues',
#'tissue',
#'tissue_ontology_term_id',
'cell_type_ontology_term_id',
'cell_type',
]).count().head(50)

KeyError: 'age_group'

In [101]:
refined_data.groupby([ 
'sex', 
'age_group', 
'development_stage',
'tissues',
#'tissue',
#'tissue_ontology_term_id',
'cell_type_ontology_term_id',
'cell_type',
]).agg({'region':['count']}).head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,region
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,count
sex,age_group,development_stage,tissues,cell_type_ontology_term_id,cell_type,Unnamed: 6_level_2
female,40-45,40-year-old human stage,apex of heart|UBERON:0002098,CL:1000309,epicardial adipocyte,10
female,40-45,40-year-old human stage,heart left ventricle|UBERON:0002084,CL:1000309,epicardial adipocyte,22
female,40-45,40-year-old human stage,heart right ventricle|UBERON:0002080,CL:1000309,epicardial adipocyte,81
female,40-45,40-year-old human stage,interventricular septum|UBERON:0002094,CL:1000309,epicardial adipocyte,7
female,40-45,40-year-old human stage,left cardiac atrium|UBERON:0002079,CL:1000309,epicardial adipocyte,298
female,40-45,40-year-old human stage,right cardiac atrium|UBERON:0002078,CL:1000309,epicardial adipocyte,34
female,40-45,45-year-old human stage,apex of heart|UBERON:0002098,CL:1000309,epicardial adipocyte,0
female,40-45,45-year-old human stage,heart left ventricle|UBERON:0002084,CL:1000309,epicardial adipocyte,0
female,40-45,45-year-old human stage,heart right ventricle|UBERON:0002080,CL:1000309,epicardial adipocyte,0
female,40-45,45-year-old human stage,interventricular septum|UBERON:0002094,CL:1000309,epicardial adipocyte,0


In [18]:
ds = refined_data[['cell_type', 'n_counts']]

In [19]:
ds

Unnamed: 0,cell_type,n_counts
0,epicardial adipocyte,6881.0
1,epicardial adipocyte,5889.0
2,epicardial adipocyte,2143.0
3,epicardial adipocyte,5080.0
4,epicardial adipocyte,12483.0
...,...,...
3794,epicardial adipocyte,3669.0
3795,epicardial adipocyte,2858.0
3796,epicardial adipocyte,2672.0
3797,epicardial adipocyte,1910.0
