In [1]:
import h5py
import numpy as np
import pandas as pd
import scanpy as sc
from scipy.cluster.hierarchy import linkage,leaves_list, dendrogram
from scipy.spatial.distance import pdist, euclidean
from scipy.special import comb
import re
import json 

In [3]:
h5_data = h5py.File('./output/condensed_lung_atlas_in_cpm.h5',"r")

In [8]:
h5_data.keys()

<KeysViewHDF5 ['celltype', 'celltype_dataset', 'celltype_dataset_timepoint', 'celltype_timepoint']>

In [29]:
h5_data['celltype'].keys()

<KeysViewHDF5 ['cell_count', 'gene_expression_average', 'gene_proportion_expression']>

In [15]:
np.array(h5_data['celltype']['gene_proportion_expression']['block0_values'])

array([[0.011765 , 0.2018   , 0.1095   , ..., 0.       , 0.       ,
        0.000905 ],
       [0.01349  , 0.0501   , 0.01927  , ..., 0.       , 0.       ,
        0.       ],
       [0.0101   , 0.1685   , 0.06198  , ..., 0.       , 0.0004592,
        0.       ],
       ...,
       [0.02563  , 0.       , 0.       , ..., 0.       , 0.       ,
        0.00641  ],
       [0.125    , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.002802 , 0.007282 , 0.00616  , ..., 0.002802 , 0.       ,
        0.0005603]], dtype=float16)

In [20]:
# gives the name of dataset we want as an input
# celltype / celltype_dataset / celltype_dataset_timepoint
def read_file_average(df_type):
    with h5py.File('./output/condensed_lung_atlas_in_cpm.h5',"r") as h5_data:
    
        df = pd.DataFrame(
                data=np.array(h5_data[df_type]['gene_expression_average']['block0_values']).astype(np.float32),
                index=np.array(h5_data[df_type]['gene_expression_average']['axis1'].asstr()),
                columns=np.array(h5_data[df_type]['gene_expression_average']['axis0'].asstr()),
            ).T
    return df

In [17]:
def read_file(df_type):
    with h5py.File('./output/condensed_lung_atlas_in_cpm.h5',"r") as h5_data:
    
        df = pd.DataFrame(
                data=np.array(h5_data[df_type]['gene_proportion_expression']['block0_values']).astype(np.float32),
                index=np.array(h5_data[df_type]['gene_proportion_expression']['axis1'].asstr()),
                columns=np.array(h5_data[df_type]['gene_proportion_expression']['axis0'].asstr()),
            ).T
    return df

In [18]:
df = read_file('celltype')

In [21]:
df_average = read_file_average('celltype')

In [28]:
df.loc['Car4']

Adventitial fibroblast                       0.025345
Airway smooth muscle                         0.021194
Alveolar fibroblast                          0.036743
Alveolar type I                              0.018936
Alveolar type II                             0.027313
Arterial EC I                                0.031860
Arterial EC II                               0.039215
B cell                                       0.013855
Car4+ capillaries                            0.987305
DC I                                         0.025284
DC II                                        0.059998
DC III                                       0.010307
Early Car4- capillaries                      0.319336
Early adventitial fibroblast                 0.018616
Early airway smooth muscle                   0.017517
Early alveolar fibroblast                    0.032715
Fibroblast precursor                         0.007835
IL cell                                      0.027771
Late Car4- capillaries      

In [23]:
df_average

Unnamed: 0,Adventitial fibroblast,Airway smooth muscle,Alveolar fibroblast,Alveolar type I,Alveolar type II,Arterial EC I,Arterial EC II,B cell,Car4+ capillaries,DC I,...,Proliferating myofibroblast,Proliferating pericyte,Proliferative EC,Striated muscle,T cell,Vascular smooth muscle,Venous EC,basophil,mast cell,neutrophil
0610005C13Rik,0.379639,2.013672,1.431641,0.364014,0.726562,3.193359,0.014755,0.176514,0.235596,0.061218,...,0.099731,0.250244,0.550781,0.000000,0.683105,0.023193,0.110229,0.091736,0.090271,0.006596
0610007C21Rik,84.812500,25.203125,97.062500,1.244141,14.835938,4.570312,70.062500,32.718750,70.750000,110.625000,...,0.000000,0.000000,0.000000,25.562500,17.546875,2.630859,3.621094,0.000000,0.000000,5.058594
0610007L01Rik,34.062500,8.250000,23.437500,0.421875,7.386719,0.000000,4.519531,17.718750,27.343750,68.875000,...,0.000000,0.000000,0.000000,3.824219,11.242188,0.776367,0.000000,0.000000,0.000000,5.898438
0610007N19Rik,48.437500,8.929688,48.156250,0.000000,3.205078,0.000000,3.279297,0.793945,2.554688,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.345703,0.000000,0.000000,0.000000,0.000000
0610007P08Rik,7.171875,7.187500,7.972656,0.000000,2.726562,0.000000,7.105469,2.048828,6.804688,14.195312,...,0.000000,0.000000,0.000000,0.000000,1.659180,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
n-R5s54,0.000000,0.004101,0.015205,0.000000,0.000000,0.000000,0.000000,0.000386,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000585,0.030884,0.011482,0.340088,0.000000,0.000202
n-R5s67,0.007317,0.117249,0.035767,0.000000,0.000000,0.008865,0.073853,0.125854,0.322998,0.059479,...,0.022858,0.000000,0.116882,0.000000,0.066895,0.173096,0.096375,0.000000,0.000000,0.000000
n-R5s68,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001127,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.460938
n-R5s8,0.000000,0.000000,0.013527,0.000000,0.000000,0.000000,0.000000,0.000000,0.056854,0.000000,...,0.000000,0.000000,0.043182,0.000000,0.000000,0.048553,0.091675,0.000000,0.000000,0.000000


In [8]:
gene="Car4"
mylist = ["Idjd,HHH,shi"]
lower(gene)

NameError: name 'lower' is not defined

In [5]:
i = 'ACZ'
columns_with_this_dataset = [col_name for col_name in df.columns if i in col_name]
df.loc[[gene]][['Adventitial fibroblast Hurskainen2021_P7']]

KeyError: "None of [Index(['Adventitial fibroblast Hurskainen2021_P7'], dtype='object')] are in the [columns]"

In [4]:
df = read_file('celltype_dataset_timepoint')
# gene = 'Car4'
# filtered_df = df.filter(items=[gene],axis=0)
# filtered_df
df

Unnamed: 0,Adventitial fibroblast_ACZ_P21,Adventitial fibroblast_ACZ_P7,Adventitial fibroblast_Hurskainen2021_P14,Adventitial fibroblast_Hurskainen2021_P3,Adventitial fibroblast_Hurskainen2021_P7,Adventitial fibroblast_TMS_18m,Adventitial fibroblast_TMS_24m,Adventitial fibroblast_TMS_3m,Airway smooth muscle_ACZ_P21,Airway smooth muscle_Hurskainen2021_P14,...,neutrophil_ACZ_E18.5,neutrophil_ACZ_P1,neutrophil_ACZ_P21,neutrophil_ACZ_P7,neutrophil_Hurskainen2021_P14,neutrophil_Hurskainen2021_P3,neutrophil_Hurskainen2021_P7,neutrophil_TMS_18m,neutrophil_TMS_24m,neutrophil_TMS_3m
0610005C13Rik,0.074402,0.0,0.0,2.119141,1.183594,0.000000,0.000000,0.00000,1.085938,2.802734,...,0.309814,0.010681,0.0,0.008118,0.0,0.0,0.0,0.0000,0.0000,0.000000
0610007C21Rik,0.000000,0.0,0.0,0.000000,0.000000,325.000000,375.750000,350.25000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,109.1875,54.6875,31.390625
0610007L01Rik,0.000000,0.0,0.0,0.000000,0.000000,136.000000,129.000000,143.87500,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,79.6250,169.0000,0.000000
0610007N19Rik,0.000000,0.0,0.0,0.000000,0.000000,189.500000,211.750000,198.25000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0000,0.0000,0.000000
0610007P08Rik,0.000000,0.0,0.0,0.000000,0.000000,5.925781,5.886719,51.71875,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0000,0.0000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
n-R5s54,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.018829,0.000000,...,0.015053,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0000,0.0000,0.000000
n-R5s67,0.033966,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.538574,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0000,0.0000,0.000000
n-R5s68,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,1.734375,0.0,2.335938,0.0,0.0,0.0,0.0000,0.0000,0.000000
n-R5s8,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0000,0.0000,0.000000


In [7]:
all_celltypes = []
dt_combinations = []  # dataset and timepoint combinations

for column_name in filtered_df.columns:
    celltype = column_name.split("_")[0]
    dataset_timepoint = column_name.split(celltype+"_")[1]
    if celltype not in all_celltypes:
        all_celltypes.append(celltype)
    if dataset_timepoint not in dt_combinations:
        dt_combinations.append(dataset_timepoint)

In [8]:
all_celltypes
dt_combinations

['ACZ_P21',
 'ACZ_P7',
 'Hurskainen2021_P14',
 'Hurskainen2021_P3',
 'Hurskainen2021_P7',
 'TMS_18m',
 'TMS_24m',
 'TMS_3m',
 'ACZ_E18.5',
 'ACZ_P1']

In [58]:
expression = {}
for dt in dt_combinations:
    expression[dt] = {}
    for ct in all_celltypes:
        name = "_".join([ct,dt])
        if name not in filtered_df.columns:
            exp_value = -1
        else:
            exp_value = filtered_df[name].values[0]
        expression[dt][ct] = exp_value
        

In [64]:
json.loads(expression)

TypeError: the JSON object must be str, bytes or bytearray, not dict