In [1]:
import h5py
import numpy as np
import pandas as pd
import scanpy as sc
from scipy.cluster.hierarchy import linkage,leaves_list, dendrogram
from scipy.spatial.distance import pdist, euclidean
from scipy.special import comb
import re

In [2]:
# gives the name of dataset we want as an input
# celltype / celltype_dataset / celltype_dataset_timepoint
def read_file(dataset_name):
    h5_data = h5py.File('./output/condensed_lung_atlas_in_cpm.h5',"r")
    
    df = pd.DataFrame(data=np.array(h5_data[dataset_name]\
    ['gene_expression_average']['block0_values']),\
    index=np.array(h5_data[dataset_name]['gene_expression_average']['axis1'])\
    ,columns=np.array(h5_data[dataset_name]['gene_expression_average']['axis0'])).T

    new_index = []
    for i in df.index:
        new_index.append(i.decode('utf-8'))
    # convert column name from binary to string
    new_column_name = []
    for i in df.columns:
        new_column_name.append(i.decode('utf-8'))
    df.index=new_index
    df.columns=new_column_name
#   d.drop(['CT010467.1'],inplace=True)
    df = df.astype(np.float32)
    
    return df

#### Dataframe for the landing page

In [3]:
df_cell = read_file('celltype')
df_cell

Unnamed: 0,Adventitial fibroblast,Airway smooth muscle,Alveolar fibroblast,Alveolar type I,Alveolar type II,Arterial EC I,Arterial EC II,B cell,Car4+ capillaries,DC I,...,Proliferating myofibroblast,Proliferating pericyte,Proliferative EC,Striated muscle,T cell,Vascular smooth muscle,Venous EC,basophil,mast cell,neutrophil
0610005C13Rik,0.379639,2.013672,1.431641,0.364014,0.726562,3.193359,0.014755,0.176514,0.235596,0.061218,...,0.099731,0.250244,0.550781,0.000000,0.683105,0.023193,0.110229,0.091736,0.090271,0.006596
0610007C21Rik,84.812500,25.203125,97.062500,1.244141,14.835938,4.570312,70.062500,32.718750,70.750000,110.625000,...,0.000000,0.000000,0.000000,25.562500,17.546875,2.630859,3.621094,0.000000,0.000000,5.058594
0610007L01Rik,34.062500,8.250000,23.437500,0.421875,7.386719,0.000000,4.519531,17.718750,27.343750,68.875000,...,0.000000,0.000000,0.000000,3.824219,11.242188,0.776367,0.000000,0.000000,0.000000,5.898438
0610007N19Rik,48.437500,8.929688,48.156250,0.000000,3.205078,0.000000,3.279297,0.793945,2.554688,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.345703,0.000000,0.000000,0.000000,0.000000
0610007P08Rik,7.171875,7.187500,7.972656,0.000000,2.726562,0.000000,7.105469,2.048828,6.804688,14.195312,...,0.000000,0.000000,0.000000,0.000000,1.659180,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
n-R5s54,0.000000,0.004101,0.015205,0.000000,0.000000,0.000000,0.000000,0.000386,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000585,0.030884,0.011482,0.340088,0.000000,0.000202
n-R5s67,0.007317,0.117249,0.035767,0.000000,0.000000,0.008865,0.073853,0.125854,0.322998,0.059479,...,0.022858,0.000000,0.116882,0.000000,0.066895,0.173096,0.096375,0.000000,0.000000,0.000000
n-R5s68,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001127,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.460938
n-R5s8,0.000000,0.000000,0.013527,0.000000,0.000000,0.000000,0.000000,0.000000,0.056854,0.000000,...,0.000000,0.000000,0.043182,0.000000,0.000000,0.048553,0.091675,0.000000,0.000000,0.000000


In [4]:
df_cell.to_csv('./out.csv')

In [6]:
df_cell.filter(items=['Gal'],axis=0)


Unnamed: 0,Adventitial fibroblast,Airway smooth muscle,Alveolar fibroblast,Alveolar type I,Alveolar type II,Arterial EC I,Arterial EC II,B cell,Car4+ capillaries,DC I,...,Proliferating myofibroblast,Proliferating pericyte,Proliferative EC,Striated muscle,T cell,Vascular smooth muscle,Venous EC,basophil,mast cell,neutrophil
Gal,0.847168,0.384521,0.462646,0.0,1.045898,0.0,1.616211,0.206055,0.003513,1.670898,...,0.0,0.0,0.0,0.0,0.820312,2.505859,0.0,0.061707,0.0,1.420898


#### Dataframe for the second page (heatmaps by datasets and timepoints)

In [9]:
df_datasets = read_file('celltype_dataset')
df_datasets

Unnamed: 0,Adventitial fibroblast_ACZ,Adventitial fibroblast_Hurskainen2021,Adventitial fibroblast_TMS,Airway smooth muscle_ACZ,Airway smooth muscle_Hurskainen2021,Airway smooth muscle_TMS,Alveolar fibroblast_ACZ,Alveolar fibroblast_Hurskainen2021,Alveolar fibroblast_TMS,Alveolar type II_ACZ,...,Vascular smooth muscle_TMS,Venous EC_ACZ,Venous EC_Hurskainen2021,Venous EC_TMS,basophil_ACZ,basophil_Hurskainen2021,mast cell_ACZ,neutrophil_ACZ,neutrophil_Hurskainen2021,neutrophil_TMS
0610005C13Rik,0.073792,0.676758,0.000000,1.085938,2.546875,0.0000,0.164185,1.270508,2.496094,0.222046,...,0.000,0.222656,0.0,0.0,0.292236,0.0,0.090271,0.023636,0.0,0.0000
0610007C21Rik,0.000000,0.000000,345.750000,0.000000,0.000000,297.2500,0.000000,0.000000,412.750000,0.000000,...,402.250,0.000000,0.0,548.5,0.000000,0.0,0.000000,0.000000,0.0,76.5625
0610007L01Rik,0.000000,0.000000,138.875000,0.000000,0.000000,97.3125,0.000000,0.000000,99.687500,0.000000,...,118.625,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,89.2500
0610007N19Rik,0.000000,0.000000,197.500000,0.000000,0.000000,105.3125,0.000000,0.000000,204.875000,0.000000,...,205.500,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0000
0610007P08Rik,0.000000,0.000000,29.234375,0.000000,0.000000,84.8125,0.000000,0.000000,33.906250,0.000000,...,0.000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
n-R5s54,0.000000,0.000000,0.000000,0.018829,0.000000,0.0000,0.132446,0.000000,0.000000,0.000000,...,0.000,0.023193,0.0,0.0,1.083008,0.0,0.000000,0.000725,0.0,0.0000
n-R5s67,0.033691,0.000000,0.000000,0.538574,0.000000,0.0000,0.311523,0.000000,0.000000,0.000000,...,0.000,0.194702,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0000
n-R5s68,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,...,0.000,0.000000,0.0,0.0,0.000000,0.0,0.000000,1.651367,0.0,0.0000
n-R5s8,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.117859,0.000000,0.000000,0.000000,...,0.000,0.185181,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0000


In [10]:
df_datasets.filter(items=['Gal'],axis=0)

Unnamed: 0,Adventitial fibroblast_ACZ,Adventitial fibroblast_Hurskainen2021,Adventitial fibroblast_TMS,Airway smooth muscle_ACZ,Airway smooth muscle_Hurskainen2021,Airway smooth muscle_TMS,Alveolar fibroblast_ACZ,Alveolar fibroblast_Hurskainen2021,Alveolar fibroblast_TMS,Alveolar type II_ACZ,...,Vascular smooth muscle_TMS,Venous EC_ACZ,Venous EC_Hurskainen2021,Venous EC_TMS,basophil_ACZ,basophil_Hurskainen2021,mast cell_ACZ,neutrophil_ACZ,neutrophil_Hurskainen2021,neutrophil_TMS
Gal,0.0,1.514648,0.135498,0.0,0.55127,0.0,0.295654,0.312744,0.958984,0.0,...,0.0,0.0,0.0,0.0,0.196411,0.0,0.0,1.30957,1.611328,0.0


In [11]:
df_datasets_time = read_file('celltype_dataset_timepoint')
df_datasets_time

Unnamed: 0,Adventitial fibroblast_ACZ_P21,Adventitial fibroblast_ACZ_P7,Adventitial fibroblast_Hurskainen2021_P14,Adventitial fibroblast_Hurskainen2021_P3,Adventitial fibroblast_Hurskainen2021_P7,Adventitial fibroblast_TMS_18m,Adventitial fibroblast_TMS_24m,Adventitial fibroblast_TMS_3m,Airway smooth muscle_ACZ_P21,Airway smooth muscle_Hurskainen2021_P14,...,neutrophil_ACZ_E18.5,neutrophil_ACZ_P1,neutrophil_ACZ_P21,neutrophil_ACZ_P7,neutrophil_Hurskainen2021_P14,neutrophil_Hurskainen2021_P3,neutrophil_Hurskainen2021_P7,neutrophil_TMS_18m,neutrophil_TMS_24m,neutrophil_TMS_3m
0610005C13Rik,0.074402,0.0,0.0,2.119141,1.183594,0.000000,0.000000,0.00000,1.085938,2.802734,...,0.309814,0.010681,0.0,0.008118,0.0,0.0,0.0,0.0000,0.0000,0.000000
0610007C21Rik,0.000000,0.0,0.0,0.000000,0.000000,325.000000,375.750000,350.25000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,109.1875,54.6875,31.390625
0610007L01Rik,0.000000,0.0,0.0,0.000000,0.000000,136.000000,129.000000,143.87500,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,79.6250,169.0000,0.000000
0610007N19Rik,0.000000,0.0,0.0,0.000000,0.000000,189.500000,211.750000,198.25000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0000,0.0000,0.000000
0610007P08Rik,0.000000,0.0,0.0,0.000000,0.000000,5.925781,5.886719,51.71875,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0000,0.0000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
n-R5s54,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.018829,0.000000,...,0.015053,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0000,0.0000,0.000000
n-R5s67,0.033966,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.538574,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0000,0.0000,0.000000
n-R5s68,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,1.734375,0.0,2.335938,0.0,0.0,0.0,0.0000,0.0000,0.000000
n-R5s8,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0000,0.0000,0.000000


In [12]:
df_datasets_time.filter(items=['Gal'],axis=0)

Unnamed: 0,Adventitial fibroblast_ACZ_P21,Adventitial fibroblast_ACZ_P7,Adventitial fibroblast_Hurskainen2021_P14,Adventitial fibroblast_Hurskainen2021_P3,Adventitial fibroblast_Hurskainen2021_P7,Adventitial fibroblast_TMS_18m,Adventitial fibroblast_TMS_24m,Adventitial fibroblast_TMS_3m,Airway smooth muscle_ACZ_P21,Airway smooth muscle_Hurskainen2021_P14,...,neutrophil_ACZ_E18.5,neutrophil_ACZ_P1,neutrophil_ACZ_P21,neutrophil_ACZ_P7,neutrophil_Hurskainen2021_P14,neutrophil_Hurskainen2021_P3,neutrophil_Hurskainen2021_P7,neutrophil_TMS_18m,neutrophil_TMS_24m,neutrophil_TMS_3m
Gal,0.0,0.0,0.0,0.0,3.84375,0.407959,0.0,0.0,0.0,0.0,...,0.0,0.023209,0.0,5.558594,0.0,4.496094,0.0,0.0,0.0,0.0
