In [1]:
import h5py
import os
import zlib
import msgpack
import numpy as np

def read_compressed_json(path):
	if not os.path.isfile(path):
		return {}
	with open(path, 'rb') as f:
		o = f.read()
		o = zlib.decompress(o)
		o = msgpack.unpackb(o, strict_map_key=False)
		return o

In [2]:
TRAINED_STUDIES = open('/home/ub-sonvo-25d094476064960/celltype_prediction/studies_idx_HVG.txt').read().splitlines()

In [3]:
ROOT_DIR = '/home/ub-sonvo-25d094476064960/celltype_prediction/camel'
TERM_MAPPING = read_compressed_json('/home/ub-sonvo-25d094476064960/term_mapping.json.gz')

FIBROBLAST_IDX = TERM_MAPPING['name2idx']['fibroblast']
FIBROBLAST_IDX

SUBCELLTYPE_IDX = TERM_MAPPING['name2idx']['cell type - subgroup']
SUBCELLTYPE_IDX

226

In [4]:
CONNECTIVE_TISSUE_CELL_IDX = TERM_MAPPING['name2idx']['connective tissue cell']
CONNECTIVE_TISSUE_CELL_IDX

278

In [5]:
SUB_FIBROBLAST_2_FIBROBLAST = {
    446: 387,
    709: 387,
    621: 387,
    794: 387,
    1076: 387,
    387: 387,
}

In [6]:
def get_metadata_fields(study_idx):
    metalist = read_compressed_json(
        os.path.join(ROOT_DIR, str(study_idx), 'standardized_metalist.json.gz')
    )
    return metalist['fields']

def get_metadata_arr(study_idx, meta_field):
    meta_idx = TERM_MAPPING['name2idx'][meta_field]
    all_fields = get_metadata_fields(study_idx)

    selected_idx = all_fields.index(meta_idx)
    with h5py.File(os.path.join(ROOT_DIR, str(study_idx), 'standardized_metadata.hdf5.new')) as f:
        meta_arr = f[str(selected_idx)][()]

    return meta_arr

In [7]:
def get_celltype_arr(study_idx):
    major_celltype_arr = get_metadata_arr(study_idx, 'cell type')
    sub_celltype_arr = get_metadata_arr(study_idx, 'cell type - subgroup')
    bool_idx = np.logical_and(
        major_celltype_arr == CONNECTIVE_TISSUE_CELL_IDX, 
        np.isin(sub_celltype_arr, np.array(list(SUB_FIBROBLAST_2_FIBROBLAST.keys())))
    )
    major_celltype_arr[
        bool_idx
    ] = sub_celltype_arr[
        bool_idx
    ]

    for child in SUB_FIBROBLAST_2_FIBROBLAST:
        major_celltype_arr[major_celltype_arr == child] = SUB_FIBROBLAST_2_FIBROBLAST[child]

    return major_celltype_arr

In [14]:
res = []
for i in TRAINED_STUDIES:
    res.append(get_celltype_arr(i))

In [16]:
res = np.concatenate(res)

In [17]:
res.shape

(5033871,)

In [18]:
res

array([108, 108, 108, ..., 826, 826, 826], dtype=uint16)

In [19]:
np.save('trained_data/trained_meta_celltypes.npy', res)