In [7]:
import h5py
import os
import zlib
import msgpack
import numpy as np

def read_compressed_json(path):
	if not os.path.isfile(path):
		return {}
	with open(path, 'rb') as f:
		o = f.read()
		o = zlib.decompress(o)
		o = msgpack.unpackb(o, strict_map_key=False)
		return o

In [2]:
TRAINED_STUDIES = open('/home/ub-sonvo-25d094476064960/celltype_prediction/studies_idx_HVG.txt').read().splitlines()

In [8]:
ROOT_DIR = '/home/ub-sonvo-25d094476064960/celltype_prediction/camel_2'
TERM_MAPPING = read_compressed_json('/home/ub-sonvo-25d094476064960/term_mapping.json.gz')

FIBROBLAST_IDX = TERM_MAPPING['name2idx']['fibroblast']
print(FIBROBLAST_IDX)

SUBCELLTYPE_IDX = TERM_MAPPING['name2idx']['cell type - subgroup']
print(SUBCELLTYPE_IDX)

CONNECTIVE_TISSUE_CELL_IDX = TERM_MAPPING['name2idx']['connective tissue cell']
print(CONNECTIVE_TISSUE_CELL_IDX)

387
226
278


In [9]:
SUB_FIBROBLAST_2_FIBROBLAST = {
    446: 387,
    709: 387,
    621: 387,
    794: 387,
    1076: 387,
    387: 387,
}

In [10]:
def get_metadata_fields(study_idx):
    metalist = read_compressed_json(
        os.path.join(ROOT_DIR, str(study_idx), 'standardized_metalist.json.gz')
    )
    return metalist['fields']

def get_metadata_arr(study_idx, meta_field):
    meta_idx = TERM_MAPPING['name2idx'][meta_field]
    all_fields = get_metadata_fields(study_idx)

    selected_idx = all_fields.index(meta_idx)
    with h5py.File(os.path.join(ROOT_DIR, str(study_idx), 'standardized_metadata.hdf5.new')) as f:
        meta_arr = f[str(selected_idx)][()]

    return meta_arr

In [11]:
def get_celltype_arr(study_idx):
    major_celltype_arr = get_metadata_arr(study_idx, 'cell type')
    sub_celltype_arr = get_metadata_arr(study_idx, 'cell type - subgroup')
    bool_idx = np.logical_and(
        major_celltype_arr == CONNECTIVE_TISSUE_CELL_IDX, 
        np.isin(sub_celltype_arr, np.array(list(SUB_FIBROBLAST_2_FIBROBLAST.keys())))
    )
    major_celltype_arr[
        bool_idx
    ] = sub_celltype_arr[
        bool_idx
    ]

    for child in SUB_FIBROBLAST_2_FIBROBLAST:
        major_celltype_arr[major_celltype_arr == child] = SUB_FIBROBLAST_2_FIBROBLAST[child]

    return major_celltype_arr

In [12]:
res = []
for i in sorted(os.listdir(ROOT_DIR)):
    tmp = np.load('{}/{}/{}'.format(ROOT_DIR, i, 'new_celltype_arr.npy'))
    res.append(tmp)

FileNotFoundError: [Errno 2] No such file or directory: '1640/new_celltype_arr.npy'

In [21]:
res = np.concatenate(res)

In [22]:
res.shape

(5033871,)

In [23]:
res

array([108, 108, 108, ..., 826, 826, 826], dtype=uint16)

In [24]:
np.sum(res == 387)

525640

In [25]:
np.save('trained_data/trained_meta_celltypes_HVG.npy', res)

In [12]:
test = np.load('trained_data/trained_meta_celltypes.npy')

In [14]:
np.nonzero(test == res)[0].shape

(4357865,)

In [6]:
for i in os.listdir(ROOT_DIR):
    with h5py.File('{}/{}/{}'.format(ROOT_DIR, i, 'raw.hdf5')) as f:
        fe = f['features'][()].astype('str')
    print(i, ':', fe[-10:])

2706 : ['ENSG00000288616' 'ENSG00000288631' 'ENSG00000288642' 'ENSG00000288649'
 'ENSG00000288675' 'ENSG00000288701' 'ENSG00000288702' 'ENSG00000288705'
 'ENSG00000288709' 'ENSG00000288722']
2846 : ['RP11-15K19.2' 'RP11-722G7.1' 'TIMM10B-1' 'CCDC7-1' 'ABC7-42404400C24.1'
 'KIF1BP' 'OVOS2' 'TMEM256-PLSCR3-1' 'TBC1D26-1' 'RP5-1187M17.10']
1958 : ['AC109135.1' 'AL592170.1' 'AL603926.1' 'AL162851.1' 'AL590523.1'
 'CT476828.1' 'AC145205.1' 'CU459201.1' 'AC002321.2' 'AC002321.1']
2694 : ['ZZZ3' 'RP11-442N24--B.1' 'AC013461.1' 'KIF1BP' 'OVOS2' 'CH17-212P11.5'
 'RP5-1187M17.10' 'TRAPPC2P1' 'LLNLR-245B6.1' 'LL22NC03-N95F10.1']
2344 : ['ENSG00000261841' 'ENSG00000212857' 'ENSG00000268953' 'ENSG00000197414'
 'ENSG00000241481' 'ENSG00000234972' 'ENSG00000267161' 'ENSG00000215616'
 'ENSG00000206114' 'ENSG00000267893']
2411 : ['Z97987.1' 'AL133417.1' 'AC091534.1' 'AL136088.1' 'AP003438.1'
 'AL663023.1' 'AL096772.1' 'AC114400.1' 'AC002384.1' 'AL136454.1']
3071 : ['RAMMET' 'LINC02001' 'FLJ45513' 'AC01