In [30]:
import h5py
import os
import zlib
import msgpack
import numpy as np
import json

def read_compressed_json(path):
	if not os.path.isfile(path):
		return {}
	with open(path, 'rb') as f:
		o = f.read()
		o = zlib.decompress(o)
		o = msgpack.unpackb(o, strict_map_key=False)
		return o

In [25]:
TRAINED_STUDIES = open('/home/ub-sonvo-25d094476064960/celltype_prediction/studies_idx_HVG.txt').read().splitlines()

In [32]:
ROOT_DIR = '/home/ub-sonvo-25d094476064960/celltype_prediction/camel_2'
TERM_MAPPING = read_compressed_json('/home/ub-sonvo-25d094476064960/term_mapping.json.gz')

FIBROBLAST_IDX = TERM_MAPPING['name2idx']['fibroblast']
print(FIBROBLAST_IDX)

SUBCELLTYPE_IDX = TERM_MAPPING['name2idx']['cell type - subgroup']
print(SUBCELLTYPE_IDX)

CONNECTIVE_TISSUE_CELL_IDX = TERM_MAPPING['name2idx']['connective tissue cell']
print(CONNECTIVE_TISSUE_CELL_IDX)

387
226
278


In [9]:
SUB_FIBROBLAST_2_FIBROBLAST = {
    446: 387,
    709: 387,
    621: 387,
    794: 387,
    1076: 387,
    387: 387,
}

In [33]:
def get_metadata_fields(study_idx):
    metalist = read_compressed_json(
        os.path.join(ROOT_DIR, str(study_idx), 'standardized_metalist.json.gz')
    )
    return metalist['fields']

def get_metadata_arr(study_idx, meta_field):
    meta_idx = TERM_MAPPING['name2idx'][meta_field]
    all_fields = get_metadata_fields(study_idx)

    selected_idx = all_fields.index(meta_idx)
    with h5py.File(os.path.join(ROOT_DIR, str(study_idx), 'standardized_metadata.hdf5')) as f:
        meta_arr = f[str(selected_idx)][()]

    return meta_arr

In [11]:
def get_celltype_arr(study_idx):
    major_celltype_arr = get_metadata_arr(study_idx, 'cell type')
    sub_celltype_arr = get_metadata_arr(study_idx, 'cell type - subgroup')
    bool_idx = np.logical_and(
        major_celltype_arr == CONNECTIVE_TISSUE_CELL_IDX, 
        np.isin(sub_celltype_arr, np.array(list(SUB_FIBROBLAST_2_FIBROBLAST.keys())))
    )
    major_celltype_arr[
        bool_idx
    ] = sub_celltype_arr[
        bool_idx
    ]

    for child in SUB_FIBROBLAST_2_FIBROBLAST:
        major_celltype_arr[major_celltype_arr == child] = SUB_FIBROBLAST_2_FIBROBLAST[child]

    return major_celltype_arr

## **--- map new cell types tree ---**

In [39]:
def get_meta(study_idx):
    major_celltype_arr = get_metadata_arr(study_idx, 'cell type')
    try:
        sub_celltype_arr = get_metadata_arr(study_idx, 'cell type - subgroup')
    except:
        sub_celltype_arr = major_celltype_arr
    
    return major_celltype_arr, sub_celltype_arr

In [48]:
def convert_label(major_label, sub_label, new_tree, remove):
    # Map major type to "unassiged sub-type"
	unassigned = sub_label == 0
	sub_label[unassigned] = major_label[unassigned]

	node_map = np.full(
        max(
    		np.max(sub_label),
    		np.max([int(i) for i in new_tree.keys()])
    	) + 1, 
        -1
    )
	for k in new_tree:
		node_map[int(k)] = new_tree[k] if new_tree[k] else int(k)
	node_map[remove] = -1

    # map children to (new) parent with new_tree
    # Keep original children
	label = node_map[sub_label]
	label[sub_label == 193] = -1 # blood cell
	return label, sub_label

In [47]:
[sorted(os.listdir(ROOT_DIR))]

[['1640',
  '1647',
  '1668',
  '1671',
  '1682',
  '1683',
  '1694',
  '1697',
  '1699',
  '1703',
  '1725',
  '1731',
  '1736',
  '1749',
  '1752',
  '1782',
  '1799',
  '1810',
  '1821',
  '1834',
  '1842',
  '1848',
  '1853',
  '1884',
  '1888',
  '1915',
  '1958',
  '1962',
  '1975',
  '1977',
  '1980',
  '2003',
  '2010',
  '2025',
  '2054',
  '2058',
  '2071',
  '2085',
  '2088',
  '2136',
  '2142',
  '2155',
  '2167',
  '2175',
  '2207',
  '2232',
  '2234',
  '2236',
  '2238',
  '2243',
  '2249',
  '2260',
  '2261',
  '2274',
  '2280',
  '2281',
  '2298',
  '2311',
  '2334',
  '2344',
  '2360',
  '2377',
  '2379',
  '2394',
  '2400',
  '2411',
  '2413',
  '2448',
  '2453',
  '2472',
  '2475',
  '2487',
  '2517',
  '2518',
  '2532',
  '2541',
  '2545',
  '2572',
  '2576',
  '2580',
  '2598',
  '2607',
  '2615',
  '2630',
  '2647',
  '2649',
  '2694',
  '2700',
  '2706',
  '2713',
  '2720',
  '2733',
  '2734',
  '2744',
  '2755',
  '2762',
  '2775',
  '2797',
  '2803',
  '2804',


In [52]:
final_major = []
final_sub = []

new_tree = json.load(open('new_tree.json'))
remove = list(json.load(open('remove.json')).values())

for i in sorted(os.listdir(ROOT_DIR)):
    print ('study', i)
    
    tmp = np.load('{}/{}/{}'.format(ROOT_DIR, i, 'new_celltype_arr.npy'))
    tmp_bool = tmp == 0
    tmp_major, tmp_sub = get_meta(i)
    tmp_major[tmp_bool] = 0
    tmp_sub[tmp_bool] = 0

    label, sub_label = convert_label(tmp_major, tmp_sub, new_tree, remove)
    
    print (np.sum(tmp_bool))
    print (np.sum(label > 0))
    print (np.sum(sub_label > 0))
    
    final_major.append(label)
    final_sub.append(sub_label)
    np.save(
        '{}/{}/{}'.format(ROOT_DIR, i, 'major_new_tree.npy'),
        label
    )
    np.save(
        '{}/{}/{}'.format(ROOT_DIR, i, 'sub_new_tree.npy'),
        sub_label
    )

study 1640
16952
200786
200786
study 1647
3793
6732
9510
study 1668
6774
13851
13851
study 1671
257
2713
2713
study 1682
953
708
4463
study 1683
258
3416
5132
study 1694
14950
4682
4818
study 1697
17
62150
62150
study 1699
755
18756
25261
study 1703
698
3826
3826
study 1725
1471
0
0
study 1731
12673
23575
23575
study 1736
726
0
31413
study 1749
291
4143
4143
study 1752
6119
15170
17604
study 1782
35045
17209
18106
study 1799
41
1349
1349
study 1810
951
5939
7571
study 1821
1882
6620
6620
study 1834
87
601
601
study 1842
3116
7658
15911
study 1848
13311
35689
44219
study 1853
8825
13812
13812
study 1884
34291
5977
5977
study 1888
85
8730
9196
study 1915
22
968
7908
study 1958
813
12762
12762
study 1962
381
9338
9338
study 1975
3734
19040
25331
study 1977
287
3139
3139
study 1980
230
1173
1173
study 2003
1936
2387
2387
study 2010
759
0
37900
study 2025
45156
0
0
study 2054
783
2040
3679
study 2058
2
0
601
study 2071
4999
2741
3885
study 2085
4799
0
0
study 2088
1172
4193
4721
study 2136


In [53]:
final_major = np.concatenate(final_major)
final_sub = np.concatenate(final_sub)

In [62]:
final_major[final_major < 0] = 0
final_sub[final_sub < 0] = 0

In [63]:
np.save(
    'trained_data_2/new_tree_major.npy',
    final_major.astype(np.uint16)
)

np.save(
    'trained_data_2/new_tree_sub.npy',
    final_sub.astype(np.uint16)
)

In [57]:
final_sub.max()

1994

In [35]:
remove

{'mature B cell': 568,
 'retinal bipolar neuron': 798,
 'T cell': 108,
 'CD8-positive, alpha-beta cytotoxic T cell': 15,
 'alpha-beta T cell': 142,
 'alpha-beta intraepithelial T cell': 143,
 'connective tissue cell': 278,
 'decidual cell': 298,
 'early pro-B cell': 321,
 'effector T cell': 325,
 'embryonic stem cell': 332,
 'enteroendocrine': 352,
 'erythroblast': 367,
 'exhausted T cell': 376,
 'extravillous trophoblast': 380,
 'heart valve cell': 437,
 'intermediate mesodermal cell': 493,
 'intraepithelial lymphocyte': 500,
 'leukocyte': 532,
 'lymphocyte': 554,
 'lymphoid tissueâ€“inducer cell': 555,
 'monoblast': 600,
 'mucus secreting cell': 606,
 'myeloblast': 616,
 'myeloid suppressor cell': 617,
 'naive T cell': 623,
 'mature T cell': 570,
 'memory T cell': 582,
 'hepatic stem cell': 447,
 'myeloid leukocyte': 616,
 'neural progenitor cell': 644,
 'oocyte': 682,
 'osteoclast': 686,
 'placental villous trophoblast': 730,
 'precursor B cell': 747,
 'primordial germ cell': 751,
 

## _____________

In [13]:
res = []
for i in sorted(os.listdir(ROOT_DIR)):
    tmp = np.load('{}/{}/{}'.format(ROOT_DIR, i, 'new_celltype_arr.npy'))
    res.append(tmp)

In [14]:
res = np.concatenate(res)

In [15]:
res.shape

(9165105,)

In [17]:
x, y = np.unique(res, return_counts=True)

In [21]:
{TERM_MAPPING['idx2name'][x[i]]: y[i] for i in range(len(x))}

{'Unassigned': 2485435,
 'B cell': 572953,
 'T cell': 2108257,
 'blood cell': 13577,
 'connective tissue cell': 215157,
 'endothelial cell': 124137,
 'epithelial cell': 343735,
 'fibroblast': 194238,
 'glial cell': 648023,
 'innate lymphoid cell': 214594,
 'muscle cell': 27104,
 'myeloid leukocyte': 885123,
 'neural cell': 966566,
 'progenitor cell': 40569,
 'retinal cell': 50972,
 'secretory cell': 90833,
 'stem cell': 973,
 'kidney epithelial cell': 7392,
 'somatic stem cell': 28646,
 'embryonic cell': 14836,
 'germ line cell': 4909,
 'neoplastic cell': 25304,
 'malignant cell': 57847,
 'hematopoietic cell': 2817,
 'hematopoietic precursor cell': 978,
 'kidney cell': 37900,
 'myeloid cell': 1128,
 'erythroid lineage cell': 1102}

In [19]:
y

array([2485435,  572953, 2108257,   13577,  215157,  124137,  343735,
        194238,  648023,  214594,   27104,  885123,  966566,   40569,
         50972,   90833,     973,    7392,   28646,   14836,    4909,
         25304,   57847,    2817,     978,   37900,    1128,    1102])

In [22]:
res

array([359, 359, 359, ...,   0,   0,   0], dtype=uint16)

In [23]:
res

array([108, 108, 108, ..., 826, 826, 826], dtype=uint16)

In [24]:
np.sum(res == 387)

525640

In [23]:
np.save('trained_data_2/trained_meta_celltypes_HVG.npy', res)

In [12]:
test = np.load('trained_data/trained_meta_celltypes.npy')

In [14]:
np.nonzero(test == res)[0].shape

(4357865,)

In [6]:
for i in os.listdir(ROOT_DIR):
    with h5py.File('{}/{}/{}'.format(ROOT_DIR, i, 'raw.hdf5')) as f:
        fe = f['features'][()].astype('str')
    print(i, ':', fe[-10:])

2706 : ['ENSG00000288616' 'ENSG00000288631' 'ENSG00000288642' 'ENSG00000288649'
 'ENSG00000288675' 'ENSG00000288701' 'ENSG00000288702' 'ENSG00000288705'
 'ENSG00000288709' 'ENSG00000288722']
2846 : ['RP11-15K19.2' 'RP11-722G7.1' 'TIMM10B-1' 'CCDC7-1' 'ABC7-42404400C24.1'
 'KIF1BP' 'OVOS2' 'TMEM256-PLSCR3-1' 'TBC1D26-1' 'RP5-1187M17.10']
1958 : ['AC109135.1' 'AL592170.1' 'AL603926.1' 'AL162851.1' 'AL590523.1'
 'CT476828.1' 'AC145205.1' 'CU459201.1' 'AC002321.2' 'AC002321.1']
2694 : ['ZZZ3' 'RP11-442N24--B.1' 'AC013461.1' 'KIF1BP' 'OVOS2' 'CH17-212P11.5'
 'RP5-1187M17.10' 'TRAPPC2P1' 'LLNLR-245B6.1' 'LL22NC03-N95F10.1']
2344 : ['ENSG00000261841' 'ENSG00000212857' 'ENSG00000268953' 'ENSG00000197414'
 'ENSG00000241481' 'ENSG00000234972' 'ENSG00000267161' 'ENSG00000215616'
 'ENSG00000206114' 'ENSG00000267893']
2411 : ['Z97987.1' 'AL133417.1' 'AC091534.1' 'AL136088.1' 'AP003438.1'
 'AL663023.1' 'AL096772.1' 'AC114400.1' 'AC002384.1' 'AL136454.1']
3071 : ['RAMMET' 'LINC02001' 'FLJ45513' 'AC01