In [1]:
import json
import scipy
import numpy as np

In [None]:
def _create_gene_idx_mapping(alias_json, features_arr):
	features_arr = [x.upper() for x in features_arr]
	features_arr_dict = {name: idx for idx, name in enumerate(features_arr)}
	gene_alias_dict = json.load(open(alias_json))

	gene_idx_alias_dict = dict()
	# {alias_idx : replaced_idx}
	for alias_name in gene_alias_dict:
		replaced_name = gene_alias_dict[alias_name]
		if alias_name not in features_arr_dict or replaced_name not in features_arr_dict:
			continue
		gene_idx_alias_dict[features_arr_dict[alias_name]] = features_arr_dict[replaced_name]

	return gene_idx_alias_dict


## matrix in raw (integer) data
## Output: same shape as original matrix
def merge_alias_gene(gxc_csr, features_arr, alias_json):
	gene_idx_alias_dict = _create_gene_idx_mapping(alias_json, features_arr)

	n_genes = gxc_csr.shape[0]
	n_cells = gxc_csr.shape[1]

	indices = [np.array([]) for i in range(n_genes)]
	data = [np.array([]) for i in range(n_genes)]
	for i in range(n_genes):
		tmp_arr = gxc_csr[i, :].toarray()[0].astype(np.float64)
		if i in gene_idx_alias_dict:
			replacted_idx = gene_idx_alias_dict[i]
		else:
			replacted_idx = i

		alias_arr = np.zeros(n_cells, dtype=np.float64)
		if len(indices[replacted_idx]) > 0:
			alias_arr[indices[replacted_idx]] = data[replacted_idx]
		alias_arr += tmp_arr

		tmp_nonzero_idx = np.nonzero(alias_arr)[0]
		indices[replacted_idx] = tmp_nonzero_idx
		data[replacted_idx] = alias_arr[tmp_nonzero_idx]

	n_expressed_genes_per_cell = [len(i) for i in indices]
	indptr = np.zeros(n_genes + 1)
	indptr[1:] = np.cumsum(n_expressed_genes_per_cell)
	indices = np.concatenate(indices)
	data = np.concatenate(data)

	return scipy.sparse.csr_matrix(
		(data, indices, indptr),
		shape=(n_genes, n_cells)
	)

In [None]:
alias_gene_dict = json.load(open('/data/sonvo/gene_mapping/update_human.json'))