In [1]:
import numpy as np

In [None]:
## Create an array of [[sum_counts], [n_expressed_genes]] of each cell for scanning duplications
## Reduce scanning time
def _scan_duplications(gxc_csc):
	duplicated_idx = np.zeros(
		shape=(gxc_csc.shape[1], 2),
		dtype=np.uint64
	)

	for i in range(gxc_csc.indptr.shape[0] - 1):
		data = gxc_csc.data[
			gxc_csc.indptr[i] : gxc_csc.indptr[i+1]
		]
		duplicated_idx[i, 0] = data.shape[0]
		duplicated_idx[i, 1] = np.sum(data)

	return duplicated_idx


def _group_cells_arr(data_arr):
	_, tmp_idx, tmp_counts = np.unique(
		data_arr,
		return_counts=True,
		return_inverse=True,
		axis=0
	)
	res = []
	if np.sum(tmp_counts > 1) == 0:
		return res
	for i in np.nonzero(tmp_counts > 1)[0]:
		res.append(
			np.nonzero(tmp_idx==i)[0]
		)
	return res


def find_duplicated_cells(gxc_csc):
	duplicated_idx = _scan_duplications(gxc_csc)
	consesus_data_arr = _group_cells_arr(duplicated_idx)
	res = []
	for consensus_cells_idx in consesus_data_arr:

		tmp_data_arr = np.zeros(
			shape=(
				len(consensus_cells_idx),
				gxc_csc[:, consensus_cells_idx[0]].data.shape[0]
			)
		)
		tmp_indices_arr = np.zeros_like(tmp_data_arr)
		for i in range(len(consensus_cells_idx)):
			idx = consensus_cells_idx[i]
			tmp_data_arr[i, :] = gxc_csc[:, idx].data
			tmp_indices_arr[i, :] = gxc_csc[:, idx].indices

		data_groups = _group_cells_arr(tmp_data_arr)

		if not len(data_groups):
			continue

		tmp_res = []
		for data_gr in data_groups:
			tmp_indices_arr_2 = tmp_indices_arr[data_gr]
			final_groups = _group_cells_arr(tmp_indices_arr_2)
			if not len(final_groups):
				continue
			tmp_res.extend(final_groups)

		if not len(tmp_res):
			continue

		res.extend([consensus_cells_idx[i] for i in tmp_res])

	return res