# find duplicate cells

This notebook uses the X counts to search for potentially duplicate cells in the Census.
Based upon that potential equality, it provides a variety of reports - currently it reports::
* duplicate primary: cells with more than one copy marked `is_primary_data==True`
* missing primary: report cells with zero copies marked `is_primary_data==True`

The cell equality method is simplisitic - a hash of the cell counts (expression) vector across all genes.
This will only detect exact copies.  Future enhancements may include a fuzzier definition of equality.

Caveat: CELLxGENE Census internal QC tool

In [1]:
import math
import xxhash  # https://github.com/ifduyue/python-xxhash
from typing import Literal

import ipywidgets

import cellxgene_census
import numpy as np
import pandas as pd

from _csr_iter import X_sparse_iter


"""
Configuration - pick the Census version and experiment to utilize
"""
census_version: str = (
    "latest"  # which Census version? Good options: latest, stable, or YYYY-MM-DD
)
experiment: Literal[
    "mus_musculus", "homo_sapiens"
] = "homo_sapiens"  # Which experiment? Pick one.

In [2]:
row_stride = 100_000  # row partition

with cellxgene_census.open_soma(census_version=census_version) as census:
    exp = census["census_data"][experiment]
    with exp.axis_query(measurement_name="RNA") as query:
        # Notebook progress bar
        display(
            prog := ipywidgets.IntProgress(
                value=0,
                min=0,
                max=math.ceil(query.n_obs / row_stride),
                description="Hashing:",
            )
        )

        obs_df = (
            query.obs(column_names=["dataset_id", "soma_joinid", "is_primary_data"])
            .concat()
            .to_pandas()
            .set_index("soma_joinid")
        )
        hashes = pd.Series(data=np.full((len(obs_df),), ""), index=obs_df.index)

        for (obs_soma_joinids_chunk, _), X_chunk in X_sparse_iter(
            query, X_name="raw", row_stride=row_stride
        ):
            for r, row_soma_joinid in enumerate(obs_soma_joinids_chunk):
                row_counts = X_chunk.getrow(r)
                hash = xxhash.xxh3_128(row_counts.data.tobytes())
                hash.update(row_counts.indptr)
                hashes.at[row_soma_joinid] = hash.hexdigest()

            prog.value += 1

obs_df.insert(0, "hash", hashes)

The "latest" release is currently 2023-06-20. Specify 'census_version="2023-06-20"' in future calls to open_soma() to ensure data consistency.


IntProgress(value=0, description='Hashing:', max=573)

In [3]:
"""Compute a summary pivot on the hash and is_primary_data"""
hash_primary_pivot = (
    obs_df.value_counts(subset=["hash", "is_primary_data"])
    .to_frame()
    .reset_index()
    .pivot_table(index="hash", columns="is_primary_data", values="count", fill_value=0)
)

In [4]:
"""
Case 1 - hashes lacking a cell marked primary. A hash/vector exists, but has no corresponding obs DataFrame record with is_primary_data == True.
"""
obs_missing_primary = (
    obs_df.reset_index()
    .set_index("hash")
    .loc[hash_primary_pivot[hash_primary_pivot.loc[:, True] == 0].index]
    .reset_index()
    .set_index("soma_joinid")
)
obs_missing_primary

Unnamed: 0_level_0,hash,dataset_id,is_primary_data
soma_joinid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7140178,0000024c59f0cfc3f81d49ab01846aad,9f222629-9e39-47d0-b83f-e08d610c7479,False
54711262,0000069e9322c5f54da7a90d10be4b00,572f3f3e-d3e4-4d13-8e2b-88215e508481,False
40024756,0000075334599b48fc57456ee324173a,066943a2-fdac-4b29-b348-40cede398e4e,False
47467096,0000075334599b48fc57456ee324173a,f72958f5-7f42-4ebb-98da-445b0c6de516,False
33031351,000008d404e59342106d535498dee929,1252c5fb-945f-42d6-b1a8-8a3bd864384b,False
...,...,...,...
32109753,fffff6bf5b0d4625e23b01301b239317,83b5e943-a1d5-4164-b3f2-f7a37f01b524,False
29819042,fffff9e56b8512de26c244c085cd0307,c888b684-6c51-431f-972a-6c963044cef0,False
45637022,fffffac078fa7c74d31146b63d1fe0d0,88c483bf-477d-4be5-90d3-4fb101dd601f,False
45663617,fffffac078fa7c74d31146b63d1fe0d0,8b2e5453-faf7-46ea-9073-aea69b283cb7,False


In [5]:
"""
Datasets containing a hash (cell) that lacks any copies with is_primary_data==True
I.e., all copies are marked "not primary"
"""
obs_missing_primary.value_counts(subset=["dataset_id"])

dataset_id                          
9f222629-9e39-47d0-b83f-e08d610c7479    1096207
066943a2-fdac-4b29-b348-40cede398e4e     584944
f72958f5-7f42-4ebb-98da-445b0c6de516     584884
fd072bc3-2dfb-46f8-b4e3-467cb3223182     562562
48101fa2-1a63-4514-b892-53ea1d3a8657     397255
                                         ...   
98113e7e-f586-4065-a26a-60aa702f8d1c          2
e6b2ce27-681b-4409-a053-2681875936e5          1
2190bd4d-3be0-4bf7-8ca8-8d6f71228936          1
04a23820-ffa8-4be5-9f65-64db15631d1e          1
c202b243-1aa1-4b16-bc9a-b36241f3b1e3          1
Name: count, Length: 152, dtype: int64

In [6]:
"""
Case 2 - hashes with more than one cell marked is_primary_data == True
"""
obs_duplicate_primary = (
    obs_df.reset_index()
    .set_index("hash")
    .loc[hash_primary_pivot[hash_primary_pivot.loc[:, True] > 1].index]
    .reset_index()
    .set_index("soma_joinid")
)
obs_duplicate_primary = obs_duplicate_primary[
    obs_duplicate_primary.is_primary_data == True
]
obs_duplicate_primary

Unnamed: 0_level_0,hash,dataset_id,is_primary_data
soma_joinid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
13983736,0000bf62f69ca9af0ddb06c35ac8a5e4,e6dad530-418b-47f9-af6e-472e56a7b314,True
14082223,0000bf62f69ca9af0ddb06c35ac8a5e4,389bfbb9-8ef1-4582-8c41-410131c3d0eb,True
13991172,0000d427423f870a79c05aaf8e4af4b9,e6dad530-418b-47f9-af6e-472e56a7b314,True
14085698,0000d427423f870a79c05aaf8e4af4b9,389bfbb9-8ef1-4582-8c41-410131c3d0eb,True
20489088,0000de56bf406faf7c29caa8dad3df61,715327a6-7978-4896-ba91-69d6b04dbbfb,True
...,...,...,...
14057182,ffff659a02a704e201e7a22654f75a5a,389bfbb9-8ef1-4582-8c41-410131c3d0eb,True
24049304,ffff844c31a8c0cb2e1c0f4703ba66b5,a43aa46b-bd16-47fe-bc3e-19a052624e79,True
42060013,ffff844c31a8c0cb2e1c0f4703ba66b5,ddb22b3d-a75c-4dd1-9730-dff7fc8ca530,True
13966367,ffffc9c6d73e4ad4efdc30eb864b7b41,e6dad530-418b-47f9-af6e-472e56a7b314,True


In [7]:
"""
Datasets with duplicate cells marked "primary"
"""
obs_duplicate_primary.value_counts(subset=["dataset_id"])

dataset_id                          
e6dad530-418b-47f9-af6e-472e56a7b314    98326
389bfbb9-8ef1-4582-8c41-410131c3d0eb    62509
4e38f019-f8e8-44ae-ad32-ba500de6f64c    16540
2ef8f3ce-bbff-447b-9e51-567e5d6c47bd    10411
715327a6-7978-4896-ba91-69d6b04dbbfb    10411
ab5b2256-b209-48b5-a801-c5d9a8c0de56     8193
f8c77961-67a7-4161-b8c2-61c3f917b54f     6101
b252b015-b488-4d5c-b16e-968c13e48a2c     4886
ddb22b3d-a75c-4dd1-9730-dff7fc8ca530     3123
a43aa46b-bd16-47fe-bc3e-19a052624e79     3123
c3d381b2-3104-444e-8ad5-d3524407bbb6     1875
cec9f9a5-8832-437d-99af-fb8237cde54b     1777
9cfee1e6-b24f-433d-a269-f01841655d6a     1635
d95ab381-2b7c-4885-b168-0097ed4e397f     1378
7970bd6b-f752-47a9-8643-2af16855ec49      441
44882825-0da1-4547-b721-2c6105d4a9d1      441
de17ac25-550a-4018-be75-bbb485a0636e      395
4b6af54a-4a21-46e0-bc8d-673c0561a836      262
214bf9eb-93db-48c8-8e3c-9bb22fa3bc63      262
d7dcfd8f-2ee7-4385-b9ac-e074c23ed190       12
6a270451-b4d9-43e0-aa89-e33aac1ac74b       