# find duplicate cells

This notebook uses the X counts to search for potentially duplicate cells in the Census.
Based upon that potential equality, it provides a variety of reports - currently it reports::
* duplicate primary: cells with more than one copy marked `is_primary_data==True`
* missing primary: report cells with zero copies marked `is_primary_data==True`

The cell equality method is simplisitic - a hash of the cell counts (expression) vector across all genes.
This will only detect exact copies.  Future enhancements may include a fuzzier definition of equality.

Caveat: CELLxGENE Census internal QC tool

In [1]:
import math
import xxhash  # https://github.com/ifduyue/python-xxhash
from typing import Literal

import ipywidgets

import cellxgene_census
import numpy as np
import pandas as pd

from _csr_iter import X_sparse_iter


"""
Configuration - pick the Census version and experiment to utilize
"""
census_version: str = (
    "latest"  # which Census version? Good options: latest, stable, or YYYY-MM-DD
)
experiment: Literal[
    "mus_musculus", "homo_sapiens"
] = "homo_sapiens"  # Which experiment? Pick one.

In [2]:
row_stride = 100_000  # row partition

with cellxgene_census.open_soma(census_version=census_version) as census:
    exp = census["census_data"][experiment]
    with exp.axis_query(measurement_name="RNA") as query:
        # Notebook progress bar
        display(
            prog := ipywidgets.IntProgress(
                value=0,
                min=0,
                max=math.ceil(query.n_obs / row_stride),
                description="Hashing:",
            )
        )

        obs_df = (
            query.obs(column_names=["dataset_id", "soma_joinid", "is_primary_data"])
            .concat()
            .to_pandas()
            .set_index("soma_joinid")
        )
        hashes = pd.Series(data=np.full((len(obs_df),), ""), index=obs_df.index)

        for (obs_soma_joinids_chunk, _), X_chunk in X_sparse_iter(
            query, X_name="raw", row_stride=row_stride
        ):
            for r, row_soma_joinid in enumerate(obs_soma_joinids_chunk):
                row_counts = X_chunk.getrow(r)
                hash = xxhash.xxh3_128(row_counts.data.tobytes())
                hash.update(row_counts.indptr)
                hashes.at[row_soma_joinid] = hash.hexdigest()

            prog.value += 1

obs_df.insert(0, "hash", hashes)

The "latest" release is currently 2023-06-20. Specify 'census_version="2023-06-20"' in future calls to open_soma() to ensure data consistency.


IntProgress(value=0, description='Hashing:', max=573)

In [None]:
"""Compute a summary pivot on the hash and is_primary_data"""
hash_primary_pivot = (
    obs_df.value_counts(subset=["hash", "is_primary_data"])
    .to_frame()
    .reset_index()
    .pivot_table(index="hash", columns="is_primary_data", values="count", fill_value=0)
)

In [None]:
"""
Case 1 - hashes lacking a cell marked primary. A hash/vector exists, but has no corresponding obs DataFrame record with is_primary_data == True.
"""
obs_missing_primary = (
    obs_df.reset_index()
    .set_index("hash")
    .loc[hash_primary_pivot[hash_primary_pivot.loc[:, True] == 0].index]
    .reset_index()
    .set_index("soma_joinid")
)
obs_missing_primary

Unnamed: 0_level_0,hash,dataset_id,is_primary_data
soma_joinid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2545877,0000946df15c23841112a0d2b3117f17,07f14e26-ff0d-43c4-bfe3-bf1a94dc73c3,False
2903707,0000b7a8b53b193148ffd8adee14102e,20634fa3-f3cf-44b5-8bc3-b825610bfe8c,False
2948140,0000b7a8b53b193148ffd8adee14102e,2aef80da-acb4-4e15-8f7d-6c0322b86b2f,False
4469682,0000dab9938013d8b0cd9a49f122259d,5e765f97-1cf1-407e-a86c-e28701f4749d,False
2692708,0000efe5d0c76816a8a76a62c003f091,cb5efdb0-f91c-4cbd-9ad4-9d4fa41c572d,False
...,...,...,...
2913367,ffff83f54c68d0e2a25db8202cdf60d4,20634fa3-f3cf-44b5-8bc3-b825610bfe8c,False
2959130,ffff83f54c68d0e2a25db8202cdf60d4,2aef80da-acb4-4e15-8f7d-6c0322b86b2f,False
4457786,ffffcbcd974730a5ebeb685c70a14442,5e765f97-1cf1-407e-a86c-e28701f4749d,False
4408292,ffffe45333d642f56e6edd9a986d9a75,50d79de5-bd17-4d14-a295-199d71ff56be,False


In [None]:
"""
Datasets containing a hash (cell) that lacks any copies with is_primary_data==True
I.e., all copies are marked "not primary"
"""
obs_missing_primary.value_counts(subset=["dataset_id"])

dataset_id                          
a9affc92-a291-4eb9-996f-147392132323    159738
cb5efdb0-f91c-4cbd-9ad4-9d4fa41c572d     93829
5e765f97-1cf1-407e-a86c-e28701f4749d     71183
2aef80da-acb4-4e15-8f7d-6c0322b86b2f     47435
20634fa3-f3cf-44b5-8bc3-b825610bfe8c     42905
50d79de5-bd17-4d14-a295-199d71ff56be     40166
dbb4e1ed-d820-4e83-981f-88ef7eb55a35     35945
c88e2a9c-72b8-4a88-a2f6-e428eada0c86     19093
9bb9596d-f23f-4558-912f-d4dc7d52721b     15511
42ff5b55-b848-4f4c-b7cb-b8aac107841c     12083
07f14e26-ff0d-43c4-bfe3-bf1a94dc73c3      7011
d622cee4-56e1-44ba-8b05-fd2f0f2032e6      6288
1304e107-0f06-4d33-b634-d95ed986d02b      6171
5695d556-974e-4d92-9e99-5f61b8695313      5488
a810e511-c18b-4b2a-8fdf-98a6a0d433a7      1647
be46dfdc-0f99-4731-8957-64ca37364985        22
Name: count, dtype: int64

In [None]:
"""
Case 2 - hashes with more than one cell marked is_primary_data == True
"""
obs_duplicate_primary = (
    obs_df.reset_index()
    .set_index("hash")
    .loc[hash_primary_pivot[hash_primary_pivot.loc[:, True] > 1].index]
    .reset_index()
    .set_index("soma_joinid")
)
obs_duplicate_primary = obs_duplicate_primary[
    obs_duplicate_primary.is_primary_data == True
]
obs_duplicate_primary

Unnamed: 0_level_0,is_primary_data,dataset_id,hash
soma_joinid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [None]:
"""
Datasets with duplicate cells marked "primary"
"""
obs_duplicate_primary.value_counts(subset=["dataset_id"])

Series([], Name: count, dtype: int64)