# find duplicate cells

This notebook uses the X counts to search for potentially duplicate cells in the Census.
Based upon that potential equality, it provides a variety of reports - currently it reports:
* duplicate primary: cells with more than one copy marked `is_primary_data==True`
* missing primary: report cells with zero copies marked `is_primary_data==True`

The cell equality method is simplisitic - a hash of the cell counts (expression) vector across all genes.
This will only detect exact copies.  Future enhancements may include a fuzzier definition of equality.

Caveat: CELLxGENE Census internal QC tool

In [2]:
import math
from typing import Literal

import cellxgene_census
import ipywidgets
import numpy as np
import pandas as pd
import xxhash  # https://github.com/ifduyue/python-xxhash

"""
Configuration - pick the Census version and experiment to utilize
"""
census_version: str = "2023-07-25"  # which Census version? Good options: latest, stable, or YYYY-MM-DD
experiment: Literal["mus_musculus", "homo_sapiens"] = "homo_sapiens"  # Which experiment? Pick one.

In [3]:
"""
Open Census, grab various data used for reports, and then calculate hashes for all cells.
"""

row_stride = 250_000  # row partition

with cellxgene_census.open_soma(census_version=census_version) as census:
    # Used for reporting
    datasets_df = census["census_info"]["datasets"].read().concat().to_pandas().drop(columns=["soma_joinid"])

    # Calculate all per-cell hashes
    exp = census["census_data"][experiment]
    with exp.axis_query(measurement_name="RNA") as query:
        # Notebook progress bar
        display(
            prog := ipywidgets.IntProgress(
                value=0,
                min=0,
                max=math.ceil(query.n_obs / row_stride),
                description="Hashing:",
            )
        )

        obs_df = (
            query.obs(column_names=["dataset_id", "soma_joinid", "is_primary_data"])
            .concat()
            .to_pandas()
            .set_index("soma_joinid")
        )
        hashes = pd.Series(data=np.full((len(obs_df),), ""), index=obs_df.index)

        for X_chunk, (obs_soma_joinids_chunk, _) in query.X("raw").blockwise(axis=0, size=row_stride):
            for r, row_soma_joinid in enumerate(obs_soma_joinids_chunk):
                X_row = X_chunk.getrow(r)
                hash = xxhash.xxh3_128(X_row.data.tobytes())
                hash.update(X_row.indptr)
                hashes.at[row_soma_joinid] = hash.hexdigest()

            prog.value += 1

        obs_df.insert(0, "hash", hashes)

IntProgress(value=0, description='Hashing:', max=226)

In [5]:
"""Compute a summary pivot on the hash and is_primary_data"""
hash_primary_pivot = (
    obs_df.value_counts(subset=["hash", "is_primary_data"]).to_frame().reset_index()
    # somehow values="count" retrieves an error since the "count" column gets rename to 0
    # .pivot_table(index="hash", columns="is_primary_data", values="count", fill_value=0)
)
hash_primary_pivot

Unnamed: 0,hash,is_primary_data,0
0,047759c09e89a3a24f7730a7d5a75e3f,False,7
1,6a2a56c933a1c4c8f169433084cfd447,False,7
2,c30248b10901ac310613dbdf62f226b2,False,7
3,4763a40e9b58af279223805424f97f4e,False,7
4,292eaa43f32ac8ec698effbf5f2426ee,False,7
...,...,...,...
49568988,5afc0fcbb6efe59eccaf510697f9f1d9,True,1
49568989,5afc106b42528d2409f08862dede1f47,False,1
49568990,5afc10fd59513a73cdf08e377ceb3bcb,True,1
49568991,5afc116fa6283f44bca5ac74fec10432,False,1


In [8]:
hash_primary_pivot = hash_primary_pivot.pivot_table(index="hash", columns="is_primary_data", values=0, fill_value=0)

In [9]:
"""
Case 1 - hashes lacking a cell marked primary. A hash/vector exists, but has no corresponding obs DataFrame record with is_primary_data == True.
"""
obs_missing_primary = (
    obs_df.reset_index()
    .set_index("hash")
    .loc[hash_primary_pivot[hash_primary_pivot.loc[:, True] == 0].index]
    .reset_index()
    .set_index("soma_joinid")
)
obs_missing_primary

Unnamed: 0_level_0,hash,dataset_id,is_primary_data
soma_joinid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7472165,00000516855c5c79b1469067b78b0669,9f222629-9e39-47d0-b83f-e08d610c7479,False
54066398,00000658dc3e5c666f4bc1a40ca2322d,78fd69d2-75e4-4207-819a-563139f273c6,False
32185451,00000cb7d5c9a285d5524ecbc655ccde,1252c5fb-945f-42d6-b1a8-8a3bd864384b,False
38760047,00000e87404cdf81c8cd634d78c892eb,105c7dad-0468-4628-a5be-2bb42c6a8ae4,False
26387480,00000ed73794bb609845ce81966b6626,80a2c5b6-02e7-4fc0-9f12-179f5247c1bc,False
...,...,...,...
39042730,ffffe039acc304261379960376515935,066943a2-fdac-4b29-b348-40cede398e4e,False
46485086,ffffe039acc304261379960376515935,f72958f5-7f42-4ebb-98da-445b0c6de516,False
14946384,ffffed20f0eb0663eda37218ee0da288,e067e5ca-e53e-485f-aa8e-efd5435229c8,False
7983322,fffff8eb5dfb82db7e59b15b24581d03,9f222629-9e39-47d0-b83f-e08d610c7479,False


In [10]:
"""
Datasets containing a hash (cell) that lacks any copies with is_primary_data==True
I.e., all copies are marked "not primary"
"""
datasets_with_missing_primary = (
    obs_missing_primary.value_counts(subset=["dataset_id"])
    .to_frame()
    .rename(columns={"count": "dup_cell_count"})
    .join(datasets_df.set_index("dataset_id"), on="dataset_id")
    .reset_index()
)
datasets_with_missing_primary

Unnamed: 0,dataset_id,0,collection_id,collection_name,collection_doi,dataset_title,dataset_h5ad_path,dataset_total_cell_count
0,9f222629-9e39-47d0-b83f-e08d610c7479,1096207,6f6d381a-7701-4781-935c-db10d30de293,The integrated Human Lung Cell Atlas,10.1038/s41591-023-02327-2,An integrated cell atlas of the human lung in ...,9f222629-9e39-47d0-b83f-e08d610c7479.h5ad,2282447
1,066943a2-fdac-4b29-b348-40cede398e4e,584944,6f6d381a-7701-4781-935c-db10d30de293,The integrated Human Lung Cell Atlas,10.1038/s41591-023-02327-2,An integrated cell atlas of the human lung in ...,066943a2-fdac-4b29-b348-40cede398e4e.h5ad,584944
2,f72958f5-7f42-4ebb-98da-445b0c6de516,584884,2f75d249-1bec-459b-bf2b-b86221097ced,Azimuth meta-analysis of human scRNA-seq datasets,,Human - Lung v2 (HLCA),f72958f5-7f42-4ebb-98da-445b0c6de516.h5ad,584884
3,fd072bc3-2dfb-46f8-b4e3-467cb3223182,562562,b1a879f6-5638-48d3-8f64-f6592c1b1561,Mapping the developing human immune system acr...,10.1126/science.abo0510,Full dataset of single-cell RNA-seq profiles f...,fd072bc3-2dfb-46f8-b4e3-467cb3223182.h5ad,908046
4,48101fa2-1a63-4514-b892-53ea1d3a8657,397255,b1a879f6-5638-48d3-8f64-f6592c1b1561,Mapping the developing human immune system acr...,10.1126/science.abo0510,HSC/immune cells (all hematopoietic-derived ce...,48101fa2-1a63-4514-b892-53ea1d3a8657.h5ad,589390
...,...,...,...,...,...,...,...,...
150,98113e7e-f586-4065-a26a-60aa702f8d1c,2,283d65eb-dd53-496d-adb7-7570c7caa443,Transcriptomic diversity of cell types across ...,10.1101/2022.10.12.511898,Supercluster: Deep-layer intratelencephalic,98113e7e-f586-4065-a26a-60aa702f8d1c.h5ad,228467
151,2190bd4d-3be0-4bf7-8ca8-8d6f71228936,1,283d65eb-dd53-496d-adb7-7570c7caa443,Transcriptomic diversity of cell types across ...,10.1101/2022.10.12.511898,Supercluster: Midbrain-derived inhibitory,2190bd4d-3be0-4bf7-8ca8-8d6f71228936.h5ad,126782
152,e6b2ce27-681b-4409-a053-2681875936e5,1,283d65eb-dd53-496d-adb7-7570c7caa443,Transcriptomic diversity of cell types across ...,10.1101/2022.10.12.511898,Supercluster: Eccentric medium spiny neuron,e6b2ce27-681b-4409-a053-2681875936e5.h5ad,40144
153,c202b243-1aa1-4b16-bc9a-b36241f3b1e3,1,283d65eb-dd53-496d-adb7-7570c7caa443,Transcriptomic diversity of cell types across ...,10.1101/2022.10.12.511898,Supercluster: Amygdala excitatory,c202b243-1aa1-4b16-bc9a-b36241f3b1e3.h5ad,109452


In [11]:
"""
Case 2 - hashes with more than one cell marked is_primary_data == True
"""
obs_duplicate_primary = (
    obs_df.reset_index()
    .set_index("hash")
    .loc[hash_primary_pivot[hash_primary_pivot.loc[:, True] > 1].index]
    .reset_index()
    .set_index("soma_joinid")
)
obs_duplicate_primary = obs_duplicate_primary[obs_duplicate_primary.is_primary_data is True]
obs_duplicate_primary

Unnamed: 0_level_0,hash,dataset_id,is_primary_data
soma_joinid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [12]:
"""
Datasets with duplicate cells marked "primary"
"""
datasets_with_dup_primary = (
    obs_duplicate_primary.value_counts(subset=["dataset_id"])
    .to_frame()
    .rename(columns={"count": "dup_cell_count"})
    .join(datasets_df.set_index("dataset_id"), on="dataset_id")
    .reset_index()
)
datasets_with_dup_primary

Unnamed: 0,dataset_id,0,collection_id,collection_name,collection_doi,dataset_title,dataset_h5ad_path,dataset_total_cell_count


In [13]:
"""
For each duplicated hash (cell), find the datasets that overlap/contain the duplicate.

In other words, these are all of the datasets which have intersecting duplicate/primary hashes.
"""
overlapping_dup_datasets = np.unique(
    obs_duplicate_primary.sort_values("dataset_id")[["hash", "dataset_id"]].groupby(by="hash").agg(list)
)

display(overlapping_dup_datasets)

for d in overlapping_dup_datasets:
    display(datasets_df.set_index("dataset_id").loc[d])

array([], dtype=object)