In [1]:
import os
import re

import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import anndata

In [5]:
import scrublet as scr

In [6]:
# !pip install scrublet

In [7]:
# raw counts are raw.X
counts_adata = sc.read_h5ad('../data/single_cell/original/count_matrix/counts.h5ad')
counts_adata

AnnData object with n_obs × n_vars = 323120 × 34985
    obs: 'sample_id', 'dataset', 'tissue_type'

In [8]:
counts_adata.obs

Unnamed: 0_level_0,sample_id,dataset,tissue_type
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
htan01_AAACCTGAGACCTAGG-1,htan01,htan_surgical,tumor_primary
htan01_AAACCTGAGTGCGTGA-1,htan01,htan_surgical,tumor_primary
htan01_AAACCTGCAATCGGTT-1,htan01,htan_surgical,tumor_primary
htan01_AAACCTGCACCGTTGG-1,htan01,htan_surgical,tumor_primary
htan01_AAACCTGCATCCCACT-1,htan01,htan_surgical,tumor_primary
...,...,...,...
G9903_TTTGTCAAGTTGTCGT-1,G9903,chan-seng-yue,tumor_primary
G9903_TTTGTCACAACTTGAC-1,G9903,chan-seng-yue,tumor_primary
G9903_TTTGTCACATATGGTC-1,G9903,chan-seng-yue,tumor_primary
G9903_TTTGTCATCCGAACGC-1,G9903,chan-seng-yue,tumor_primary


In [16]:
sorted(set(counts_adata.obs['sample_id']))

['100070',
 '85948',
 '87235',
 '87784',
 '90209_CMP',
 '91412',
 '91610',
 '91706',
 '94930',
 '95092',
 '95373',
 '96460',
 '97727',
 'COMP_0158_P',
 'G9903',
 'MET01',
 'MET02',
 'MET03',
 'MET04',
 'MET05',
 'MET06',
 'N1',
 'N10',
 'N11',
 'N2',
 'N3',
 'N4',
 'N5',
 'N6',
 'N7',
 'N8',
 'N9',
 'P01',
 'P02',
 'P03',
 'P04',
 'P05',
 'P06',
 'P07',
 'P08',
 'P09',
 'P10',
 'T1',
 'T10',
 'T11',
 'T12',
 'T13',
 'T14',
 'T15',
 'T16',
 'T17',
 'T18',
 'T19',
 'T2',
 'T20',
 'T21',
 'T22',
 'T23',
 'T24',
 'T3',
 'T4',
 'T5',
 'T6',
 'T7',
 'T8',
 'T9',
 'htan01',
 'htan02',
 'htan03',
 'htan04',
 'htan05',
 'htan06',
 'wu01',
 'wu02',
 'wu03',
 'wu04',
 'wu05',
 'wu07',
 'wu09',
 'wu14',
 'wu15',
 'wu16',
 'wu18',
 'wu22',
 'wu23',
 'wu24',
 'wu25',
 'wu26',
 'wu27',
 'wu28',
 'wu29',
 'wu30',
 'wu31',
 'wu32',
 'wu38',
 'wu39',
 'wu42']

In [20]:
cell_to_doublet_score = {}
for s in sorted(set(counts_adata.obs['sample_id'])):
    print(s)
    a = counts_adata[counts_adata.obs['sample_id']==s]
    scrub = scr.Scrublet(a.X.toarray())
    doublet_scores, predicted_doublets = scrub.scrub_doublets()
    if predicted_doublets is not None:
        for cid, score, pred in zip(a.obs.index.to_list(), doublet_scores, predicted_doublets):
            cell_to_doublet_score[cid] = (score, pred)
    else:
        # default of .5
        for cid, score, pred in zip(a.obs.index.to_list(), doublet_scores, [s>.5 for s in doublet_scores]):
            cell_to_doublet_score[cid] = (score, pred)

wu26
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Elapsed time: 2.1 seconds
wu27
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.60
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 6.4%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 2.6%
Elapsed time: 2.4 seconds
wu28
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.64
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 6.6%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 1.0%
Elapsed time: 3.0 seconds
wu29
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.58
Detected doublet rate = 0.3%
Estimated detec

In [21]:
counts_adata.obs['doublet_score'] = [cell_to_doublet_score[c][0]
                                     for c in counts_adata.obs.index.to_list()]
counts_adata.obs['is_doublet'] = ['yes' if cell_to_doublet_score[c][1] else 'no'
                                     for c in counts_adata.obs.index.to_list()]

In [22]:
set(counts_adata.obs['is_doublet'])

{'no', 'yes'}

In [23]:
df = counts_adata.obs[['doublet_score', 'is_doublet']]
df

Unnamed: 0_level_0,doublet_score,is_doublet
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
htan01_AAACCTGAGACCTAGG-1,0.056356,no
htan01_AAACCTGAGTGCGTGA-1,0.126582,no
htan01_AAACCTGCAATCGGTT-1,0.019643,no
htan01_AAACCTGCACCGTTGG-1,0.074130,no
htan01_AAACCTGCATCCCACT-1,0.061728,no
...,...,...
G9903_TTTGTCAAGTTGTCGT-1,0.027542,no
G9903_TTTGTCACAACTTGAC-1,0.059507,no
G9903_TTTGTCACATATGGTC-1,0.109278,no
G9903_TTTGTCATCCGAACGC-1,0.098266,no


In [24]:
df.to_csv('../data/single_cell/original/count_matrix/doublets.txt', sep='\t')

In [25]:
counts_adata.write_h5ad('../data/single_cell/original/count_matrix/counts_with_doublet_score.h5ad')