# Comparison of 10X Nucleus vs. Segger Monocyte Capture

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from requirements import *
from segger.data.parquet._utils import (
    read_parquet_region,
    filter_transcripts,
    load_settings,
)
from sg_utils.tl.xenium_utils import anndata_from_transcripts
from sg_utils.pp.preprocess_rapids import *
from sg_utils.pl.plot_embedding import plot_embedding
from sg_utils.pl.utils import lighten_color, get_color_palette
import celltypist as ct
import ast

In [3]:
dataset = 'xenium_colon'

## Aggregate Cell Type Markers from Literature

Because Xenium is limited in the number of genes in a given panel, cell type signatures from literature are sparse and often missing genes critical to the specificity of a signature. Below is a manually curated list of marker sets from three different sources, aggregated across cell types and filtered for meaningless signatures (i.e. those missing genes critical to a gene sets specificity). Cells describe the process to curate marker sets, but the final marker set is available as a supplementary table.

## Create and Annotate AnnData from Transcripts

### Transcripts to AnnData

In [4]:
# Read in all transcripts
transcripts_filepath = data_dir / dataset / 'labeled_transcripts.parquet'
transcripts = pd.read_parquet(transcripts_filepath)

# Filter control probes and low QV probes
xe_settings = load_settings('xenium')
transcripts = filter_transcripts(
    transcripts,
    label=xe_settings.transcripts.label,
    filter_substrings=xe_settings.transcripts.filter_substrings,
    min_qv=25,
)

In [7]:
# Subset Segger data to high-confidence transcripts
for name, seg_col, mask in [
    ('10x_cell', 'cell_id', np.full(transcripts.shape[0], True)),
    ('10x_nucleus', 'cell_id', transcripts['overlaps_nucleus'].eq(1)),
    ('segger', 'segger_cell_id_cxg_k=20', transcripts['score_cxg_k=20'].gt(0.5))
]:

    # Transcripts to anndata
    ad = anndata_from_transcripts(
        transcripts[mask],
        cell_label=seg_col,
        gene_label='feature_name',
        coordinate_labels=['x_location', 'y_location'],
    )
    # Add raw counts before filtering
    ad.uns['raw_counts'] = dict(
        index=ad.obs.index.tolist(),
        count=ad.raw.X.A.sum(1),
    )
    ad.write_h5ad(data_dir / dataset / f'h5ads/{name}_raw.h5ad')
    
    # Preprocess
    preprocess_rapids(
        ad,
        filter_min_counts=1,
        pca_total_var=0.95,
        umap_min_dist=0.2,
        umap_n_epochs=1000,
        pca_layer='norm',
        knn_neighbors=20,
        phenograph_resolution=4,
        umap_kwargs=dict(local_connectivity=2, init_pos='random'),
    )
    
    # Map 10X IDs across anndatas
    if name in ['10x_nucleus', '10x_cell', '10x_nucleus_lo_thresh']:
        ad.obs['10x_id'] = ad.obs.index
    else:
        counts = transcripts.value_counts([seg_col, 'cell_id'])
        nuc_map = counts.groupby(seg_col).idxmax().str[1]
        ad.obs['10x_id'] = ad.obs.index.astype(float).map(nuc_map)
    
    ad.write_h5ad(data_dir / dataset / f'h5ads/{name}_processed.h5ad')

Done: 100%|██████████| 6/6 [01:06<00:00, 11.09s/it]         


### Build Celltypist Model on 10X Nucleus AnnData

In [79]:
# Read in AnnData
name = '10x_nucleus'
ad_nuc = sc.read_h5ad(data_dir / dataset / f'h5ads/{name}_processed.h5ad')

# Read in cell type markers from literature
filepath = data_dir / dataset / 'tables/cell_type_markers.csv'
markers = pd.read_csv(filepath)

# Z-score data before gene set scoring
X = ad_nuc.layers['lognorm'].todense().A
ad_nuc.layers['z-score'] = sp.stats.zscore(X, axis=0)

# Ignore performance warnings from calling insert
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# Score all genes
ad_nuc.X = ad_nuc.layers['z-score']
col = 'group'
for name, group in tqdm(markers.groupby(col)):
    sc.tl.score_genes(
        adata=ad_nuc,
        gene_list=group['gene'].unique(),
        score_name=name,
        use_raw=False,
    )

100%|██████████| 33/33 [00:19<00:00,  1.74it/s]


In [80]:
# Join into single .obsm entry
ad_nuc.obsm['X_cell_type'] = ad_nuc.obs[markers[col].dropna().unique()]

# Max cell type by cluster
gb = 'phenograph_cluster'
fn = 'mean'
ct_agg = sc.get.aggregate(ad_nuc, gb, fn, obsm='X_cell_type').to_df(layer=fn)
ct_agg.index = ct_agg.index.astype(int)
ct_map = ct_agg.idxmax(1)

# Compartment by cluster
cp_map = markers.dropna().set_index(col)['compartment']
cp_map = cp_map[~cp_map.index.duplicated()]

# Map cluster/compartment assignments to dataset
ad_nuc.obs['prelim_cell_type'] = ad_nuc.obs[gb].map(ct_map)

In [81]:
# Build Celltypist model on nuclear AnnData
ad_nuc.layers['raw'] = ad_nuc.raw.X.copy()
ct_model = build_celltypist_model(
    ad_nuc,
    celltype_col='prelim_cell_type',
    raw_layer='raw',
    target_sum=100,
    sample_size=1000,
)
ct_model.write(data_dir / dataset / 'celltypist/colon_celltypist_model.pkl')

### Cell Type All AnnDatas

In [214]:
# Neutrophil IDs
filepath = data_dir / dataset / f'h5ads/segger_processed.h5ad'
ad = sc.read_h5ad(filepath)
is_neutrophil = ad.obs['phenograph_cluster'].eq(62)
neutrophil_ids = ad.obs.loc[is_neutrophil, '10x_id'].values

In [215]:
# Annotate using celltypist model
filepath = data_dir / dataset / 'celltypist/colon_celltypist_model.pkl'
ct_model = ct.Model.load(str(filepath))

# Read in cell type markers from literature
filepath = data_dir / dataset / 'tables/cell_type_markers.csv'
markers = pd.read_csv(filepath)

for name in ['10x_nucleus', '10x_cell', 'segger']:
    
    # Label anndata
    filepath = data_dir / dataset / f'h5ads/{name}_processed.h5ad'
    ad = sc.read_h5ad(filepath)
    annotate_cell_types(ad, ct_model, target_sum=100)
    ad.obs['cell_type'] = ad.obs['celltypist_label'].astype(str)
    mask = ad.obs['10x_id'].isin(neutrophil_ids)
    ad.obs.loc[mask, 'cell_type'] = 'Neutrophil'

    # Map to coarse labels for plotting
    for l in [1, 2]:
        counts = markers.groupby('group')[f'level_{l}'].value_counts()
        ct_map = counts.unstack().idxmax(1)  # coarse labels
        ad.obs[f'cell_type_level_{l}'] = ad.obs['cell_type'].map(ct_map)
    
    ad.write_h5ad(filepath)

... storing 'cell_type' as categorical
... storing 'cell_type_level_1' as categorical
... storing 'cell_type_level_2' as categorical
... storing 'cell_type' as categorical
... storing 'cell_type_level_1' as categorical
... storing 'cell_type_level_2' as categorical
... storing 'cell_type' as categorical
... storing 'cell_type_level_1' as categorical
... storing 'cell_type_level_2' as categorical
... storing 'cell_type' as categorical
... storing 'cell_type_level_1' as categorical
... storing 'cell_type_level_2' as categorical
