In [40]:
import numpy as np
import scanpy as sc
import pandas as pd

from gitk import scembed

pbmc = sc.read_h5ad("pbmc/pbmc.h5ad")

In [41]:
pbmc.shape

(10246, 165434)

In [42]:
pbmc.obs.drop(["barcode"], inplace=True, axis=1)
pbmc.obs.reset_index(inplace=True)

In [43]:
annotations = pd.read_csv("pbmc/cellcano_annotations.csv", index_col=0, header=0)

In [44]:
# split index at # and grab second item, store as new column called barcode
annotations["barcode"] = annotations.index.str.split("#").str[1]

In [45]:
# merge pbmc.obs with annotations on barcode
pbmc.obs = pbmc.obs.merge(annotations, on="barcode", how="left")

In [39]:
# init projector to get clusters again
projector = scembed.Projector("databio/multiome")

In [None]:
projector.project(pbmc)

In [51]:
pbmc

AnnData object with n_obs × n_vars = 10246 × 165434
    obs: 'barcode', 'pred_celltype', 'firstround_pred_celltype', 'entropy', 'leiden'
    var: 'chr', 'start', 'end'
    uns: 'neighbors', 'leiden'
    obsm: 'embedding'
    obsp: 'distances', 'connectivities'

In [62]:
# cluster using embeddings (these are the same as what I used for previous clustering)
sc.pp.neighbors(pbmc, use_rep="embedding")
sc.tl.leiden(pbmc, resolution=0.10, random_state=42)

In [63]:
# for each cluster in the leiden clustering, find the most common cell_type and assign it to the cluster
cluster_to_cell_type = {}
for cluster in pbmc.obs.leiden.unique():
    cluster_to_cell_type[cluster] = pbmc.obs.loc[pbmc.obs.leiden == cluster, "pred_celltype"].value_counts().index[0]

In [69]:
# map the cluster_to_cell_type dictionary to the leiden column
pbmc.obs["cellcano_consensus_celltype"] = pbmc.obs["leiden"].map(cluster_to_cell_type)

In [70]:
pbmc.obs['cellcano_consensus_celltype'].value_counts()

cellcano_consensus_celltype
CD4 T cells        4427
Monocytes          3218
CD8 T cells        1241
NK cells            675
B cells             551
Dendritic cells     134
Name: count, dtype: int64

In [72]:
pbmc.obs.drop(["consensus_celltype"], inplace=True, axis=1)

In [73]:
cluster_to_cell_type

{'1': 'Monocytes',
 '0': 'CD4 T cells',
 '5': 'B cells',
 '3': 'NK cells',
 '2': 'CD8 T cells',
 '4': 'Monocytes',
 '6': 'Dendritic cells',
 '7': 'Monocytes'}