Code to test if ccf coordinates can predict cell types -- not the highest accuracy (around 0.35)

In [5]:
from graphFeatureSelect.utils import get_adata
import scanpy as sc
import numpy as np

In [6]:
adata = get_adata('../data/VISp_nhood.h5ad')

In [7]:
adata

AnnData object with n_obs × n_vars = 61884 × 550
    obs: 'cell_label', 'brain_section_label', 'cluster_alias', 'average_correlation_score', 'feature_matrix_label', 'donor_label', 'donor_genotype', 'donor_sex', 'x_section', 'y_section', 'z_section', 'neurotransmitter', 'class', 'subclass', 'supertype', 'cluster', 'neurotransmitter_color', 'class_color', 'subclass_color', 'supertype_color', 'cluster_color', 'x_reconstructed', 'y_reconstructed', 'z_reconstructed', 'parcellation_index', 'x_ccf', 'y_ccf', 'z_ccf', 'parcellation_organ', 'parcellation_category', 'parcellation_division', 'parcellation_structure', 'parcellation_substructure', 'parcellation_organ_color', 'parcellation_category_color', 'parcellation_division_color', 'parcellation_structure_color', 'parcellation_substructure_color'
    var: 'gene_symbol', 'transcript_identifier'
    uns: 'accessed_on', 'neighbors', 'pca', 'src'
    obsm: 'X_pca', 'ccf'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [8]:
adata.obsm["section"] = np.concatenate(
    (
        np.expand_dims(np.array(adata.obs["x_section"]), axis=1),
        np.expand_dims(np.array(adata.obs["y_section"]), axis=1),
        np.expand_dims(np.array(adata.obs["z_section"]), axis=1),
    ),
    axis=1,
)


In [9]:
adata.obsm["reconstructed"] = np.concatenate(
    (
        np.expand_dims(np.array(adata.obs["x_reconstructed"]), axis=1),
        np.expand_dims(np.array(adata.obs["y_reconstructed"]), axis=1),
        np.expand_dims(np.array(adata.obs["z_reconstructed"]), axis=1),
    ),
    axis=1,
)


In [10]:
adata.shape

(61884, 550)

In [11]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

train_idx, test_idx = next(skf.split(np.arange(adata.shape[0]), adata.obs['supertype'].values))

train_mask = np.zeros(adata.shape[0], dtype=bool)
train_mask[train_idx] = True
# train_mask = torch.tensor(train_mask, dtype=torch.bool)

test_mask = np.zeros(adata.shape[0], dtype=bool)
test_mask[test_idx] = True
# test_mask = torch.tensor(test_mask, dtype=torch.bool)




In [20]:
len(adata.obs['supertype'][test_mask])

12377

In [21]:
int(adata.obs['supertype'][test_mask].value_counts()[0]) / len(adata.obs['supertype'][test_mask])

  int(adata.obs['supertype'][test_mask].value_counts()[0]) / len(adata.obs['supertype'][test_mask])


0.14737012200048477

In [22]:
#ccf
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier()
clf.fit(adata.obsm['ccf'][train_mask], adata.obs['supertype'][train_mask])
print(clf.score(adata.obsm['ccf'][test_mask], adata.obs['supertype'][test_mask]))

0.3609113678597398




In [23]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.dummy import DummyClassifier

strategies = ["prior", "stratified", "uniform"]
test_scores = {}
for s in strategies:
    dclf = DummyClassifier(strategy=s, random_state=0)
    dclf.fit(adata.obsm['ccf'][train_mask], adata.obs['supertype'][train_mask])
    score = dclf.score(adata.obsm['ccf'][test_mask], adata.obs['supertype'][test_mask])
    test_scores[s] = np.round(score, 2)


In [24]:
print(test_scores)

{'prior': np.float64(0.15), 'stratified': np.float64(0.07), 'uniform': np.float64(0.01)}


In [46]:
# section
clf = MLPClassifier()
clf.fit(adata.obsm['section'][train_mask], adata.obs['supertype'][train_mask])
print(clf.score(adata.obsm['section'][test_mask], adata.obs['supertype'][test_mask]))

0.3168780803102529




In [47]:
# reconstructed
clf = MLPClassifier()
clf.fit(adata.obsm['reconstructed'][train_mask], adata.obs['supertype'][train_mask])
print(clf.score(adata.obsm['reconstructed'][test_mask], adata.obs['supertype'][test_mask]))

0.36276965338935124




In [56]:
adata.obs['class'].value_counts()

class
01 IT-ET Glut        34168
02 NP-CT-L6b Glut     8821
30 Astro-Epen         5307
33 Vascular           5169
07 CTX-MGE GABA       3597
31 OPC-Oligo          2664
06 CTX-CGE GABA       1363
34 Immune              756
08 CNU-MGE GABA         28
05 OB-IMN GABA           4
03 OB-CR Glut            3
29 CB Glut               3
28 CB GABA               1
Name: count, dtype: int64

In [54]:
adata.obsm['ccf'][adata.obs['supertype'] == '0028 L4/5 IT CTX Glut_6']

array([[9.8129949 , 1.19931551, 8.08794042],
       [9.09990644, 1.34543557, 2.80519387],
       [8.57365051, 0.82416691, 7.7414858 ],
       ...,
       [9.94499592, 1.95682617, 9.03688049],
       [9.45167514, 1.45998209, 2.8503031 ],
       [8.7625237 , 0.68943085, 7.54256732]])

In [55]:
adata.obsm['ccf'][adata.obs['supertype'] == '0114 L6 CT CTX Glut_1']

array([[9.04415392, 1.89627485, 2.81099339],
       [9.23744584, 1.54314281, 8.17194066],
       [8.35460389, 1.27536708, 8.00697448],
       ...,
       [8.38780223, 1.34206439, 3.20357213],
       [9.44565677, 1.65248622, 3.31083417],
       [9.79159104, 1.72164381, 3.00037199]])