In [10]:
import scanpy as sc
import numpy as np

In [2]:
ad = sc.read_h5ad('/Volumes/processing2/mana/mana_updated.h5ad')

In [3]:
ad

AnnData object with n_obs × n_vars = 107228 × 5101
    obs: 'cell', 'centroid_x', 'centroid_y', 'centroid_z', 'component', 'volume', 'surface_area', 'scale', 'region', 'sample_id', 'proseg_cluster', 'output_folder', 'Num', 'n_genes', 'n_counts', 'louvain_0.5', 'louvain_1', 'louvain_1.5', 'louvain_2', 'louvain_2.5', 'louvain_3', 'louvain_3.5', 'Cluster', 'Level1', 'Level2', 'Level3', 'Level3.1', 'grid_label', 'rbd_domain', 'rbd_domain_0.1', 'rbd_domain_0.2', 'rbd_domain_0.3', 'rbd_domain_0.5', 'rbd_domain_0.6', 'rbd_domain_0.7', 'rbd_domain_0.8', 'rbd_domain_0.9', 'rbd_domain_1.1', 'rbd_domain_1', 'rbd_domain_1.25', 'rbd_domain_1.4', 'rbd_domain_1.5', 'leiden_0.5', 'leiden_1', 'leiden_1.5', 'leiden_2', 'leiden_2.5', 'leiden_3', 'leiden_3.5', 'sample_name', 'course', 'condition', 'model', 'cytetype_annotation_louvain_3.5', 'cytetype_cellOntologyTerm_louvain_3.5', 'cluster_id', 'author_label', 'annotation', 'Class', 'state', 'CL_term', 'CL_term_id', 'confidence', 'author_label_similarity_

In [7]:
conn = ad.obsp['spatial_connectivities']

In [8]:
def local_purity(cluster_key):
    labels = ad.obs[cluster_key].astype('category').cat.codes.values
    purities = []
    for i in range(ad.n_obs):
        neighbors = conn[i].nonzero()[1]
        if len(neighbors) > 0:
            neighbor_labels = labels[neighbors]
            # Fraction matching the cell's own label
            purity = (neighbor_labels == labels[i]).mean()
            purities.append(purity)
    return np.mean(purities)

In [11]:
for l in [2,3, 4, 5, 6, 7,8]:
    key = f'X_weighted_d0.2_l{l}'
    cluster_key = f'leiden_d0.2_l{l}'
    
    n_features = ad.obsm[key].shape[1]
    n_clusters = ad.obs[cluster_key].nunique()
    
    print(f"n_layers={l}: {n_features} features, {n_clusters} clusters, purity={local_purity(cluster_key):.3f}")

n_layers=2: 30 features, 24 clusters, purity=0.482
n_layers=3: 40 features, 26 clusters, purity=0.535
n_layers=4: 50 features, 29 clusters, purity=0.588
n_layers=5: 60 features, 29 clusters, purity=0.648
n_layers=6: 70 features, 33 clusters, purity=0.773
n_layers=7: 80 features, 40 clusters, purity=0.801
n_layers=8: 90 features, 39 clusters, purity=0.857


In [12]:
# 1. Transcriptional coherence — are cells in a cluster similar in expression?
from sklearn.metrics import silhouette_score

for l in [2,3, 4, 5, 6, 7,8]:
    # Silhouette in EXPRESSION space (X_scVI), not spatial
    sil = silhouette_score(ad.obsm['X_scVI'], ad.obs[f'leiden_d0.2_l{l}'], sample_size=10000)
    print(f"n_layers={l}: expression silhouette={sil:.3f}")

# 2. Do clusters have clear marker genes?
# (compare DEG clarity between n_layers=3 vs 5 vs 7)

# 3. Do the clusters make biological sense?
# (this is the real test — look at them!)

n_layers=2: expression silhouette=0.064
n_layers=3: expression silhouette=0.037
n_layers=4: expression silhouette=-0.018
n_layers=5: expression silhouette=-0.033
n_layers=6: expression silhouette=-0.067
n_layers=7: expression silhouette=-0.093
n_layers=8: expression silhouette=-0.139
