In [2]:
import numpy as np
import scanpy as sc
import tensorflow as tf
from sklearn.neighbors import kneighbors_graph
import scipy.sparse
import pandas as pd

# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

2025-04-27 13:46:01.647364: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-27 13:46:01.802542: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-27 13:46:01.802592: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-27 13:46:01.826082: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-27 13:46:01.875769: I tensorflow/core/platform/cpu_feature_guar

In [3]:
adata = sc.read_h5ad('/home/frankfurt/LDL/data/abc_atlas/Zhuang-ABCA-1-raw.h5ad')
meta = pd.read_csv('/home/frankfurt/LDL/data/abc_atlas/cell_metadata_with_cluster_annotation.csv')
coords = pd.read_csv('/home/frankfurt/LDL/data/abc_atlas/ccf_coordinates.csv')

In [4]:
# 3. Explore Columns
# ===========================================
print("adata.obs columns:", adata.obs.columns)
print("meta columns:", meta.columns)
print("coords columns:", coords.columns)

adata.obs columns: Index(['brain_section_label'], dtype='object')
meta columns: Index(['cell_label', 'brain_section_label', 'feature_matrix_label',
       'donor_label', 'donor_genotype', 'donor_sex', 'cluster_alias', 'x', 'y',
       'z', 'subclass_confidence_score', 'cluster_confidence_score',
       'high_quality_transfer', 'neurotransmitter', 'class', 'subclass',
       'supertype', 'cluster', 'neurotransmitter_color', 'class_color',
       'subclass_color', 'supertype_color', 'cluster_color'],
      dtype='object')
coords columns: Index(['cell_label', 'x', 'y', 'z', 'parcellation_index'], dtype='object')


In [5]:
# ===========================================
# 4. Make sure Metadata and Coords are Indexed by 'cell_label'
# ===========================================
meta = meta.set_index('cell_label')
coords = coords.set_index('cell_label')


In [6]:
# ======================================
# 5. Subset AnnData to Metadata Cells
# ======================================
# Keep only cells that are present in metadata
adata = adata[adata.obs.index.isin(meta.index)].copy()
print(f"✅ Shape after subsetting: {adata.shape} (cells, genes)")

✅ Shape after subsetting: (2846908, 1122) (cells, genes)


In [7]:
# Quick summary: Cells per brain_section_label
section_summary = adata.obs['brain_section_label'].value_counts().sort_index()

print("\n✅ Number of cells per brain section:")
print(section_summary)


✅ Number of cells per brain section:
brain_section_label
Zhuang-ABCA-1.001     5893
Zhuang-ABCA-1.002     5505
Zhuang-ABCA-1.003     8706
Zhuang-ABCA-1.004     9597
Zhuang-ABCA-1.005    12097
                     ...  
Zhuang-ABCA-1.146    14027
Zhuang-ABCA-1.147    13729
Zhuang-ABCA-1.148    14388
Zhuang-ABCA-1.149     8829
Zhuang-ABCA-1.150    10096
Name: count, Length: 147, dtype: int64


In [9]:
# ======================================
# 4. Join Metadata
# ======================================
# Drop overlapping column from meta
meta = meta.drop(columns=['brain_section_label'], errors='ignore')

# Now safe to join
adata.obs = adata.obs.join(meta, how='left')


In [12]:
# ======================================
# 5. Join Coordinates Safely
# ======================================
# Drop overlapping columns before joining
overlapping_cols = [col for col in coords.columns if col in adata.obs.columns]
coords = coords.drop(columns=overlapping_cols, errors='ignore')

adata.obs = adata.obs.join(coords, how='left')



In [16]:
# ======================================
# 7. Save the Final Clean AnnData
# ======================================
output_path = '/home/frankfurt/LDL/data/abc_atlas/Zhuang-ABCA-1-merged-final-CLEAN-v2.h5ad'
adata.write(output_path)

print(f"✅ Saved successfully to: {output_path}")

✅ Saved successfully to: /home/frankfurt/LDL/data/abc_atlas/Zhuang-ABCA-1-merged-final-CLEAN-v2.h5ad


Old code starts here.

In [12]:

# Save the complete annotated spatial dataset
adata.write("full_spatial_data.h5ad")

# Extract a pseudo scRNA-seq dataset from spatial data
# Keeping only essential columns like predicted cell type and cluster
sc_data = adata.copy()
sc_data.obs = sc_data.obs[["cell_type", "clusters"]]
sc_data.write("pseudo_scRNAseq_data.h5ad")

# Prepare feature matrix (X), spatial coordinates, and labels for modeling
X = adata.X.toarray() if not isinstance(adata.X, np.ndarray) else adata.X
coords = adata.obsm["spatial"]
labels = adata.obs["cell_type"].astype("category").cat.codes.to_numpy()

# Save these arrays for GNN input
np.save("features_X.npy", X)
np.save("spatial_coords.npy", coords)
np.save("labels_y.npy", labels)

print("✅ All data saved: pseudo scRNA-seq file and arrays for GNN input.")

✅ All data saved: pseudo scRNA-seq file and arrays for GNN input.


In [None]:

cell_features = np.load("features_X.npy")         # shape (n_cells, n_genes)
coords = np.load("spatial_coords.npy")            # shape (n_cells, 2)
labels = np.load("labels_y.npy")                  # shape (n_cells,)

In [13]:
# ===========================================
# 2. Load Preprocessed Spatial Data
# ===========================================
cell_features = np.load("features_X.npy")         # (num_cells, num_genes)
coords = np.load("spatial_coords.npy")            # (num_cells, 2)
labels = np.load("labels_y.npy")                  # (num_cells,)

# Optionally normalize features
from sklearn.preprocessing import StandardScaler
cell_features = StandardScaler().fit_transform(cell_features)

In [14]:
# ===========================================
# 3. Build Adjacency Matrix from Coordinates
# ===========================================
adj_matrix = kneighbors_graph(coords, n_neighbors=6, mode='connectivity')
cell_adj = tf.convert_to_tensor(adj_matrix.toarray(), dtype=tf.float32)


2025-04-23 11:30:08.684294: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [31]:

# ===========================================
# 4. Define Model Architecture
# ===========================================
class SimpleGraphConv(tf.keras.layers.Layer):
    def __init__(self, units, adj, **kwargs):  # Accept additional kwargs
        super().__init__(**kwargs)             # Pass kwargs to parent class
        self.units = units
        self.adj = adj

    def build(self, input_shape):
        self.w = self.add_weight(
            shape=(input_shape[-1], self.units),
            initializer="glorot_uniform",
            trainable=True
        )

    def call(self, x):
        xw = tf.matmul(x, self.w)                      # (num_cells, units)
        return tf.nn.relu(tf.matmul(self.adj, xw))     # (num_cells, units)


In [32]:
inputs = tf.keras.Input(shape=(cell_features.shape[1],), name="cell_input")  # (num_genes,)
x = SimpleGraphConv(128, cell_adj)(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
output = tf.keras.layers.Dense(np.unique(labels).shape[0], activation='softmax')(x)

model = tf.keras.Model(inputs=inputs, outputs=output)

In [35]:
# Compile
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 cell_input (InputLayer)     [(None, 32285)]           0         
                                                                 
 simple_graph_conv_2 (Simpl  (1196, 128)               4132480   
 eGraphConv)                                                     
                                                                 
 dense_4 (Dense)             (1196, 64)                8256      
                                                                 
 dense_5 (Dense)             (1196, 15)                975       
                                                                 
Total params: 4141711 (15.80 MB)
Trainable params: 4141711 (15.80 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [36]:
# ===========================================
# 5. Train Model (Corrected)
# ===========================================
model.fit(
    cell_features,
    labels,
    epochs=50,
    batch_size=cell_features.shape[0],  # Process all cells in one batch
    validation_split=0.0,               # Disable validation split for now
    verbose=1
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x782c549b3d90>