# Data loading

## Evaluation dataset

In [1]:
import os

# Limit the number of threads used by various linear algebra libraries
# to avoid oversubscription and improve efficiency in multiprocessing environments
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

import numpy as np
import pandas as pd

# Set patch size used for processing image tiles
patch_size = 128

# Load evaluation metadata (downsampled version)
eval_csv = "/home/lbh/projects_dir/BigSlice/dataset/test_set_with_coo.csv"
eval_df = pd.read_csv(eval_csv, index_col=0)
eval_df

Unnamed: 0_level_0,spot,organ,subregion,level1_annotation,level2_annotation,level0_annotation,celltype_prediction_1,score_prediction_1,celltype_prediction_2,score_prediction_2,...,x_scaled_image,y_scaled_image,x_scaled_image_organ,y_scaled_image_organ,batch,organ_encoded,subregion_encoded,level0_annotation_encoded,level1_annotation_encoded,level2_annotation_encoded
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,AACAACGACAGCCTACAA_0_0_0,LU,Alveoli,blood & immune cell,B cell,B cell,B cell,0.4480,epithelial cell,0.2924,...,2030.518557,3122.431157,804.518557,659.431157,CTRL_1,8,0,0,1,0
11,AACACACGGTACAGAGCT_0_0_0,LU,Alveoli,blood & immune cell,B cell,B cell,B cell,0.5057,epithelial cell,0.2555,...,2077.235895,3373.985738,851.235895,910.985738,CTRL_1,8,0,0,1,0
12,AACACACGGTACGGCATA_0_0_0,LU,Alveoli,epithelial cell,specialized epithelial cell,epithelial cell,epithelial cell,0.4006,T cell,0.3183,...,1659.375258,3342.541415,433.375258,879.541415,CTRL_1,8,0,104,6,68
20,AACACTACGCGAACGGTA_0_0_0,LU,Alveoli,epithelial cell,pneumocyte,type I pneumocyte,type I pneumocyte,0.8655,endothelial cell,0.0980,...,2282.273102,3522.223258,1056.273102,1059.223258,CTRL_1,8,0,339,6,58
24,AACACTACGGTCCACCAA_0_0_0,LU,Alveoli,epithelial cell,pneumocyte,type I pneumocyte,type I pneumocyte,1.0000,,0.0000,...,2061.663449,3266.176632,835.663449,803.176632,CTRL_1,8,0,339,6,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2330867,TTGTTGACCGTGTACACT_1,HE,Heart Tissue,muscle cell,cardiac muscle cell,cardiac muscle cell,cardiac muscle cell,0.5973,fibroblast,0.2106,...,747.736645,3558.503127,386.736645,821.503127,LPS_2,4,17,61,13,7
2330869,TTGTTGACCTCTTGACAG_1,HE,Heart Tissue,muscle cell,cardiac muscle cell,cardiac muscle cell,cardiac muscle cell,0.8586,regular atrial cardiac myocyte,0.1414,...,1426.432052,3603.338014,1065.432052,866.338014,LPS_2,4,17,61,13,7
2330870,TTGTTGCGAAGATGCGTC_1,HE,Heart Tissue,muscle cell,cardiac muscle cell,cardiac muscle cell,cardiac muscle cell,0.6812,fibroblast,0.1929,...,1550.773196,3728.875699,1189.773196,991.875699,LPS_2,4,17,61,13,7
2330872,TTGTTGCGAGTCTTCACA_1,HE,Heart Tissue,muscle cell,cardiac muscle cell,cardiac muscle cell,cardiac muscle cell,0.4873,fibroblast of cardiac tissue,0.2563,...,1188.111528,3917.182227,827.111528,1180.182227,LPS_2,4,17,61,13,7


In [2]:
coo_eval = eval_df[['x_scaled_image', 'y_scaled_image']].values.astype(np.float32)

In [3]:
import numpy as np

# Load extracted image features for the evaluation set
# Features are saved as .npy file and converted to float32 for compatibility and efficiency
X_eval = np.load(f"/home/lbh/projects_dir/BigSlice/dataset/uni_validation_feature_{patch_size}_all.npy").astype(np.float32)

# Print the shape of the loaded feature array (expected: [N, 1536])
X_eval.shape

(466176, 1536)

## Train set

In [4]:
import numpy as np
import pandas as pd

# Load full annotation table with 'spot' as index
all_df = pd.read_csv('/home/lbh/projects_dir/BigSlice/Celltype_Annotations/all_annotations.csv', index_col=0)

# Load sampled training set with 'filename' as index
train_df = pd.read_csv("/home/lbh/projects_dir/BigSlice/dataset/train_8k_samples.csv", index_col=0)

# Filter annotations to only include rows present in the training set
subset_df = all_df[all_df['spot'].isin(train_df['filename'])]

# Reorder subset_df to match the order in train_df['filename']
# This ensures that labels align with extracted image features
subset_df_sorted = subset_df.set_index('spot').loc[train_df['filename']].reset_index()

# Display the sorted subset of annotation DataFrame
subset_df_sorted

Unnamed: 0,spot,organ,subregion,level1_annotation,level2_annotation,level0_annotation,celltype_prediction_1,score_prediction_1,celltype_prediction_2,score_prediction_2,...,x_scaled_image,y_scaled_image,x_scaled_image_organ,y_scaled_image_organ,batch,organ_encoded,subregion_encoded,level0_annotation_encoded,level1_annotation_encoded,level2_annotation_encoded
0,CAGTTCCAGATTGGTGCT_0_0_0,BR,Fiber Tracts,glial cell,oligodendrocyte,oligodendrocyte,oligodendrocyte,0.9458,tanycyte,0.0542,...,2123.953233,714.694460,932.953233,714.694460,CTRL_1,2,13,257,9,52
1,CGACGAGTAAGCATCTAC_1,BM,Bone Marrow Tissue,blood & immune cell,hematopoietic stem cell,hematopoietic stem cell,hematopoietic stem cell,0.4734,hematopoietic precursor cell,0.4092,...,1804.636364,3092.220296,1804.636364,3092.220296,LPS_2,1,2,145,1,31
2,GATTCTGACCGCATAAGC_0_0_0,BR,Cerebellar Cortex (Gran. Layer),mesenchymal cell,granule cell,granule cell,granule cell,0.4037,inhibitory neuron,0.3906,...,2648.225586,1056.089962,1457.225586,1056.089962,CTRL_1,2,7,136,12,27
3,CGCATCTCATGGTTCACT_1_0,MU,Muscle Tissue,muscle cell,muscle cell,muscle cell,muscle cell,0.3445,multi-potent skeletal muscle stem cell,0.3324,...,2083.518463,6556.506308,2083.518463,6556.506308,LPS_1,9,27,227,13,46
4,AGTCAGACTGCATACACA_1,MU,Muscle Tissue,muscle cell,skeletal muscle cell,multi-potent skeletal muscle stem cell,multi-potent skeletal muscle stem cell,0.4516,cell of skeletal muscle,0.3373,...,2387.485473,2854.595392,2387.485473,2854.595392,LPS_2,9,27,226,13,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79995,GTCTTCACAGTAGAACCT_0_0_0,MU,Muscle Tissue,muscle cell,skeletal muscle cell,multi-potent skeletal muscle stem cell,multi-potent skeletal muscle stem cell,0.4073,cell of skeletal muscle,0.3209,...,1013.118744,2071.292375,1013.118744,2071.292375,CTRL_1,9,27,226,13,66
79996,GCTTCCTAAGTCGATTCG_1_0,LU,Alveoli,epithelial cell,pneumocyte,type II pneumocyte,type II pneumocyte,0.3775,alveolar macrophage,0.3690,...,2236.420806,3806.917170,975.420806,856.917170,LPS_1,8,0,341,6,58
79997,GTACTTGGTTCTCTAGCG_1_0,MU,Muscle Tissue,muscle cell,skeletal muscle cell,multi-potent skeletal muscle stem cell,multi-potent skeletal muscle stem cell,0.5797,muscle cell,0.2503,...,2459.295408,5690.811849,2459.295408,5690.811849,LPS_1,9,27,226,13,66
79998,TTGTCTCGTGAAGCAACG_1_0_0,BF,Brown Fat Tissue,adipocytes,adipocyte,brown fat cell,brown fat cell,0.7443,mesenchymal stem cell of adipose tissue,0.1687,...,1655.630740,610.229951,1655.630740,610.229951,CTRL_2,0,5,55,0,3


### Update annotations

In [5]:
import numpy as np

def update_skin_subregion(df):
    """
    Update subregion annotations for 'SK' (skin) organ entries.
    Replaces missing or 'other'-like subregion values with 'Skin_other'.

    Args:
        df (pd.DataFrame): DataFrame with 'organ' and 'subregion' columns.

    Returns:
        pd.DataFrame: Updated DataFrame.
        int: Number of rows modified.
    """
    # Condition: rows where organ is 'SK' and subregion is NA or 'other'
    condition = (df['organ'] == 'SK') & (
        df['subregion'].isna() | (df['subregion'].astype(str).str.lower() == 'other')
    )
    num_updated = condition.sum()

    # Apply update: set subregion to 'Skin_other' for matching rows
    df.loc[condition, 'subregion'] = 'Skin_other'
    return df, num_updated

# Apply the update to training annotations
subset_df_sorted, num_updated_train = update_skin_subregion(subset_df_sorted)

# Print the number of updated rows
print(f"Train set updated: {num_updated_train} rows")

Train set updated: 7632 rows


In [6]:
np.unique(subset_df_sorted['subregion'])

array(['Alveoli', 'Blood Vessel', 'Bone Marrow Tissue', 'Bronchi',
       'Bronchioles', 'Brown Fat Tissue', 'Caudoputamen',
       'Cerebellar Cortex (Gran. Layer)',
       'Cerebellar Cortex (Mol. Layer)', 'Cerebral Cortex',
       'Colon Tissue', 'Dermis', 'Epidermis', 'Fiber Tracts',
       'Foveolar Epithelium', 'Gastric Glands (Chief Cell Rich)',
       'Gastric Glands (Parietal Cell Rich)', 'Heart Tissue',
       'Hypodermis', 'Hyppocampus', 'Inner Medulla',
       'Inner Stripe of Outer Medulla', 'Lymph Node Tissue',
       'Marginal Zone', 'Meninges', 'Midbrain', 'Midbrain/Hindbrain',
       'Muscle Tissue', 'Other', 'Outer Stripe of Outer Medulla',
       'Pancreas Tissue', 'Pericentral', 'Periportal', 'Red Pulp',
       'Renal Cortex', 'Skin_other', 'Small Intestine Tissue',
       'Spinal Cord', 'Thalamus', 'Thymic Cortex', 'Thymic Medulla',
       'Ventricle', 'White Pulp'], dtype=object)

### Data preparation

In [7]:
import os
from tqdm import tqdm

# Initialize storage for each of the 5 label levels
y_train_levels = [[] for _ in range(5)]

batch_size = 64  # Batch size for label processing (adjust if needed)
label_columns = ["organ", "subregion", "level1_annotation", "level2_annotation", "level0_annotation"]

# Iterate through the annotation DataFrame in batches
for i in tqdm(range(0, len(subset_df_sorted), batch_size), desc="Extracting train features"):
    batch_df = subset_df_sorted.iloc[i:i+batch_size]
    batch_labels = [[] for _ in range(5)]  # Temporary storage for this batch

    # Column names to extract (can be modified for encoded labels)
    encoded_columns = [f"{col}" for col in label_columns]

    # Extract labels for each level in this batch
    for _, row in batch_df.iterrows():
        for j, col in enumerate(encoded_columns):
            batch_labels[j].append(row[col])

    # Append batch labels to the overall label lists
    for j in range(5):
        y_train_levels[j].extend(batch_labels[j])

Extracting train features:   0%|          | 0/1250 [00:00<?, ?it/s]

Extracting train features: 100%|██████████| 1250/1250 [00:03<00:00, 398.75it/s]


In [8]:
# Extract scaled spatial coordinates ('x_scaled_image', 'y_scaled_image') for training samples
# Convert to float32 NumPy array for downstream tasks (e.g., KNN, spatial clustering)
coo_train = subset_df_sorted[['x_scaled_image', 'y_scaled_image']].values.astype(np.float32)

# Check the shape of the coordinate array (should match the number of training samples)
coo_train.shape

(80000, 2)

In [9]:
patch_size = 128  # Define patch size used during feature extraction

# Load precomputed feature vectors for the training set
# Features are stored in a .npy file and converted to float32 for consistency
X_train = np.load(f"/home/lbh/projects_dir/BigSlice/dataset/uni_feature_{patch_size}.npy").astype(np.float32)

# Display the shape of the feature array (expected shape: [N, 1536])
X_train.shape

(80000, 1536)

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

# 1. Concatenate spatial coordinates and image features
# Shape: [N, D], where D = 2 (coords) + 1536 (features)
x_all = np.hstack((coo_train, X_train))  # shape: [20000, D]

# 2. Normalize features using StandardScaler (zero mean, unit variance)
scaler = StandardScaler()
x_all_normalized = scaler.fit_transform(x_all)

# 3. Apply PCA to reduce dimensionality to 200 components
# This helps improve efficiency and reduce noise for downstream tasks
pca = PCA(n_components=200, random_state=42)
x_all_reduced = pca.fit_transform(x_all_normalized)

In [11]:
# Concatenate spatial coordinates and extracted image features for the evaluation set
# Shape: [N, D], where D = 2 (coords) + 1536 (features)
x_all_eval = np.hstack((coo_eval, X_eval))  # shape: [N, D]

# WARNING: This re-fits the scaler on the evaluation set — not recommended!
# In practice, you should reuse the `scaler` fitted on the training set for consistency

scaler = StandardScaler()
x_eval_normalized = scaler.fit_transform(x_all_eval)  # <-- should ideally use scaler.transform()

# Apply PCA transformation using the model fitted on training data
X_eval_reduced = pca.transform(x_eval_normalized)  # shape: [N, 200]

In [12]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import time

# Set batch size for inference
batch_size = 1000

# Dictionary to store predictions
predictions = {}

# Define the label levels to be predicted
label_columns = ['organ', 'subregion', 'level1_annotation', 'level2_annotation', 'level0_annotation']

# Loop over each label level
for i, y_train in enumerate(y_train_levels):
    print(f"\n=== Processing label: {label_columns[i]} ===")
    
    # --- Train KNN classifier ---
    start_train = time.time()
    knn = KNeighborsClassifier(n_neighbors=5, n_jobs=16)  # Use 16 threads for parallel search
    knn.fit(x_all_reduced, y_train)  # Fit on reduced training features
    end_train = time.time()
    print(f"Training time: {end_train - start_train:.2f} seconds")

    # --- Batch prediction ---
    print(f"Predicting {label_columns[i]} in batches...")
    y_preds = []
    start_predict = time.time()

    for j in range(0, len(X_eval_reduced), batch_size):
        batch = X_eval_reduced[j:j+batch_size]
        batch_pred = knn.predict(batch)
        y_preds.append(batch_pred)

        # Log progress every 10 batches with estimated remaining time
        if (j // batch_size) % 10 == 0 and j > 0:
            elapsed = time.time() - start_predict
            done_batches = j // batch_size
            total_batches = len(X_eval_reduced) // batch_size + 1
            eta = elapsed / done_batches * (total_batches - done_batches)
            print(f"Processed {j}/{len(X_eval_reduced)} samples "
                  f"({done_batches}/{total_batches} batches), "
                  f"elapsed: {elapsed:.1f}s, ETA: {eta:.1f}s")

    # --- Store and record predictions ---
    y_pred = np.concatenate(y_preds)  # Flatten batched predictions
    eval_df[f"{label_columns[i]}_pred"] = y_pred  # Save predictions to DataFrame

    total_predict_time = time.time() - start_predict
    print(f"Prediction time: {total_predict_time:.2f} seconds")

print("\n✅ All predictions completed.")


=== Processing label: organ ===
Training time: 0.05 seconds
Predicting organ in batches...
Processed 10000/466176 samples (10/467 batches), elapsed: 9.6s, ETA: 440.3s
Processed 20000/466176 samples (20/467 batches), elapsed: 18.4s, ETA: 410.4s
Processed 30000/466176 samples (30/467 batches), elapsed: 27.1s, ETA: 394.7s
Processed 40000/466176 samples (40/467 batches), elapsed: 35.9s, ETA: 382.9s
Processed 50000/466176 samples (50/467 batches), elapsed: 44.7s, ETA: 372.4s
Processed 60000/466176 samples (60/467 batches), elapsed: 53.5s, ETA: 362.6s
Processed 70000/466176 samples (70/467 batches), elapsed: 62.2s, ETA: 352.7s
Processed 80000/466176 samples (80/467 batches), elapsed: 70.9s, ETA: 343.0s
Processed 90000/466176 samples (90/467 batches), elapsed: 79.7s, ETA: 333.8s
Processed 100000/466176 samples (100/467 batches), elapsed: 88.5s, ETA: 324.6s
Processed 110000/466176 samples (110/467 batches), elapsed: 97.2s, ETA: 315.5s
Processed 120000/466176 samples (120/467 batches), elapsed

In [13]:
eval_df

Unnamed: 0_level_0,spot,organ,subregion,level1_annotation,level2_annotation,level0_annotation,celltype_prediction_1,score_prediction_1,celltype_prediction_2,score_prediction_2,...,organ_encoded,subregion_encoded,level0_annotation_encoded,level1_annotation_encoded,level2_annotation_encoded,organ_pred,subregion_pred,level1_annotation_pred,level2_annotation_pred,level0_annotation_pred
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,AACAACGACAGCCTACAA_0_0_0,LU,Alveoli,blood & immune cell,B cell,B cell,B cell,0.4480,epithelial cell,0.2924,...,8,0,0,1,0,LU,Alveoli,epithelial cell,endothelial cell,endothelial cell
11,AACACACGGTACAGAGCT_0_0_0,LU,Alveoli,blood & immune cell,B cell,B cell,B cell,0.5057,epithelial cell,0.2555,...,8,0,0,1,0,LU,Alveoli,epithelial cell,specialized epithelial cell,club cell
12,AACACACGGTACGGCATA_0_0_0,LU,Alveoli,epithelial cell,specialized epithelial cell,epithelial cell,epithelial cell,0.4006,T cell,0.3183,...,8,0,104,6,68,LU,Alveoli,blood & immune cell,macrophage,alveolar macrophage
20,AACACTACGCGAACGGTA_0_0_0,LU,Alveoli,epithelial cell,pneumocyte,type I pneumocyte,type I pneumocyte,0.8655,endothelial cell,0.0980,...,8,0,339,6,58,LU,Alveoli,epithelial cell,pneumocyte,T cell
24,AACACTACGGTCCACCAA_0_0_0,LU,Alveoli,epithelial cell,pneumocyte,type I pneumocyte,type I pneumocyte,1.0000,,0.0000,...,8,0,339,6,58,LU,Alveoli,blood & immune cell,B cell,B cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2330867,TTGTTGACCGTGTACACT_1,HE,Heart Tissue,muscle cell,cardiac muscle cell,cardiac muscle cell,cardiac muscle cell,0.5973,fibroblast,0.2106,...,4,17,61,13,7,HE,Heart Tissue,muscle cell,cardiac muscle cell,cardiac muscle cell
2330869,TTGTTGACCTCTTGACAG_1,HE,Heart Tissue,muscle cell,cardiac muscle cell,cardiac muscle cell,cardiac muscle cell,0.8586,regular atrial cardiac myocyte,0.1414,...,4,17,61,13,7,HE,Heart Tissue,muscle cell,cardiac muscle cell,cardiac muscle cell
2330870,TTGTTGCGAAGATGCGTC_1,HE,Heart Tissue,muscle cell,cardiac muscle cell,cardiac muscle cell,cardiac muscle cell,0.6812,fibroblast,0.1929,...,4,17,61,13,7,HE,Heart Tissue,muscle cell,cardiac muscle cell,cardiac muscle cell
2330872,TTGTTGCGAGTCTTCACA_1,HE,Heart Tissue,muscle cell,cardiac muscle cell,cardiac muscle cell,cardiac muscle cell,0.4873,fibroblast of cardiac tissue,0.2563,...,4,17,61,13,7,HE,Heart Tissue,muscle cell,cardiac muscle cell,cardiac muscle cell


In [14]:
# Save
eval_df.to_csv(f"/home/lbh/projects_dir/BigSlice/evalset/uni_prediction_{patch_size}_for_validation_coo_norm_fin.csv")