# Data loading

## Evaluation dataset

In [3]:
import os

# Limit the number of threads used by various linear algebra libraries
# to avoid oversubscription and improve efficiency in multiprocessing environments
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

import numpy as np
import pandas as pd

# Set patch size used for processing image tiles
patch_size = 128

# Load evaluation metadata (downsampled version)
eval_csv = "/home/lbh/projects_dir/BigSlice/evalset/evallist_down.csv"
eval_df = pd.read_csv(eval_csv)

# Display the evaluation DataFrame
eval_df

Unnamed: 0,filename,batch,x,y
0,Image1_DS_52X_1096_256,Image1_DS_52X,1096,256
1,Image1_DS_52X_1126_256,Image1_DS_52X,1126,256
2,Image1_DS_52X_1156_256,Image1_DS_52X,1156,256
3,Image1_DS_52X_1186_256,Image1_DS_52X,1186,256
4,Image1_DS_52X_1216_256,Image1_DS_52X,1216,256
...,...,...,...,...
121051,Image7_DS_52X_1816_6136,Image7_DS_52X,1816,6136
121052,Image7_DS_52X_1846_6136,Image7_DS_52X,1846,6136
121053,Image7_DS_52X_1876_6136,Image7_DS_52X,1876,6136
121054,Image7_DS_52X_1906_6136,Image7_DS_52X,1906,6136


In [4]:
# Extract spatial coordinates ('x', 'y') from the evaluation DataFrame
# and convert to float32 NumPy array for downstream processing (e.g., KNN or clustering)
coo_eval = eval_df[['x', 'y']].values.astype(np.float32)


In [5]:
import numpy as np

# Load extracted image features for the evaluation set
# Features are saved as .npy file and converted to float32 for compatibility and efficiency
X_eval = np.load(f"/home/lbh/projects_dir/BigSlice/dataset/uni_test_feature_{patch_size}.npy").astype(np.float32)

# Print the shape of the loaded feature array (expected: [N, 1536])
X_eval.shape

(121056, 1536)

## Train set

In [6]:
import numpy as np
import pandas as pd

# Load full annotation table with 'spot' as index
all_df = pd.read_csv('/home/lbh/projects_dir/BigSlice/Celltype_Annotations/all_annotations.csv', index_col=0)

# Load sampled training set with 'filename' as index
train_df = pd.read_csv("/home/lbh/projects_dir/BigSlice/dataset/train_8k_samples.csv", index_col=0)

# Filter annotations to only include rows present in the training set
subset_df = all_df[all_df['spot'].isin(train_df['filename'])]

# Reorder subset_df to match the order in train_df['filename']
# This ensures that labels align with extracted image features
subset_df_sorted = subset_df.set_index('spot').loc[train_df['filename']].reset_index()

# Display the sorted subset of annotation DataFrame
subset_df_sorted

Unnamed: 0,spot,organ,subregion,level1_annotation,level2_annotation,level0_annotation,celltype_prediction_1,score_prediction_1,celltype_prediction_2,score_prediction_2,...,x_scaled_image,y_scaled_image,x_scaled_image_organ,y_scaled_image_organ,batch,organ_encoded,subregion_encoded,level0_annotation_encoded,level1_annotation_encoded,level2_annotation_encoded
0,CAGTTCCAGATTGGTGCT_0_0_0,BR,Fiber Tracts,glial cell,oligodendrocyte,oligodendrocyte,oligodendrocyte,0.9458,tanycyte,0.0542,...,2123.953233,714.694460,932.953233,714.694460,CTRL_1,2,13,257,9,52
1,CGACGAGTAAGCATCTAC_1,BM,Bone Marrow Tissue,blood & immune cell,hematopoietic stem cell,hematopoietic stem cell,hematopoietic stem cell,0.4734,hematopoietic precursor cell,0.4092,...,1804.636364,3092.220296,1804.636364,3092.220296,LPS_2,1,2,145,1,31
2,GATTCTGACCGCATAAGC_0_0_0,BR,Cerebellar Cortex (Gran. Layer),mesenchymal cell,granule cell,granule cell,granule cell,0.4037,inhibitory neuron,0.3906,...,2648.225586,1056.089962,1457.225586,1056.089962,CTRL_1,2,7,136,12,27
3,CGCATCTCATGGTTCACT_1_0,MU,Muscle Tissue,muscle cell,muscle cell,muscle cell,muscle cell,0.3445,multi-potent skeletal muscle stem cell,0.3324,...,2083.518463,6556.506308,2083.518463,6556.506308,LPS_1,9,27,227,13,46
4,AGTCAGACTGCATACACA_1,MU,Muscle Tissue,muscle cell,skeletal muscle cell,multi-potent skeletal muscle stem cell,multi-potent skeletal muscle stem cell,0.4516,cell of skeletal muscle,0.3373,...,2387.485473,2854.595392,2387.485473,2854.595392,LPS_2,9,27,226,13,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79995,GTCTTCACAGTAGAACCT_0_0_0,MU,Muscle Tissue,muscle cell,skeletal muscle cell,multi-potent skeletal muscle stem cell,multi-potent skeletal muscle stem cell,0.4073,cell of skeletal muscle,0.3209,...,1013.118744,2071.292375,1013.118744,2071.292375,CTRL_1,9,27,226,13,66
79996,GCTTCCTAAGTCGATTCG_1_0,LU,Alveoli,epithelial cell,pneumocyte,type II pneumocyte,type II pneumocyte,0.3775,alveolar macrophage,0.3690,...,2236.420806,3806.917170,975.420806,856.917170,LPS_1,8,0,341,6,58
79997,GTACTTGGTTCTCTAGCG_1_0,MU,Muscle Tissue,muscle cell,skeletal muscle cell,multi-potent skeletal muscle stem cell,multi-potent skeletal muscle stem cell,0.5797,muscle cell,0.2503,...,2459.295408,5690.811849,2459.295408,5690.811849,LPS_1,9,27,226,13,66
79998,TTGTCTCGTGAAGCAACG_1_0_0,BF,Brown Fat Tissue,adipocytes,adipocyte,brown fat cell,brown fat cell,0.7443,mesenchymal stem cell of adipose tissue,0.1687,...,1655.630740,610.229951,1655.630740,610.229951,CTRL_2,0,5,55,0,3


### Update annotations

In [7]:
import numpy as np

def update_skin_subregion(df):
    """
    Update subregion annotations for 'SK' (skin) organ entries.
    Replaces missing or 'other'-like subregion values with 'Skin_other'.

    Args:
        df (pd.DataFrame): DataFrame with 'organ' and 'subregion' columns.

    Returns:
        pd.DataFrame: Updated DataFrame.
        int: Number of rows modified.
    """
    # Condition: rows where organ is 'SK' and subregion is NA or 'other'
    condition = (df['organ'] == 'SK') & (
        df['subregion'].isna() | (df['subregion'].astype(str).str.lower() == 'other')
    )
    num_updated = condition.sum()

    # Apply update: set subregion to 'Skin_other' for matching rows
    df.loc[condition, 'subregion'] = 'Skin_other'
    return df, num_updated

# Apply the update to training annotations
subset_df_sorted, num_updated_train = update_skin_subregion(subset_df_sorted)

# Print the number of updated rows
print(f"Train set updated: {num_updated_train} rows")

Train set updated: 7632 rows


In [23]:
np.unique(subset_df_sorted['subregion'])

array(['Alveoli', 'Blood Vessel', 'Bone Marrow Tissue', 'Bronchi',
       'Bronchioles', 'Brown Fat Tissue', 'Caudoputamen',
       'Cerebellar Cortex (Gran. Layer)',
       'Cerebellar Cortex (Mol. Layer)', 'Cerebral Cortex',
       'Colon Tissue', 'Dermis', 'Epidermis', 'Fiber Tracts',
       'Foveolar Epithelium', 'Gastric Glands (Chief Cell Rich)',
       'Gastric Glands (Parietal Cell Rich)', 'Heart Tissue',
       'Hypodermis', 'Hyppocampus', 'Inner Medulla',
       'Inner Stripe of Outer Medulla', 'Lymph Node Tissue',
       'Marginal Zone', 'Meninges', 'Midbrain', 'Midbrain/Hindbrain',
       'Muscle Tissue', 'Other', 'Outer Stripe of Outer Medulla',
       'Pancreas Tissue', 'Pericentral', 'Periportal', 'Red Pulp',
       'Renal Cortex', 'Skin_other', 'Small Intestine Tissue',
       'Spinal Cord', 'Thalamus', 'Thymic Cortex', 'Thymic Medulla',
       'Ventricle', 'White Pulp'], dtype=object)

### Data preparation

In [20]:
import os
from tqdm import tqdm

# Initialize storage for each of the 5 label levels
y_train_levels = [[] for _ in range(5)]

batch_size = 64  # Batch size for label processing (adjust if needed)
label_columns = ["organ", "subregion", "level1_annotation", "level2_annotation", "level0_annotation"]

# Iterate through the annotation DataFrame in batches
for i in tqdm(range(0, len(subset_df_sorted), batch_size), desc="Extracting train features"):
    batch_df = subset_df_sorted.iloc[i:i+batch_size]
    batch_labels = [[] for _ in range(5)]  # Temporary storage for this batch

    # Column names to extract (can be modified for encoded labels)
    encoded_columns = [f"{col}" for col in label_columns]

    # Extract labels for each level in this batch
    for _, row in batch_df.iterrows():
        for j, col in enumerate(encoded_columns):
            batch_labels[j].append(row[col])

    # Append batch labels to the overall label lists
    for j in range(5):
        y_train_levels[j].extend(batch_labels[j])


Extracting train features: 100%|██████████| 1250/1250 [00:03<00:00, 398.95it/s]


In [9]:
# Extract scaled spatial coordinates ('x_scaled_image', 'y_scaled_image') for training samples
# Convert to float32 NumPy array for downstream tasks (e.g., KNN, spatial clustering)
coo_train = subset_df_sorted[['x_scaled_image', 'y_scaled_image']].values.astype(np.float32)

# Check the shape of the coordinate array (should match the number of training samples)
coo_train.shape


(80000, 2)

In [10]:
patch_size = 128  # Define patch size used during feature extraction

# Load precomputed feature vectors for the training set
# Features are stored in a .npy file and converted to float32 for consistency
X_train = np.load(f"/home/lbh/projects_dir/BigSlice/dataset/uni_feature_{patch_size}.npy").astype(np.float32)

# Display the shape of the feature array (expected shape: [N, 1536])
X_train.shape

(80000, 1536)

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

# 1. Concatenate spatial coordinates and image features
# Shape: [N, D], where D = 2 (coords) + 1536 (features)
x_all = np.hstack((coo_train, X_train))  # shape: [20000, D]

# 2. Normalize features using StandardScaler (zero mean, unit variance)
scaler = StandardScaler()
x_all_normalized = scaler.fit_transform(x_all)

# 3. Apply PCA to reduce dimensionality to 200 components
# This helps improve efficiency and reduce noise for downstream tasks
pca = PCA(n_components=200, random_state=42)
x_all_reduced = pca.fit_transform(x_all_normalized)


In [12]:
# Concatenate spatial coordinates and extracted image features for the evaluation set
# Shape: [N, D], where D = 2 (coords) + 1536 (features)
x_all_eval = np.hstack((coo_eval, X_eval))  # shape: [N, D]

scaler = StandardScaler()
x_eval_normalized = scaler.fit_transform(x_all_eval)

# Apply PCA transformation using the model fitted on training data
X_eval_reduced = pca.transform(x_eval_normalized)  # shape: [N, 200]

In [21]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import time

# Set batch size for inference
batch_size = 1000

# Dictionary to store predictions
predictions = {}

# Define the label levels to be predicted
label_columns = ['organ', 'subregion', 'level1_annotation', 'level2_annotation', 'level0_annotation']

# Loop over each label level
for i, y_train in enumerate(y_train_levels):
    print(f"\n=== Processing label: {label_columns[i]} ===")
    
    # --- Train KNN classifier ---
    start_train = time.time()
    knn = KNeighborsClassifier(n_neighbors=5, n_jobs=16)  # Use 16 threads for parallel search
    knn.fit(x_all_reduced, y_train)  # Fit on reduced training features
    end_train = time.time()
    print(f"Training time: {end_train - start_train:.2f} seconds")

    # --- Batch prediction ---
    print(f"Predicting {label_columns[i]} in batches...")
    y_preds = []
    start_predict = time.time()

    for j in range(0, len(X_eval_reduced), batch_size):
        batch = X_eval_reduced[j:j+batch_size]
        batch_pred = knn.predict(batch)
        y_preds.append(batch_pred)

        # Log progress every 10 batches with estimated remaining time
        if (j // batch_size) % 10 == 0 and j > 0:
            elapsed = time.time() - start_predict
            done_batches = j // batch_size
            total_batches = len(X_eval_reduced) // batch_size + 1
            eta = elapsed / done_batches * (total_batches - done_batches)
            print(f"Processed {j}/{len(X_eval_reduced)} samples "
                  f"({done_batches}/{total_batches} batches), "
                  f"elapsed: {elapsed:.1f}s, ETA: {eta:.1f}s")

    # --- Store and record predictions ---
    y_pred = np.concatenate(y_preds)  # Flatten batched predictions
    eval_df[f"{label_columns[i]}_pred"] = y_pred  # Save predictions to DataFrame

    total_predict_time = time.time() - start_predict
    print(f"Prediction time: {total_predict_time:.2f} seconds")

print("\n✅ All predictions completed.")




=== Processing label: organ ===
Training time: 0.03 seconds
Predicting organ in batches...
Processed 10000/121056 samples (10/122 batches), elapsed: 9.5s, ETA: 106.6s
Processed 20000/121056 samples (20/122 batches), elapsed: 18.2s, ETA: 92.8s
Processed 30000/121056 samples (30/122 batches), elapsed: 26.9s, ETA: 82.5s
Processed 40000/121056 samples (40/122 batches), elapsed: 35.6s, ETA: 72.9s
Processed 50000/121056 samples (50/122 batches), elapsed: 44.2s, ETA: 63.7s
Processed 60000/121056 samples (60/122 batches), elapsed: 52.9s, ETA: 54.7s
Processed 70000/121056 samples (70/122 batches), elapsed: 61.6s, ETA: 45.7s
Processed 80000/121056 samples (80/122 batches), elapsed: 70.2s, ETA: 36.9s
Processed 90000/121056 samples (90/122 batches), elapsed: 78.9s, ETA: 28.1s
Processed 100000/121056 samples (100/122 batches), elapsed: 87.6s, ETA: 19.3s
Processed 110000/121056 samples (110/122 batches), elapsed: 96.3s, ETA: 10.5s
Processed 120000/121056 samples (120/122 batches), elapsed: 104.9s, 

In [22]:
eval_df

Unnamed: 0,filename,batch,x,y,organ_pred,subregion_pred,level1_annotation_pred,level2_annotation_pred,level0_annotation_pred
0,Image1_DS_52X_1096_256,Image1_DS_52X,1096,256,MU,Muscle Tissue,muscle cell,skeletal muscle cell,cell of skeletal muscle
1,Image1_DS_52X_1126_256,Image1_DS_52X,1126,256,MU,Muscle Tissue,muscle cell,skeletal muscle cell,cell of skeletal muscle
2,Image1_DS_52X_1156_256,Image1_DS_52X,1156,256,MU,Muscle Tissue,muscle cell,skeletal muscle cell,cell of skeletal muscle
3,Image1_DS_52X_1186_256,Image1_DS_52X,1186,256,MU,Muscle Tissue,muscle cell,skeletal muscle cell,cell of skeletal muscle
4,Image1_DS_52X_1216_256,Image1_DS_52X,1216,256,MU,Muscle Tissue,muscle cell,skeletal muscle cell,cell of skeletal muscle
...,...,...,...,...,...,...,...,...,...
121051,Image7_DS_52X_1816_6136,Image7_DS_52X,1816,6136,BR,Midbrain/Hindbrain,glial cell,neuronal cell,neuron
121052,Image7_DS_52X_1846_6136,Image7_DS_52X,1846,6136,MU,Muscle Tissue,epithelial cell,skeletal muscle cell,cell of skeletal muscle
121053,Image7_DS_52X_1876_6136,Image7_DS_52X,1876,6136,SK,Skin_other,epithelial cell,epidermal cell,brown fat cell
121054,Image7_DS_52X_1906_6136,Image7_DS_52X,1906,6136,MU,Muscle Tissue,epithelial cell,epidermal cell,multi-potent skeletal muscle stem cell


In [24]:
# Save
eval_df.to_csv(f"/home/lbh/projects_dir/BigSlice/evalset/uni_prediction_{patch_size}_coo_norm_fin.csv")