# Data loading

In [None]:
import os

# Limit the number of threads used by various linear algebra libraries
# to avoid oversubscription and improve efficiency in multiprocessing environments
os.environ["OMP_NUM_THREADS"] = "8"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

import numpy as np
import pandas as pd

## Train set

In [None]:
patch_size = 128
x_train = np.load(f'/home/lbh/projects_dir/BigSlice/dataset/uni_feature_{patch_size}_all.npy')
x_train.shape

In [None]:
# Load full annotation table with 'spot' as index
all_df = pd.read_csv('/home/lbh/projects_dir/BigSlice/Celltype_Annotations/all_annotations.csv', index_col=0)
all_df

### Update annotations

In [None]:
import numpy as np

def update_skin_subregion(df):
    """
    Update subregion annotations for 'SK' (skin) organ entries.
    Replaces missing or 'other'-like subregion values with 'Skin_other'.

    Args:
        df (pd.DataFrame): DataFrame with 'organ' and 'subregion' columns.

    Returns:
        pd.DataFrame: Updated DataFrame.
        int: Number of rows modified.
    """
    # Condition: rows where organ is 'SK' and subregion is NA or 'other'
    condition = (df['organ'] == 'SK') & (
        df['subregion'].isna() | (df['subregion'].astype(str).str.lower() == 'other')
    )
    num_updated = condition.sum()

    # Apply update: set subregion to 'Skin_other' for matching rows
    df.loc[condition, 'subregion'] = 'Skin_other'
    return df, num_updated

# Apply the update to training annotations
subset_df_sorted, num_updated_train = update_skin_subregion(all_df)

# Print the number of updated rows
print(f"Train set updated: {num_updated_train} rows")

In [None]:
np.unique(all_df['subregion'])

### Data preparation

In [None]:
import numpy as np
import pandas as pd

def leave_one_batch_out(features: np.ndarray, df_labels: pd.DataFrame, key='batch'):
    """
    Generator for leave-one-batch-out cross validation.
    
    Parameters
    ----------
    features : np.ndarray
        N x D feature matrix (N samples, D features).
    df_labels : pd.DataFrame
        DataFrame with columns including "batch".
    
    Yields
    ------
    batch_id : the held-out batch identifier
    train_df : DataFrame for training
    val_df   : DataFrame for validation
    train_features : np.ndarray for training
    val_features   : np.ndarray for validation
    """
    assert len(features) == len(df_labels), "Features and df_labels must align row-wise."
    
    batch_list = df_labels[key].unique()
    
    for batch_id in batch_list:
        train_df = df_labels.loc[df_labels[key] != batch_id].reset_index(drop=True)
        val_df   = df_labels.loc[df_labels[key] == batch_id].reset_index(drop=True)
        
        train_mask = df_labels[key] != batch_id
        val_mask   = df_labels[key] == batch_id
        
        train_features = features[train_mask.values]
        val_features   = features[val_mask.values]
        
        yield batch_id, train_df, val_df, train_features, val_features

In [None]:
import os
import time
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import List
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

# ==============================
# Config
# ==============================
BASE_K = 5  # global K (will be clipped to [1, n_train])
LABEL_COLUMNS: List[str] = ["organ", "subregion", "level1_annotation", "level2_annotation", "level0_annotation"]

USE_COORDINATES = True        # whether to append (x_scaled_image, y_scaled_image) to features
N_COMPONENTS_PCA = 200        # PCA dims
RANDOM_STATE = 42

# Paths
EVAL_DIR = "/home/lbh/projects_dir/BigSlice/evalset"

# ==============================
# Utils
# ==============================
def safe_name(s: str) -> str:
    """Make a name safe for filesystem paths."""
    return "".join([c if c.isalnum() or c in "-_." else "_" for c in str(s)])

def build_features_with_optional_coords(df: pd.DataFrame, feat: np.ndarray) -> np.ndarray:
    """Optionally append (x_scaled_image, y_scaled_image) to the features."""
    if USE_COORDINATES:
        if not {"x_scaled_image", "y_scaled_image"}.issubset(df.columns):
            raise KeyError("Missing 'x_scaled_image' and/or 'y_scaled_image' in dataframe.")
        coo = df[["x_scaled_image", "y_scaled_image"]].values
        return np.hstack((coo, feat))
    return feat

def fit_preprocessor(train_feat: np.ndarray):
    """Fit StandardScaler and PCA on training features and return transformed train features."""
    scaler = StandardScaler()
    train_norm = scaler.fit_transform(train_feat)

    pca = PCA(n_components=N_COMPONENTS_PCA, random_state=RANDOM_STATE)
    X_tr = pca.fit_transform(train_norm)
    return scaler, pca, X_tr

def transform_features(scaler: StandardScaler, pca: PCA, feat: np.ndarray) -> np.ndarray:
    """Apply fitted scaler and PCA to features."""
    return pca.transform(scaler.transform(feat))

def uniform_vote(neigh_labels: np.ndarray, n_classes: int) -> (np.ndarray, np.ndarray):
    """
    Uniform voting (equivalent to KNN with weights='uniform').
    neigh_labels: (n_val, k) encoded neighbor labels
    Returns:
        pred_enc: (n_val,) predicted class indices
        conf: (n_val,) top1 frequency in [0,1]
    """
    votes = np.eye(n_classes, dtype=np.int64)[neigh_labels].sum(axis=1)  # (n_val, C)
    pred_enc = votes.argmax(axis=1)
    conf = votes.max(axis=1) / neigh_labels.shape[1]
    return pred_enc, conf


# ==============================
# Main loop
# Assumes you already have: leave_one_batch_out(x_train, all_df, key='batch')
# which yields: (batch_id, train_df, val_df, train_feat, val_feat)
# ==============================
for batch_id, train_df, val_df, train_feat, val_feat in leave_one_batch_out(
    x_train, all_df, key="batch"
):
    print(f"Batch {batch_id} | train {train_feat.shape} | val {val_feat.shape}")

    # 1) Build features (optionally append coordinates), then fit scaler & PCA
    train_feat_coo = build_features_with_optional_coords(train_df, train_feat)
    val_feat_coo   = build_features_with_optional_coords(val_df,   val_feat)

    scaler, pca, X_tr = fit_preprocessor(train_feat_coo)
    X_va = transform_features(scaler, pca, val_feat_coo)

    print(f"Data preprocessed. Train shape: {X_tr.shape}, Val shape: {X_va.shape}")

    # Prepare output directory & persist preprocessors
    output_dir = f"hierarchical_knn_batch_{safe_name(batch_id)}"
    os.makedirs(output_dir, exist_ok=True)
    joblib.dump(pca,    os.path.join(output_dir, "pca_model.pkl"))
    joblib.dump(scaler, os.path.join(output_dir, "scaler.pkl"))

    # 2) Precompute neighbors ONCE and reuse for every level
    k = max(1, min(BASE_K, len(X_tr)))  # clip k to [1, n_train]
    nn = NearestNeighbors(n_neighbors=k, n_jobs=-1)
    t0 = time.time()
    nn.fit(X_tr)
    distances, indices = nn.kneighbors(X_va, return_distance=True)
    print(f"Neighbor graph built in {time.time() - t0:.2f}s | k={k}")

    # Optionally persist neighbor graph for reproducibility / analysis
    np.save(os.path.join(output_dir, "val_knn_indices.npy"), indices)
    np.save(os.path.join(output_dir, "val_knn_distances.npy"), distances)

    # 3) Prepare result frame (keep useful metadata)
    predictions_df = val_df.copy()
    if "_row_id" not in predictions_df.columns:
        predictions_df["_row_id"] = predictions_df.index
    for c in ["x_scaled_image", "y_scaled_image"]:
        if c not in predictions_df.columns and c in val_df.columns:
            predictions_df[c] = val_df[c].values

    # 4) Vote per level using the SAME neighbors
    print("\n--- Step 2: Vote per level using shared neighbors ---")
    for level in tqdm(LABEL_COLUMNS, desc="Levels"):
        t1 = time.time()

        # Prepare training labels and encoder
        y_tr_str = train_df[level].astype(str).fillna("__UNK__").values
        le = LabelEncoder().fit(y_tr_str)
        y_tr_enc = le.transform(y_tr_str)            # shape: (n_train,)

        # Gather neighbor labels for each validation sample
        neigh_labels = y_tr_enc[indices]             # shape: (n_val, k)
        n_classes = len(le.classes_)

        # Voting
        pred_enc, conf = uniform_vote(neigh_labels, n_classes)

        y_pred = le.inverse_transform(pred_enc)

        # Save predictions and a simple confidence score
        predictions_df[f"{level}_pred"] = y_pred
        predictions_df[f"{level}_pred_conf"] = conf

        # Persist class list for this level to decode later if needed
        joblib.dump(le.classes_, os.path.join(output_dir, f"classes_{safe_name(level)}.pkl"))
        print(f"Level [{level}] done in {time.time() - t1:.2f}s | classes={n_classes}")

    # 5) Save predictions
    os.makedirs(EVAL_DIR, exist_ok=True)
    out_csv = os.path.join(EVAL_DIR, f"uni_prediction_{safe_name(batch_id)}_LOOCV.csv")
    predictions_df.to_csv(out_csv, index=False)
    print(f"âœ… Done. Saved predictions to: {out_csv}")
