# CatBoostRegressor training for State latent to gene counts (multi-target)

This notebook trains a multi-target CatBoostRegressor that maps State latent outputs (X) to gene counts (Y).

Workflow:
- Load latent features X and gene counts Y (NumPy/Parquet/CSV)
- Train/val split
- Train CatBoost with MultiRMSE (multi-target)
- Evaluate RMSE per-target and overall
- Save model to `.cbm` and show how to enable in `pertsets.yaml`


In [None]:
# If running locally, uncomment to install dependencies
# pip install catboost numpy pandas pyarrow scikit-learn


In [None]:
import os
import json
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor, Pool

# Config
latent_path = os.getenv("STATE_LATENT_PATH", "latent.npy")  # shape [N, D]
counts_path = os.getenv("STATE_COUNTS_PATH", "counts.npy")  # shape [N, G]
output_model = os.getenv("CB_MODEL_PATH", "catboost_decoder.cbm")
random_state = int(os.getenv("SEED", 42))

# Load data
if latent_path.endswith(".npy"):
    X = np.load(latent_path)
elif latent_path.endswith((".parquet", ".pq")):
    X = pd.read_parquet(latent_path).to_numpy()
else:
    X = pd.read_csv(latent_path).to_numpy()

if counts_path.endswith(".npy"):
    Y = np.load(counts_path)
elif counts_path.endswith((".parquet", ".pq")):
    Y = pd.read_parquet(counts_path).to_numpy()
else:
    Y = pd.read_csv(counts_path).to_numpy()

assert X.ndim == 2 and Y.ndim == 2, f"Expected 2D arrays, got X:{X.shape}, Y:{Y.shape}"
N = min(len(X), len(Y))
X = X[:N]
Y = Y[:N]
print(f"Loaded X: {X.shape}, Y: {Y.shape}")

# Train/val split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=random_state)
print(f"Train: {X_train.shape}, {Y_train.shape} | Val: {X_val.shape}, {Y_val.shape}")


In [None]:
# Training with MultiRMSE
params = {
    "loss_function": "MultiRMSE",
    "eval_metric": "MultiRMSE",
    "learning_rate": 0.05,
    "depth": 8,
    "iterations": 2000,
    "random_seed": random_state,
    "bootstrap_type": "Bayesian",
    "task_type": "CPU",  # set to GPU if available and catboost built with CUDA
    # "devices": "0",
}

train_pool = Pool(X_train, label=Y_train)
val_pool = Pool(X_val, label=Y_val)

model = CatBoostRegressor(**params)
model.fit(
    train_pool,
    eval_set=val_pool,
    verbose=200,
    use_best_model=True,
)

# Evaluate RMSE overall and per-target
Y_val_pred = model.predict(X_val)
if Y_val_pred.ndim == 1:
    Y_val_pred = Y_val_pred[:, None]

rmse_overall = mean_squared_error(Y_val, Y_val_pred, squared=False)
rmse_per_target = ((Y_val - Y_val_pred) ** 2).mean(axis=0) ** 0.5

print("Validation RMSE (overall):", float(rmse_overall))
print("Validation RMSE per-target (first 10):", rmse_per_target[:10])


In [None]:
# Save model and artifacts
Path(output_model).parent.mkdir(parents=True, exist_ok=True)
model.save_model(output_model)
print(f"Saved CatBoost model to: {output_model}")

# Optional: save metrics
metrics_path = os.getenv("CB_METRICS_PATH", "catboost_metrics.json")
with open(metrics_path, "w") as f:
    json.dump({
        "rmse_overall": float(rmse_overall),
        "rmse_per_target_sample": [float(x) for x in rmse_per_target[:50]],
        "X_shape": list(X.shape),
        "Y_shape": list(Y.shape),
    }, f, indent=2)
print(f"Saved metrics to: {metrics_path}")


## Exporting latent features and counts

You can export latent outputs (X) and gene counts (Y) from the State model by running a prediction job and saving tensors:

- `X` should be the model latent predictions (shape [N, D])
- `Y` should be aligned gene counts targets (shape [N, G])

Once trained, enable the CatBoost decoder in `configs/model/pertsets.yaml`:

```yaml
kwargs:
  catboost_decoder:
    enable: true
    model_path: /path/to/catboost_decoder.cbm
    target_dim: ${model.kwargs.gene_dim}
```
