# TOXIPRED â€” SVM (Descriptors-Only) QSAR Notebook

**Scope:** Rebuild 2D descriptors from raw SMILES, use scaffold-aware validation, feature selection (KBest, LASSO, SHAP, RFE/Linear SVM), tune an **RBF-SVM**, calibrate probabilities, threshold for **balanced accuracy**, define a simple **Applicability Domain (AD)**, and export metrics & artifacts with `desc_svm_*` prefixes.

> **Primaries for QSAR:** We optimize **Balanced Accuracy** (widely used in regulatory QSAR). We also report **ROC-AUC**, **PR-AUC**, **F1**, **Brier** score, confusion matrix and reliability diagrams.

**You may need to adjust paths and column names in the Config below.**


In [None]:
# --- Optional: quick installs (uncomment if needed) ---
# %pip install rdkit-pypi scikit-learn xgboost shap pandas numpy matplotlib joblib tqdm
# %pip install imbalanced-learn

[31mERROR: Could not find a version that satisfies the requirement rdkit-pypi (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for rdkit-pypi[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os, json, math, warnings, joblib, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from dataclasses import dataclass
from typing import List, Tuple, Optional, Dict

from sklearn.model_selection import GroupKFold, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    f1_score,
    balanced_accuracy_score,
    confusion_matrix,
    brier_score_loss,
    precision_recall_curve,
    roc_curve,
)
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils import check_random_state

from tqdm import tqdm

# RDKit imports
try:
    from rdkit import Chem
    from rdkit.Chem import Descriptors
    from rdkit.Chem.Scaffolds import MurckoScaffold
    from rdkit.Chem.inchi import MolToInchiKey
except Exception as e:
    raise RuntimeError(
        "RDKit is required. Please install 'rdkit-pypi' and restart the kernel."
    ) from e

# Optional (for SHAP ranking)
_have_shap = False
try:
    import shap

    _have_shap = True
except Exception:
    warnings.warn("SHAP not available; SHAP-based ranking will be skipped.")
    _have_shap = False

# Optional (tree model for SHAP ranking)
_have_xgb = False
try:
    from xgboost import XGBClassifier

    _have_xgb = True
except Exception:
    warnings.warn("XGBoost not available; SHAP ranking will fall back or be skipped.")
    _have_xgb = False

warnings.filterwarnings("ignore")
np.set_printoptions(suppress=True, linewidth=120)
pd.set_option("display.max_columns", 200)



In [3]:
# =========================
# ====== CONFIGURE ========
# =========================
RANDOM_STATE = 42
PRIMARY_METRIC = (
    "balanced_accuracy"  # QSAR-friendly; also report PR-AUC/ROC-AUC/F1/Brier
)
TEST_SIZE_FRACTION = 0.2  # external scaffold holdout ~20%

# ---- File paths (adjust as needed) ----
RAW_DATA_PATH = "in_chemico_dataset.xlsx"  # <== CHANGE if your file is different
SMILES_COL = "SMILES code"  # <== CHANGE if your column is different
TARGET_COL = (
    "Phototoxicity"  # <== CHANGE if your column is different; 1=toxic, 0=non-toxic
)

# Output prefix
PREFIX = "desc_svm"

# Feature selection methods to include
USE_METHODS = [
    "kbest",
    "lasso",
    "shap",
    "rfe",
]  # subset to speed up, e.g., ["kbest","rfe"]

# Candidate k values for top-k selection
TOPK_CANDIDATES = [16, 32, 64, 128, 256]

# Correlation and near-constant thresholds
CORR_THRESH = 0.90
TOP_VALUE_FREQ_THRESH = 0.80

# IQR clipping factor
IQR_CLIP = 3.0

# SVM search space (log-uniform)
C_RANGE = (1e-3, 1e3)
GAMMA_RANGE = (1e-5, 1e1)

N_OUTER_SPLITS = 5
N_INNER_ITER = 30  # RandomizedSearch iterations per outer fold (adjust for speed)

# KNN-AD params
AD_K = 5
AD_THRESH_QUANTILE = (
    0.95  # flag as OOD if distance > 95th percentile of train distances
)

In [None]:
def largest_fragment(smiles: str) -> Optional[Chem.Mol]:
    # Return RDKit Mol of the largest fragment from SMILES, sanitized.
    if not isinstance(smiles, str) or len(smiles.strip()) == 0:
        return None
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        frags = Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=True)
        if not frags:
            return None
        # choose fragment with most heavy atoms
        mol = max(frags, key=lambda m: m.GetNumHeavyAtoms())
        Chem.SanitizeMol(mol)
        return mol
    except Exception:
        return None


DESC_LIST = Descriptors.descList  # list of (name, function)


def compute_descriptors(mol: Chem.Mol) -> Dict[str, float]:
    # Compute RDKit 2D descriptors from Descriptors.descList.
    values = {}
    for name, func in DESC_LIST:
        try:
            v = func(mol)
            if isinstance(v, float) or isinstance(v, int):
                values[name] = float(v)
            else:
                values[name] = np.nan
        except Exception:
            values[name] = np.nan
    return values


def murcko_scaffold_smiles(mol: Chem.Mol) -> str:
    try:
        scaf = MurckoScaffold.GetScaffoldForMol(mol)
        return Chem.MolToSmiles(scaf) if scaf is not None else ""
    except Exception:
        return ""


def inchi_key(mol: Chem.Mol) -> str:
    try:
        return MolToInchiKey(mol)
    except Exception:
        return ""

In [5]:
class TopValueFrequencyFilter(BaseEstimator, TransformerMixin):
    # Drop columns whose most frequent value frequency >= threshold.
    def __init__(self, threshold=0.80):
        self.threshold = threshold
        self.keep_cols_ = None

    def fit(self, X, y=None):
        X = pd.DataFrame(X).copy()
        keep = []
        for c in X.columns:
            vc = X[c].value_counts(normalize=True, dropna=False)
            top = vc.iloc[0] if not vc.empty else 1.0
            if top < self.threshold:
                keep.append(c)
        self.keep_cols_ = keep
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        return X[self.keep_cols_]


class CorrelationFilter(BaseEstimator, TransformerMixin):
    # Drop one of highly correlated pairs (|r|>thresh) using upper-triangle scan on train.
    def __init__(self, threshold=0.90):
        self.threshold = threshold
        self.keep_cols_ = None

    def fit(self, X, y=None):
        X = pd.DataFrame(X).copy()
        corr = X.corr(numeric_only=True).abs()
        upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
        drop = [
            column for column in upper.columns if any(upper[column] > self.threshold)
        ]
        self.keep_cols_ = [c for c in X.columns if c not in drop]
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        return X[self.keep_cols_]


class IQRClipper(BaseEstimator, TransformerMixin):
    # Clip features to [Q1 - k*IQR, Q3 + k*IQR] computed on train.
    def __init__(self, k=3.0):
        self.k = k
        self.bounds_ = None  # dict col -> (low, high)

    def fit(self, X, y=None):
        X = pd.DataFrame(X).copy()
        bounds = {}
        for c in X.columns:
            q1 = X[c].quantile(0.25)
            q3 = X[c].quantile(0.75)
            iqr = q3 - q1
            low = q1 - self.k * iqr
            high = q3 + self.k * iqr
            bounds[c] = (low, high)
        self.bounds_ = bounds
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        for c, (low, high) in self.bounds_.items():
            X[c] = X[c].clip(lower=low, upper=high)
        return X


class LassoSelector(BaseEstimator, TransformerMixin):
    # Select features with non-zero coef from L1-penalized Logistic Regression.
    def __init__(self, C=1.0, max_iter=5000):
        self.C = C
        self.max_iter = max_iter
        self.model_ = None
        self.keep_cols_ = None

    def fit(self, X, y):
        X = pd.DataFrame(X).copy()
        lr = LogisticRegression(
            penalty="l1",
            solver="liblinear",
            class_weight="balanced",
            C=self.C,
            max_iter=self.max_iter,
            random_state=42,
        )
        lr.fit(X, y)
        mask = np.abs(lr.coef_).sum(axis=0) > 1e-12
        self.keep_cols_ = list(X.columns[mask])
        self.model_ = lr
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        return X[self.keep_cols_]


class SHAPSelector(BaseEstimator, TransformerMixin):
    # Rank features by mean |SHAP| from a tree model (XGB), then keep top-k.
    def __init__(self, k=64, xgb_params=None):
        self.k = int(k)
        self.xgb_params = xgb_params or {}
        self.keep_cols_ = None

    def fit(self, X, y):
        if not (_have_shap and _have_xgb):
            warnings.warn(
                "SHAPSelector skipped (missing shap/xgboost). Keeping all features."
            )
            self.keep_cols_ = list(pd.DataFrame(X).columns)
            return self
        X = pd.DataFrame(X).copy()
        model = XGBClassifier(
            n_estimators=300,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            max_depth=4,
            random_state=42,
            n_jobs=-1,
            **self.xgb_params,
        )
        model.fit(X, y)
        explainer = shap.TreeExplainer(model)
        sv = explainer.shap_values(X)
        if isinstance(sv, list):  # binary-class returns list with 2 arrays in old shap
            sv = sv[1]
        shap_mean = np.mean(np.abs(sv), axis=0)
        order = np.argsort(shap_mean)[::-1]
        cols = np.array(X.columns)[order]
        self.keep_cols_ = list(cols[: self.k])
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        return X[self.keep_cols_]


class RFESelector(BaseEstimator, TransformerMixin):
    # RFE using a LinearSVC (balanced). Keep exactly k features.
    def __init__(self, k=64, max_iter=5000):
        self.k = int(k)
        self.max_iter = max_iter
        self.keep_cols_ = None

    def fit(self, X, y):
        from sklearn.feature_selection import RFE

        X = pd.DataFrame(X).copy()
        base = LinearSVC(
            dual=False, class_weight="balanced", max_iter=self.max_iter, random_state=42
        )
        rfe = RFE(base, n_features_to_select=self.k, step=0.1)
        rfe.fit(X, y)
        mask = rfe.support_
        self.keep_cols_ = list(X.columns[mask])
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        return X[self.keep_cols_]


class GenericSelector(BaseEstimator, TransformerMixin):
    # Unified selector: method in {'kbest','lasso','shap','rfe'} with parameter k/C.
    def __init__(self, method="kbest", k=64, lasso_C=1.0):
        self.method = method
        self.k = int(k)
        self.lasso_C = lasso_C
        self.selector_ = None

    def fit(self, X, y):
        method = self.method.lower()
        if method == "kbest":
            self.selector_ = SelectKBest(score_func=f_classif, k=self.k)
        elif method == "lasso":
            self.selector_ = LassoSelector(C=self.lasso_C)
        elif method == "shap":
            self.selector_ = SHAPSelector(k=self.k)
        elif method == "rfe":
            self.selector_ = RFESelector(k=self.k)
        else:
            raise ValueError(f"Unknown method: {self.method}")
        self.selector_.fit(X, y)
        return self

    def transform(self, X):
        return self.selector_.transform(X)

In [6]:
def pick_threshold_max_bal_acc(y_true, p):
    thresholds = np.unique(np.concatenate([[0.0], p, [1.0]]))
    best_t, best_ba = 0.5, -1.0
    for t in thresholds:
        y_hat = (p >= t).astype(int)
        ba = balanced_accuracy_score(y_true, y_hat)
        if ba > best_ba:
            best_ba, best_t = ba, t
    return float(best_t), float(best_ba)


def compute_metric_panel(y_true, p, threshold=0.5):
    y_hat = (p >= threshold).astype(int)
    return {
        "balanced_accuracy": balanced_accuracy_score(y_true, y_hat),
        "roc_auc": roc_auc_score(y_true, p) if len(np.unique(y_true)) > 1 else np.nan,
        "pr_auc": average_precision_score(y_true, p),
        "f1": f1_score(y_true, y_hat, zero_division=0),
        "brier": brier_score_loss(y_true, p),
        "confusion_matrix": confusion_matrix(y_true, y_hat, labels=[0, 1]).tolist(),
        "threshold": threshold,
    }


def knn_ad_distance(train_X, test_X, k=5):
    # Return mean distance to kNN in train (Euclidean) as AD score; lower=more in-domain.
    from sklearn.neighbors import NearestNeighbors

    nbrs = NearestNeighbors(n_neighbors=min(k, len(train_X)), metric="euclidean")
    nbrs.fit(train_X)
    dists, _ = nbrs.kneighbors(test_X)
    return dists.mean(axis=1)


def choose_ad_threshold(train_dist, quantile=0.95):
    return float(np.quantile(train_dist, quantile))

In [None]:
# ===============
# External scaffold holdout split
# ===============

rng = check_random_state(42)
unique_scaffolds = pd.Series(scaffolds).astype(str).unique().tolist()
rng.shuffle(unique_scaffolds)

# Greedy accumulate scaffolds until ~TEST_SIZE_FRACTION
scaf_to_idx = {}
for i, s in enumerate(scaffolds):
    scaf_to_idx.setdefault(s, []).append(i)

test_scaffs = []
test_idx = set()
target_test_size = int(math.ceil(TEST_SIZE_FRACTION * len(df)))
for scaf in unique_scaffolds:
    cand = scaf_to_idx.get(scaf, [])
    new_size = len(test_idx) + len(cand)
    if len(test_idx) < target_test_size or new_size <= target_test_size:
        test_scaffs.append(scaf)
        test_idx.update(cand)

mask_test = df.index.isin(sorted(list(test_idx)))
mask_train = ~mask_test

X_train_raw, X_test_raw = X.loc[mask_train].reset_index(drop=True), X.loc[
    mask_test
].reset_index(drop=True)
y_train, y_test = y[mask_train], y[mask_test]
groups_train = pd.Series(scaffolds)[mask_train].astype(str).values
groups_test = pd.Series(scaffolds)[mask_test].astype(str).values

print(f"Train: {len(X_train_raw)} | Test (scaffold holdout): {len(X_test_raw)}")

NameError: name 'scaffolds' is not defined

In [None]:
preproc = Pipeline(
    [
        ("impute", SimpleImputer(strategy="median")),
        ("topval", TopValueFrequencyFilter(threshold=TOP_VALUE_FREQ_THRESH)),
        ("corr", CorrelationFilter(threshold=CORR_THRESH)),
        ("iqr", IQRClipper(k=IQR_CLIP)),
        ("scale", StandardScaler(with_mean=True, with_std=True)),
    ]
)

# Fit on train, apply to both
X_train_p = preproc.fit_transform(X_train_raw, y_train)
X_test_p = preproc.transform(X_test_raw)

# Keep fitted feature names after drops
keep_cols = preproc.named_steps["corr"].keep_cols_
X_train = pd.DataFrame(X_train_p, columns=keep_cols)
X_test = pd.DataFrame(X_test_p, columns=keep_cols)

print(f"After preprocessing: d={X_train.shape[1]}")

In [None]:
# ===============
# Nested CV: outer GroupKFold, inner RandomizedSearch
# ===============
def loguniform(low, high, size=None, rng=None):
    rng = check_random_state(rng)
    return np.exp(rng.uniform(np.log(low), np.log(high), size))


outer_cv = GroupKFold(n_splits=5)
results = []
best_pipelines = []

fold_id = 0
for train_idx, val_idx in outer_cv.split(X_train, y_train, groups=groups_train):
    fold_id += 1
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    g_tr, g_val = groups_train[train_idx], groups_train[val_idx]

    # Pipeline: selector -> SVM
    pipe = Pipeline(
        [
            ("selector", GenericSelector(method="kbest", k=64, lasso_C=1.0)),
            (
                "svm",
                SVC(
                    kernel="rbf",
                    class_weight="balanced",
                    probability=True,
                    random_state=42,
                ),
            ),
        ]
    )

    # Randomized search space
    space = {
        "selector__method": USE_METHODS,
        "selector__k": TOPK_CANDIDATES,
        "selector__lasso_C": loguniform(
            1e-3, 1e3, size=30, rng=42
        ).tolist(),  # will be sampled below
        "svm__C": loguniform(1e-3, 1e3, size=30, rng=42).tolist(),
        "svm__gamma": loguniform(1e-5, 1e1, size=30, rng=42).tolist(),
    }

    # Build param distributions list (one per iteration) to ensure lasso_C is sampled
    param_distributions = []
    for i in range(30):
        param_distributions.append(
            {
                "selector__method": np.random.choice(USE_METHODS),
                "selector__k": int(np.random.choice(TOPK_CANDIDATES)),
                "selector__lasso_C": float(space["selector__lasso_C"][i]),
                "svm__C": float(space["svm__C"][i]),
                "svm__gamma": float(space["svm__gamma"][i]),
            }
        )

    # Custom random search loop (group-aware)
    best_score = -1.0
    best_est = None
    inner_cv = GroupKFold(n_splits=max(3, min(5, len(np.unique(g_tr)))))
    for params in tqdm(param_distributions, desc=f"Outer fold {fold_id} inner search"):
        est = clone(pipe).set_params(**params)
        # group-aware CV score
        scores = []
        for it_tr_idx, it_va_idx in inner_cv.split(X_tr, y_tr, groups=g_tr):
            Xt_tr, Xt_va = X_tr.iloc[it_tr_idx], X_tr.iloc[it_va_idx]
            yt_tr, yt_va = y_tr[it_tr_idx], y_tr[it_va_idx]

            est.fit(Xt_tr, yt_tr)
            p_va = est.predict_proba(Xt_va)[:, 1]
            y_hat = (p_va >= 0.5).astype(int)  # default threshold for inner scoring
            sc = balanced_accuracy_score(yt_va, y_hat)
            scores.append(sc)
        score = float(np.mean(scores))
        if score > best_score:
            best_score = score
            best_est = est

    # Refit best on outer-train
    best_est.fit(X_tr, y_tr)
    # Calibrate (CV=5, isotonic). Note: groups not honored in sklearn's calibrator.
    calibrator = CalibratedClassifierCV(best_est, method="isotonic", cv=5)
    calibrator.fit(X_tr, y_tr)

    # Threshold tuning on outer-train via calibration predictions
    p_tr = calibrator.predict_proba(X_tr)[:, 1]
    t_star, _ = pick_threshold_max_bal_acc(y_tr, p_tr)

    # Evaluate on outer-val
    p_val = calibrator.predict_proba(X_val)[:, 1]
    panel = compute_metric_panel(y_true=y_val, p=p_val, threshold=t_star)
    panel["fold"] = fold_id
    try:
        panel["best_params"] = {
            "selector__method": calibrator.base_estimator.named_steps[
                "selector"
            ].method,
            "selector__k": calibrator.base_estimator.named_steps["selector"].k,
            "svm__C": calibrator.base_estimator.named_steps["svm"].C,
            "svm__gamma": calibrator.base_estimator.named_steps["svm"].gamma,
        }
    except Exception:
        pass
    results.append(panel)
    best_pipelines.append(calibrator)

# Aggregate outer-CV
import os, json

cv_df = pd.DataFrame(results)
print(
    cv_df[
        ["fold", "balanced_accuracy", "roc_auc", "pr_auc", "f1", "brier", "threshold"]
    ]
)
print(
    "\nCV means:\n",
    cv_df[["balanced_accuracy", "roc_auc", "pr_auc", "f1", "brier"]].mean(),
)

# Save CV results
os.makedirs("artifacts", exist_ok=True)
cv_df.to_csv(f"artifacts/{PREFIX}_outer_cv_metrics.csv", index=False)

In [None]:
# Choose the best fold estimator by balanced accuracy mean surrogate
best_idx = int(np.argmax(cv_df["balanced_accuracy"].values))
best_model = best_pipelines[best_idx]

# Threshold tuning on full train
p_train_full = best_model.predict_proba(X_train)[:, 1]
t_star_full, _ = pick_threshold_max_bal_acc(y_train, p_train_full)

# External test evaluation
p_test = best_model.predict_proba(X_test)[:, 1]
panel_test = compute_metric_panel(y_true=y_test, p=p_test, threshold=t_star_full)
print("\nExternal test panel:", json.dumps(panel_test, indent=2))

# Applicability Domain (AD): compute on scaled, selected features
# Recompute transformed matrices explicitly
prep_sel = best_model.base_estimator.named_steps["selector"]
pre_svm_cols = getattr(prep_sel.selector_, "keep_cols_", None)
Xtr_sel = pd.DataFrame(X_train, columns=X_train.columns)
Xte_sel = pd.DataFrame(X_test, columns=X_test.columns)
if pre_svm_cols is not None:
    Xtr_sel = Xtr_sel[pre_svm_cols]
    Xte_sel = Xte_sel[pre_svm_cols]

train_dist = knn_ad_distance(Xtr_sel.values, Xtr_sel.values, k=5)
test_dist = knn_ad_distance(Xtr_sel.values, Xte_sel.values, k=5)
ad_thresh = choose_ad_threshold(train_dist, quantile=0.95)
ad_flag_test = (test_dist > ad_thresh).astype(int)

# Save AD info
ad_df = pd.DataFrame(
    {"ad_distance": test_dist, "ad_flag": ad_flag_test, "y_true": y_test, "p": p_test}
)
ad_df.to_csv(f"artifacts/{PREFIX}_test_ad.csv", index=False)

# Reliability diagram
prob_true, prob_pred = [], []
bins = np.linspace(0, 1, 11)
inds = np.digitize(p_test, bins) - 1
for b in range(len(bins) - 1):
    mask = inds == b
    if mask.any():
        prob_true.append(np.mean(y_test[mask]))
        prob_pred.append(np.mean(p_test[mask]))
plt.figure()
plt.plot([0, 1], [0, 1], "--", label="Perfect")
plt.plot(prob_pred, prob_true, marker="o", label="Calibrated SVM")
plt.xlabel("Predicted probability")
plt.ylabel("Empirical frequency")
plt.title("Reliability diagram (external test)")
plt.legend()
plt.tight_layout()
plt.savefig(f"artifacts/{PREFIX}_reliability.png", dpi=150)
plt.close()

# PR & ROC curves
from sklearn.metrics import precision_recall_curve, roc_curve

prec, rec, _ = precision_recall_curve(y_test, p_test)
fpr, tpr, _ = roc_curve(y_test, p_test)
plt.figure()
plt.plot(rec, prec)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("PR curve (external test)")
plt.tight_layout()
plt.savefig(f"artifacts/{PREFIX}_pr_curve.png", dpi=150)
plt.close()

plt.figure()
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC curve (external test)")
plt.tight_layout()
plt.savefig(f"artifacts/{PREFIX}_roc_curve.png", dpi=150)
plt.close()

# Save metrics & model
with open(f"artifacts/{PREFIX}_test_metrics.json", "w") as f:
    json.dump(panel_test, f, indent=2)

joblib.dump(best_model, f"artifacts/{PREFIX}_best_model.joblib")

# Save processed splits for reproducibility
X_train.to_csv(f"artifacts/{PREFIX}_X_train.csv", index=False)
pd.Series(y_train).to_csv(f"artifacts/{PREFIX}_y_train.csv", index=False, header=["y"])
X_test.to_csv(f"artifacts/{PREFIX}_X_test.csv", index=False)
pd.Series(y_test).to_csv(f"artifacts/{PREFIX}_y_test.csv", index=False, header=["y"])

print("Artifacts saved under ./artifacts")

## Notes & Adjustments

- **Columns & paths:** Update `RAW_DATA_PATH`, `SMILES_COL`, `TARGET_COL` in the Config.
- **SHAP ranking:** requires `shap` + `xgboost`. If not installed, the selector gracefully keeps all features or you can drop `"shap"` from `USE_METHODS`.
- **Calibration:** we use isotonic CV=5. `CalibratedClassifierCV` is not group-aware; for strict group calibration, replace with group-based CV & per-fold isotonic fits.
- **Thresholding:** chosen to maximize **Balanced Accuracy** on the (outer) training data; applied to validation/test.
- **Applicability Domain:** simple kNN in descriptor space (post-scaling/selection). Adjust `AD_K` and `AD_THRESH_QUANTILE` as needed.
- **Performance reporting:** See `artifacts/desc_svm_test_metrics.json` (external test) and `artifacts/desc_svm_outer_cv_metrics.csv` (outer-CV per fold). Reliability and curves are saved as PNGs.
- **Reproducibility:** `random_state=42` applied broadly; nested CV uses **scaffold groups**.
