In [None]:
# ============================================================
# 15_build_pair_datasets
# Generate ligand-receptor pairwise datasets of pKi and selectivity (ΔpKi) for all receptor combinations
# ============================================================

import pandas as pd, numpy as np
from itertools import combinations
from pathlib import Path

SCRIPT_DIR = Path("..")/"Cleaned_data"          # folder with this file
LONG_PATH  = SCRIPT_DIR / "ki_long.parquet"           # ./ki_long.parquet

out_dir = SCRIPT_DIR / "pair_datasets"
out_dir.mkdir(exist_ok=True)

df = pd.read_parquet(LONG_PATH)
df["pKi"] = -np.log10(df["Standard Value"])

all_receptors = ["5HT1A","5HT2A","5HT2B","5HT5A","5HT6","5HT7"]

for r1, r2 in combinations(all_receptors, 2):
    pivot = df.pivot_table(index="Smiles", columns="Receptor", values="pKi")
    pair  = pivot[[r1, r2]].dropna()
    pair[f"Sel_{r1}_vs_{r2}"] = pair[r1] - pair[r2]
    pair.to_csv(out_dir / f"pair_{r1}_{r2}.csv")
    print(f"  {r1}/{r2}: {len(pair)} Saved rows")


  5HT1A/5HT2A: 1166 Saved rows
  5HT1A/5HT2B: 317 Saved rows
  5HT1A/5HT5A: 147 Saved rows
  5HT1A/5HT6: 704 Saved rows
  5HT1A/5HT7: 1403 Saved rows
  5HT2A/5HT2B: 838 Saved rows
  5HT2A/5HT5A: 108 Saved rows
  5HT2A/5HT6: 730 Saved rows
  5HT2A/5HT7: 859 Saved rows
  5HT2B/5HT5A: 134 Saved rows
  5HT2B/5HT6: 238 Saved rows
  5HT2B/5HT7: 302 Saved rows
  5HT5A/5HT6: 104 Saved rows
  5HT5A/5HT7: 147 Saved rows
  5HT6/5HT7: 925 Saved rows


In [None]:
# ============================================================
# meta_sel_all_pairs
# Train LightGBM meta-models (ΔpKi) for all 15 receptor pairs
# and store:  *meta_<R1>_vs_<R2>.pkl*      in  models/sel/
#           meta_metrics_summary.csv     in  models/sel/
# ============================================================

import glob, joblib, numpy as np, pandas as pd
from itertools import combinations
from pathlib import Path
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors, Crippen, rdMolDescriptors as rdMD

# ── adjust paths here ───────────────────────────────────────────────
SCRIPT_DIR   = "../ Selectivity model"        # …/Selectivity model
PROJECT_ROOT = SCRIPT_DIR.parent                      # root

PKI_DIR   = Path("..")/"pKi regression model"/"models"/"pKi"
BIN_DIR   = Path("..")/"Binary activity models"/"models"
PAIR_DIR  = Path("..")/"Cleaned_data"/"pair_datasets"            # ./pair_datasets/
HOLD_PATH = Path("..")/"Cleaned_data"/"ligands_for_inference.csv"

PKI_DIR.mkdir(parents=True, exist_ok=True)
BIN_DIR.mkdir(parents=True, exist_ok=True)
PAIR_DIR.mkdir(exist_ok=True)

# list of receptors used in C1 / C1_R
RECEPTORS = ["5HT1A", "5HT2A", "5HT2B", "5HT5A", "5HT6", "5HT7"]

# ── helper: full 26-descriptor phys-chem + ECFP-1024 ───────────────
FP_RADIUS, N_BITS = 2, 1024
def physchem(mol):
    return np.array([
        Descriptors.MolWt(mol), Descriptors.MolLogP(mol), Crippen.MolMR(mol),
        Descriptors.NumHAcceptors(mol), Descriptors.NumHDonors(mol),
        Descriptors.NumRotatableBonds(mol), Descriptors.RingCount(mol),
        Descriptors.FractionCSP3(mol), rdMD.CalcTPSA(mol),
        rdMD.CalcNumAliphaticRings(mol), rdMD.CalcNumAromaticRings(mol),
        rdMD.CalcNumSaturatedRings(mol), rdMD.CalcNumAliphaticCarbocycles(mol),
        rdMD.CalcNumAromaticCarbocycles(mol), rdMD.CalcNumSaturatedCarbocycles(mol),
        rdMD.CalcNumAliphaticHeterocycles(mol), rdMD.CalcNumAromaticHeterocycles(mol),
        rdMD.CalcNumSaturatedHeterocycles(mol), rdMD.CalcNumHeteroatoms(mol),
        rdMD.CalcNumAmideBonds(mol), float(Chem.GetFormalCharge(mol)),
        rdMD.CalcExactMolWt(mol), rdMD.CalcNumAtomStereoCenters(mol),
        rdMD.CalcNumUnspecifiedAtomStereoCenters(mol), Descriptors.NumValenceElectrons(mol),
        rdMD.CalcLabuteASA(mol)
    ], dtype=np.float32)

def featurize(smiles):
    fps, desc = [], []
    for smi in smiles:
        mol = Chem.MolFromSmiles(smi)
        fp  = AllChem.GetMorganFingerprintAsBitVect(mol, FP_RADIUS, N_BITS)
        arr = np.zeros(N_BITS, dtype=np.uint8)
        DataStructs.ConvertToNumpyArray(fp, arr)
        fps.append(arr); desc.append(physchem(mol))
    return np.hstack([np.vstack(fps), np.vstack(desc)])

# ── main training routine for a single pair ─────────────────────────
def train_meta_pair(r1: str, r2: str) -> dict:
    pair_path = f"{PAIR_DIR}/pair_{r1}_{r2}.csv"
    if not Path(pair_path).exists():
        print(f"pair file missing: {pair_path}")
        return None

    df_pair = pd.read_csv(pair_path)          # Smiles, pKi_r1, pKi_r2, Sel
    X_all   = featurize(df_pair["Smiles"])

    # pKi predictions ------------------------------------------------
    pki_pred = {}
    for mdl_path in glob.glob(f"{PKI_DIR}/LGBM_pKi_5HT*.pkl"):
        rec   = Path(mdl_path).stem.split("_")[-1]
        scaler = joblib.load(f"{PKI_DIR}/scaler_pKi_{rec}.pkl")
        model  = joblib.load(mdl_path)
        X_tmp  = X_all.copy()
        X_tmp[:, -scaler.mean_.shape[0]:] = scaler.transform(
            X_tmp[:, -scaler.mean_.shape[0]:])
        pki_pred[f"{rec}_pKi"] = model.predict(X_tmp)
    pki_df = pd.DataFrame(pki_pred).assign(Smiles=df_pair["Smiles"])

    # P(active) predictions -----------------------------------------
    pact_pred = {}
    for mdl_path in glob.glob(f"{BIN_DIR}/LGBM_ECFP_RDKit_cleaned_5HT*.pkl"):
        rec   = Path(mdl_path).stem.split("_")[-1]
        model = joblib.load(mdl_path)
        try:
            scaler = joblib.load(f"{BIN_DIR}/scaler_{rec}.pkl")
            X_tmp  = X_all.copy()
            X_tmp[:, -scaler.mean_.shape[0]:] = scaler.transform(
                X_tmp[:, -scaler.mean_.shape[0]:])
        except FileNotFoundError:
            X_tmp = X_all
        pact_pred[f"{rec}_Pact"] = model.predict(X_tmp)
    pact_df = pd.DataFrame(pact_pred).assign(Smiles=df_pair["Smiles"])

    # merge + feature matrix ----------------------------------------
    merged = (df_pair.merge(pki_df,  on="Smiles", how="left")
                       .merge(pact_df, on="Smiles", how="left"))

    feat_cols = [c for c in merged.columns
                 if c.endswith("_pKi") or c.endswith("_Pact")]
    X_raw = merged[feat_cols].astype(float).dropna(axis=1, how="all")
    X_raw = X_raw.fillna(X_raw.mean())
    X = X_raw.loc[:, X_raw.nunique() > 1]          # drop constant
    y = merged[f"Sel_{r1}_vs_{r2}"]

    if X.shape[1] == 0:
        print(f" no usable features for {r1}/{r2}")
        return None

    # hold-out split ------------------------------------------------
    hold_smiles = pd.read_csv(HOLD_PATH)["Smiles"]
    mask = merged["Smiles"].isin(hold_smiles)
    X_tr, X_te = X[~mask], X[mask]
    y_tr, y_te = y[~mask], y[mask]

    # meta-model ----------------------------------------------------
    meta = LGBMRegressor(
        n_estimators=600, learning_rate=0.05,
        num_leaves=128, min_data_in_leaf=5,
        random_state=42
    ).fit(X_tr, y_tr)

    r2_tr = meta.score(X_tr, y_tr)
    r2_te = r2_score(y_te, meta.predict(X_te))
    mae_te = mean_absolute_error(y_te, meta.predict(X_te))

    # save model
    out_dir = Path("models/sel"); out_dir.mkdir(parents=True, exist_ok=True)
    model_path = out_dir / f"meta_{r1}_vs_{r2}.pkl"
    joblib.dump(meta, model_path)

    print(f"✓ {r1} vs {r2}:  R²_hold={r2_te:.3f},  MAE_hold={mae_te:.3f}")
    return {"pair": f"{r1}/{r2}", "features": X.shape[1],
            "n_train": len(X_tr), "n_hold": len(X_te),
            "R2_train": round(r2_tr,3), "R2_hold": round(r2_te,3),
            "MAE_hold": round(mae_te,3)}

# ── loop over all 15 pairs ─────────────────────────────────────────
summary_rows = []
for r1, r2 in combinations(RECEPTORS, 2):
    res = train_meta_pair(r1, r2)
    if res: summary_rows.append(res)

# save summary table
summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv("models/sel/meta_metrics_summary.csv", index=False)
print("\nSaved summary → models/sel/meta_metrics_summary.csv")
print(summary_df)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000119 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3057
[LightGBM] [Info] Number of data points in the train set: 1117, number of used features: 12
[LightGBM] [Info] Start training from score 0.309649
✓ 5HT1A vs 5HT2A:  R²_hold=0.830,  MAE_hold=0.257
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1064
[LightGBM] [Info] Number of data points in the train set: 268, number of used features: 12
[LightGBM] [Info] Start training from score 0.017945
✓ 5HT1A vs 5HT2B:  R²_hold=0.856,  MAE_hold=0.292
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000079 seconds.
You can set `force_col_wise=true` to remove the ove