In [None]:

"""
compute_Ki_and_selectivity.py
Predict Ki, P(active), and selectivity (ΔpKi) for new ligands
using pre-trained 5HT receptor regression and classification models.
"""

import pandas as pd
import numpy as np
import joblib
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors, Crippen, rdMolDescriptors as rdMD
from itertools import combinations
from pathlib import Path

# === Configuration ===

BASE_DIR = Path(__file__).resolve().parent                     # repo_root/Prediction for new SMILES/

INPUT_CSV = BASE_DIR / "5ht2c.csv"                             # or replace with your own file

PKI_DIR   = BASE_DIR.parent / "pKi regression model"   / "models" / "pKi"
BIN_DIR   = BASE_DIR.parent / "Binary activity models" / "models"
META_DIR  = BASE_DIR.parent / "Selectivity model"      / "models" / "sel"
META_DIR.mkdir(parents=True, exist_ok=True)                     # ensure it exists

FP_RADIUS = 2
N_BITS     = 1024

# === Descriptor and fingerprint functions ===
def physchem(mol: Chem.Mol) -> np.ndarray:
    return np.array([
        Descriptors.MolWt(mol), Descriptors.MolLogP(mol), Crippen.MolMR(mol),
        Descriptors.NumHAcceptors(mol), Descriptors.NumHDonors(mol),
        Descriptors.NumRotatableBonds(mol), Descriptors.RingCount(mol),
        Descriptors.FractionCSP3(mol), rdMD.CalcTPSA(mol),
        rdMD.CalcNumAliphaticRings(mol), rdMD.CalcNumAromaticRings(mol),
        rdMD.CalcNumSaturatedRings(mol), rdMD.CalcNumAliphaticCarbocycles(mol),
        rdMD.CalcNumAromaticCarbocycles(mol), rdMD.CalcNumSaturatedCarbocycles(mol),
        rdMD.CalcNumAliphaticHeterocycles(mol), rdMD.CalcNumAromaticHeterocycles(mol),
        rdMD.CalcNumSaturatedHeterocycles(mol), rdMD.CalcNumHeteroatoms(mol),
        rdMD.CalcNumAmideBonds(mol), float(Chem.GetFormalCharge(mol)),
        rdMD.CalcExactMolWt(mol), rdMD.CalcNumAtomStereoCenters(mol),
        rdMD.CalcNumUnspecifiedAtomStereoCenters(mol), Descriptors.NumValenceElectrons(mol),
        rdMD.CalcLabuteASA(mol)
    ], dtype=np.float32)

def featurize(smiles_list: list) -> np.ndarray:
    fps, descs = [], []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            continue
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, FP_RADIUS, N_BITS)
        arr = np.zeros(N_BITS, dtype=np.uint8)
        DataStructs.ConvertToNumpyArray(fp, arr)
        fps.append(arr)
        descs.append(physchem(mol))
    return np.hstack([np.vstack(fps), np.vstack(descs)])

# === Main Execution ===
if __name__ == '__main__':
    # Load new SMILES
    df_in  = pd.read_csv(INPUT_CSV, sep=';', usecols=['Smiles'])
    smiles = df_in['Smiles'].tolist()

    # Compute features
    X = featurize(smiles)

    # 1. Ki predictions only
    ki_out = pd.DataFrame({'Smiles': smiles})
    for mdl in sorted(PKI_DIR.glob('LGBM_pKi_*.pkl')):
        rec = mdl.stem.split('_')[-1].upper()
        model  = joblib.load(mdl)
        scaler = joblib.load(PKI_DIR / f'scaler_pKi_{rec}.pkl')
        X_tmp  = X.copy()
        n_desc = scaler.mean_.shape[0]
        X_tmp[:, -n_desc:] = scaler.transform(X_tmp[:, -n_desc:])
        pki_vals = model.predict(X_tmp)
        ki_out[f'{rec}_Ki_pred'] = np.round(10**(-pki_vals), 2)
    ki_out.to_csv('new_predicted_Ki.csv', index=False)
    print('Saved Ki predictions to new_predicted_Ki.csv')

    # 2. P(active) predictions
    pact_out = pd.DataFrame({'Smiles': smiles})
    model_files = sorted(BIN_DIR.glob('LGBM_ECFP_RDKit_*.pkl'))
    if not model_files:
        print(f"WARNING: No P(active) models found in {BIN_DIR}, skipping")
    else:
        for model_path in model_files:
            rec = model_path.stem.split('_')[-1]
            print(f'Processing receptor {rec}')
            model = joblib.load(model_path)

            scaler_path = BIN_DIR / f'scaler_{rec}.pkl'
            X_tmp = X.copy()
            if scaler_path.exists():
                scaler = joblib.load(scaler_path)
                n_desc = scaler.mean_.shape[0]
                X_tmp[:, -n_desc:] = scaler.transform(X_tmp[:, -n_desc:])

            if hasattr(model, 'predict_proba'):
                probs = model.predict_proba(X_tmp)[:, 1]
            else:
                probs = model.predict(X_tmp)
            pact_out[f'{rec}_Pactive'] = np.round(probs, 2)
    pact_out.to_csv('new_predicted_Pactive.csv', index=False)
    print('Saved P_active predictions to new_predicted_Pactive.csv')

    # 3. Selectivity ΔpKi
    sel_out = pd.DataFrame({'Smiles': smiles})
    receptors = [c.split('_')[0] for c in ki_out.columns if c.endswith('Ki_pred')]
    for r1, r2 in combinations(receptors, 2):
        sel_out[f'Delta_pKi_{r1}_vs_{r2}'] = (
            ki_out[f'{r1}_Ki_pred'] - ki_out[f'{r2}_Ki_pred']
        )
    sel_out = sel_out.round(2)
    sel_out.to_csv('new_predicted_selectivity.csv', index=False)
    print('Saved selectivity matrix to new_predicted_selectivity.csv')


Saved Ki predictions to new_predicted_Ki.csv
Processing receptor 5HT1A
Processing receptor 5HT2A
Processing receptor 5HT2B
Processing receptor 5HT5A
Processing receptor 5HT6
Processing receptor 5HT7
Processing receptor 5HT5A
Saved P_active predictions to new_predicted_Pactive.csv
Saved selectivity matrix to new_predicted_selectivity.csv
