In [None]:
# ============================================================
#  Chemical space mapping via UMAP and separability metrics
#  (red = inactive, blue = active, opaque dots)
#  ─ Saves PNG maps and chem_space_metrics.csv
#  ─ Input: cleaned_5HT*.csv 

import os, glob, warnings
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors, Crippen, rdMolDescriptors

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, silhouette_score
from sklearn.neighbors import KNeighborsClassifier
from umap import UMAP

warnings.filterwarnings("ignore")
sns.set_context("talk")

# Constants
)
CSV_DIR   = Path(__file__).resolve().parents[1] / "Cleaned_data" # repo_root / Cleaned_data 
SAVE_FIGS = Path("chem_space_figs"); SAVE_FIGS.mkdir(exist_ok=True)
ACTIVE_THR, FP_R, N_BITS = 1000, 2, 1024        # Ki < 1000 nM Active

# Phys-chem descriptors (26) & ECFP4
def physchem(mol):
    """Return 26 RDKit descriptors as float32 array."""
    return np.array([
        Descriptors.MolWt(mol), Descriptors.MolLogP(mol), Crippen.MolMR(mol),
        Descriptors.NumHAcceptors(mol), Descriptors.NumHDonors(mol),
        Descriptors.NumRotatableBonds(mol), Descriptors.RingCount(mol),
        Descriptors.FractionCSP3(mol), rdMolDescriptors.CalcTPSA(mol),
        rdMolDescriptors.CalcNumAliphaticRings(mol), rdMolDescriptors.CalcNumAromaticRings(mol),
        rdMolDescriptors.CalcNumSaturatedRings(mol), rdMolDescriptors.CalcNumAliphaticCarbocycles(mol),
        rdMolDescriptors.CalcNumAromaticCarbocycles(mol), rdMolDescriptors.CalcNumSaturatedCarbocycles(mol),
        rdMolDescriptors.CalcNumAliphaticHeterocycles(mol), rdMolDescriptors.CalcNumAromaticHeterocycles(mol),
        rdMolDescriptors.CalcNumSaturatedHeterocycles(mol), rdMolDescriptors.CalcNumHeteroatoms(mol),
        rdMolDescriptors.CalcNumAmideBonds(mol), float(Chem.GetFormalCharge(mol)),
        rdMolDescriptors.CalcExactMolWt(mol), rdMolDescriptors.CalcNumAtomStereoCenters(mol),
        rdMolDescriptors.CalcNumUnspecifiedAtomStereoCenters(mol), Descriptors.NumValenceElectrons(mol),
        rdMolDescriptors.CalcLabuteASA(mol)
    ], dtype=np.float32)

def ecfp_arr(mol, radius=FP_R, n_bits=N_BITS):
    """Convert ECFP to numpy int8 array."""
    fp  = AllChem.GetMorganFingerprintAsBitVect(mol, radius, n_bits)
    arr = np.zeros((n_bits,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

# UMAP plot helper
def save_umap_png(emb, y, receptor, auc, silh, path_png):
    """
    emb : ndarray (n, 2) – UMAP coordinates
    y   : 0/1             – 1 = active (blue), 0 = inactive (red)
    """
    n_act, n_inact = int((y == 1).sum()), int((y == 0).sum())

    fig, ax = plt.subplots(figsize=(6, 5))
    ax.scatter(*emb[y == 1].T, c="blue", s=28, edgecolors="k", linewidths=0.15,
               label=f"Active ({n_act})", zorder=2)
    ax.scatter(*emb[y == 0].T, c="red",  s=28, edgecolors="k", linewidths=0.15,
               label=f"Inactive ({n_inact})", zorder=3)

    ax.set_xticks([]); ax.set_yticks([])
    ax.set_title(f"{receptor}   AUC={auc:.2f}   Silh={silh:.2f}",
                 fontsize=11, pad=10)

    ax.grid(which="major", linestyle="--", alpha=0.25)
    ax.minorticks_on()
    ax.grid(which="minor", linestyle=":",  alpha=0.12)

    for spine in ax.spines.values():
        spine.set(edgecolor="k", linewidth=0.8)

    leg = ax.legend(loc="lower left", fontsize=9, frameon=True, borderpad=0.8)
    leg.get_frame().set(alpha=0.3, facecolor="white")

    plt.tight_layout()
    plt.savefig(path_png, dpi=300)
    plt.close()

#  Main loop over CSV files
records = []

for csv_path in tqdm(sorted(CSV_DIR.glob("cleaned_5HT*.csv")), desc="C3-batch"):
    receptor = csv_path.stem.split("_")[1]

    df = (pd.read_csv(csv_path)
            .dropna(subset=["Smiles", "Standard Value"]))
    df["Active"] = (df["Standard Value"] < ACTIVE_THR).astype(int)
    y = df["Active"].values

    fp_mat, desc_mat = [], []
    for smi in df["Smiles"]:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            continue
        fp_mat.append(ecfp_arr(mol))
        desc_mat.append(physchem(mol))

    if not fp_mat:   # skip empty files after filtering
        print(f"  Skipping {csv_path.name}: no valid molecules.")
        continue

    X = np.hstack([
        np.vstack(fp_mat).astype(np.uint8),
        StandardScaler().fit_transform(np.vstack(desc_mat).astype(np.float32))
    ])

    emb = UMAP(n_neighbors=15, min_dist=0.1, random_state=42).fit_transform(X)

    # --- Quality metrics -------------------------------------------
    knn_auc = roc_auc_score(y,
                 KNeighborsClassifier(15).fit(emb, y).predict_proba(emb)[:, 1])
    silh    = silhouette_score(emb, y)

    png_path = SAVE_FIGS / f"UMAP_{receptor}.png"
    save_umap_png(emb, y, receptor, knn_auc, silh, png_path)
    print("  Saved:", png_path)

    records.append(dict(Receptor=receptor,
                        n=len(y),
                        Actives=int(y.sum()),
                        AUC_UMAP=round(knn_auc, 3),
                        Silhouette=round(silh, 3)))

#  Final report
chem_df = (pd.DataFrame(records)
             .set_index("Receptor")
             .sort_index())

chem_df.to_csv("chem_space_metrics.csv")
display(chem_df)
print(" chem_space_metrics.csv saved")


C3-batch:   0%|          | 0/6 [00:00<?, ?it/s]

  Saved: chem_space_figs/UMAP_5HT1A.png
  Saved: chem_space_figs/UMAP_5HT2A.png
  Saved: chem_space_figs/UMAP_5HT2B.png
  Saved: chem_space_figs/UMAP_5HT5A.png
  Saved: chem_space_figs/UMAP_5HT6.png
  Saved: chem_space_figs/UMAP_5HT7.png


Unnamed: 0_level_0,n,Actives,AUC_UMAP,Silhouette
Receptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5HT1A,4342,3831,0.927,-0.079
5HT2A,3861,3205,0.883,-0.035
5HT2B,1394,1094,0.815,-0.01
5HT5A,338,219,0.86,-0.008
5HT6,3596,3136,0.941,-0.03
5HT7,2444,2039,0.902,-0.02


 chem_space_metrics.csv saved
