In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors, Crippen, rdMolDescriptors as rdMD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from umap import UMAP
from pathlib import Path
import glob

# pathname
from pathlib import Path

SCRIPT_DIR = Path(__file__).resolve().parent      # folder with this script
PAIR_DIR   = SCRIPT_DIR / "pair_datasets"        # ./pair_datasets/
OUT_DIR    = SCRIPT_DIR / "umap_plots"           # ./umap_plots/
OUT_DIR.mkdir(exist_ok=True)

# features
FP_RADIUS, N_BITS = 2, 1024

def physchem(mol):
    return np.array([
        Descriptors.MolWt(mol), Descriptors.MolLogP(mol), Crippen.MolMR(mol),
        Descriptors.NumHAcceptors(mol), Descriptors.NumHDonors(mol),
        Descriptors.NumRotatableBonds(mol), Descriptors.RingCount(mol),
        Descriptors.FractionCSP3(mol), rdMD.CalcTPSA(mol),
        rdMD.CalcNumAliphaticRings(mol), rdMD.CalcNumAromaticRings(mol),
        rdMD.CalcNumSaturatedRings(mol), rdMD.CalcNumAliphaticCarbocycles(mol),
        rdMD.CalcNumAromaticCarbocycles(mol), rdMD.CalcNumSaturatedCarbocycles(mol),
        rdMD.CalcNumAliphaticHeterocycles(mol), rdMD.CalcNumAromaticHeterocycles(mol),
        rdMD.CalcNumSaturatedHeterocycles(mol), rdMD.CalcNumHeteroatoms(mol),
        rdMD.CalcNumAmideBonds(mol), float(Chem.GetFormalCharge(mol)),
        rdMD.CalcExactMolWt(mol), rdMD.CalcNumAtomStereoCenters(mol),
        rdMD.CalcNumUnspecifiedAtomStereoCenters(mol), Descriptors.NumValenceElectrons(mol),
        rdMD.CalcLabuteASA(mol)
    ], dtype=np.float32)

def featurize(smiles_list):
    fps, desc = [], []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            continue
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, FP_RADIUS, N_BITS)
        arr = np.zeros(N_BITS, dtype=np.uint8)
        DataStructs.ConvertToNumpyArray(fp, arr)
        fps.append(arr)
        desc.append(physchem(mol))
    return np.hstack([np.vstack(fps), np.vstack(desc)])

pairs = sorted(PAIR_DIR.glob("pair_*.csv"))

for pair_fp in pairs:
    df = pd.read_csv(pair_fp)
    smiles = df["Smiles"]
    X = featurize(smiles)
    # |ΔpKi| > 1 → 1, other 0
    y = (df.iloc[:, -1].abs() > 1).astype(int)

 
    X_scaled = StandardScaler().fit_transform(X)
    umap_2d = UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
    X_umap = umap_2d.fit_transform(X_scaled)

    
    sil = silhouette_score(X_umap, y)
    knn = KNeighborsClassifier(n_neighbors=5).fit(X_umap, y)
    roc = roc_auc_score(y, knn.predict_proba(X_umap)[:, 1])

    plt.figure(figsize=(6, 6))
    plt.scatter(
        X_umap[y == 0, 0], X_umap[y == 0, 1],
        c='red', edgecolors='white', linewidths=0.15,
        alpha=0.7, label='Non-selective'
    )
    plt.scatter(
        X_umap[y == 1, 0], X_umap[y == 1, 1],
        c='blue', edgecolors='white', linewidths=0.15,
        alpha=0.7, label='Selective'
    )
    plt.title(f"{pair_fp.stem} | Sil={sil:.2f}, AUC={roc:.2f}")
    ax = plt.gca()
    ax.set_xticks([]); ax.set_yticks([])
    plt.legend(frameon=True)
    plt.tight_layout()

    # Save
    out_path = OUT_DIR / f"{pair_fp.stem}_umap.png"
    plt.savefig(out_path, dpi=300)
    plt.close()
    print(f"Saved {out_path}")


  warn(


Saved umap_plots/pair_5HT1A_5HT2A_umap.png


  warn(


Saved umap_plots/pair_5HT1A_5HT2B_umap.png


  warn(


Saved umap_plots/pair_5HT1A_5HT5A_umap.png


  warn(


Saved umap_plots/pair_5HT1A_5HT6_umap.png


  warn(


Saved umap_plots/pair_5HT1A_5HT7_umap.png


  warn(


Saved umap_plots/pair_5HT2A_5HT2B_umap.png


  warn(


Saved umap_plots/pair_5HT2A_5HT5A_umap.png


  warn(


Saved umap_plots/pair_5HT2A_5HT6_umap.png


  warn(


Saved umap_plots/pair_5HT2A_5HT7_umap.png


  warn(


Saved umap_plots/pair_5HT2B_5HT5A_umap.png


  warn(


Saved umap_plots/pair_5HT2B_5HT6_umap.png


  warn(


Saved umap_plots/pair_5HT2B_5HT7_umap.png


  warn(


Saved umap_plots/pair_5HT5A_5HT6_umap.png


  warn(


Saved umap_plots/pair_5HT5A_5HT7_umap.png


  warn(


Saved umap_plots/pair_5HT6_5HT7_umap.png
