In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs 
from scipy.sparse import csr_matrix
import umap.umap_ as umap
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt


## UMAP

Combines molecular data from two datasets (Papyrus and TB), generates molecular fingerprints, and applies UMAP for dimensionality reduction using a Jaccard distance metric. The resulting 2D projection visualizes chemical similarity between molecules from both datasets.

In [None]:

# CONFIG 
n_total = 100000       
nBits, radius = 1024, 2
perplexity = 100       


df_pap = pd.read_csv("Papyrus_Barlow.csv"); df_pap["dataset"] = "Papyrus"
df_tb  = pd.read_csv("TB_BARLOW.csv"); df_tb["dataset"]  = "tb"
df = pd.concat([df_pap, df_tb], ignore_index=True)


df_tb  = df[df["dataset"].str.lower() == "tb"].copy()
df_pap = df[df["dataset"].str.lower() == "papyrus"].copy()

n_tb = len(df_tb)
if n_tb >= n_total:
    df = df_tb.sample(n_total, random_state=42).reset_index(drop=True)
else:
    n_pap_need = min(len(df_pap), n_total - n_tb)
    df_sample = pd.concat([df_tb, df_pap.sample(n_pap_need, random_state=42)], ignore_index=True)
df = df_sample.sample(frac=1.0, random_state=42).reset_index(drop=True)

print(f"Select: {len(df_sample)} | TB={sum(df_sample['dataset'].str.lower()=='tb')}, "
      f"Papyrus={sum(df_sample['dataset'].str.lower()=='papyrus')}")



In [None]:

mols = [Chem.MolFromSmiles(s) for s in df['smiles']]
df = df[[m is not None for m in mols]].reset_index(drop=True)
mols = [m for m in mols if m is not None]

nBits = 2048; radius = 2
rows, cols, data = [], [], []
for i, m in enumerate(mols):
    on = list(AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits).GetOnBits())
    rows.extend([i]*len(on)); cols.extend(on); data.extend([1]*len(on))
X = csr_matrix((data, (rows, cols)), shape=(len(mols), nBits))


um = umap.UMAP(
    n_neighbors=50,
    min_dist=0.1,
    metric="jaccard",
    low_memory=True,      
    random_state=42,
    verbose=True
)
emb = um.fit_transform(X)

fig, ax = plt.subplots(figsize=(7,6))
for grp in df["dataset"].unique():
    idx = (df["dataset"] == grp).values
    ax.scatter(emb[idx,0], emb[idx,1], s=6, alpha=0.6, label=grp)
ax.legend(title="Dataset"); ax.set_title("UMAP (Jaccard ≡ Tanimoto)")
plt.show()


# Tb dataset analysis 

counts how many drug–target pairs exist for each protein, and plots a bar chart showing the distribution of interactions, highlighting the most frequent targets in the TB dataset.

In [None]:
# config 
CSV_PATH = "tb_final_filtrado.csv"  
COL_SMILES = "SMILES"
COL_TARGET = "UniProt ID"
COL_LABEL = "interaction"    
ONLY_ACTIVE = False   
TOP_N = 155           
FIG_OUT = "fig_tb_target_distribution.png"

df = pd.read_csv(CSV_PATH)

if COL_LABEL is not None and ONLY_ACTIVE:
    df = df[df[COL_LABEL] == 1]

counts = df[COL_TARGET].value_counts()
total = counts.sum()

top = counts.head(TOP_N)
others = counts.iloc[TOP_N:].sum()
if others > 0:
    top = pd.concat([top, pd.Series({"Others": others})])

perc = (top / total * 100).round(2)

top = top.sort_values(ascending=True)
perc = perc[top.index]

fig, ax = plt.subplots(figsize=(9, 8))
ypos = np.arange(len(top))
ax.barh(ypos, top.values, edgecolor="black")
ax.set_yticks(ypos, top.index)
ax.set_xlabel("Number of drug–target pairs")
ax.set_title("Distribution of interactions per target (TB dataset)")

for i, (c, p) in enumerate(zip(top.values, perc.values)):
    ax.text(c + max(top.values)*0.01, i, f"{int(c)} ({p:.1f}%)", va="center")

plt.tight_layout()
plt.savefig(FIG_OUT, dpi=300)
print(f"Saved: {FIG_OUT}")


Generates molecular fingerprints from SMILES strings, reduces their dimensionality using UMAP or t-SNE, and applies k-means clustering. The resulting 2D embedding is visualized to show how molecules group by target or cluster, revealing chemical similarities in the TB dataset.

In [None]:
# config
CSV_PATH = "tb_final_filtrado.csv"
COL_SMILES = "SMILES"
COL_TARGET = "UniProt ID"
COL_LABEL = "interaction"    
ONLY_ACTIVE = False    
N_BITS = 1024
RADIUS = 2
USE_UMAP = True          
COLOR_BY = "UniProt ID"      
N_CLUSTERS = 10          
MAX_LEGEND = 15         
EMB_OUT_CSV = "umap_embeddings_tb.csv"
FIG_OUT = "fig_tb_umap_by_target.png"

def ecfp_bits(smiles_list, n_bits=1024, radius=2):
    fps = []
    for smi in smiles_list:
        m = Chem.MolFromSmiles(str(smi)) if pd.notna(smi) else None
        if m is None:
            fps.append(np.zeros(n_bits, dtype=np.uint8))
            continue
        bv = AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits=n_bits)
        arr = np.zeros((n_bits,), dtype=np.uint8)
        DataStructs.ConvertToNumpyArray(bv, arr)
        fps.append(arr)
    return np.vstack(fps)

df = pd.read_csv(CSV_PATH)
if COL_LABEL is not None and ONLY_ACTIVE:
    df = df[df[COL_LABEL] == 1].copy()


X = ecfp_bits(df[COL_SMILES].tolist(), n_bits=N_BITS, radius=RADIUS)


if USE_UMAP:
    import umap
    reducer = umap.UMAP(
        n_neighbors=10,
        min_dist=0.1,
        n_components=2,
        metric="jaccard",
        random_state=42
    )
    emb = reducer.fit_transform(X)
else:
    from sklearn.manifold import TSNE
    emb = TSNE(n_components=2, perplexity=30, learning_rate="auto",
               init="pca", random_state=42).fit_transform(X)


if COLOR_BY == "kmeans":
    from sklearn.cluster import KMeans
    km = KMeans(n_clusters=N_CLUSTERS, n_init="auto", random_state=42)
    y = km.fit_predict(X)
    legend_values = list(range(N_CLUSTERS))
    labels_map = {i: f"Cluster {i}" for i in legend_values}
else:
    y = df[COL_TARGET].astype(str).values
    top_classes = pd.Series(y).value_counts().head(MAX_LEGEND).index.tolist()
    y = np.where(np.isin(y, top_classes), y, "Other")
    legend_values = pd.unique(y)
    labels_map = {lv: lv for lv in legend_values}

pd.DataFrame({"x": emb[:,0], "y": emb[:,1], "color": y}).to_csv(EMB_OUT_CSV, index=False)
print(f"Saved embeddings: {EMB_OUT_CSV}")

plt.figure(figsize=(8.5, 7.5))
for lv in legend_values:
    mask = (y == lv)
    plt.scatter(emb[mask,0], emb[mask,1], s=8, alpha=0.75, label=labels_map[lv])

title_alg = "UMAP " if USE_UMAP else "t-SNE"
color_note = "by target" if COLOR_BY != "kmeans" else "by k-means clusters"
plt.title(f"{title_alg} of TB molecules {color_note}")
plt.xlabel("Dim 1"); plt.ylabel("Dim 2")
plt.legend(markerscale=2, frameon=False, bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.savefig(FIG_OUT, dpi=300, bbox_inches="tight")
print(f"Saved: {FIG_OUT}")
