In [1]:
import random 
from random import sample, seed, shuffle
import numpy as np
import pandas as pd
import os
import six
from rdkit import rdBase
from rdkit import RDLogger


# Suppress RDKit warnings
rdBase.DisableLog('rdApp.*')
RDLogger.DisableLog('rdApp.*')

#utility functions : prepare the data 
from model_fp_selection.lib.utils import prepare_df_morgan, prepare_df_rdkit, swap_identical_ligands, prepare_df_chemeleon
from model_fp_selection.lib.utils import drop_duplicates, average_duplicates, calc_desc

#utility functions : CV and results 
from model_fp_selection.lib.utils import obtain_metrics, plot_cv_results
from model_fp_selection.lib.utils import df_split, get_indices, get_indices_doi, get_indices_scaff
from model_fp_selection.lib.utils import generate_scaffold, scaffold_to_smiles
from model_fp_selection.lib.utils import ligands_permutation, cross_validation, prepare_train_set


from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, Draw
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, median_absolute_error, PredictionErrorDisplay
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import MinMaxScaler

#Encoding categorical Data
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Regressors
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

#Pipelines and other model constructions
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

# Visualization
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 20})

#np.random.seed(42)
#seed(42)

#Specific to Scaffold Splitting
from rdkit.Chem.Scaffolds import MurckoScaffold
from collections import defaultdict
import pickle as pkl
import time
from tqdm import tqdm
import seaborn as sns

from itertools import *

from model_fp_selection.lib.cross_val_both_models import cross_val_2_models

from model_fp_selection.chemeleon_fingerprint import CheMeleonFingerprint

In [2]:
import umap

In [3]:
os.environ['PYTHONHASHSEED'] = '0'   # only needed in extreme cases
np.random.seed(42)
mapper = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    metric='euclidean',
    random_state=42,   # essential for reproducibility
    init='spectral',   # good deterministic-ish init (still uses random_state)
    n_jobs=1           # single thread lowers nondeterminism (slower)
)

## Preparing the dataset

In [None]:
metals= pd.read_csv("ruthenium_complexes_dataset.csv", dtype={'L1': str, 'L2': str, 'L3': str})
metals_morgan = prepare_df_morgan(metals, 2, 1024)
metals_rdkit = prepare_df_rdkit(metals, nbits=2048)

In [None]:
l=len(metals)
metals_chemeleon_1 = prepare_df_chemeleon(metals[0:int(0.33*l)])

In [None]:
metals_chemeleon_2 = prepare_df_chemeleon(metals[int(0.33*l):int(0.66*l)])

In [None]:
metals_chemeleon_3 = prepare_df_chemeleon(metals[int(0.66*l):l])

In [None]:
metals_chemeleon = pd.concat([metals_chemeleon_1, metals_chemeleon_2, metals_chemeleon_3], ignore_index=True)
metals_chemeleon = average_duplicates(metals_chemeleon, 'Ligands_Dict', 'pIC50')

In [None]:
metals_morgan = average_duplicates(metals_morgan, 'Ligands_Dict', 'pIC50')
metals_rdkit = average_duplicates(metals_rdkit, 'Ligands_Dict', 'pIC50'

In [None]:
metals_desc = calc_desc(metals_rdkit)

## Getting the UMAP vectors

In [None]:
X_desc = metals_desc['Descriptors']
X_desc = X_desc.tolist()
X_desc = np.array(X_desc)
scaler=StandardScaler()
X_desc = scaler.fit_transform(X_desc)

X_rdkit = metals_rdkit['Fingerprint']
X_rdkit = np.array(X_rdkit.tolist())

X_morgan = metals_morgan['Fingerprint']
X_morgan = np.array(X_morgan.tolist())

X_chemeleon = metals_chemeleon['Fingerprint']
X_chemeleon = X_chemeleon.tolist()
X_chemeleon = np.array(X_chemeleon,)
scaler=StandardScaler()
X_chemeleon = scaler.fit_transform(X_chemeleon)

In [4]:
os.environ['PYTHONHASHSEED'] = '0'   # only needed in extreme cases
np.random.seed(42)
mapper = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    metric='euclidean',
    random_state=42,   # essential for reproducibility
    init='spectral',   # good deterministic-ish init (still uses random_state)
    n_jobs=1           # single thread lowers nondeterminism (slower)
)

In [None]:
umap_desc = mapper.fit(X_desc)
X_desc_umap = umap_desc.transform(X_desc)
umap_rdkit = mapper.fit(X_rdkit)
X_rdkit_umap = umap_rdkit.transform(X_rdkit)
umap_morgan = mapper.fit(X_morgan)
X_morgan_umap = umap_morgan.transform(X_morgan)
umap_chemeleon = mapper.fit(X_chemeleon)
X_chemeleon_umap = umap_chemeleon.transform(X_chemeleon)

In [None]:
X_desc_umap_df = pd.DataFrame(X_desc_umap)
X_rdkit_umap_df = pd.DataFrame(X_rdkit_umap)
X_morgan_umap_df = pd.DataFrame(X_morgan_umap)
X_chemeleon_umap_df = pd.DataFrame(X_chemeleon_umap)

In [None]:
X_desc_umap_df.columns = ['UMAP1', 'UMAP2']
X_rdkit_umap_df.columns = ['UMAP1', 'UMAP2']
X_morgan_umap_df.columns = ['UMAP1', 'UMAP2']
X_chemeleon_umap_df.columns = ['UMAP1', 'UMAP2']

In [None]:
generated = pd.read_csv('generated_200k.csv')

In [None]:
synth = pd.read_csv('synthesized_complexes.csv')

In [None]:
import re
def parse_array_string(s):
    if pd.isna(s):
        return np.array([])           # or return np.nan if you prefer
    # replace newlines and repeated whitespace with single space, remove brackets
    clean = re.sub(r"[\[\]\n]+", " ", str(s)).strip()
    if clean == "":
        return np.array([])
    return np.fromstring(clean, sep=" ")

## Plotting UMAP

### Descriptors

In [None]:
gen_desc = pd.read_csv('data/UMAP/generated_200k.csv')

In [None]:
gen_desc['Descriptors'] = gen_desc['Descriptors'].apply(parse_array_string)

In [None]:
og_gen = gen_desc.copy()
og_synth = synth.copy()

In [None]:
X_gen = generated['Descriptors']
X_gen = X_gen[X_gen.apply(lambda arr: not np.isnan(arr).any())]
X_gen = X_gen.tolist()
X_gen = np.array(X_gen)
scaler=StandardScaler()
X_gen = scaler.fit_transform(X_gen)

In [None]:
umap_generated = mapper.fit(X_gen)
X_gen_umap = umap_generated.transform(X_gen)
X_gen_umap_df = pd.DataFrame(X_gen_umap)
X_gen_umap_df.columns = ['UMAP1', 'UMAP2']

In [None]:
train_desc = metals_desc.copy()
df_umap_desc=[]

synth['IC50'] = [5.25, 4.20, 4.15, 6.08, 5.90, 4.30, 5.69]
synth_desc = prepare_df_rdkit(synth)
synth_desc = calc_desc(synth_desc)

train_desc['Descriptors'] = train_desc['Descriptors'].apply(lambda x: x.tolist())
train_desc['Dataset'] = 't'
train_desc = train_desc[['Descriptors', 'Dataset']]
df_umap_desc.append(train_desc)

synth_desc['Descriptors'] = synth_desc['Descriptors'].apply(lambda x: x.tolist())
synth_desc['Dataset'] = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
synth_desc = synth_desc[['Descriptors', 'Dataset']]
df_umap_desc.append(synth_desc)

gen_desc = gen_desc[['Descriptors']]
gen_desc['Dataset'] = 'g'
gen_desc = gen_desc[['Descriptors', 'Dataset']]
df_umap_desc.append(gen_desc)

total_df_desc = pd.concat(df_umap_desc, ignore_index=True, axis=0)

scaler = StandardScaler()


total_df_desc['Descriptors'] = scaler.fit_transform(np.array(total_df_desc['Descriptors'].tolist()))
total_df_desc = total_df_desc.dropna(how='any')

print(f'length of total_df_desc : {len(total_df_desc)}')

X = pd.DataFrame(total_df_desc['Descriptors'])
print(f'lenght of X : {len(X)}')
X_umap = mapper.fit_transform(X)
X_umap = pd.DataFrame(X_umap)

X_umap.columns = ['UMAP1', 'UMAP2']

X_umap['Dataset'] = total_df_desc['Dataset']
print(f'length of X_umap : {len(X_umap)}')

X_umap_gen = X_umap[X_umap['Dataset'] == 'g']
X_umap_train = X_umap[X_umap['Dataset'] == 't']
X_umap_synth = X_umap[X_umap['Dataset'].isin(['A', 'B', 'C', 'D', 'E', 'F', 'G'])]

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 10))

fig.suptitle('UMAP plots for Descriptors')
cmap = sns.color_palette('rocket', as_cmap=True)

letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

for (x, y), label in zip(X_umap_synth[['UMAP1','UMAP2']].values, letters):
    if label == 'F':
        axes.text(
            x+0.05,
            y+1.6,
            label,
            fontsize=30,
            fontweight='bold',
            ha='center',
            va='center',
            color='black',
            zorder=10
        )
    else:
        axes.text(
            x+0.2,
            y+1.6,
            label,
            fontsize=30,
            fontweight='bold',
            ha='center',
            va='center',
            color='black',
            zorder=10
        )


sns.scatterplot(ax=axes, data=X_umap_gen, x='UMAP1', y='UMAP2', s=8, alpha=0.05, color='lightgray', legend=False)
sns.scatterplot(ax=axes, data=X_umap_train, x='UMAP1', y='UMAP2', hue=metals_desc['pIC50'], palette=cmap, legend=False, s=30)
sns.scatterplot(ax=axes, data = X_umap_synth, x='UMAP1', y='UMAP2', marker='*', s=600, edgecolor='black', linewidth=0.5, hue=synth['IC50'].tolist(), palette=cmap, legend=False)

#plt.xlim(-20,20)
#plt.ylim(-20,20)

sm = plt.cm.ScalarMappable(cmap=cmap, norm=mpl.colors.Normalize(vmin=0, vmax=7))
sm.set_array([])  # required for colorbar
cbar = fig.colorbar(sm, ax=axes, orientation='vertical', fraction=0.05, pad=0.02)
cbar.set_label('pIC50')

#plt.savefig('UMAP_200k_descriptors.png', dpi=300)

### RDKit FP

In [None]:
import json
gen_rdkit = pd.read_csv('./data/UMAP/dataset_encoded_to_RDKit_FP.csv')
gen_rdkit['Fingerprint'] = gen_rdkit['Fingerprint'].apply(json.loads)

In [None]:
train_rdkit = metals_rdkit.copy()

df_umap_rdkit=[]

synth['IC50'] = [5.25, 4.20, 4.15, 6.08, 5.90, 4.30, 5.69]
synth_rdkit = prepare_df_rdkit(synth)

train_rdkit['Dataset'] = 't'
train_rdkit = train_rdkit[['Fingerprint', 'Dataset']]
df_umap_rdkit.append(train_rdkit)

synth_rdkit['Dataset'] = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
synth_rdkit = synth_rdkit[['Fingerprint', 'Dataset']]
df_umap_rdkit.append(synth_rdkit)

gen_rdkit = gen_rdkit[['Fingerprint']]
gen_rdkit['Dataset'] = 'g'
gen_rdkit = gen_rdkit[['Fingerprint', 'Dataset']]
df_umap_rdkit.append(gen_rdkit)

total_df_rdkit = pd.concat(df_umap_rdkit, ignore_index=True, axis=0)

print(f'length of total_df : {len(total_df_rdkit)}')
scaler = StandardScaler()

X = np.array(total_df_rdkit['Fingerprint'].tolist())
print(f'length of X : {len(X)}')

X_umap = mapper.fit_transform(X)
X_umap = pd.DataFrame(X_umap)

X_umap.columns = ['UMAP1', 'UMAP2']

X_umap['Dataset'] = total_df_rdkit['Dataset']
print(f'length of X_umap : {len(X_umap)}')

In [None]:
X_umap_gen_rdkit = X_umap[X_umap['Dataset'] == 'g']
X_umap_train_rdkit = X_umap[X_umap['Dataset'] == 't']
X_umap_synth_rdkit = X_umap[X_umap['Dataset'].isin(['A', 'B', 'C', 'D', 'E', 'F', 'G'])]

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 10))

fig.suptitle('UMAP plot for RDKit fingerprints')
cmap = sns.color_palette('rocket', as_cmap=True)

letters = ['A', 'B', 'C', 'D, E', '', 'F', 'G']

for (x, y), label in zip(X_umap_synth_rdkit[['UMAP1','UMAP2']].values, letters):
    if label=='B':
        axes.text(
        x+0.8,
        y+0.1,
        label,
        fontsize=30,
        fontweight='bold',
        ha='left',
        va='center',
        color='black',      # choose your color
        zorder=10
    )
    else:
        axes.text(
            x+0,
            y+1.5,
            label,
            fontsize=30,
            fontweight='bold',
            ha='left',
            va='center',
            color='black',      # choose your color
            zorder=10
        )


sns.scatterplot(ax=axes, data=X_umap_gen_rdkit, x='UMAP1', y='UMAP2', s=8, alpha=0.05, color='lightgray', legend=False)
sns.scatterplot(ax=axes, data=X_umap_train_rdkit, x='UMAP1', y='UMAP2', hue=metals_desc['pIC50'], palette=cmap, legend=False, s=30)
sns.scatterplot(ax=axes, data = X_umap_synth_rdkit, x='UMAP1', y='UMAP2', marker='*', s=600, edgecolor='black', linewidth=0.5, hue=synth['IC50'].tolist(), palette=cmap, legend=False)

#plt.xlim(-20,20)
#plt.ylim(-20,20)

sm = plt.cm.ScalarMappable(cmap=cmap, norm=mpl.colors.Normalize(vmin=0, vmax=7))

sm.set_array([])  # required for colorbar
cbar = fig.colorbar(sm, ax=axes, orientation='vertical', fraction=0.05, pad=0.02)
cbar.set_label('pIC50')

#plt.savefig('UMAP_200k_rdkit.png', dpi=300)

### Morgan FP

In [None]:
gen_morgan = pd.read_csv('./data/UMAP/dataset_encoded_to_Morgan_FP.csv')
gen_morgan['Fingerprint'] = gen_morgan['Fingerprint'].apply(json.loads)
train_morgan = metals_morgan.copy()

In [None]:
df_umap_morgan=[]

synth['IC50'] = [5.25, 4.20, 4.15, 6.08, 5.90, 4.30, 5.69]
synth_morgan = prepare_df_morgan(synth, 2, 2048)

train_morgan['Dataset'] = 't'
train_morgan = train_morgan[['Fingerprint', 'Dataset']]
df_umap_morgan.append(train_morgan)

synth_morgan['Dataset'] = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
synth_morgan = synth_morgan[['Fingerprint', 'Dataset']]
df_umap_morgan.append(synth_morgan)

gen_morgan = gen_morgan[['Fingerprint']]
gen_morgan['Dataset'] = 'g'
gen_morgan = gen_morgan[['Fingerprint', 'Dataset']]
df_umap_morgan.append(gen_morgan)

total_df_morgan = pd.concat(df_umap_morgan, ignore_index=True, axis=0)

print(f'length of total_df : {len(total_df_morgan)}')
X = total_df_morgan['Fingerprint']

X= X.tolist()
X = np.array(X, dtype=np.float32)
scaler = StandardScaler()
print(f'length of X : {len(X)}')

X_umap = mapper.fit_transform(X)
X_umap = pd.DataFrame(X_umap)

X_umap.columns = ['UMAP1', 'UMAP2']

X_umap['Dataset'] = total_df_morgan['Dataset']
print(f'length of X_umap : {len(X_umap)}')

In [None]:
X_umap_gen_morgan = X_umap[X_umap['Dataset'] == 'g']
X_umap_train_morgan = X_umap[X_umap['Dataset'] == 't']
X_umap_synth_morgan = X_umap[X_umap['Dataset'].isin(['A', 'B', 'C', 'D', 'E', 'F', 'G'])]

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 10))

fig.suptitle('UMAP plot for Morgan fingerprints')
cmap = sns.color_palette('rocket', as_cmap=True)

letters = ['A', 'B', 'C', '', 'D, E', 'F', 'G']

i=0
for (x, y), label in zip(X_umap_synth_morgan[['UMAP1','UMAP2']].values, letters):
        axes.text(x+0.8, y+0, label, fontsize=30, fontweight='bold', ha='left', va='center', color='black', zorder=10)
        i+=1


sns.scatterplot(ax=axes, data=X_umap_gen_morgan, x='UMAP1', y='UMAP2', s=8, alpha=0.05, color='lightgray', legend=False)
sns.scatterplot(ax=axes, data=X_umap_train_morgan, x='UMAP1', y='UMAP2', hue=metals_rdkit['pIC50'], palette=cmap, legend=False, s=30)
sns.scatterplot(ax=axes, data = X_umap_synth_morgan, x='UMAP1', y='UMAP2', marker='*', s=600, edgecolor='black', linewidth=0.5 , hue=synth['IC50'].tolist(), palette=cmap, legend=False)

#plt.xlim(-20,20)
#plt.ylim(-20,20)

sm = plt.cm.ScalarMappable(cmap=cmap, norm=mpl.colors.Normalize(vmin=0, vmax=7))

sm.set_array([])  # required for colorbar
cbar = fig.colorbar(sm, ax=axes, orientation='vertical', fraction=0.05, pad=0.02)
cbar.set_label('pIC50')

#plt.savefig('UMAP_200k_morgan.png', dpi=300)

### CheMeleon FP

In [None]:
total_df_chemeleon = pd.read_csv('./data/UMAP/UMAP_Chemeleon.csv')

In [None]:
X_umap = total_df_chemeleon[['UMAP1', 'UMAP2', 'Dataset']]
X_umap_gen_chemeleon = X_umap[X_umap['Dataset'] == 'g']
X_umap_train_chemeleon = X_umap[X_umap['Dataset'] == 't']
X_umap_synth_chemeleon = X_umap[X_umap['Dataset']=='[A, B, C, D, E, F, G]']

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 10))

fig.suptitle('UMAP plot for Chemeleon')
cmap = sns.color_palette('rocket', as_cmap=True)

letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

for (x, y), label in zip(X_umap_synth_chemeleon[['UMAP1','UMAP2']].values, letters):
    if label == 'F':
        axes.text(
            x-0.8,
            y-1.7,
            label,
            fontsize=30,
            fontweight='bold',
            ha='left',
            va='center',
            color='black',      # choose your color
            zorder=10
        )

    elif label == 'G':
        axes.text(
            x+0.8,
            y-0.4,
            label,
            fontsize=30,
            fontweight='bold',
            ha='left',
            va='center',
            color='black',      # choose your color
            zorder=10
        )

    else:
        axes.text(
            x+0.8,
            y,
            label,
            fontsize=30,
            fontweight='bold',
            ha='left',
            va='center',
            color='black',      # choose your color
            zorder=10
        )

sns.scatterplot(ax=axes, data=X_umap_gen_chemeleon, x='UMAP1', y='UMAP2', s=8, alpha=0.05, color='lightgray', legend=False)
sns.scatterplot(ax=axes, data=X_umap_train_chemeleon, x='UMAP1', y='UMAP2', hue=metals_rdkit['pIC50'].tolist(), palette=cmap, legend=False, s=30)
sns.scatterplot(ax=axes, data = X_umap_synth_chemeleon, x='UMAP1', y='UMAP2', marker='*', s=600, edgecolor='black', linewidth=0.5, hue=synth['IC50'].tolist(), palette=cmap, legend=False)

#plt.xlim(-20,20)
#plt.ylim(-20,20)

sm = plt.cm.ScalarMappable(cmap=cmap, norm=mpl.colors.Normalize(vmin=0, vmax=7))

sm.set_array([])  # required for colorbar
cbar = fig.colorbar(sm, ax=axes, orientation='vertical', fraction=0.05, pad=0.02)
cbar.set_label('pIC50')

#plt.savefig('UMAP_200k_chemeleon.png', dpi=300)