In [1]:
### If your run the notebooks on Google Colab uncomment these lines to install required packages

#!pip install rdkit
#!pip install chemprop
#!pip install matplotlib
#!pip install seaborn xgboost hyperopt

In [10]:
import random
from random import sample, seed, shuffle
import numpy as np
import pandas as pd
import os
import six
from rdkit import rdBase
from rdkit import RDLogger

# Suppress RDKit warnings
rdBase.DisableLog('rdApp.*')
RDLogger.DisableLog('rdApp.*')

#utility functions : prepare the data
from model_fp_selection.lib.utils import prepare_df_morgan, prepare_df_rdkit, swap_identical_ligands, prepare_df_chemeleon, convert_to_float, prepare_df
from model_fp_selection.lib.utils import drop_duplicates, average_duplicates, calc_desc, get_ligands_dict

#utility functions : CV and results
from model_fp_selection.lib.utils import obtain_metrics, plot_cv_results
from model_fp_selection.lib.utils import df_split, get_indices_doi, get_indices_scaff, get_indices_chemeleon, get_indices_chemeleon_DOI, get_indices_chemeleon_scaff
from model_fp_selection.lib.utils import generate_scaffold, scaffold_to_smiles
from model_fp_selection.lib.utils import ligands_permutation, cross_validation, prepare_train_set, cross_validation_chemeleon


from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, Draw
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, median_absolute_error, PredictionErrorDisplay
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import MinMaxScaler

#Encoding categorical Data
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Regressors
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

#Pipelines and other model constructions
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

# Visualization
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 20})

#np.random.seed(42)
#seed(42)

#Specific to Scaffold Splitting
from rdkit.Chem.Scaffolds import MurckoScaffold
from collections import defaultdict
import pickle as pkl
import time
from tqdm import tqdm
import seaborn as sns

from itertools import *

from model_fp_selection.lib.cross_val_both_models import cross_val_2_models

from model_fp_selection.chemeleon_fingerprint import CheMeleonFingerprint

from pathlib import Path

from lightning import pytorch as pl

from lightning.pytorch.callbacks import ModelCheckpoint
import pandas as pd

import torch.nn as torch_nn

from chemprop import data, models, featurizers, nn

import time

import lightning.pytorch as pl
from lightning.pytorch.callbacks import Callback
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
from urllib.request import urlretrieve

urlretrieve(
    r"https://zenodo.org/records/15460715/files/chemeleon_mp.pt",
    "chemeleon_mp.pt",
)

('chemeleon_mp.pt', <http.client.HTTPMessage at 0x7fa871c47d90>)

## Correct workflow to save finetuned weights

In [53]:
SEED = 42
pl.seed_everything(SEED, workers=True)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

Seed set to 42


In [55]:
df_input=pd.read_csv('./ruthenium_complexes_dataset.csv')

In [56]:
smiles_column = "SMILES"
target_columns = ["pIC50"]

df = prepare_df(df_input)
df = average_duplicates(df, "Ligands_Dict", "pIC50")

df["SMILES"] = df.L1 + "." + df.L2 + "." + df.L3
df["ID"] = df.index

Length of training dataset after cleaning duplicates, before adding permutations : 718


In [57]:
smis = df[smiles_column].values
ys   = df[target_columns].values

all_data = [
    data.MoleculeDatapoint.from_smi(smi, y)
    for smi, y in zip(smis, ys)
]

In [58]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

full_dset = data.MoleculeDataset(all_data, featurizer)
scaler = full_dset.normalize_targets()

train_loader = data.build_dataloader(
    full_dset,
    shuffle=True,
    num_workers=0
)

chemeleon_ckpt = torch.load("chemeleon_mp.pt", weights_only=True)

mp = nn.BondMessagePassing(**chemeleon_ckpt["hyper_parameters"])
mp.load_state_dict(chemeleon_ckpt["state_dict"])

agg = nn.MeanAggregation()

output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)

ffn = nn.RegressionFFN(
    input_dim=mp.output_dim,
    n_layers=3,
    hidden_dim=400,
    dropout=0.1,
    output_transform=output_transform
)
mpnn = models.MPNN(
    mp,
    agg,
    ffn,
    batch_norm=False
)

In [59]:
trainer = pl.Trainer(
    accelerator="gpu",
    devices=1,
    max_epochs=20,
    logger=False,
    enable_checkpointing=False
)

trainer.fit(mpnn, train_loader)

/opt/python/lib/python3.13/site-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/opt/python/lib/python3.13/site-packages/lightning/pytorch/utilities/_pytree.py:21: `isinstance(treespec, LeafSpec)` is deprecated, use `isinstance(treespec, TreeSpec) and treespec.is_leaf()` instead.
/opt/python/lib/python3.13/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=71` in the `DataLoader` to improve performance.


In [60]:
torch.save(mpnn.state_dict(), "final_mpnn_weights.pth")

In [61]:
torch.save(scaler, "target_scaler.pth")

## Inference 

In [80]:
MODEL_PATH = "final_mpnn_weights.pth"
SCALER_PATH = "target_scaler.pth"

scaler = torch.load(SCALER_PATH, weights_only=False)

UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL sklearn.preprocessing._data.StandardScaler was not an allowed global by default. Please use `torch.serialization.add_safe_globals([sklearn.preprocessing._data.StandardScaler])` or the `torch.serialization.safe_globals([sklearn.preprocessing._data.StandardScaler])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

In [76]:
df_test = pd.read_csv("synthesized_complexes.csv")

In [77]:
df_test["SMILES"] = df_test["L1"] + "." + df_test["L2"] + "." + df_test["L3"]
df_test["ID"] = df_test.index
ys = [None] * len(df_test)
test_data = [
    data.MoleculeDatapoint.from_smi(smi, y)
    for smi, y in zip(df_test["SMILES"], ys)
]

In [78]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

test_dset = data.MoleculeDataset(test_data, featurizer)

test_loader = data.build_dataloader(
    test_dset,
    shuffle=False,
    num_workers=0
)

# Load CheMeleon MP backbone
chemeleon_ckpt = torch.load("chemeleon_mp.pt", weights_only=True)

mp = nn.BondMessagePassing(**chemeleon_ckpt["hyper_parameters"])
mp.load_state_dict(chemeleon_ckpt["state_dict"])

agg = nn.MeanAggregation()

output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)

ffn = nn.RegressionFFN(
    input_dim=mp.output_dim,
    n_layers=3,
    hidden_dim=400,
    dropout=0.1,
    output_transform=output_transform
)

mpnn = models.MPNN(mp, agg, ffn, batch_norm=False)

mpnn.load_state_dict(torch.load(MODEL_PATH))
mpnn.eval()

trainer = pl.Trainer(
    accelerator="gpu",
    devices=1,
    logger=False
)

preds = trainer.predict(mpnn, test_loader)
preds = torch.cat(preds).cpu().numpy().squeeze()

results_df = pd.DataFrame({
    "ID": df_test["ID"],
    "SMILES": df_test["SMILES"],
    "pIC50_pred": preds
})

/opt/python/lib/python3.13/site-packages/lightning/pytorch/utilities/_pytree.py:21: `isinstance(treespec, LeafSpec)` is deprecated, use `isinstance(treespec, TreeSpec) and treespec.is_leaf()` instead.
/opt/python/lib/python3.13/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:434: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=71` in the `DataLoader` to improve performance.


In [79]:
results_df

Unnamed: 0,ID,SMILES,pIC50_pred
0,0,C1(C2=CC=CC=C2)=CC=NC3=C1C=CC4=C3N=CC=C4C5=CC=...,5.590017
1,1,C1(C2=NC=CC=C2)=NC=CC=C1.C1(C2=NC=CC=C2)=NC=CC...,4.516929
2,2,C1(C2=NC=CC=C2)=NC=CC=C1.C1(C2=NC=CC=C2)=NC=CC...,4.065144
3,3,C12=NC=CC=C1C=CC3=C2N=CC=C3.C12=NC=CC=C1C=CC3=...,6.331344
4,4,C12=NC=CC=C1C=CC3=C2N=CC=C3.C12=NC=CC=C1C=CC3=...,6.186633
5,5,C1(C2=CC=CC=N2)=NC=CC=C1.C1(C2=CC=CC=N2)=NC=CC...,4.147353
6,6,C12=NC=CC=C1C=CC3=C2N=CC=C3.C12=NC=CC=C1C=CC3=...,5.377989
