# MoleculeACE - ChEMBL cliff evaluation

Finally, once our models are trained on MoleculeACE data, we evaluate them on the accompanying MoleculeACE test data.

* ChEMBL234 - Dopamine D3 receptor
* ChEMBL4203 - Dual specificity protein kinase
* ChEMBL2047 - Farnesoid X receptor
* ChEMBL4616 - Ghrelin receptor
* ChEMBL264 - Histamine H3 receptor
* ChEMBL2835 - Janus kinase 1
* ChEMBL4792 - Orexin receptor 2

## Setup

In [1]:
import os.path

try:
    from google.colab import drive
    drive.mount('/content/drive')
    _home = 'drive/MyDrive/tlacamr'
except ImportError:
    _home = '~'
finally:
    project_root = os.path.join(_home, 'tlacamr')

print(project_root)

Mounted at /content/drive
drive/MyDrive/tlacamr/tlacamr


In [2]:
%cd $project_root
!pip install .
### install statement should look like this once repo is public
###!pip install git+https://github.com/my-user/my-repo

/content/drive/MyDrive/tlacamr/tlacamr
Processing /content/drive/MyDrive/tlacamr/tlacamr
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting lightning>=2.0.0 (from acsuite==0.1)
  Downloading lightning-2.1.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchmetrics>=0.11.4 (from acsuite==0.1)
  Downloading torchmetrics-1.3.0.post0-py3-none-any.whl (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.2/840.2 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hydra-core==1.3.2 (from acsuite==0.1)
  Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hydra-colorlog==1.2.


## Model imports

In [89]:
from MoleculeACE import calc_rmse, calc_cliff_rmse, Data
from src.models.acamodule import ACAModule
from src.data.property_prediction.moleculeace_dataset import MoleculeACEDataset
from src.data.property_prediction.moleculeace_datamodule import MoleculeACEDataModule
from torch.utils.data import ConcatDataset, DataLoader, Dataset, random_split

import datamol as dm
from molfeat.calc import FP_FUNCS, FPCalculator
from molfeat.trans.concat import FeatConcat
from molfeat.trans import MoleculeTransformer
from lightning import Trainer

import torch
import gc
import os

pretrained_model_dir = os.path.join("src", "models", "pretrained", "property_prediction")

chembl_datsets = ["CHEMBL234_Ki",
                  "CHEMBL264_Ki",
                  "CHEMBL2047_EC50",
                  "CHEMBL2835_Ki",
                  "CHEMBL4203_Ki",
                  "CHEMBL4616_EC50",
                  "CHEMBL4792_Ki"]

mlp_2048_results = {}

## Evaluation

### MLP 2048

### Classification

#### Create eval datasets

In [7]:
data = Data('CHEMBL234_Ki')

In [85]:
type(data.y_test)

list

In [87]:
x = DataLoader(MoleculeACEDataset('CHEMBL234_Ki', data_split='test', task="classification",
                   molfeat_featurizer = MoleculeTransformer(
                       FPCalculator('ecfp',
                                    length = 2048,
                                    radius = 4))),
               shuffle = False)

In [None]:
predictions = []
with torch.no_grad():
    for batch in data_loader:
        # Get input data from batch
        # If your dataset returns a tuple (inputs, targets), extract inputs
        inputs = batch[0] if isinstance(batch, tuple) else batch

        # If your model and data are on different devices, align them
        inputs = inputs.to(device)  # Replace 'device' with 'cuda' or 'cpu' as appropriate

        # Get model predictions
        batch_predictions = model(inputs)

        # Convert predictions to the desired format (e.g., detach, move to CPU, and convert to numpy)
        batch_predictions = batch_predictions.cpu().numpy()

        # Extend the list of predictions
        predictions.extend(batch_predictions)


In [86]:
y = MoleculeACEDataset('CHEMBL234_Ki', data_split='test', task="classification",
                   molfeat_featurizer = MoleculeTransformer(
                       FPCalculator('ecfp',
                                    length = 2048,
                                    radius = 4))
                   )

In [92]:
x = MoleculeACEDataModule('CHEMBL234_Ki', 'classification',
                      molfeat_featurizer = MoleculeTransformer(
                       FPCalculator('ecfp',
                                    length = 2048,
                                    radius = 4)),
                      batch_size = 64,
                      num_workers = 1,
                      pin_memory = False,
                      shuffle = False)

In [104]:
x.setup()

Setting up data...
CHEMBL234_Ki


Data object with molecules as: Nothing. 2923 train/734 test

In [107]:
len(x.train_dataloader())

46

In [97]:
trainer = Trainer()

predictions = trainer.predict(model, dataloaders=x.test_dataloader())

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


TypeError: object of type 'NoneType' has no len()

#### MLP 2048

In [None]:
for dataset in chembl_datasets:
    # Generate the directory path dynamically
    checkpoint_path = os.path.join(pretrained_model_dir, "classification", "mlp_2048", dataset)

    # Load the model from checkpoint
    model = ACAModule.load_from_checkpoint(checkpoint_path)

    # Add your code here to prepare the dataset specific to each model
    # For example, loading the test data (y_test, smiles_test, etc.) for the specific dataset
    # y_test, smiles_test, y_train, smiles_train = ...

    # Run predictions
    y_hat = model.predict(...)

    # Evaluate the model
    rmse = calc_rmse(y_test, y_hat)
    rmse_cliff = calc_cliff_rmse(y_test_pred=y_hat, y_test=y_test, smiles_test=smiles_test,
                                 y_train=y_train, smiles_train=smiles_train,
                                 in_log10=True, similarity=0.9, potency_fold=10)
    print(f"Dataset: {dataset}")
    print(f"rmse: {rmse}")
    print(f"rmse_cliff: {rmse_cliff}")

    del model
    gc.collect()

In [39]:
checkpoint_path = os.path.join(pretrained_model_dir, "classification", "mlp_2048", "CHEMBL234_Ki.ckpt")

In [48]:
model = ACAModule.load_from_checkpoint(checkpoint_path,
                                       map_location=torch.device('cpu'),
                                       objective="classification")
trainer = Trainer()

/usr/local/lib/python3.10/dist-packages/lightning/pytorch/utilities/parsing.py:198: Attribute 'net' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['net'])`.
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/utilities/parsing.py:198: Attribute 'criterion' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['criterion'])`.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


#### MLP 256

### Regression

## Refs

[1] Derek van Tilborg, Alisa Alenicheva, and Francesca Grisoni.“Exposing the Limitations of Molecular Machine Learning with Activity Cliffs”. In: Journal of Chemical Information and Modeling 62.23 (Dec. 2022), pp. 5938–5951. DOI: 10.1021/acs.jcim.2c01073. URL: https://doi.
org/10.1021/acs.jcim.2c01073.   
[2] César Miguel Valdez Córdova. Towards learning activity cliff-aware molecular representations. Publication pending.