### Chemical Property Prediction and Evaluation

This notebook is *designed to perform chemical property prediction* using a *pre-trained model*, evaluate the model's performance using various metrics, and visualize the ROC curve to assess its discriminatory power. The specific chemical property being predicted is 'CYP1A2-inhibitor', and the code is organized into sections for clarity.

In [1]:
# Installing chemprop (https://github.com/chemprop/chemprop) for chemical property prediction
!pip install chemprop==2.0.0



In [2]:
# Importing necessary modules
import pandas as pd
import numpy as np
import os
import torch
import pandas as pd
import numpy as np
from chemprop import data, featurizers, models
from lightning import pytorch as pl
import math
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from pathlib import Path
from typing import List, Optional

In [3]:
# Define the base paths
BASE_DIRS = {
    'absorption': Path('..', 'absorption'),
    'distribution': Path('..', 'distribution'),
    'metabolism': Path('..', 'metabolism'),
    'excretion': Path('..', 'excretion'),
}

# Define a dictionary for model paths
model_paths = {
    'caco2': BASE_DIRS['absorption'] / 'caco2' / 'caco2.ckpt',
    'solubility': BASE_DIRS['absorption'] / 'solubility' / 'solubility.ckpt',
    'lipophilicity': BASE_DIRS['absorption'] / 'lipophilicity' / 'lipophilicity.ckpt',
    'ppbr': BASE_DIRS['distribution'] / 'ppbr' / 'ppbr.ckpt',
    'vdss': BASE_DIRS['distribution'] / 'vdss' / 'vdss.ckpt',
    'cyp1a2-inhibitor': BASE_DIRS['metabolism'] / 'cyp1a2-inhibitor' / 'cyp1a2-inhibitor.ckpt',
    'cyp1a2-substrate': BASE_DIRS['metabolism'] / 'cyp1a2-substrate' / 'cyp1a2-substrate.ckpt',
    'cyp2c19-inhibitor': BASE_DIRS['metabolism'] / 'cyp2c19-inhibitor' / 'cyp2c19-inhibitor.ckpt',
    'cyp2c19-substrate': BASE_DIRS['metabolism'] / 'cyp2c19-substrate' / 'cyp2c19-substrate.ckpt',
    'cyp2c9-inhibitor': BASE_DIRS['metabolism'] / 'cyp2c9-inhibitor' / 'cyp2c9-inhibitor.ckpt',
    'cyp2c9-substrate': BASE_DIRS['metabolism'] / 'cyp2c9-substrate' / 'cyp2c9-substrate.ckpt',
    'cyp2d6-inhibitor': BASE_DIRS['metabolism'] / 'cyp2d6-inhibitor' / 'cyp2d6-inhibitor.ckpt',
    'cyp2d6-substrate': BASE_DIRS['metabolism'] / 'cyp2d6-substrate' / 'cyp2d6-substrate.ckpt',
    'cl-hepa': BASE_DIRS['excretion'] / 'cl-hepa' / 'cl-hepa.ckpt',
    'cl-micro': BASE_DIRS['excretion'] / 'cl-micro' / 'cl-micro.ckpt',
    'half-life': BASE_DIRS['excretion'] / 'half-life' / 'half-life.ckpt',
}

In [4]:
# Utility functions for loading and applying the model
def get_model_path(property_name: str) -> Optional[Path]:
    """Retrieve the path for a given property model."""
    model_path = model_paths.get(property_name.lower())
    if model_path and model_path.exists():
        return model_path
    return None

def load_model(checkpoint_path: Path):
    """Load a pre-trained model from a checkpoint path."""
    try:
        return models.MPNN.load_from_checkpoint(str(checkpoint_path))
    except Exception as e:
        print(f"Error loading model from {checkpoint_path}: {e}")
        return None

def featurize_smiles(smiles_list: List[str]):
    """Convert SMILES strings to a dataset and return the dataloader."""
    test_data = [data.MoleculeDatapoint.from_smi(smi) for smi in smiles_list]
    featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()
    test_dset = data.MoleculeDataset(test_data, featurizer=featurizer)
    return data.build_dataloader(test_dset, shuffle=False)

def predict_property(model, dataloader):
    """Run inference on the provided model and dataloader."""
    with torch.inference_mode():
        trainer = pl.Trainer(
            logger=False,
            enable_progress_bar=False,
            accelerator="cpu",
            devices=1
        )
        test_preds = trainer.predict(model, dataloader)
        test_preds_flat = [item for sublist in test_preds for item in sublist]
    return test_preds_flat

def handle_user_input() -> str:
    """Handle user input for property name and validate it."""
    while True:
        property_name = input("Enter the property to calculate (e.g., solubility) or 'exit' to quit: ").strip().lower()
        if property_name == 'exit':
            return property_name
        model_path = get_model_path(property_name)
        if not model_path:
            print(f"Model for property '{property_name}' not found.")
        else:
            return property_name

def process_smiles_input() -> List[str]:
    """Get and process SMILES input from the user."""
    smiles_input = input("Enter the SMILES list (comma-separated): ").strip()
    return [smi.strip() for smi in smiles_input.split(',')]

In [None]:
def main():
    """Main function providing a CLI for predicting molecular properties."""
    while True:
        property_name = handle_user_input()
        if property_name == 'exit':
            print("Exiting the interface.")
            break
        
        model_path = get_model_path(property_name)
        if not model_path:
            continue
        
        smiles_list = process_smiles_input()

        mpnn = load_model(model_path)
        if mpnn is None:
            continue

        test_loader = featurize_smiles(smiles_list)
        test_preds = predict_property(mpnn, test_loader)

        test_preds_list = [pred[0] for pred in test_preds]
        df = pd.DataFrame({
            'SMILES': smiles_list,
            'Prediction': test_preds_list
        })
        
        print("Predictions:")
        print(df)

if __name__ == "__main__":
    main()

Enter the property to calculate (e.g., solubility) or 'exit' to quit:  caco2
Enter the SMILES list (comma-separated):  CCCC(=O)Nc1ccc(N2CCN(CC)CC2)c(Cl)c1


/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'output_transform' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['output_transform'])`.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Predictions:
                                SMILES       Prediction
0  CCCC(=O)Nc1ccc(N2CCN(CC)CC2)c(Cl)c1  tensor(-4.6893)


Enter the property to calculate (e.g., solubility) or 'exit' to quit:  cyp1a2-inhibitor
Enter the SMILES list (comma-separated):  CCCC(=O)Nc1ccc(N2CCN(CC)CC2)c(Cl)c1


GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Predictions:
                                SMILES      Prediction
0  CCCC(=O)Nc1ccc(N2CCN(CC)CC2)c(Cl)c1  tensor(0.0005)
