# A Simple Experiment 
in which we try to predict the difference in molecular weight from a set of features

In [2]:
import rdkit.Chem
import sklearn
import pandas as pd
import numpy as np
from pathlib import Path
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn import svm, metrics, clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold, train_test_split

## load the data

In [3]:
mol_path = Path("/data/chodera/asap-datasets/mpro_fragalysis-04-01-24_curated_cache/combined_2d.sdf")

In [4]:
mols = [mol for mol in Chem.SDMolSupplier(str(mol_path))]

# Calculate molecular weight

In [5]:
mol_weight = [Descriptors.MolWt(mol) for mol in mols]

# Training data

In [6]:
mol1 = [mol for mol in mols for _ in mols]
weight1 = [weight for weight in mol_weight for _ in mol_weight]
mol2 = [mol for _ in mols for mol in mols]
weight2 = [weight for _ in mol_weight for weight in mol_weight]

In [7]:
df = pd.DataFrame({"mol1": mol1, "mol2": mol2, "weight1": weight1, "weight2": weight2})

In [8]:
df["weight_diff"] = np.abs(df["weight1"] - df["weight2"])

# methods for extracting features

In [9]:
def mol_to_fp(mol: rdkit.Chem.Mol, method="maccs", n_bits=2048):
    from rdkit.Chem import MACCSkeys, rdFingerprintGenerator
    """
    Encode a molecule from a SMILES string into a fingerprint.

    Parameters
    ----------
    smiles : str
        The SMILES string defining the molecule.

    method : str
        The type of fingerprint to use. Default is MACCS keys.

    n_bits : int
        The length of the fingerprint.

    Returns
    -------
    array
        The fingerprint array.

    """

    if method == "maccs":
        return np.array(MACCSkeys.GenMACCSKeys(mol))
    if method == "morgan2":
        fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_bits)
        return np.array(fpg.GetFingerprint(mol))
    if method == "morgan3":
        fpg = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=n_bits)
        return np.array(fpg.GetFingerprint(mol))
    else:
        # NBVAL_CHECK_OUTPUT
        print(f"Warning: Wrong method specified: {method}. Default will be used instead.")
        return np.array(MACCSkeys.GenMACCSKeys(mol))

In [10]:
fps = [mol_to_fp(mol) for mol in mols]

In [11]:
df["maccs1"] = [fp for fp in fps for _ in fps]
df["maccs2"] = [fp for _ in fps for fp in fps]

In [12]:
ecfp4 = [mol_to_fp(mol, method="morgan2") for mol in mols]

In [13]:
df["morgan2_1"] = [fp for fp in ecfp4 for _ in ecfp4]
df["morgan2_2"] = [fp for _ in ecfp4 for fp in ecfp4]

In [14]:
df["combined_maccs"] = df.apply(lambda x: np.concatenate([x["maccs1"], x["maccs2"]]), axis=1)

In [15]:
df["combined_morgan2"] = df.apply(lambda x: np.concatenate([x["morgan2_1"], x["morgan2_2"]]), axis=1)

# Define ML Models

In [16]:
# actually we can't use a RF classifier because the thing we're trying to predict is continuous
# # Set model parameter for random forest
# param = {
#     "n_estimators": 100,  # number of trees to grows
#     "criterion": "entropy",  # cost function to be optimized for a split
# }
# model_RF = RandomForestClassifier(**param)

In [17]:
model_SVR = svm.SVR()

# Train Models

In [18]:
def model_performance(ml_model, test_x, test_y, verbose=True):
    """
    Calculate the performance of a machine learning model on a test set.

    Parameters
    ----------
    ml_model: sklearn model object
        The machine learning model to evaluate.
    test_x: array
        The test set descriptors.
    test_y: array
        The test set labels.
    verbose: bool
        Print performance info (default = True)

    Returns
    -------
    tuple:
        Accuracy, sensitivity, specificity, auc on test set.

    """
    # Predict the test set
    pred_y = ml_model.predict(test_x)

    # Calculate the accuracy
    accuracy = sklearn.metrics.accuracy_score(test_y, pred_y)

    # Calculate the sensitivity
    sens = sklearn.metrics.recall_score(test_y, pred_y)

    # Calculate the specificity
    spec = sklearn.metrics.recall_score(test_y, pred_y, pos_label=0)

    # Calculate the AUC
    auc = sklearn.metrics.roc_auc_score(test_y, pred_y)

    if verbose:
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Sensitivity: {sens:.2f}")
        print(f"Specificity: {spec:.2f}")
        print(f"AUC: {auc:.2f}")

    return accuracy, sens, spec, auc

In [19]:
def model_training_and_validation(ml_model, name, splits, verbose=True):
    """
    Fit a machine learning model on a random train-test split of the data
    and return the performance measures.

    Parameters
    ----------
    ml_model: sklearn model object
        The machine learning model to train.
    name: str
        Name of machine learning algorithm: RF, SVM, ANN
    splits: list
        List of desciptor and label data: train_x, test_x, train_y, test_y.
    verbose: bool
        Print performance info (default = True)

    Returns
    -------
    tuple:
        Accuracy, sensitivity, specificity, auc on test set.

    """
    train_x, test_x, train_y, test_y = splits

    # Fit the model
    ml_model.fit(train_x, train_y)

    # Calculate model performance results
    accuracy, sens, spec, auc = model_performance(ml_model, test_x, test_y, verbose)

    return accuracy, sens, spec, auc

## Train / Test Split

In [20]:
fingerprint_to_model = df.combined_morgan2.tolist()
label_to_model = df.weight_diff.tolist()

# Split data randomly in train and test set
# note that we use test/train_x for the respective fingerprint splits
# and test/train_y for the respective label splits
(
    static_train_x,
    static_test_x,
    static_train_y,
    static_test_y,
) = train_test_split(fingerprint_to_model, label_to_model, test_size=0.8)
splits = [static_train_x, static_test_x, static_train_y, static_test_y]
# NBVAL_CHECK_OUTPUT
print("Training data size:", len(static_train_x))
print("Test data size:", len(static_test_x))

Training data size: 8405
Test data size: 33620


In [21]:
# Fit model on single split
performance_measures = model_training_and_validation(model_SVR, "SVR", splits)

ValueError: continuous is not supported