# Message-passing neural network (MPNN) for molecular property prediction

In [None]:
#!pip -q install rdkit-pypi
#!pip -q install pandas
#!pip -q install Pillow
#!pip -q install matplotlib
#!pip -q install pydot
#!sudo apt-get -qq install graphviz

### Import packages

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from pathlib import Path
# Temporary suppress tf logs
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import tensorflow as tf
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt

# Temporary suppress warnings
#warnings.filterwarnings("ignore")


np.random.seed(42)
tf.random.set_seed(42)

In [None]:
X_train_fingerprint = [list(get_fingerprint(x)) for x in X_train]
X_test_fingerprint = [list(get_fingerprint(x)) for x in X_test]

In [46]:
from constants import *

## Data

In [None]:
df = pd.read_csv("../data/0_raw/data.csv").reset_index(drop=True)
df.head(
)

In [49]:
from data import validate_dataframe
df = validate_dataframe(df)

INFO:root: Data Validation | Dataset imbalance | Proportions: {1: 0.82, 0: 0.18}
INFO:root: Data Validation | Finished!


## Split to train/validation/test


Although scaffold splitting is recommended in our case (see
[here](https://www.blopig.com/blog/2021/06/out-of-distribution-generalisation-and-scaffold-splitting-in-molecular-property-prediction/)), for simplicity, random strattified splittings were
performed.

In [None]:
X_train_fingerprint = [list(get_fingerprint(x)) for x in X_train]
X_test_fingerprint = [list(get_fingerprint(x)) for x in X_test]

In [22]:
from data import split_data
split_data(data_path=INPUT_DATA_PATH,
           output_path=INTERMEDIATE_DATA_PATH,
           test_only=False,
          )

INFO:root: Data Validation | Dataset imbalance | Proportions: {1: 0.82, 0: 0.18}
INFO:root: Data Validation | Finished!
INFO:root: Data Splitting | Train: 0.7, Valid: 0.15, Test: 0.15
INFO:root: Data Splitting | Finished!


('../data/1_primary/data_train.csv',
 '../data/1_primary/data_valid.csv',
 '../data/1_primary/data_test.csv')

In [28]:
df_train = pd.read_csv(INTERMEDIATE_DATA_PATH/"data_train.csv")
df_valid = pd.read_csv(INTERMEDIATE_DATA_PATH/"data_valid.csv")
df_test = pd.read_csv(INTERMEDIATE_DATA_PATH/"data_test.csv")
len(df_train), len(df_valid), len(df_test)

(3499, 750, 750)

## Featurization, Graph Generation & DataSet Creation


In [30]:
from data import get_mpnn_dataset
train_dataset, atom_dim, bond_dim = get_mpnn_dataset(df_train, return_dims=True)
valid_dataset = get_mpnn_dataset(df_valid,)
test_dataset = get_mpnn_dataset(df_test,)

In [32]:
FEATURE_DATA_PATH

PosixPath('../data/2_feature')

In [34]:
tf.data.experimental.save(train_dataset, "train_dataset")

In [37]:
train_dataset

<PrefetchDataset shapes: (((None, None), (None, None), (None, None), (None,)), (None,)), types: ((tf.float32, tf.float32, tf.int64, tf.int32), tf.int64)>

### Handle imbalance

In [35]:
tr_ds = tf.data.experimental.load("train_dataset")

In [43]:
from modeling import get_imbalance_params, MPNNModel
from data import get_mpnn_dataset

initial_bias, class_weight = get_imbalance_params(df_train)
initial_bias, class_weight

(array([1.52765758]), {0: 2.8036858974358974, 1: 0.6085217391304348})

In [44]:
def train(data_train_path, data_valid_path, save_model_path="models/my_model", handle_imbalance=False):
    df_train = pd.read_csv(data_train_path)
    df_train = validate_dataframe(df_train)
    
    train_dataset, atom_dim, bond_dim = get_mpnn_dataset(df_train, return_dims=True)
    
    if data_valid_path is not None:
        df_valid = pd.read_csv(data_valid_path)
        valid_dataset = get_mpnn_dataset(df_valid,)
    
    initial_bias=None
    class_weights=None
    if handle_imbalance==True:
        initial_bias, class_weight = get_imbalance_params(df_train)
    
    model = MPNNModel(
        atom_dim=atom_dim, bond_dim=bond_dim, output_bias=initial_bias,
    )
    
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4),
        metrics=[tf.keras.metrics.AUC(name="AUC")],
    )
    
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                                  patience=5, min_lr=1e-7)
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
    )
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        history = model.fit(
            train_dataset,
            validation_data=valid_dataset,
            epochs=MAX_EPOCHS,
            verbose=2,
            callbacks=[reduce_lr, early_stopping],
            class_weight=class_weight,
        )
    model.save(save_model_path)
    return model, history

In [50]:
train(INTERMEDIATE_DATA_PATH/DATA_TRAIN_FILENAME, INTERMEDIATE_DATA_PATH/DATA_VALID_FILENAME, 
      save_model_path=MODEL_DATA_PATH/"mpnn_model",
      handle_imbalance=True,)

INFO:root: Data Validation | Dataset imbalance | Proportions: {1: 0.82, 0: 0.18}
INFO:root: Data Validation | Finished!


Epoch 1/2
110/110 - 9s - loss: 0.6875 - AUC: 0.6093 - val_loss: 0.5343 - val_AUC: 0.6568 - lr: 5.0000e-04 - 9s/epoch - 82ms/step
Epoch 2/2
110/110 - 8s - loss: 0.6366 - AUC: 0.6962 - val_loss: 0.5878 - val_AUC: 0.6742 - lr: 5.0000e-04 - 8s/epoch - 72ms/step




INFO:tensorflow:Assets written to: ../data/3_model/my_model/assets


INFO:tensorflow:Assets written to: ../data/3_model/my_model/assets


(<keras.engine.functional.Functional at 0x157124b20>,
 <keras.callbacks.History at 0x286ee6520>)

In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=5, min_lr=1e-7)
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
)

if validation_dataset is None:
    history = model.fit(
    train_dataset,
    validation_split=VALIDATION_SIZE,
    epochs=MAX_EPOCHS,
    verbose=2,
    callbacks=[reduce_lr, early_stopping],
    class_weight={0: 2.0, 1: 0.5},
    )
else:
    history = model.fit(
        train_dataset,
        validation_data=valid_dataset,
        epochs=MAX_EPOCHS,
        verbose=2,
        callbacks=[reduce_lr, early_stopping],
        class_weight=class_weight,
    )

In [None]:
def evaluate(data_path, model_path):
    df_data = pd.read_pickle(data_path)
    dataset = get_mpnn_dataset(df_data)
    model = tf.keras.models.load_model(model_path)
    return model.evaluate(dataset)

In [None]:
loss, acc = evaluate(data_path=INTERMEDIATE_DATA_PATH/DATA_VALID_FILENAME,
        model_path="models/my_model",)

In [None]:
def predict(model_path, data_path=None, smiles=None):
    model = tf.keras.models.load_model(model_path)
    if (data_path is None) and (smiles is None):
        return "Error!"
    if data_path is not None:
        df_data = pd.read_pickle(data_path)
        dataset = get_mpnn_dataset(df_data)
        return  model.predict(dataset)
    if smiles is not None:
        dataset = get_mpnn_dataset(smiles)
        return model.predict(dataset)

In [None]:
predictions = predict(model_path="models/my_model",
        data_path=INTERMEDIATE_DATA_PATH/DATA_TEST_FILENAME,
       )

In [None]:
preds = predict(model_path="models/my_model",
        smiles=['CC1=C(C(=O)Nc2cc(-c3cccc(F)c3)[nH]n2)C2(CCCCC2)OC1=O',],
       )

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history.history["AUC"], label="train AUC")
plt.plot(history.history["val_AUC"], label="valid AUC")
plt.xlabel("Epochs", fontsize=16)
plt.ylabel("AUC", fontsize=16)
plt.legend(fontsize=16)

### Predicting

In [None]:
molecules = [molecule_from_smiles(df.smiles.values[index]) for index in test_index]
y_true = [df.p_np.values[index] for index in test_index]
y_pred = tf.squeeze(mpnn.predict(test_dataset), axis=1)

legends = [f"y_true/y_pred = {y_true[i]}/{y_pred[i]:.2f}" for i in range(len(y_true))]
MolsToGridImage(molecules, molsPerRow=4, legends=legends)

In [None]:
df.iloc[test_index].head(1).smiles.values

In [None]:
sample = graphs_from_smiles(df.iloc[134:135].smiles.values)
len(sample)

In [None]:
sample_dataset = MPNNDataset(sample, None)

In [None]:
sample_preds = tf.squeeze(mpnn.predict(sample_dataset), axis=1)

In [None]:
sample_preds

In [None]:
len(sample)

In [None]:
(1, 2, 3) + (4,)

In [None]:
def evaluate(data_path, model_path):
    df_data = pd.read_csv(data_path)
    df_data = validate_dataframe(df_data)
    dataset = get_mpnn_dataset(df_data)
    model = tf.keras.models.load_model(model_path)
    return model.evaluate(dataset)

In [None]:
loss, acc = evaluate(data_path=INTERMEDIATE_DATA_PATH/DATA_VALID_FILENAME,
        model_path="models/my_model",)

In [None]:
def predict(model_path, data_path=None, smiles=None):
    model = tf.keras.models.load_model(model_path)
    if (data_path is None) and (smiles is None):
        raise Exception('No data input is given!')
    if data_path is not None:
        df_data = pd.read_csv(data_path)
        df_data = validate_dataframe(df_data, predict=True)
        if COL_TARGET not in df_data:
            df_data[COL_TARGET] = 0
        dataset = get_mpnn_dataset(df_data)
        return  model.predict(dataset)
    if smiles is not None:
        dataset = get_mpnn_dataset(smiles)
        return model.predict(dataset)

In [None]:
predictions = predict(model_path="models/my_model",
        data_path=INTERMEDIATE_DATA_PATH/DATA_TEST_FILENAME,
       )

In [None]:
df["smiles"]

In [None]:
predictions

In [None]:
predict(model_path="models/my_model", smiles=["Cc1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1C",
                                             "Cn1ccnc1SCC(=O)Nc1ccc(Oc2ccccc2)cc1"])