# MPNNs for Predicting Partial Charges
This notebook creates neural networks for predicting the partial charges of each molecule

In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
from nfp.preprocessing import SmilesPreprocessor
from nfp.models import GraphModel
from nfp.layers import (MessageLayer, GRUStep, Set2Set, ReduceAtomToMol, 
                        Embedding2D, Embedding2DCompressed, Squeeze)
from keras import backend as K
from keras.layers import (Add, Input, Dense, BatchNormalization, Reshape, Concatenate,
                          Activation, Dropout, Embedding, Lambda)
from jcesr_ml.benchmark import load_benchmark_data
from jcesr_ml.mpnn import save_model_files, AtomicPropertySequence, TotalChargeLayer
import numpy as np

Using TensorFlow backend.


## Load in Datasets
We need the datasets for determining the shape of the inputs

In [2]:
train_data, _ = load_benchmark_data()

## Make the Preprocessing Tools
These tools convert the SMILES representation of a molecule into a set of features needed for the graph training.

In [3]:
preprocessor = SmilesPreprocessor(explicit_hs=True)

In [4]:
preprocessor.fit(train_data['smiles_0']);

100%|██████████| 117232/117232 [00:56<00:00, 2063.07it/s]


## Make Utility Functions
Make a model-building function and a tool to save a model to disk

In [5]:
def build_fn(preprocessor, embedding=128, dense_layers=(64, 32),
             message_steps=6, activation='softplus'):
    """Build a MPNN Keras model
    
    Adapted from: https://github.com/NREL/nfp/blob/master/examples/run_2D_model_noatom_bn.py
    
    Args:
        preprocessor (SmilesPreprocessor): Tool to generate inputs from SMILES string
        embedding (int): Size of the atom/bond embedding
        mol_features (int): Number of features to use to describe a molecule
        message_steps (int): Number of message-passing steps
    """
    
    # Raw (integer) graph inputs
    #  node_graph_indices - Maps the atom index to which molecule it came from
    #  atom_types - Categorical type of each atom
    #  bond_types - Categorical type of each bond
    #  connectivity - Atoms on each end of each bond
    node_graph_indices = Input(shape=(1,), name='node_graph_indices', dtype='int32')
    atom_types = Input(shape=(1,), name='atom', dtype='int32')
    bond_types = Input(shape=(1,), name='bond', dtype='int32')
    connectivity = Input(shape=(2,), name='connectivity', dtype='int32')

    # The "indices" and "type" inputs have 1 feature per "entry"
    #  The Squeeze layer removes this singleton dimension to make the data easier to use
    squeeze = Squeeze()
    snode_graph_indices = squeeze(node_graph_indices)
    satom_types = squeeze(atom_types)
    sbond_types = squeeze(bond_types)

    # Create the embedding for each atom type
    atom_state = Embedding(
        preprocessor.atom_classes,
        embedding, name='atom_embedding')(satom_types)

    # Create the embedding for each bond type
    bond_matrix = Embedding2DCompressed(
        preprocessor.bond_classes,
        embedding, name='bond_embedding')(sbond_types)

    # The core of the message passing framework: Recurrent and Message-passing layers
    #  The Message Layer computes an update message for each atom given the state of it's neighbors
    #  The Reccurent Layer (GRUStep) computes how the state of the atom changes given a message
    atom_rnn_layer = GRUStep(embedding)
    message_layer = MessageLayer(reducer='sum')

    # Perform the message passing
    for _ in range(message_steps):

        # Get the message updates to each atom
        message = message_layer([atom_state, bond_matrix, connectivity])

        # Update memory and atom states
        atom_state = atom_rnn_layer([message, atom_state])

    # After the message passing step, we reduce the atomic representation to one feature per atom
    atom_out = Dense(embedding, activation='sigmoid')(atom_state)
    
    for layer_size in dense_layers:
        atom_out = Dense(layer_size, activation=activation)(atom_out)
    
    # One feature per atom
    atom_out = Dense(1, activation='linear')(atom_out)

    return GraphModel([node_graph_indices, atom_types, bond_types, connectivity], [atom_out])

In [6]:
model = build_fn(preprocessor)

Instructions for updating:
Colocations handled automatically by placer.


In [7]:
save_model_files('standard', preprocessor, model, output_props=['mapped_charges'], normalize=False)

Already output. Skipping


## Charge-Balanced MPNN
An MPNN where it ensures that the total charge on each molecule is zero

In [8]:
def build_fn(preprocessor, embedding=128, dense_layers=(64, 32),
             message_steps=6, activation='softplus'):
    """Build a MPNN Keras model
    
    Adapted from: https://github.com/NREL/nfp/blob/master/examples/run_2D_model_noatom_bn.py
    
    Args:
        preprocessor (SmilesPreprocessor): Tool to generate inputs from SMILES string
        embedding (int): Size of the atom/bond embedding
        mol_features (int): Number of features to use to describe a molecule
        message_steps (int): Number of message-passing steps
    """
    
    # Raw (integer) graph inputs
    #  node_graph_indices - Maps the atom index to which molecule it came from
    #  atom_types - Categorical type of each atom
    #  bond_types - Categorical type of each bond
    #  connectivity - Atoms on each end of each bond
    node_graph_indices = Input(shape=(1,), name='node_graph_indices', dtype='int32')
    atom_types = Input(shape=(1,), name='atom', dtype='int32')
    bond_types = Input(shape=(1,), name='bond', dtype='int32')
    connectivity = Input(shape=(2,), name='connectivity', dtype='int32')

    # The "indices" and "type" inputs have 1 feature per "entry"
    #  The Squeeze layer removes this singleton dimension to make the data easier to use
    squeeze = Squeeze()
    snode_graph_indices = squeeze(node_graph_indices)
    satom_types = squeeze(atom_types)
    sbond_types = squeeze(bond_types)

    # Create the embedding for each atom type
    atom_state = Embedding(
        preprocessor.atom_classes,
        embedding, name='atom_embedding')(satom_types)

    # Create the embedding for each bond type
    bond_matrix = Embedding2DCompressed(
        preprocessor.bond_classes,
        embedding, name='bond_embedding')(sbond_types)

    # The core of the message passing framework: Recurrent and Message-passing layers
    #  The Message Layer computes an update message for each atom given the state of it's neighbors
    #  The Reccurent Layer (GRUStep) computes how the state of the atom changes given a message
    atom_rnn_layer = GRUStep(embedding)
    message_layer = MessageLayer(reducer='sum')

    # Perform the message passing
    for _ in range(message_steps):

        # Get the message updates to each atom
        message = message_layer([atom_state, bond_matrix, connectivity])

        # Update memory and atom states
        atom_state = atom_rnn_layer([message, atom_state])

    # After the message passing step, we reduce the atomic representation to one feature per atom
    atom_out = Dense(embedding, activation='sigmoid')(atom_state)
    
    for layer_size in dense_layers:
        atom_out = Dense(layer_size, activation=activation)(atom_out)
    
    # One feature per atom
    atom_out = Dense(1, activation='linear')(atom_out)
    
    # Make sure the molecules are charge balanced
    atom_out = TotalChargeLayer()([snode_graph_indices, atom_out])

    return GraphModel([node_graph_indices, atom_types, bond_types, connectivity], [atom_out])

In [9]:
model = build_fn(preprocessor)

In [10]:
save_model_files('charge-balanced', preprocessor, model, output_props=['mapped_charges'], normalize=False)

Already output. Skipping
