# Neural Fingerprint Solubility result

In [1]:
# Example regression script using neural fingerprints.
#
# Compares Morgan fingerprints to neural fingerprints.

import autograd.numpy as np
import autograd.numpy.random as npr

from neuralfingerprint import load_data
from neuralfingerprint import build_morgan_deep_net
from neuralfingerprint import build_conv_deep_net
from neuralfingerprint import normalize_array, adam
from neuralfingerprint import build_batched_grad
from neuralfingerprint.util import rmse

from autograd import grad

task_params = {'target_name' : 'measured log solubility in mols per litre',
               'data_file'   : 'delaney.csv'}
N_train = 800
N_val   = 20
N_test  = 20

model_params = dict(fp_length=50,    # Usually neural fps need far fewer dimensions than morgan.
                    fp_depth=6,      # The depth of the network equals the fingerprint radius.
                    conv_width=30,   # Only the neural fps need this parameter.
                    h1_size=100,     # Size of hidden layer of network on top of fps.
                    L2_reg=np.exp(-2))
train_params = dict(num_iters=750,
                    batch_size=100,
                    init_scale=np.exp(-4),
                    step_size=np.exp(-6))

# Define the architecture of the network that sits on top of the fingerprints.
vanilla_net_params = dict(
    layer_sizes = [model_params['fp_length'], model_params['h1_size']],  # One hidden layer.
    normalize=True, L2_reg = model_params['L2_reg'], nll_func = rmse)

def train_nn(pred_fun, loss_fun, num_weights, train_smiles, train_raw_targets, train_params, seed=0,
             validation_smiles=None, validation_raw_targets=None):
    """loss_fun has inputs (weights, smiles, targets)"""
    print "Total number of weights in the network:", num_weights
    init_weights = npr.RandomState(seed).randn(num_weights) * train_params['init_scale']

    num_print_examples = 100
    train_targets, undo_norm = normalize_array(train_raw_targets)
    training_curve = []
    def callback(weights, iter):
        if iter % 10 == 0:
            print "max of weights", np.max(np.abs(weights))
            train_preds = undo_norm(pred_fun(weights, train_smiles[:num_print_examples]))
            cur_loss = loss_fun(weights, train_smiles[:num_print_examples], train_targets[:num_print_examples])
            training_curve.append(cur_loss)
            print "Iteration", iter, "loss", cur_loss,\
                  "train RMSE", rmse(train_preds, train_raw_targets[:num_print_examples]),
            if validation_smiles is not None:
                validation_preds = undo_norm(pred_fun(weights, validation_smiles))
                print "Validation RMSE", iter, ":", rmse(validation_preds, validation_raw_targets),

    # Build gradient using autograd.
    grad_fun = grad(loss_fun)
    grad_fun_with_data = build_batched_grad(grad_fun, train_params['batch_size'],
                                            train_smiles, train_targets)

    # Optimize weights.
    trained_weights = adam(grad_fun_with_data, init_weights, callback=callback,
                           num_iters=train_params['num_iters'], step_size=train_params['step_size'])

    def predict_func(new_smiles):
        """Returns to the original units that the raw targets were in."""
        return undo_norm(pred_fun(trained_weights, new_smiles))
    return predict_func, trained_weights, training_curve


def main():
    print "Loading data..."
    traindata, valdata, testdata = load_data(
        task_params['data_file'], (N_train, N_val, N_test),
        input_name='smiles', target_name=task_params['target_name'])
    train_inputs, train_targets = traindata
    val_inputs,   val_targets   = valdata
    test_inputs,  test_targets  = testdata

    def print_performance(pred_func):
        train_preds = pred_func(train_inputs)
        val_preds = pred_func(val_inputs)
        print "\nPerformance (RMSE) on " + task_params['target_name'] + ":"
        print "Train:", rmse(train_preds, train_targets)
        print "Test: ", rmse(val_preds,  val_targets)
        print "-" * 80
        return rmse(val_preds, val_targets)

    def run_morgan_experiment():
        loss_fun, pred_fun, net_parser = \
            build_morgan_deep_net(model_params['fp_length'],
                                  model_params['fp_depth'], vanilla_net_params)
        num_weights = len(net_parser)
        predict_func, trained_weights, conv_training_curve = \
            train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                     train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
        return print_performance(predict_func)

    def run_conv_experiment():
        conv_layer_sizes = [model_params['conv_width']] * model_params['fp_depth']
        conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                            'fp_length' : model_params['fp_length'], 'normalize' : 1}
        loss_fun, pred_fun, conv_parser = \
            build_conv_deep_net(conv_arch_params, vanilla_net_params, model_params['L2_reg'])
        num_weights = len(conv_parser)
        predict_func, trained_weights, conv_training_curve = \
            train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                     train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
        test_predictions = predict_func(test_inputs)
        return rmse(test_predictions, test_targets)

    print "Task params", task_params
    print
    #print "Starting Morgan fingerprint experiment..."
    #test_loss_morgan = run_morgan_experiment()
    print "Starting neural fingerprint experiment..."
    test_loss_neural = run_conv_experiment()
    print
    #print "Morgan test RMSE:", test_loss_morgan, "Neural test RMSE:", test_loss_neural
    print "Neural test RMSE:", test_loss_neural

if __name__ == '__main__':
    main()


Loading data...
Task params {'target_name': 'measured log solubility in mols per litre', 'data_file': 'delaney.csv'}

Starting neural fingerprint experiment...
Total number of weights in the network: 68831
max of weights 0.08886963478129936
Iteration 0 loss 1.0719451738994792 train RMSE 2.2340124410642304 Validation RMSE 0 : 2.5261672692217907 max of weights 0.09101903063397289
Iteration 10 loss 0.8723554254834599 train RMSE 1.817927278307013 Validation RMSE 10 : 1.9413312880371727 max of weights 0.11078477064514371
Iteration 20 loss 0.8536294059961078 train RMSE 1.7787770464188364 Validation RMSE 20 : 1.9544333731754195 max of weights 0.13763792901196806
Iteration 30 loss 0.7497380453642386 train RMSE 1.5620284153321324 Validation RMSE 30 : 1.7497666606754347 max of weights 0.16030121642932257
Iteration 40 loss 0.589444734418833 train RMSE 1.2277007656195902 Validation RMSE 40 : 1.4871873276793541 max of weights 0.17854017192571806
Iteration 50 loss 0.5316706205797443 train RMSE 1.107

Iteration 550 loss 0.29048458528884497 train RMSE 0.6022145996638145 Validation RMSE 550 : 0.7869607408011478 max of weights 0.7370442888662246
Iteration 560 loss 0.2842842032458141 train RMSE 0.5892681631196803 Validation RMSE 560 : 0.756760267204342 max of weights 0.7484396999247419
Iteration 570 loss 0.24843683568301336 train RMSE 0.5145227415313163 Validation RMSE 570 : 0.6996722674714357 max of weights 0.756093627471169
Iteration 580 loss 0.26534273040413947 train RMSE 0.5497345633975582 Validation RMSE 580 : 0.6622284480872511 max of weights 0.7610512795845409
Iteration 590 loss 0.2751201676693799 train RMSE 0.5700865865318614 Validation RMSE 590 : 0.6785113162181864 max of weights 0.7630715524633231
Iteration 600 loss 0.2702422910863891 train RMSE 0.5598863531263204 Validation RMSE 600 : 0.6634249659431887 max of weights 0.7714029022037596
Iteration 610 loss 0.24695178034687978 train RMSE 0.5113213920227726 Validation RMSE 610 : 0.6290468116358046 max of weights 0.77958010517654

# Neural Fingerprint photovoltaic efficiency result

In [7]:
# Example regression script using neural fingerprints.
#
# Compares Morgan fingerprints to neural fingerprints.

import autograd.numpy as np
import autograd.numpy.random as npr

from neuralfingerprint import load_data
from neuralfingerprint import build_morgan_deep_net
from neuralfingerprint import build_conv_deep_net
from neuralfingerprint import normalize_array, adam
from neuralfingerprint import build_batched_grad
from neuralfingerprint.util import rmse

from autograd import grad

task_params = {'target_name' : 'PCE',
               'data_file'   : 'cep-processed.csv'}
N_train = 19800
N_val   = 100
N_test  = 100

model_params = dict(fp_length=50,    # Usually neural fps need far fewer dimensions than morgan.
                    fp_depth=6,      # The depth of the network equals the fingerprint radius.
                    conv_width=40,   # Only the neural fps need this parameter.
                    h1_size=100,     # Size of hidden layer of network on top of fps.
                    L2_reg=np.exp(-2))
train_params = dict(num_iters=900,
                    batch_size=100,
                    init_scale=np.exp(-4),
                    step_size=np.exp(-6))

# Define the architecture of the network that sits on top of the fingerprints.
vanilla_net_params = dict(
    layer_sizes = [model_params['fp_length'], model_params['h1_size']],  # One hidden layer.
    normalize=True, L2_reg = model_params['L2_reg'], nll_func = rmse)

def train_nn(pred_fun, loss_fun, num_weights, train_smiles, train_raw_targets, train_params, seed=0,
             validation_smiles=None, validation_raw_targets=None):
    """loss_fun has inputs (weights, smiles, targets)"""
    print "Total number of weights in the network:", num_weights
    init_weights = npr.RandomState(seed).randn(num_weights) * train_params['init_scale']

    num_print_examples = 100
    train_targets, undo_norm = normalize_array(train_raw_targets)
    training_curve = []
    def callback(weights, iter):
        if iter % 10 == 0:
            print "max of weights", np.max(np.abs(weights))
            train_preds = undo_norm(pred_fun(weights, train_smiles[:num_print_examples]))
            cur_loss = loss_fun(weights, train_smiles[:num_print_examples], train_targets[:num_print_examples])
            training_curve.append(cur_loss)
            print "Iteration", iter, "loss", cur_loss,\
                  "train RMSE", rmse(train_preds, train_raw_targets[:num_print_examples]),
            if validation_smiles is not None:
                validation_preds = undo_norm(pred_fun(weights, validation_smiles))
                print "Validation RMSE", iter, ":", rmse(validation_preds, validation_raw_targets),

    # Build gradient using autograd.
    grad_fun = grad(loss_fun)
    grad_fun_with_data = build_batched_grad(grad_fun, train_params['batch_size'],
                                            train_smiles, train_targets)

    # Optimize weights.
    trained_weights = adam(grad_fun_with_data, init_weights, callback=callback,
                           num_iters=train_params['num_iters'], step_size=train_params['step_size'])

    def predict_func(new_smiles):
        """Returns to the original units that the raw targets were in."""
        return undo_norm(pred_fun(trained_weights, new_smiles))
    return predict_func, trained_weights, training_curve


def main():
    print "Loading data..."
    traindata, valdata, testdata = load_data(
        task_params['data_file'], (N_train, N_val, N_test),
        input_name='smiles', target_name=task_params['target_name'])
    train_inputs, train_targets = traindata
    val_inputs,   val_targets   = valdata
    test_inputs,  test_targets  = testdata

    def print_performance(pred_func):
        train_preds = pred_func(train_inputs)
        val_preds = pred_func(val_inputs)
        print "\nPerformance (RMSE) on " + task_params['target_name'] + ":"
        print "Train:", rmse(train_preds, train_targets)
        print "Test: ", rmse(val_preds,  val_targets)
        print "-" * 80
        return rmse(val_preds, val_targets)

    def run_morgan_experiment():
        loss_fun, pred_fun, net_parser = \
            build_morgan_deep_net(model_params['fp_length'],
                                  model_params['fp_depth'], vanilla_net_params)
        num_weights = len(net_parser)
        predict_func, trained_weights, conv_training_curve = \
            train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                     train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
        return print_performance(predict_func)

    def run_conv_experiment():
        conv_layer_sizes = [model_params['conv_width']] * model_params['fp_depth']
        conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                            'fp_length' : model_params['fp_length'], 'normalize' : 1}
        loss_fun, pred_fun, conv_parser = \
            build_conv_deep_net(conv_arch_params, vanilla_net_params, model_params['L2_reg'])
        num_weights = len(conv_parser)
        predict_func, trained_weights, conv_training_curve = \
            train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                     train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
        test_predictions = predict_func(test_inputs)
        return rmse(test_predictions, test_targets)

    print "Task params", task_params
    print
    #print "Starting Morgan fingerprint experiment..."
    #test_loss_morgan = run_morgan_experiment()
    print "Starting neural fingerprint experiment..."
    test_loss_neural = run_conv_experiment()
    print
    #print "Morgan test RMSE:", test_loss_morgan, "Neural test RMSE:", test_loss_neural
    print "Neural test RMSE:", test_loss_neural

if __name__ == '__main__':
    main()


Loading data...
Task params {'target_name': 'PCE', 'data_file': 'cep-processed.csv'}

Starting neural fingerprint experiment...
Total number of weights in the network: 102891
max of weights 0.08886963478129936
Iteration 0 loss 0.9409373354488176 train RMSE 2.382422934103587 Validation RMSE 0 : 2.3523152794061186 max of weights 0.08590965845178118
Iteration 10 loss 0.9321649813003298 train RMSE 2.3602039514325552 Validation RMSE 10 : 2.1881559058617785 max of weights 0.10318460249667184
Iteration 20 loss 0.9187027156022843 train RMSE 2.3260769141817734 Validation RMSE 20 : 2.1907101090913486 max of weights 0.1294722529595563
Iteration 30 loss 0.7927960732696413 train RMSE 2.006955776490843 Validation RMSE 30 : 1.6588675756833113 max of weights 0.14045331413412346
Iteration 40 loss 0.7339702671477818 train RMSE 1.8577070679457697 Validation RMSE 40 : 1.4163022716488234 max of weights 0.1651769062531167
Iteration 50 loss 0.7090922877865006 train RMSE 1.7947655659496542 Validation RMSE 50 

# Neural Fingerprint drug efficacy result

In [2]:
# Example regression script using neural fingerprints.
#
# Compares Morgan fingerprints to neural fingerprints.

import autograd.numpy as np
import autograd.numpy.random as npr

from neuralfingerprint import load_data
from neuralfingerprint import build_morgan_deep_net
from neuralfingerprint import build_conv_deep_net
from neuralfingerprint import normalize_array, adam
from neuralfingerprint import build_batched_grad
from neuralfingerprint.util import rmse

from autograd import grad

task_params = {'target_name' : 'activity',
               'data_file'   : 'malaria-processed.csv'}
N_train = 9700
N_val   = 100
N_test  = 100

model_params = dict(fp_length=50,    # Usually neural fps need far fewer dimensions than morgan.
                    fp_depth=6,      # The depth of the network equals the fingerprint radius.
                    conv_width=20,   # Only the neural fps need this parameter.
                    h1_size=100,     # Size of hidden layer of network on top of fps.
                    L2_reg=np.exp(-2))
train_params = dict(num_iters=800,
                    batch_size=100,
                    init_scale=np.exp(-4),
                    step_size=np.exp(-6))

# Define the architecture of the network that sits on top of the fingerprints.
vanilla_net_params = dict(
    layer_sizes = [model_params['fp_length'], model_params['h1_size']],  # One hidden layer.
    normalize=True, L2_reg = model_params['L2_reg'], nll_func = rmse)

def train_nn(pred_fun, loss_fun, num_weights, train_smiles, train_raw_targets, train_params, seed=0,
             validation_smiles=None, validation_raw_targets=None):
    """loss_fun has inputs (weights, smiles, targets)"""
    print "Total number of weights in the network:", num_weights
    init_weights = npr.RandomState(seed).randn(num_weights) * train_params['init_scale']

    num_print_examples = 100
    train_targets, undo_norm = normalize_array(train_raw_targets)
    training_curve = []
    def callback(weights, iter):
        if iter % 10 == 0:
            print "max of weights", np.max(np.abs(weights))
            train_preds = undo_norm(pred_fun(weights, train_smiles[:num_print_examples]))
            cur_loss = loss_fun(weights, train_smiles[:num_print_examples], train_targets[:num_print_examples])
            training_curve.append(cur_loss)
            print "Iteration", iter, "loss", cur_loss,\
                  "train RMSE", rmse(train_preds, train_raw_targets[:num_print_examples]),
            if validation_smiles is not None:
                validation_preds = undo_norm(pred_fun(weights, validation_smiles))
                print "Validation RMSE", iter, ":", rmse(validation_preds, validation_raw_targets),

    # Build gradient using autograd.
    grad_fun = grad(loss_fun)
    grad_fun_with_data = build_batched_grad(grad_fun, train_params['batch_size'],
                                            train_smiles, train_targets)

    # Optimize weights.
    trained_weights = adam(grad_fun_with_data, init_weights, callback=callback,
                           num_iters=train_params['num_iters'], step_size=train_params['step_size'])

    def predict_func(new_smiles):
        """Returns to the original units that the raw targets were in."""
        return undo_norm(pred_fun(trained_weights, new_smiles))
    return predict_func, trained_weights, training_curve


def main():
    print "Loading data..."
    traindata, valdata, testdata = load_data(
        task_params['data_file'], (N_train, N_val, N_test),
        input_name='smiles', target_name=task_params['target_name'])
    train_inputs, train_targets = traindata
    val_inputs,   val_targets   = valdata
    test_inputs,  test_targets  = testdata

    def print_performance(pred_func):
        train_preds = pred_func(train_inputs)
        val_preds = pred_func(val_inputs)
        print "\nPerformance (RMSE) on " + task_params['target_name'] + ":"
        print "Train:", rmse(train_preds, train_targets)
        print "Test: ", rmse(val_preds,  val_targets)
        print "-" * 80
        return rmse(val_preds, val_targets)

    def run_morgan_experiment():
        loss_fun, pred_fun, net_parser = \
            build_morgan_deep_net(model_params['fp_length'],
                                  model_params['fp_depth'], vanilla_net_params)
        num_weights = len(net_parser)
        predict_func, trained_weights, conv_training_curve = \
            train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                     train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
        return print_performance(predict_func)

    def run_conv_experiment():
        conv_layer_sizes = [model_params['conv_width']] * model_params['fp_depth']
        conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                            'fp_length' : model_params['fp_length'], 'normalize' : 1}
        loss_fun, pred_fun, conv_parser = \
            build_conv_deep_net(conv_arch_params, vanilla_net_params, model_params['L2_reg'])
        num_weights = len(conv_parser)
        predict_func, trained_weights, conv_training_curve = \
            train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                     train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
        test_predictions = predict_func(test_inputs)
        return rmse(test_predictions, test_targets)

    print "Task params", task_params
    print
    #print "Starting Morgan fingerprint experiment..."
    #test_loss_morgan = run_morgan_experiment()
    print "Starting neural fingerprint experiment..."
    test_loss_neural = run_conv_experiment()
    print
    #print "Morgan test RMSE:", test_loss_morgan, "Neural test RMSE:", test_loss_neural
    print "Neural test RMSE:", test_loss_neural

if __name__ == '__main__':
    main()


Loading data...
Task params {'target_name': 'activity', 'data_file': 'malaria-processed.csv'}

Starting neural fingerprint experiment...
Total number of weights in the network: 41771
max of weights 0.08535001578936458
Iteration 0 loss 0.8187127378654213 train RMSE 0.9956669173433814 Validation RMSE 0 : 1.20552164057607 max of weights 0.07438252802890713
Iteration 10 loss 0.8160962391088001 train RMSE 0.9925201814642607 Validation RMSE 10 : 1.2065253097589592 max of weights 0.0729394493817034
Iteration 20 loss 0.815652419516367 train RMSE 0.9919839752081421 Validation RMSE 20 : 1.2084289317500778 max of weights 0.07753474261385916
Iteration 30 loss 0.8151471304326946 train RMSE 0.9913712625474116 Validation RMSE 30 : 1.2083112679643948 max of weights 0.08506251631780189
Iteration 40 loss 0.8148917593464117 train RMSE 0.9910563908448425 Validation RMSE 40 : 1.2094255061680563 max of weights 0.10069228476059333
Iteration 50 loss 0.8153686514776366 train RMSE 0.9916278420309987 Validation 

Iteration 560 loss 0.7853168442709713 train RMSE 0.954005780245009 Validation RMSE 560 : 1.0456857361616934 max of weights 0.6712329881638819
Iteration 570 loss 0.7527061889067028 train RMSE 0.9143331923258904 Validation RMSE 570 : 1.0052514662885814 max of weights 0.6734677378312026
Iteration 580 loss 0.7413384132872121 train RMSE 0.9004846407538187 Validation RMSE 580 : 1.0206834259232513 max of weights 0.6850321682887021
Iteration 590 loss 0.7448763120606594 train RMSE 0.9047628291945121 Validation RMSE 590 : 1.0135108834598214 max of weights 0.6959057845606573
Iteration 600 loss 0.7513463767098557 train RMSE 0.9126229287377792 Validation RMSE 600 : 1.052179052885087 max of weights 0.7090725303824328
Iteration 610 loss 0.7584489380795324 train RMSE 0.9212696756931523 Validation RMSE 610 : 1.0270818631642775 max of weights 0.7192683607047061
Iteration 620 loss 0.7640650972374164 train RMSE 0.928089973673023 Validation RMSE 620 : 1.0090551394278238 max of weights 0.7367281013935237
It