Run `conv_qsar_fast` model with Hansen Ames data set [(Hansen, K. et al., 2009)](https://doi.org/10.1021/ci900161g)

In [1]:
!export KERAS_BACKEND="theano"

In [2]:
from conv_qsar_fast.utils.parsing import input_to_bool
from conv_qsar_fast.utils.parse_cfg import read_config
import utils.reset_layers as reset_layers
import rdkit.Chem as Chem
import numpy as np
import datetime
import json
import os
import time

from conv_qsar_fast.main.core import build_model, train_model, save_model
from conv_qsar_fast.main.test import test_model, test_embeddings_demo
from conv_qsar_fast.main.data import get_data_full


Using Theano backend.


In [3]:
config_file='./conv_qsar_fast/inputs/hansen-Ames.cfg'

In [4]:
# Load settings
config = read_config(config_file)

# Get model label
try:
    fpath = config['IO']['model_fpath']
except KeyError:
    print('Must specify model_fpath in IO in config')
    quit(1)


###################################################################################
### DEFINE DATA 
###################################################################################

data_kwargs = config['DATA']
if '__name__' in data_kwargs:
    del data_kwargs['__name__'] #  from configparser
if 'batch_size' in config['TRAINING']:
    data_kwargs['batch_size'] = int(config['TRAINING']['batch_size'])
if 'use_fp' in config['ARCHITECTURE']:
    data_kwargs['use_fp'] = config['ARCHITECTURE']['use_fp']
if 'shuffle_seed' in data_kwargs:
    data_kwargs['shuffle_seed'] = int(data_kwargs['shuffle_seed'])
else:
    data_kwargs['shuffle_seed'] = int(time.time())
if 'truncate_to' in data_kwargs:
    data_kwargs['truncate_to'] = int(data_kwargs['truncate_to'])
if 'training_ratio' in data_kwargs:
    data_kwargs['training_ratio'] = float(data_kwargs['training_ratio'])
if 'molecular_attributes' in data_kwargs: 
    data_kwargs['molecular_attributes'] = input_to_bool(data_kwargs['molecular_attributes'])

if 'cv_folds' in data_kwargs:
    try:
        os.makedirs(os.path.dirname(fpath))
    except: # folder exists
        pass
    if '<this_fold>' in data_kwargs['cv_folds']:
        cv_folds = data_kwargs['cv_folds']
        total_folds = int(cv_folds.split('/')[1])
        all_cv_folds = ['{}/{}'.format(i + 1, total_folds) for i in range(total_folds)]
    else:
        all_cv_folds = [data_kwargs['cv_folds']]

In [5]:
# Iterate through all folds
ref_fpath = fpath
for cv_fold in all_cv_folds:

    ###################################################################################
    ### BUILD MODEL
    ###################################################################################

    print('...building model')
    try:
        kwargs = config['ARCHITECTURE']
        if '__name__' in kwargs: del kwargs['__name__'] #  from configparser
        if 'batch_size' in config['TRAINING']:
            kwargs['padding'] = int(config['TRAINING']['batch_size']) > 1
        if 'embedding_size' in kwargs: 
            kwargs['embedding_size'] = int(kwargs['embedding_size'])
        if 'hidden' in kwargs: 
            kwargs['hidden'] = int(kwargs['hidden'])
        if 'hidden2' in kwargs:
            kwargs['hidden2'] = int(kwargs['hidden2'])
        if 'depth' in kwargs: 
            kwargs['depth'] = int(kwargs['depth'])
        if 'scale_output' in kwargs: 
            kwargs['scale_output'] = float(kwargs['scale_output'])
        if 'dr1' in kwargs:
            kwargs['dr1'] = float(kwargs['dr1'])
        if 'dr2' in kwargs:
            kwargs['dr2'] = float(kwargs['dr2'])
        if 'output_size' in kwargs:
            kwargs['output_size'] = int(kwargs['output_size'])
        if 'sum_after' in kwargs:
            kwargs['sum_after'] = input_to_bool(kwargs['sum_after'])
        if 'optimizer' in kwargs:
            kwargs['optimizer'] = kwargs['optimizer']

        if 'molecular_attributes' in config['DATA']:
            kwargs['molecular_attributes'] = config['DATA']['molecular_attributes']

        model = build_model(**kwargs)
        print('...built untrained model')
    except KeyboardInterrupt:
        print('User cancelled model building')
        quit(1)


    print('Using CV fold {}'.format(cv_fold))
    data_kwargs['cv_folds'] = cv_fold
    fpath = ref_fpath.replace('<this_fold>', cv_fold.split('/')[0])
    data = get_data_full(**data_kwargs)

    

###################################################################################
    ### LOAD WEIGHTS?
    ###################################################################################

    if 'weights_fpath' in config['IO']:
        weights_fpath = config['IO']['weights_fpath']
    else:
        weights_fpath = fpath + '.h5'

    try:
        use_old_weights = input_to_bool(config['IO']['use_existing_weights'])
    except KeyError:
        print('Must specify whether or not to use existing model weights')
        quit(1)

    if use_old_weights and os.path.isfile(weights_fpath):
        model.load_weights(weights_fpath)
        print('...loaded weight information')

        # Reset final dense?
        if 'reset_final' in config['IO']:
            if config['IO']['reset_final'] in ['true', 'y', 'Yes', 'True', '1']:
                layer = model.layers[-1]
                layer.W.set_value((layer.init(layer.W.shape.eval()).eval()).astype(np.float32))
                layer.b.set_value(np.zeros(layer.b.shape.eval(), dtype=np.float32))

    elif use_old_weights and not os.path.isfile(weights_fpath):
        print('Weights not found at specified path {}'.format(weights_fpath))
        quit(1)
    else:
        pass

    ###################################################################################
    ### CHECK FOR TESTING CONDITIONS
    ###################################################################################

    # Testing embeddings?
    try:
        if input_to_bool(config['TESTING']['test_embedding']):
            test_embeddings_demo(model, fpath)
            quit(1)
    except KeyError:
        pass

    ###################################################################################
    ### TRAIN THE MODEL
    ###################################################################################

    # Train model
    try:
        print('...training model')
        kwargs = config['TRAINING']
        if '__name__' in kwargs:
            del kwargs['__name__'] #  from configparser
        if 'nb_epoch' in kwargs:
            kwargs['nb_epoch'] = int(kwargs['nb_epoch'])
        if 'batch_size' in kwargs:
            kwargs['batch_size'] = int(kwargs['batch_size'])
        if 'patience' in kwargs:
            kwargs['patience'] = int(kwargs['patience'])
        (model, loss, val_loss) = train_model(model, data, **kwargs)
        print('...trained model')
    except KeyboardInterrupt:
        pass

    ###################################################################################
    ### SAVE MODEL
    ###################################################################################

    # Get the current time
    tstamp = datetime.datetime.utcnow().strftime('%m-%d-%Y_%H-%M')
    print('...saving model')
    save_model(model, 
        loss,
        val_loss,
        fpath = fpath,
        config = config, 
        tstamp = tstamp)
    print('...saved model')

    ###################################################################################
    ### TEST MODEL
    ###################################################################################

    print('...testing model')
    data_withresiduals = test_model(model, data, fpath, tstamp = tstamp,
        batch_size = int(config['TRAINING']['batch_size']))
    print('...tested model')

...building model


  name = "d{} multiply atom contribs and adj mat".format(d)
  name=name)
  name = 'd{} combine atom and bond contributions to new atom features'.format(d),
  FPs = merge(output_contribs, mode = 'sum', name = 'pool across depths')
  output = [ypred])


...built untrained model
Using CV fold 1/5
reading data...
done
processing data...
**** DUPLICATE ENTRY ****
[N-]=[N+]=NCCC(N)C(=O)O
Failed to generate graph for smiles, y: class
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for NC(COC(=O)\C=N/#N)C(=O)O , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for O=C1NC(=O)\C(=N/#N)\C=N1 , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, 

  0%|          | 0/5204 [00:00<?, ?it/s]

loss: 0.49980735778808594	val_loss: 0.4919602572917938
5 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-08 01:54:43.161102
...saved history
...saved model to conv_qsar_fast/models/hansen-Ames/fold1.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 5204/5204 [00:10<00:00, 474.31it/s]
100%|██████████| 1301/1301 [00:02<00:00, 576.59it/s]
0it [00:00, ?it/s]


train:
  AUC = 0.8498196432730194
  mse = 0.15598669862495657, mae = 0.2958067660187754
test:
  AUC = 0.8499762300927027
  mse = 0.1596937495439274, mae = 0.301655334339611
...tested model
...building model


  name = "d{} multiply atom contribs and adj mat".format(d)
  name=name)
  name = 'd{} combine atom and bond contributions to new atom features'.format(d),
  FPs = merge(output_contribs, mode = 'sum', name = 'pool across depths')
  output = [ypred])


...built untrained model
Using CV fold 2/5
reading data...
done
processing data...
**** DUPLICATE ENTRY ****
[N-]=[N+]=NCCC(N)C(=O)O
Failed to generate graph for smiles, y: class
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for NC(COC(=O)\C=N/#N)C(=O)O , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for O=C1NC(=O)\C(=N/#N)\C=N1 , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, 

  0%|          | 0/5204 [00:00<?, ?it/s]

loss: 0.48266053199768066	val_loss: 0.5447804927825928
5 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-08 02:07:01.326212
...saved history
...saved model to conv_qsar_fast/models/hansen-Ames/fold2.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 5204/5204 [00:10<00:00, 488.37it/s]
100%|██████████| 1301/1301 [00:02<00:00, 574.29it/s]
0it [00:00, ?it/s]


train:
  AUC = 0.850393837651533
  mse = 0.17391016830968054, mae = 0.29010406129783756
test:
  AUC = 0.8472641778284412
  mse = 0.17482517920878185, mae = 0.29174324469232815
...tested model
...building model
...built untrained model
Using CV fold 3/5
reading data...
done
processing data...
**** DUPLICATE ENTRY ****
[N-]=[N+]=NCCC(N)C(=O)O
Failed to generate graph for smiles, y: class
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for NC(COC(=O)\C=N/#N)C(=O)O , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for O=C1NC(=O)\C(=N/#N)\C=N1 , y: 1
Python 

  0%|          | 0/5204 [00:00<?, ?it/s]

loss: 0.4608614146709442	val_loss: 0.5111563205718994
5 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-08 02:19:46.794608
...saved history
...saved model to conv_qsar_fast/models/hansen-Ames/fold3.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 5204/5204 [00:09<00:00, 524.75it/s]
100%|██████████| 1301/1301 [00:02<00:00, 615.49it/s]
0it [00:00, ?it/s]


train:
  AUC = 0.8755245618732194
  mse = 0.14430956285980961, mae = 0.29875163309888875
test:
  AUC = 0.8353423140977003
  mse = 0.1672086582230829, mae = 0.3233720299019711
...tested model
...building model
...built untrained model
Using CV fold 4/5
reading data...
done
processing data...
**** DUPLICATE ENTRY ****
[N-]=[N+]=NCCC(N)C(=O)O
Failed to generate graph for smiles, y: class
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for NC(COC(=O)\C=N/#N)C(=O)O , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for O=C1NC(=O)\C(=N/#N)\C=N1 , y: 1
Python a

  0%|          | 0/5204 [00:00<?, ?it/s]

loss: 0.47008320689201355	val_loss: 0.5296939611434937
5 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-08 02:30:36.961037
...saved history
...saved model to conv_qsar_fast/models/hansen-Ames/fold4.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 5204/5204 [00:11<00:00, 471.97it/s]
100%|██████████| 1301/1301 [00:02<00:00, 544.21it/s]
0it [00:00, ?it/s]


train:
  AUC = 0.8678109541689016
  mse = 0.15062931206849797, mae = 0.2908935801675121
test:
  AUC = 0.8272171471394556
  mse = 0.17350550896215877, mae = 0.31542465538015557
...tested model
...building model
...built untrained model
Using CV fold 5/5
reading data...
done
processing data...
**** DUPLICATE ENTRY ****
[N-]=[N+]=NCCC(N)C(=O)O
Failed to generate graph for smiles, y: class
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for NC(COC(=O)\C=N/#N)C(=O)O , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for O=C1NC(=O)\C(=N/#N)\C=N1 , y: 1
Python 

  0%|          | 0/5204 [00:00<?, ?it/s]

loss: 0.4760041832923889	val_loss: 0.4924120306968689
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-08 02:43:55.387555
...saved history
...saved model to conv_qsar_fast/models/hansen-Ames/fold5.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 5204/5204 [00:11<00:00, 470.56it/s]
100%|██████████| 1301/1301 [00:02<00:00, 552.75it/s]
0it [00:00, ?it/s]


train:
  AUC = 0.8694150494793582
  mse = 0.14131755953004171, mae = 0.29469934280397664
test:
  AUC = 0.8390096629832681
  mse = 0.1588888187000075, mae = 0.3134540087678724
...tested model
