Run `conv_qsar_fast` model with Hansen Ames data set [(Hansen, K. et al., 2009)](https://doi.org/10.1021/ci900161g)

In [1]:
!export KERAS_BACKEND='theano'

In [2]:
from conv_qsar_fast.utils.parsing import input_to_bool
from conv_qsar_fast.utils.parse_cfg import read_config
import conv_qsar_fast.utils.reset_layers as reset_layers
import rdkit.Chem as Chem
import numpy as np
import datetime
import json
import os
import time

from conv_qsar_fast.main.core import build_model, train_model, save_model
from conv_qsar_fast.main.test import test_model, test_embeddings_demo
from conv_qsar_fast.main.data import get_data_full


Using Theano backend.


In [3]:
config_file='./conv_qsar_fast/inputs/hansen-Ames.cfg'

In [4]:
# Load settings
config = read_config(config_file)

# Get model label
try:
    fpath = config['IO']['model_fpath']
except KeyError:
    print('Must specify model_fpath in IO in config')
    quit(1)


###################################################################################
### DEFINE DATA 
###################################################################################

data_kwargs = config['DATA']
if '__name__' in data_kwargs:
    del data_kwargs['__name__'] #  from configparser
if 'batch_size' in config['TRAINING']:
    data_kwargs['batch_size'] = int(config['TRAINING']['batch_size'])
if 'use_fp' in config['ARCHITECTURE']:
    data_kwargs['use_fp'] = config['ARCHITECTURE']['use_fp']
if 'shuffle_seed' in data_kwargs:
    data_kwargs['shuffle_seed'] = int(data_kwargs['shuffle_seed'])
else:
    data_kwargs['shuffle_seed'] = int(time.time())
if 'truncate_to' in data_kwargs:
    data_kwargs['truncate_to'] = int(data_kwargs['truncate_to'])
if 'training_ratio' in data_kwargs:
    data_kwargs['training_ratio'] = float(data_kwargs['training_ratio'])
if 'molecular_attributes' in data_kwargs: 
    data_kwargs['molecular_attributes'] = input_to_bool(data_kwargs['molecular_attributes'])

if 'cv_folds' in data_kwargs:
    try:
        os.makedirs(os.path.dirname(fpath))
    except: # folder exists
        pass
    if '<this_fold>' in data_kwargs['cv_folds']:
        cv_folds = data_kwargs['cv_folds']
        total_folds = int(cv_folds.split('/')[1])
        all_cv_folds = ['{}/{}'.format(i + 1, total_folds) for i in range(total_folds)]
    else:
        all_cv_folds = [data_kwargs['cv_folds']]

In [5]:
# Iterate through all folds
ref_fpath = fpath
for cv_fold in all_cv_folds:

    ###################################################################################
    ### BUILD MODEL
    ###################################################################################

    print('...building model')
    try:
        kwargs = config['ARCHITECTURE']
        if '__name__' in kwargs: del kwargs['__name__'] #  from configparser
        if 'batch_size' in config['TRAINING']:
            kwargs['padding'] = int(config['TRAINING']['batch_size']) > 1
        if 'embedding_size' in kwargs: 
            kwargs['embedding_size'] = int(kwargs['embedding_size'])
        if 'hidden' in kwargs: 
            kwargs['hidden'] = int(kwargs['hidden'])
        if 'hidden2' in kwargs:
            kwargs['hidden2'] = int(kwargs['hidden2'])
        if 'depth' in kwargs: 
            kwargs['depth'] = int(kwargs['depth'])
        if 'scale_output' in kwargs: 
            kwargs['scale_output'] = float(kwargs['scale_output'])
        if 'dr1' in kwargs:
            kwargs['dr1'] = float(kwargs['dr1'])
        if 'dr2' in kwargs:
            kwargs['dr2'] = float(kwargs['dr2'])
        if 'output_size' in kwargs:
            kwargs['output_size'] = int(kwargs['output_size'])
        if 'sum_after' in kwargs:
            kwargs['sum_after'] = input_to_bool(kwargs['sum_after'])
        if 'optimizer' in kwargs:
            kwargs['optimizer'] = kwargs['optimizer']

        if 'molecular_attributes' in config['DATA']:
            kwargs['molecular_attributes'] = config['DATA']['molecular_attributes']

        model = build_model(**kwargs)
        print('...built untrained model')
    except KeyboardInterrupt:
        print('User cancelled model building')
        quit(1)


    print('Using CV fold {}'.format(cv_fold))
    data_kwargs['cv_folds'] = cv_fold
    fpath = ref_fpath.replace('<this_fold>', cv_fold.split('/')[0])
    data = get_data_full(**data_kwargs)

    

###################################################################################
    ### LOAD WEIGHTS?
    ###################################################################################

    if 'weights_fpath' in config['IO']:
        weights_fpath = config['IO']['weights_fpath']
    else:
        weights_fpath = fpath + '.h5'

    try:
        use_old_weights = input_to_bool(config['IO']['use_existing_weights'])
    except KeyError:
        print('Must specify whether or not to use existing model weights')
        quit(1)

    if use_old_weights and os.path.isfile(weights_fpath):
        model.load_weights(weights_fpath)
        print('...loaded weight information')

        # Reset final dense?
        if 'reset_final' in config['IO']:
            if config['IO']['reset_final'] in ['true', 'y', 'Yes', 'True', '1']:
                layer = model.layers[-1]
                layer.W.set_value((layer.init(layer.W.shape.eval()).eval()).astype(np.float32))
                layer.b.set_value(np.zeros(layer.b.shape.eval(), dtype=np.float32))

    elif use_old_weights and not os.path.isfile(weights_fpath):
        print('Weights not found at specified path {}'.format(weights_fpath))
        quit(1)
    else:
        pass

    ###################################################################################
    ### CHECK FOR TESTING CONDITIONS
    ###################################################################################

    # Testing embeddings?
    try:
        if input_to_bool(config['TESTING']['test_embedding']):
            test_embeddings_demo(model, fpath)
            quit(1)
    except KeyError:
        pass

    ###################################################################################
    ### TRAIN THE MODEL
    ###################################################################################

    # Train model
    try:
        print('...training model')
        kwargs = config['TRAINING']
        if '__name__' in kwargs:
            del kwargs['__name__'] #  from configparser
        if 'nb_epoch' in kwargs:
            kwargs['nb_epoch'] = int(kwargs['nb_epoch'])
        if 'batch_size' in kwargs:
            kwargs['batch_size'] = int(kwargs['batch_size'])
        if 'patience' in kwargs:
            kwargs['patience'] = int(kwargs['patience'])
        (model, loss, val_loss) = train_model(model, data, **kwargs)
        print('...trained model')
    except KeyboardInterrupt:
        pass

    ###################################################################################
    ### SAVE MODEL
    ###################################################################################

    # Get the current time
    tstamp = datetime.datetime.utcnow().strftime('%m-%d-%Y_%H-%M')
    print('...saving model')
    save_model(model, 
        loss,
        val_loss,
        fpath = fpath,
        config = config, 
        tstamp = tstamp)
    print('...saved model')

    ###################################################################################
    ### TEST MODEL
    ###################################################################################

    print('...testing model')
    data_withresiduals = test_model(model, data, fpath, tstamp = tstamp,
        batch_size = int(config['TRAINING']['batch_size']))
    print('...tested model')

...building model


  name = "d{} multiply atom contribs and adj mat".format(d)
  name=name)
  name = 'd{} combine atom and bond contributions to new atom features'.format(d),
  FPs = merge(output_contribs, mode = 'sum', name = 'pool across depths')
  output = [ypred])


...built untrained model
Using CV fold 1/5
reading data...
done
processing data...
**** DUPLICATE ENTRY ****
[N-]=[N+]=NCCC(N)C(=O)O
Failed to generate graph for NC(=O)CNC(=O)\C=N\#N , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for NC(COC(=O)\C=N/#N)C(=O)O , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for CCCCN(CC(O)C1=C\C(=N/#N)\C(=O)C=C1)N=O , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.San

  0%|          | 0/5204 [00:00<?, ?it/s]

loss: 0.498638778924942	val_loss: 0.5173741579055786
5 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-09 00:38:24.626328
...saved history
...saved model to conv_qsar_fast/models/hansen-Ames/fold1.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 5204/5204 [00:14<00:00, 365.28it/s]
100%|██████████| 1301/1301 [00:03<00:00, 429.52it/s]
0it [00:00, ?it/s]


train:
  AUC = 0.8587039280473991
  mse = 0.15828662369444696, mae = 0.3426138400246921
test:
  AUC = 0.8302915388402833
  mse = 0.1690138027724349, mae = 0.35347983048660403
...tested model
...building model


  name = "d{} multiply atom contribs and adj mat".format(d)
  name=name)
  name = 'd{} combine atom and bond contributions to new atom features'.format(d),
  FPs = merge(output_contribs, mode = 'sum', name = 'pool across depths')
  output = [ypred])


...built untrained model
Using CV fold 2/5
reading data...
done
processing data...
**** DUPLICATE ENTRY ****
[N-]=[N+]=NCCC(N)C(=O)O
Failed to generate graph for NC(=O)CNC(=O)\C=N\#N , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for NC(COC(=O)\C=N/#N)C(=O)O , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for CCCCN(CC(O)C1=C\C(=N/#N)\C(=O)C=C1)N=O , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.San

  0%|          | 0/5204 [00:00<?, ?it/s]

loss: 0.5097170472145081	val_loss: 0.5798339247703552
5 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-09 01:04:29.641791
...saved history
...saved model to conv_qsar_fast/models/hansen-Ames/fold2.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 5204/5204 [00:15<00:00, 455.34it/s]
100%|██████████| 1301/1301 [00:02<00:00, 473.50it/s]
0it [00:00, ?it/s]


train:
  AUC = 0.8316375835220056
  mse = 0.17948653623089345, mae = 0.29789900872355285
test:
  AUC = 0.8153213698434261
  mse = 0.1913484318400043, mae = 0.3109090630191276
...tested model
...building model
...built untrained model
Using CV fold 3/5
reading data...
done
processing data...
**** DUPLICATE ENTRY ****
[N-]=[N+]=NCCC(N)C(=O)O
Failed to generate graph for NC(=O)CNC(=O)\C=N\#N , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for NC(COC(=O)\C=N/#N)C(=O)O , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for CCCCN(CC(O)C1=C\C(=N/#N)\C(=O

  0%|          | 0/5204 [00:00<?, ?it/s]

loss: 0.4891657531261444	val_loss: 0.5028460621833801
3 epochs without val_loss progress
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-09 01:44:40.639840
...saved history
...saved model to conv_qsar_fast/models/hansen-Ames/fold3.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 5204/5204 [00:13<00:00, 388.89it/s]
100%|██████████| 1301/1301 [00:02<00:00, 440.30it/s]
0it [00:00, ?it/s]


train:
  AUC = 0.8596811846172137
  mse = 0.1471399608573553, mae = 0.2895528671906532
test:
  AUC = 0.8394292738024052
  mse = 0.16313136479820403, mae = 0.30630547801271757
...tested model
...building model
...built untrained model
Using CV fold 4/5
reading data...
done
processing data...
**** DUPLICATE ENTRY ****
[N-]=[N+]=NCCC(N)C(=O)O
Failed to generate graph for NC(=O)CNC(=O)\C=N\#N , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for NC(COC(=O)\C=N/#N)C(=O)O , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for CCCCN(CC(O)C1=C\C(=N/#N)\C(=O

  0%|          | 0/5204 [00:00<?, ?it/s]

loss: 0.4924558401107788	val_loss: 0.5089186429977417
5 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-09 02:12:21.880313
...saved history
...saved model to conv_qsar_fast/models/hansen-Ames/fold4.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 5204/5204 [00:15<00:00, 331.48it/s]
100%|██████████| 1301/1301 [00:03<00:00, 365.45it/s]
0it [00:00, ?it/s]


train:
  AUC = 0.8643174453265472
  mse = 0.15384877652747578, mae = 0.34947153959683874
test:
  AUC = 0.8363024037366142
  mse = 0.16447087511649094, mae = 0.35839185659286155
...tested model
...building model
...built untrained model
Using CV fold 5/5
reading data...
done
processing data...
**** DUPLICATE ENTRY ****
[N-]=[N+]=NCCC(N)C(=O)O
Failed to generate graph for NC(=O)CNC(=O)\C=N\#N , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for NC(COC(=O)\C=N/#N)C(=O)O , y: 1
Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Failed to generate graph for CCCCN(CC(O)C1=C\C(=N/#N)\C(

  0%|          | 0/5204 [00:00<?, ?it/s]

loss: 0.526729166507721	val_loss: 0.5466169118881226
5 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-09 02:30:14.137325
...saved history
...saved model to conv_qsar_fast/models/hansen-Ames/fold5.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 5204/5204 [00:15<00:00, 335.16it/s]
100%|██████████| 1301/1301 [00:03<00:00, 361.23it/s]
0it [00:00, ?it/s]


train:
  AUC = 0.8399541111300979
  mse = 0.17493107489763074, mae = 0.3604315434060676
test:
  AUC = 0.8133970719189558
  mse = 0.18229093716523592, mae = 0.36581967144540234
...tested model
