In [1]:
!export KERAS_BACKEND="theano"

In [2]:
from conv_qsar_fast.utils.parsing import input_to_bool
from conv_qsar_fast.utils.parse_cfg import read_config
import utils.reset_layers as reset_layers
import rdkit.Chem as Chem
import numpy as np
import datetime
import json
import os
import time

from conv_qsar_fast.main.core import build_model, train_model, save_model
from conv_qsar_fast.main.test import test_model, test_embeddings_demo
from conv_qsar_fast.main.data import get_data_full


Using Theano backend.


In [3]:
config_file='./conv_qsar_fast/inputs/tox21/tox21_ahr.cfg'

In [4]:
# Load settings
config = read_config(config_file)

# Get model label
try:
    fpath = config['IO']['model_fpath']
except KeyError:
    print('Must specify model_fpath in IO in config')
    quit(1)


###################################################################################
### DEFINE DATA 
###################################################################################

data_kwargs = config['DATA']
if '__name__' in data_kwargs:
    del data_kwargs['__name__'] #  from configparser
if 'batch_size' in config['TRAINING']:
    data_kwargs['batch_size'] = int(config['TRAINING']['batch_size'])
if 'use_fp' in config['ARCHITECTURE']:
    data_kwargs['use_fp'] = config['ARCHITECTURE']['use_fp']
if 'shuffle_seed' in data_kwargs:
    data_kwargs['shuffle_seed'] = int(data_kwargs['shuffle_seed'])
else:
    data_kwargs['shuffle_seed'] = int(time.time())
if 'truncate_to' in data_kwargs:
    data_kwargs['truncate_to'] = int(data_kwargs['truncate_to'])
if 'training_ratio' in data_kwargs:
    data_kwargs['training_ratio'] = float(data_kwargs['training_ratio'])
if 'molecular_attributes' in data_kwargs: 
    data_kwargs['molecular_attributes'] = input_to_bool(data_kwargs['molecular_attributes'])

if 'cv_folds' in data_kwargs:
    try:
        os.makedirs(os.path.dirname(fpath))
    except: # folder exists
        pass
    if '<this_fold>' in data_kwargs['cv_folds']:
        cv_folds = data_kwargs['cv_folds']
        total_folds = int(cv_folds.split('/')[1])
        all_cv_folds = ['{}/{}'.format(i + 1, total_folds) for i in range(total_folds)]
    else:
        all_cv_folds = [data_kwargs['cv_folds']]

In [5]:
# Iterate through all folds
ref_fpath = fpath
for cv_fold in all_cv_folds:

    ###################################################################################
    ### BUILD MODEL
    ###################################################################################

    print('...building model')
    try:
        kwargs = config['ARCHITECTURE']
        if '__name__' in kwargs: del kwargs['__name__'] #  from configparser
        if 'batch_size' in config['TRAINING']:
            kwargs['padding'] = int(config['TRAINING']['batch_size']) > 1
        if 'embedding_size' in kwargs: 
            kwargs['embedding_size'] = int(kwargs['embedding_size'])
        if 'hidden' in kwargs: 
            kwargs['hidden'] = int(kwargs['hidden'])
        if 'hidden2' in kwargs:
            kwargs['hidden2'] = int(kwargs['hidden2'])
        if 'depth' in kwargs: 
            kwargs['depth'] = int(kwargs['depth'])
        if 'scale_output' in kwargs: 
            kwargs['scale_output'] = float(kwargs['scale_output'])
        if 'dr1' in kwargs:
            kwargs['dr1'] = float(kwargs['dr1'])
        if 'dr2' in kwargs:
            kwargs['dr2'] = float(kwargs['dr2'])
        if 'output_size' in kwargs:
            kwargs['output_size'] = int(kwargs['output_size'])
        if 'sum_after' in kwargs:
            kwargs['sum_after'] = input_to_bool(kwargs['sum_after'])
        if 'optimizer' in kwargs:
            kwargs['optimizer'] = kwargs['optimizer']

        if 'molecular_attributes' in config['DATA']:
            kwargs['molecular_attributes'] = config['DATA']['molecular_attributes']

        model = build_model(**kwargs)
        print('...built untrained model')
    except KeyboardInterrupt:
        print('User cancelled model building')
        quit(1)


    print('Using CV fold {}'.format(cv_fold))
    data_kwargs['cv_folds'] = cv_fold
    fpath = ref_fpath.replace('<this_fold>', cv_fold.split('/')[0])
    data = get_data_full(**data_kwargs)

    ###################################################################################
    ### LOAD WEIGHTS?
    ###################################################################################

    if 'weights_fpath' in config['IO']:
        weights_fpath = config['IO']['weights_fpath']
    else:
        weights_fpath = fpath + '.h5'

    try:
        use_old_weights = input_to_bool(config['IO']['use_existing_weights'])
    except KeyError:
        print('Must specify whether or not to use existing model weights')
        quit(1)

    if use_old_weights and os.path.isfile(weights_fpath):
        model.load_weights(weights_fpath)
        print('...loaded weight information')

        # Reset final dense?
        if 'reset_final' in config['IO']:
            if config['IO']['reset_final'] in ['true', 'y', 'Yes', 'True', '1']:
                layer = model.layers[-1]
                layer.W.set_value((layer.init(layer.W.shape.eval()).eval()).astype(np.float32))
                layer.b.set_value(np.zeros(layer.b.shape.eval(), dtype=np.float32))

    elif use_old_weights and not os.path.isfile(weights_fpath):
        print('Weights not found at specified path {}'.format(weights_fpath))
        quit(1)
    else:
        pass

    ###################################################################################
    ### CHECK FOR TESTING CONDITIONS
    ###################################################################################

    # Testing embeddings?
    try:
        if input_to_bool(config['TESTING']['test_embedding']):
            test_embeddings_demo(model, fpath)
            quit(1)
    except KeyError:
        pass

    ###################################################################################
    ### TRAIN THE MODEL
    ###################################################################################

    # Train model
    try:
        print('...training model')
        kwargs = config['TRAINING']
        if '__name__' in kwargs:
            del kwargs['__name__'] #  from configparser
        if 'nb_epoch' in kwargs:
            kwargs['nb_epoch'] = int(kwargs['nb_epoch'])
        if 'batch_size' in kwargs:
            kwargs['batch_size'] = int(kwargs['batch_size'])
        if 'patience' in kwargs:
            kwargs['patience'] = int(kwargs['patience'])
        (model, loss, val_loss) = train_model(model, data, **kwargs)
        print('...trained model')
    except KeyboardInterrupt:
        pass

    ###################################################################################
    ### SAVE MODEL
    ###################################################################################

    # Get the current time
    tstamp = datetime.datetime.utcnow().strftime('%m-%d-%Y_%H-%M')
    print('...saving model')
    save_model(model, 
        loss,
        val_loss,
        fpath = fpath,
        config = config, 
        tstamp = tstamp)
    print('...saved model')

    ###################################################################################
    ### TEST MODEL
    ###################################################################################

    print('...testing model')
    data_withresiduals = test_model(model, data, fpath, tstamp = tstamp,
        batch_size = int(config['TRAINING']['batch_size']))
    print('...tested model')

...building model


  name = "d{} multiply atom contribs and adj mat".format(d)
  name=name)
  name = 'd{} combine atom and bond contributions to new atom features'.format(d),
  FPs = merge(output_contribs, mode = 'sum', name = 'pool across depths')
  output = [ypred])


...built untrained model
Using CV fold 1/5
Assuming TOX21 data nr-ahr
reading data...
done
processing data...
Failed to generate graph for O.O.[Cl-].[Cl-].[Ba++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Hg++], y: 1

Failed to generate graph for [K+].[I-], y: 0

Failed to generate graph for [Cl-].[Cl-].[Ca++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cl-].[Fe+3], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cu++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Fe++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cd++], y: 1

Failed to generate graph for [NH4+].[NH4+].[Cl-][Pt++]([Cl-])([Cl-])[Cl-], y: 0
Sanitization error: Explicit valence for atom # 2 Cl, 2, is greater than permitted
Failed to generate graph for [Na+].[Na+].F[Si--](F)(F)(F)(F)F, y: 0
Sanitization error: Explicit valence for atom # 3 Si, 8, is greater than permitted
Failed to generate graph for [Na+].[Br-], y: 0

Failed to generate graph for [Cl-].[Cl-].[SnH2++], y: 0

Failed to generate graph for O

  0%|          | 0/6523 [00:00<?, ?it/s]

loss: 0.2459268718957901	val_loss: 0.20821163058280945
5 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-07 12:25:08.690275
...saved history
...saved model to conv_qsar_fast/models/tox21_test_ahr/fold1.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 6523/6523 [00:18<00:00, 360.09it/s]
100%|██████████| 1631/1631 [00:04<00:00, 403.99it/s]
0it [00:00, ?it/s]


train:
No module named 'sklearn'
test:
No module named 'sklearn'
...tested model
...building model


  name = "d{} multiply atom contribs and adj mat".format(d)
  name=name)
  name = 'd{} combine atom and bond contributions to new atom features'.format(d),
  FPs = merge(output_contribs, mode = 'sum', name = 'pool across depths')
  output = [ypred])


...built untrained model
Using CV fold 2/5
Assuming TOX21 data nr-ahr
reading data...
done
processing data...
Failed to generate graph for O.O.[Cl-].[Cl-].[Ba++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Hg++], y: 1

Failed to generate graph for [K+].[I-], y: 0

Failed to generate graph for [Cl-].[Cl-].[Ca++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cl-].[Fe+3], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cu++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Fe++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cd++], y: 1

Failed to generate graph for [NH4+].[NH4+].[Cl-][Pt++]([Cl-])([Cl-])[Cl-], y: 0
Sanitization error: Explicit valence for atom # 2 Cl, 2, is greater than permitted
Failed to generate graph for [Na+].[Na+].F[Si--](F)(F)(F)(F)F, y: 0
Sanitization error: Explicit valence for atom # 3 Si, 8, is greater than permitted
Failed to generate graph for [Na+].[Br-], y: 0

Failed to generate graph for [Cl-].[Cl-].[SnH2++], y: 0

Failed to generate graph for O

  0%|          | 0/6523 [00:00<?, ?it/s]

loss: 0.23540595173835754	val_loss: 0.23429061472415924
5 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-07 12:38:16.884320
...saved history
...saved model to conv_qsar_fast/models/tox21_test_ahr/fold2.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 6523/6523 [00:14<00:00, 457.41it/s]
100%|██████████| 1631/1631 [00:03<00:00, 465.01it/s]
0it [00:00, ?it/s]


train:
No module named 'sklearn'
test:
No module named 'sklearn'
...tested model
...building model
...built untrained model
Using CV fold 3/5
Assuming TOX21 data nr-ahr
reading data...
done
processing data...
Failed to generate graph for O.O.[Cl-].[Cl-].[Ba++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Hg++], y: 1

Failed to generate graph for [K+].[I-], y: 0

Failed to generate graph for [Cl-].[Cl-].[Ca++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cl-].[Fe+3], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cu++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Fe++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cd++], y: 1

Failed to generate graph for [NH4+].[NH4+].[Cl-][Pt++]([Cl-])([Cl-])[Cl-], y: 0
Sanitization error: Explicit valence for atom # 2 Cl, 2, is greater than permitted
Failed to generate graph for [Na+].[Na+].F[Si--](F)(F)(F)(F)F, y: 0
Sanitization error: Explicit valence for atom # 3 Si, 8, is greater than permitted
Failed to generate graph for [Na+].[

  0%|          | 0/6523 [00:00<?, ?it/s]

loss: 0.2329334020614624	val_loss: 0.26553913950920105
5 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-07 12:52:20.533134
...saved history
...saved model to conv_qsar_fast/models/tox21_test_ahr/fold3.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 6523/6523 [00:18<00:00, 361.55it/s]
100%|██████████| 1631/1631 [00:04<00:00, 379.66it/s]
0it [00:00, ?it/s]


train:
No module named 'sklearn'
test:
No module named 'sklearn'
...tested model
...building model
...built untrained model
Using CV fold 4/5
Assuming TOX21 data nr-ahr
reading data...
done
processing data...
Failed to generate graph for O.O.[Cl-].[Cl-].[Ba++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Hg++], y: 1

Failed to generate graph for [K+].[I-], y: 0

Failed to generate graph for [Cl-].[Cl-].[Ca++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cl-].[Fe+3], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cu++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Fe++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cd++], y: 1

Failed to generate graph for [NH4+].[NH4+].[Cl-][Pt++]([Cl-])([Cl-])[Cl-], y: 0
Sanitization error: Explicit valence for atom # 2 Cl, 2, is greater than permitted
Failed to generate graph for [Na+].[Na+].F[Si--](F)(F)(F)(F)F, y: 0
Sanitization error: Explicit valence for atom # 3 Si, 8, is greater than permitted
Failed to generate graph for [Na+].[

  0%|          | 0/6523 [00:00<?, ?it/s]

loss: 0.2416587620973587	val_loss: 0.2243814915418625
5 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-07 13:03:37.195550
...saved history
...saved model to conv_qsar_fast/models/tox21_test_ahr/fold4.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 6523/6523 [00:17<00:00, 365.34it/s]
100%|██████████| 1631/1631 [00:04<00:00, 400.45it/s]
0it [00:00, ?it/s]


train:
No module named 'sklearn'
test:
No module named 'sklearn'
...tested model
...building model
...built untrained model
Using CV fold 5/5
Assuming TOX21 data nr-ahr
reading data...
done
processing data...
Failed to generate graph for O.O.[Cl-].[Cl-].[Ba++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Hg++], y: 1

Failed to generate graph for [K+].[I-], y: 0

Failed to generate graph for [Cl-].[Cl-].[Ca++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cl-].[Fe+3], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cu++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Fe++], y: 0

Failed to generate graph for [Cl-].[Cl-].[Cd++], y: 1

Failed to generate graph for [NH4+].[NH4+].[Cl-][Pt++]([Cl-])([Cl-])[Cl-], y: 0
Sanitization error: Explicit valence for atom # 2 Cl, 2, is greater than permitted
Failed to generate graph for [Na+].[Na+].F[Si--](F)(F)(F)(F)F, y: 0
Sanitization error: Explicit valence for atom # 3 Si, 8, is greater than permitted
Failed to generate graph for [Na+].[

  0%|          | 0/6524 [00:00<?, ?it/s]

loss: 0.24133722484111786	val_loss: 0.25836968421936035
5 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-07 13:10:30.003201
...saved history
...saved model to conv_qsar_fast/models/tox21_test_ahr/fold5.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 6524/6524 [00:14<00:00, 442.38it/s]
100%|██████████| 1630/1630 [00:03<00:00, 495.97it/s]
0it [00:00, ?it/s]

train:
No module named 'sklearn'
test:
No module named 'sklearn'
...tested model



