In [12]:
!export KERAS_BACKEND="theano"

In [13]:
from conv_qsar_fast.utils.parsing import input_to_bool
from conv_qsar_fast.utils.parse_cfg import read_config
import utils.reset_layers as reset_layers
import rdkit.Chem as Chem
import numpy as np
import datetime
import json
import os
import time

from conv_qsar_fast.main.core import build_model, train_model, save_model
from conv_qsar_fast.main.test import test_model, test_embeddings_demo
from conv_qsar_fast.main.data import get_data_full


In [14]:
config_file='./conv_qsar_fast/inputs/Ab-oct.cfg'

In [15]:
# Load settings
config = read_config(config_file)

# Get model label
try:
    fpath = config['IO']['model_fpath']
except KeyError:
    print('Must specify model_fpath in IO in config')
    quit(1)


###################################################################################
### DEFINE DATA 
###################################################################################

data_kwargs = config['DATA']
if '__name__' in data_kwargs:
    del data_kwargs['__name__'] #  from configparser
if 'batch_size' in config['TRAINING']:
    data_kwargs['batch_size'] = int(config['TRAINING']['batch_size'])
if 'use_fp' in config['ARCHITECTURE']:
    data_kwargs['use_fp'] = config['ARCHITECTURE']['use_fp']
if 'shuffle_seed' in data_kwargs:
    data_kwargs['shuffle_seed'] = int(data_kwargs['shuffle_seed'])
else:
    data_kwargs['shuffle_seed'] = int(time.time())
if 'truncate_to' in data_kwargs:
    data_kwargs['truncate_to'] = int(data_kwargs['truncate_to'])
if 'training_ratio' in data_kwargs:
    data_kwargs['training_ratio'] = float(data_kwargs['training_ratio'])
if 'molecular_attributes' in data_kwargs: 
    data_kwargs['molecular_attributes'] = input_to_bool(data_kwargs['molecular_attributes'])

if 'cv_folds' in data_kwargs:
    try:
        os.makedirs(os.path.dirname(fpath))
    except: # folder exists
        pass
    if '<this_fold>' in data_kwargs['cv_folds']:
        cv_folds = data_kwargs['cv_folds']
        total_folds = int(cv_folds.split('/')[1])
        all_cv_folds = ['{}/{}'.format(i + 1, total_folds) for i in range(total_folds)]
    else:
        all_cv_folds = [data_kwargs['cv_folds']]

In [16]:
# Iterate through all folds
ref_fpath = fpath
for cv_fold in all_cv_folds:

    ###################################################################################
    ### BUILD MODEL
    ###################################################################################

    print('...building model')
    try:
        kwargs = config['ARCHITECTURE']
        if '__name__' in kwargs: del kwargs['__name__'] #  from configparser
        if 'batch_size' in config['TRAINING']:
            kwargs['padding'] = int(config['TRAINING']['batch_size']) > 1
        if 'embedding_size' in kwargs: 
            kwargs['embedding_size'] = int(kwargs['embedding_size'])
        if 'hidden' in kwargs: 
            kwargs['hidden'] = int(kwargs['hidden'])
        if 'hidden2' in kwargs:
            kwargs['hidden2'] = int(kwargs['hidden2'])
        if 'depth' in kwargs: 
            kwargs['depth'] = int(kwargs['depth'])
        if 'scale_output' in kwargs: 
            kwargs['scale_output'] = float(kwargs['scale_output'])
        if 'dr1' in kwargs:
            kwargs['dr1'] = float(kwargs['dr1'])
        if 'dr2' in kwargs:
            kwargs['dr2'] = float(kwargs['dr2'])
        if 'output_size' in kwargs:
            kwargs['output_size'] = int(kwargs['output_size'])
        if 'sum_after' in kwargs:
            kwargs['sum_after'] = input_to_bool(kwargs['sum_after'])
        if 'optimizer' in kwargs:
            kwargs['optimizer'] = kwargs['optimizer']

        if 'molecular_attributes' in config['DATA']:
            kwargs['molecular_attributes'] = config['DATA']['molecular_attributes']

        model = build_model(**kwargs)
        print('...built untrained model')
    except KeyboardInterrupt:
        print('User cancelled model building')
        quit(1)


    print('Using CV fold {}'.format(cv_fold))
    data_kwargs['cv_folds'] = cv_fold
    fpath = ref_fpath.replace('<this_fold>', cv_fold.split('/')[0])
    data = get_data_full(**data_kwargs)

    ###################################################################################
    ### LOAD WEIGHTS?
    ###################################################################################

    if 'weights_fpath' in config['IO']:
        weights_fpath = config['IO']['weights_fpath']
    else:
        weights_fpath = fpath + '.h5'

    try:
        use_old_weights = input_to_bool(config['IO']['use_existing_weights'])
    except KeyError:
        print('Must specify whether or not to use existing model weights')
        quit(1)

    if use_old_weights and os.path.isfile(weights_fpath):
        model.load_weights(weights_fpath)
        print('...loaded weight information')

        # Reset final dense?
        if 'reset_final' in config['IO']:
            if config['IO']['reset_final'] in ['true', 'y', 'Yes', 'True', '1']:
                layer = model.layers[-1]
                layer.W.set_value((layer.init(layer.W.shape.eval()).eval()).astype(np.float32))
                layer.b.set_value(np.zeros(layer.b.shape.eval(), dtype=np.float32))

    elif use_old_weights and not os.path.isfile(weights_fpath):
        print('Weights not found at specified path {}'.format(weights_fpath))
        quit(1)
    else:
        pass

    ###################################################################################
    ### CHECK FOR TESTING CONDITIONS
    ###################################################################################

    # Testing embeddings?
    try:
        if input_to_bool(config['TESTING']['test_embedding']):
            test_embeddings_demo(model, fpath)
            quit(1)
    except KeyError:
        pass

    ###################################################################################
    ### TRAIN THE MODEL
    ###################################################################################

    # Train model
    try:
        print('...training model')
        kwargs = config['TRAINING']
        if '__name__' in kwargs:
            del kwargs['__name__'] #  from configparser
        if 'nb_epoch' in kwargs:
            kwargs['nb_epoch'] = int(kwargs['nb_epoch'])
        if 'batch_size' in kwargs:
            kwargs['batch_size'] = int(kwargs['batch_size'])
        if 'patience' in kwargs:
            kwargs['patience'] = int(kwargs['patience'])
        (model, loss, val_loss) = train_model(model, data, **kwargs)
        print('...trained model')
    except KeyboardInterrupt:
        pass

    ###################################################################################
    ### SAVE MODEL
    ###################################################################################

    # Get the current time
    tstamp = datetime.datetime.utcnow().strftime('%m-%d-%Y_%H-%M')
    print('...saving model')
    save_model(model, 
        loss,
        val_loss,
        fpath = fpath,
        config = config, 
        tstamp = tstamp)
    print('...saved model')

    ###################################################################################
    ### TEST MODEL
    ###################################################################################

    print('...testing model')
    data_withresiduals = test_model(model, data, fpath, tstamp = tstamp,
        batch_size = int(config['TRAINING']['batch_size']))
    print('...tested model')

...building model


  name = "d{} multiply atom contribs and adj mat".format(d)
  name=name)
  name = 'd{} combine atom and bond contributions to new atom features'.format(d),
  FPs = merge(output_contribs, mode = 'sum', name = 'pool across depths')
  output = [ypred])


...built untrained model
Using CV fold 1/5
reading data...
done
processing data...
Failed to generate graph for , y: -0.02

Failed to generate graph for , y: -2.57

Failed to generate graph for , y: -0.88

Failed to generate graph for , y: -4.92

Failed to generate graph for , y: -0.47

Failed to generate graph for , y: -0.612

Failed to generate graph for , y: -0.587

Failed to generate graph for , y: -2.367

Failed to generate graph for , y: -0.46

**** DUPLICATE ENTRY ****
COCC(=O)N(c1c(C)cccc1C)C(C)C(=O)OC
Failed to generate graph for , y: -1.23

Failed to generate graph for , y: -0.68

**** DUPLICATE ENTRY ****
COc1cc(Cl)c(Cl)c(Cl)c1Cl
Failed to generate graph for , y: -0.48

Failed to generate graph for , y: -2.28

**** DUPLICATE ENTRY ****
CCOC(=O)c1ccc(N)cc1
Failed to generate graph for , y: -0.05

Failed to generate graph for , y: -0.47

**** DUPLICATE ENTRY ****
ClC1=C(Cl)C2(Cl)C3C4CC(C5OC45)C3C1(Cl)C2(Cl)Cl
Failed to generate graph for , y: -0.689

Failed to generate graph f

  0%|          | 0/176 [00:00<?, ?it/s]

loss: 0.2225189208984375	val_loss: 0.2169792354106903
20 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-05 03:44:02.211806
...saved history
...saved model to conv_qsar_fast/models/Ab-oct-nested-a/fold1_.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 176/176 [00:01<00:00, 133.65it/s]
100%|██████████| 20/20 [00:00<00:00, 562.66it/s]
100%|██████████| 49/49 [00:00<00:00, 597.43it/s]


train:
  mse = 0.1650027255231589, mae = 0.3091950709054416
test:
  mse = 0.2169792254255638, mae = 0.38822677741050715
test:
  mse = 0.6129525471723841, mae = 0.603975672254757
...tested model
...building model
...built untrained model
Using CV fold 2/5
reading data...
done
processing data...
Failed to generate graph for , y: -0.02

Failed to generate graph for , y: -2.57

Failed to generate graph for , y: -0.88

Failed to generate graph for , y: -4.92

Failed to generate graph for , y: -0.47

Failed to generate graph for , y: -0.612

Failed to generate graph for , y: -0.587

Failed to generate graph for , y: -2.367

Failed to generate graph for , y: -0.46

**** DUPLICATE ENTRY ****
COCC(=O)N(c1c(C)cccc1C)C(C)C(=O)OC
Failed to generate graph for , y: -1.23

Failed to generate graph for , y: -0.68

**** DUPLICATE ENTRY ****
COc1cc(Cl)c(Cl)c(Cl)c1Cl
Failed to generate graph for , y: -0.48

Failed to generate graph for , y: -2.28

**** DUPLICATE ENTRY ****
CCOC(=O)c1ccc(N)cc1
Failed to g

loss: 0.2544502317905426	val_loss: 0.4522511065006256
19 epochs without val_loss progress
Epoch 48/100, lr = 0.00013071694411337376


  0%|          | 0/176 [00:00<?, ?it/s]

loss: 0.26477882266044617	val_loss: 0.48590293526649475
20 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-05 03:44:50.558547
...saved history
...saved model to conv_qsar_fast/models/Ab-oct-nested-a/fold2_.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 176/176 [00:01<00:00, 119.31it/s]
100%|██████████| 20/20 [00:00<00:00, 432.42it/s]
100%|██████████| 49/49 [00:00<00:00, 613.25it/s]


train:
  mse = 0.2082372616734215, mae = 0.35577562183277167
test:
  mse = 0.48590291044442385, mae = 0.5783629714488983
test:
  mse = 0.26905670815727284, mae = 0.3882163336982532
...tested model
...building model
...built untrained model
Using CV fold 3/5
reading data...
done
processing data...
Failed to generate graph for , y: -0.02

Failed to generate graph for , y: -2.57

Failed to generate graph for , y: -0.88

Failed to generate graph for , y: -4.92

Failed to generate graph for , y: -0.47

Failed to generate graph for , y: -0.612

Failed to generate graph for , y: -0.587

Failed to generate graph for , y: -2.367

Failed to generate graph for , y: -0.46

**** DUPLICATE ENTRY ****
COCC(=O)N(c1c(C)cccc1C)C(C)C(=O)OC
Failed to generate graph for , y: -1.23

Failed to generate graph for , y: -0.68

**** DUPLICATE ENTRY ****
COc1cc(Cl)c(Cl)c(Cl)c1Cl
Failed to generate graph for , y: -0.48

Failed to generate graph for , y: -2.28

**** DUPLICATE ENTRY ****
CCOC(=O)c1ccc(N)cc1
Failed t

  0%|          | 0/176 [00:00<?, ?it/s]

loss: 0.3364724814891815	val_loss: 0.5588032603263855
20 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-05 03:45:28.851975
...saved history
...saved model to conv_qsar_fast/models/Ab-oct-nested-a/fold3_.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 176/176 [00:01<00:00, 127.79it/s]
100%|██████████| 20/20 [00:00<00:00, 362.57it/s]
100%|██████████| 49/49 [00:00<00:00, 369.26it/s]


train:
  mse = 0.2208184757994874, mae = 0.36003746843947604
test:
  mse = 0.55880330718309, mae = 0.5981288817584515
test:
  mse = 0.27888842309661516, mae = 0.41057072887615287
...tested model
...building model
...built untrained model
Using CV fold 4/5
reading data...
done
processing data...
Failed to generate graph for , y: -0.02

Failed to generate graph for , y: -2.57

Failed to generate graph for , y: -0.88

Failed to generate graph for , y: -4.92

Failed to generate graph for , y: -0.47

Failed to generate graph for , y: -0.612

Failed to generate graph for , y: -0.587

Failed to generate graph for , y: -2.367

Failed to generate graph for , y: -0.46

**** DUPLICATE ENTRY ****
COCC(=O)N(c1c(C)cccc1C)C(C)C(=O)OC
Failed to generate graph for , y: -1.23

Failed to generate graph for , y: -0.68

**** DUPLICATE ENTRY ****
COc1cc(Cl)c(Cl)c(Cl)c1Cl
Failed to generate graph for , y: -0.48

Failed to generate graph for , y: -2.28

**** DUPLICATE ENTRY ****
CCOC(=O)c1ccc(N)cc1
Failed to 

  0%|          | 0/176 [00:00<?, ?it/s]

loss: 0.2812160849571228	val_loss: 0.4940413534641266
20 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-05 03:46:13.659747
...saved history
...saved model to conv_qsar_fast/models/Ab-oct-nested-a/fold4_.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 176/176 [00:01<00:00, 126.42it/s]
100%|██████████| 20/20 [00:00<00:00, 425.55it/s]
100%|██████████| 49/49 [00:00<00:00, 572.93it/s]


train:
  mse = 0.21356189276206017, mae = 0.36030218044329776
test:
  mse = 0.4940412862686516, mae = 0.5544037552297115
test:
  mse = 0.34053938151461116, mae = 0.4837160646343717
...tested model
...building model
...built untrained model
Using CV fold 5/5
reading data...
done
processing data...
Failed to generate graph for , y: -0.02

Failed to generate graph for , y: -2.57

Failed to generate graph for , y: -0.88

Failed to generate graph for , y: -4.92

Failed to generate graph for , y: -0.47

Failed to generate graph for , y: -0.612

Failed to generate graph for , y: -0.587

Failed to generate graph for , y: -2.367

Failed to generate graph for , y: -0.46

**** DUPLICATE ENTRY ****
COCC(=O)N(c1c(C)cccc1C)C(C)C(=O)OC
Failed to generate graph for , y: -1.23

Failed to generate graph for , y: -0.68

**** DUPLICATE ENTRY ****
COc1cc(Cl)c(Cl)c(Cl)c1Cl
Failed to generate graph for , y: -0.48

Failed to generate graph for , y: -2.28

**** DUPLICATE ENTRY ****
CCOC(=O)c1ccc(N)cc1
Failed t

  0%|          | 0/176 [00:00<?, ?it/s]

loss: 0.2676403820514679	val_loss: 0.554685115814209
20 epochs without val_loss progress
stopping early!
...trained model
...saving model
...saved structural information
...saved weights
trained at 2017-09-05 03:46:58.419445
...saved history
...saved model to conv_qsar_fast/models/Ab-oct-nested-a/fold5_.[json, h5, png, info]
...saved model
...testing model


100%|██████████| 176/176 [00:01<00:00, 133.51it/s]
100%|██████████| 20/20 [00:00<00:00, 364.93it/s]
100%|██████████| 49/49 [00:00<00:00, 394.42it/s]


train:
  mse = 0.20643267661145792, mae = 0.3665586893287572
test:
  mse = 0.5546850685679321, mae = 0.6235082466959954
test:
  mse = 0.4499360019730436, mae = 0.5411129069887862
...tested model
