In [1]:
import pandas as pd

from SMILESX import main, inference
%matplotlib inline

Using TensorFlow backend.


#### Definition of data

In [2]:
validation_data_dir = "./validation_data/"

In [3]:
extension = '.csv'

In [4]:
data_name = 'FreeSolv' # FreeSolv, ESOL, Lipophilicity
prop_tag = ''

In [5]:
if data_name == 'FreeSolv':
    data_filename = 'FreeSolv_SAMPL'
    prop_tag = 'expt'
elif data_name == 'ESOL':
    data_filename = 'ESOL_delaney-processed'
    prop_tag = 'measured log solubility in mols per litre'
elif data_name == 'Lipophilicity':
    data_filename = 'Lipophilicity'
    prop_tag = 'exp'
else:
    data_filename = data_name
    prop_tag = prop_tag

In [6]:
sol_data = pd.read_csv(validation_data_dir+data_filename+extension)

In [7]:
sol_data.head(3)

Unnamed: 0.1,Unnamed: 0,iupac,smiles,expt,calc
0,0,"4-methoxy-N,N-dimethyl-benzamide",COc1ccc(C(=O)N(C)C)cc1,-11.01,-9.625
1,1,methanesulfonyl chloride,CS(=O)(=O)Cl,-4.87,-6.219
2,2,3-methylbut-1-ene,C=CC(C)C,1.83,2.452


In [8]:
sol_data = sol_data[['smiles',prop_tag]]

In [9]:
sol_data.head()

Unnamed: 0,smiles,expt
0,COc1ccc(C(=O)N(C)C)cc1,-11.01
1,CS(=O)(=O)Cl,-4.87
2,C=CC(C)C,1.83
3,CCc1cnccn1,-5.45
4,CCCCCCCO,-4.21


In [10]:
sol_data.shape

(642, 2)

#### Hyperparameters optimization with GPyOpt (Bayesian optimization)

In [13]:
### Bayesian optimisation ranges
# Number of dimensions in the embedding layer
dembed_range = [2, 3, 4]
# Batch size
dbatch_range = [int(2**itn) for itn in range(3,11)]
# Adam's learning rate = 10^(-dalpha_range)
dalpha_range = [float(ialpha/10.) for ialpha in range(20,40,1)] 

### Trainless geometry search ranges:
# geom_bounds = [[2, 4, 8, 16, 32, 64, 128], # number of units in the LSTM layer
# #                [2, 4, 8, 16, 32, 64, 128]] # number of units in the dense layer
geom_bounds = [[2], # number of units in the LSTM layer
               [2]] # number of units in the dense layer
weight_range = [-0.1, -0.09, -0.08, -0.07, -0.06, -0.05, -0.04, -0.03, -0.02, -0.01, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1] 

if data_name != 'Lipophilicity':
    bayopt_bounds = [
        {'name': 'embedding', 'type': 'discrete', 'domain': dembed_range}, 
        {'name': 'batchsize', 'type': 'discrete', 'domain': dbatch_range}, 
        {'name': 'lrate', 'type': 'discrete', 'domain': dalpha_range}
    ]
else:
    bayopt_bounds = [
        {'name': 'embedding', 'type': 'discrete', 'domain': dembed_range}, 
        {'name': 'batchsize', 'type': 'discrete', 'domain': (1024, 1024)}, # fixed
        {'name': 'lrate', 'type': 'discrete', 'domain': (3, 3)} # fixed
    ]

In [14]:
main.Main(data=sol_data, 
          data_name=data_name, 
          data_units='', 
          bayopt_bounds=bayopt_bounds,
          geom_bounds = geom_bounds,
          weight_range = weight_range,
          n_opt_runs = 5,
          k_fold_number = 8, 
          augmentation = True, 
          outdir = "./data/", 
          bayopt_n_epochs = 30,
          bayopt_n_rounds = 25,
          bayopt_it_factor = 1,
          bayopt_on = True,
          n_gpus = 1,
          bridge_type = 'NVLink',
          lstmunits_ref = 16,
          denseunits_ref = 16,
          embedding_ref = 3,
          batch_size_ref = 8,
          alpha_ref = 2,
          patience = 50,
          n_epochs = 100)

***SMILES_X starts...***


******
***Fold #0 initiated...***
******
***Sampling and splitting of the dataset.***

Scaler: RobustScaler(copy=True, quantile_range=(5.0, 95.0), with_centering=True,
             with_scaling=True)
Train/valid/test splits: 0.80/0.10/0.10


***Data augmentation to True***

Enumerated SMILES:
	Training set: 4284
	Validation set: 494
	Test set: 577

***Tokenization of SMILES.***

Examples of tokenized SMILES from a training set:
[[' ', 'C', 'Cl', ' '], [' ', 'Cl', 'C', ' '], [' ', 'N', 'c', '1', 'c', 'c', 'c', 'c', 'c', '1', 'Cl', ' '], [' ', 'c', '1', '(', 'N', ')', 'c', 'c', 'c', 'c', 'c', '1', 'Cl', ' '], [' ', 'c', '1', 'c', 'c', 'c', 'c', '(', 'Cl', ')', 'c', '1', 'N', ' ']]

Number of tokens only present in a training set: 31

Number of tokens only present in a validation set: 27
Is the validation set a subset of the training set: True
What are the tokens by which they differ: set()

Number of tokens only present in a test set: 29
Is the test set a subse

Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100


Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Best val_loss @ Epoch #56

***Predictions from the best model.***

For the training set:
MAE: 0.7610 RMSE: 1.0059 R^2: 0.9342

For the validation set:
MAE: 0.7569 RMSE: 1.0079 R^2: 0.9236

For the test set:
MAE: 0.9084 RMSE: 1.1955 R^2: 0.8540

******
***Fold #1 initiated...***
******
***Sampling and splitting of the dataset.***

Scaler: RobustScaler(copy=True, quantile_range=(5.0, 95.0), with_centering=True,
             with_scaling=True)
Train/valid/test splits: 0.80/0.10/0.10


***Data augmentation to True***

Enumerated SMILES:
	Training set: 4322
	Validation set: 479
	Test set: 554

***Tokenization of SMILES.***

Examples of tokenized SMILES from a training set:
[[' ', 'Cl', 'c', '1', 'c', 'c', 'c', 'c', 'c', '1', 'Cl', ' '], [' ', 'c',

Valid MAE: 0.0936, RMSE: 0.0235
Model: [[   2. 1024.    2.]]
Valid MAE: 0.1310, RMSE: 0.0276
Model: [[  3. 128.   2.]]
Valid MAE: 0.1092, RMSE: 0.0239
Model: [[ 3.  32.   2.2]]
Valid MAE: 0.1013, RMSE: 0.0224
Model: [[ 4. 32.  2.]]
Valid MAE: 0.0996, RMSE: 0.0206
Model: [[ 3.  16.   3.1]]
Valid MAE: 0.1313, RMSE: 0.0285
Model: [[ 2. 16.  2.]]
Valid MAE: 0.0995, RMSE: 0.0203
Model: [[ 2.  16.   2.6]]


KeyboardInterrupt: 

In [None]:
pred_from_ens = inference.Inference(data_name=data_name, 
                                    smiles_list = ['CC','CCC','C=O','ABC','DEF'], 
                                    data_units = '',
                                    k_fold_number = 3,
                                    augmentation = True, 
                                    outdir = "./data/")

In [None]:
pred_from_ens