In [1]:
# !pip install tensorflow==2.0.0
# !pip install tensorflow==1.15.0
# !pip install keras==2.3.1
# !pip install tensorflow-gpu
import keras
import keras.backend as K
from keras.layers import Dense, LSTM, concatenate, Input, CuDNNLSTM
from keras.models import Model
from keras.utils import plot_model, to_categorical
from keras.optimizers import Adam, SGD
from keras import metrics
from keras.callbacks import ModelCheckpoint
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [0]:
feature_matrix = pd.read_csv("descriptors.csv")
y_smiles_text = feature_matrix["molecule_smile"]
max_len = y_smiles_text.apply(len).max() #97
feature_matrix = feature_matrix.drop("molecule_smile", axis=1)
n_features = feature_matrix.shape[1]
X = y_smiles_text.apply(lambda x: '%' + x + '^')
X_padded = X.apply(lambda x: x + '~'*(max_len-len(x)+2)) #99
Y_padded = y_smiles_text.apply(lambda x: x + '^' + '~'*(max_len-len(x)+1))

In [0]:
X = pd.Series(list(X_padded.sum()))
Y = pd.Series(list(Y_padded.sum()))

In [6]:
repeated_feature_matrix = []
for idx, row in feature_matrix.iterrows():
  for j in range(99):
    repeated_feature_matrix.append(row)
repeated_feature_matrix = np.array(repeated_feature_matrix)
print(repeated_feature_matrix.shape)

(333927, 90)


In [15]:
data = pd.concat([X_padded, Y_padded, pd.DataFrame(feature_matrix)], axis=1)
data.columns = ['X', 'Y']+list(data.columns[2:])
data_train, data_test = train_test_split(data, test_size=0.2)
data_train, data_val = train_test_split(data_train, test_size=0.2)
X_train = data_train['X']
Y_train = data_train['Y']
F_train = data_train.drop(['X', 'Y'], axis=1)
X_val = data_val['X']
Y_val = data_val['Y']
F_val = data_val.drop(['X', 'Y'], axis=1)
X_test = data_test['X']
Y_test = data_test['Y']
F_test = data_test.drop(['X', 'Y'], axis=1)
print(X_train.shape, Y_train.shape, F_train.shape)
print(X_val.shape, Y_val.shape, F_val.shape)
print(X_test.shape, Y_test.shape, F_test.shape)

(2158,) (2158,) (2158, 90)
(540,) (540,) (540, 90)
(675,) (675,) (675, 90)


In [16]:
X_train_char = pd.Series(list(X_train.sum()))
Y_train_char = pd.Series(list(Y_train.sum()))
repeated_feature_matrix_train = []
for idx, row in F_train.iterrows():
  for j in range(99):
    repeated_feature_matrix_train.append(row)
repeated_feature_matrix_train = np.array(repeated_feature_matrix_train)
print(repeated_feature_matrix_train.shape)

(213642, 90)


In [19]:
X_val_char = pd.Series(list(X_val.sum()))
Y_val_char = pd.Series(list(Y_val.sum()))
repeated_feature_matrix_val = []
for idx, row in F_val.iterrows():
  for j in range(99):
    repeated_feature_matrix_val.append(row)
repeated_feature_matrix_val = np.array(repeated_feature_matrix_val)
print(repeated_feature_matrix_val.shape)

(53460, 90)


In [20]:
X_test_char = pd.Series(list(X_test.sum()))
Y_test_char = pd.Series(list(Y_test.sum()))
repeated_feature_matrix_test = []
for idx, row in F_test.iterrows():
  for j in range(99):
    repeated_feature_matrix_test.append(row)
repeated_feature_matrix_test = np.array(repeated_feature_matrix_test)
print(repeated_feature_matrix_test.shape)

(66825, 90)


In [11]:
vocab = list(set(list(list(X_train) + list(Y_train) + list(X_val))))
vocab_size = len(vocab)
vocab_size

38

In [0]:
list(X_train)

In [0]:
X_train_idx = []
for row in X_train_char:
  row_idx = vocab.index(row)
  X_train_idx.append(row_idx)
Y_train_idx = []
for row in Y_train_char:
  row_idx = vocab.index(row)
  Y_train_idx.append(row_idx)
X_val_idx = []
for row in X_val_char:
  row_idx = vocab.index(row)
  X_val_idx.append(row_idx)
Y_val_idx = []
for row in Y_val_char:
  row_idx = vocab.index(row)
  Y_val_idx.append(row_idx)
X_test_idx = []
for row in X_test_char:
  row_idx = vocab.index(row)
  X_test_idx.append(row_idx)
Y_test_idx = []
for row in Y_test_char:
  row_idx = vocab.index(row)
  Y_test_idx.append(row_idx)

In [25]:
X_train_onehot = to_categorical(X_train_idx)
X_val_onehot = to_categorical(X_val_idx)
X_test_onehot = to_categorical(X_test_idx)

Y_train_onehot = to_categorical(Y_train_idx)
Y_val_onehot = to_categorical(Y_val_idx)
Y_test_onehot = to_categorical(Y_test_idx)
print(X_train_onehot.shape, Y_train_onehot.shape)
print(X_val_onehot.shape, Y_val_onehot.shape)
print(X_test_onehot.shape, Y_test_onehot.shape)

(213642, 38) (213642, 38)
(53460, 38) (53460, 38)
(66825, 38) (66825, 38)


In [31]:
X_F_train = np.concatenate([X_train_onehot, repeated_feature_matrix_train], axis=1)
X_F_val = np.concatenate([X_val_onehot, repeated_feature_matrix_val], axis=1)
X_F_test = np.concatenate([X_test_onehot, repeated_feature_matrix_test], axis=1)
print(X_F_train.shape, Y_train_onehot.shape)
print(X_F_val.shape, Y_val_onehot.shape)
print(X_F_test.shape, Y_test_onehot.shape)

(213642, 128) (213642, 38)
(53460, 128) (53460, 38)
(66825, 128) (66825, 38)


In [43]:
encoder_inputs_train = X_F_train[:,np.newaxis,:]
decoder_inputs_train = X_train_onehot[:,np.newaxis,:]
decoder_outputs_train = Y_train_onehot[:,np.newaxis,:]
encoder_inputs_val = X_F_val[:,np.newaxis,:]
decoder_inputs_val = X_val_onehot[:,np.newaxis,:]
decoder_outputs_val = Y_val_onehot[:,np.newaxis,:]
encoder_inputs_test = X_F_test[:,np.newaxis,:]
decoder_inputs_test = X_test_onehot[:,np.newaxis,:]
decoder_outputs_test = Y_test_onehot[:,np.newaxis,:]
print(encoder_inputs_train.shape, decoder_inputs_train.shape, decoder_outputs_train.shape)
print(encoder_inputs_val.shape, decoder_inputs_val.shape, decoder_outputs_val.shape)
print(encoder_inputs_test.shape, decoder_inputs_test.shape, decoder_outputs_test.shape)

(213642, 1, 128) (213642, 1, 38) (213642, 1, 38)
(53460, 1, 128) (53460, 1, 38) (53460, 1, 38)
(66825, 1, 128) (66825, 1, 38) (66825, 1, 38)


In [0]:
latent_dim = 64
num_decoder_tokens = 38
max_decoder_seq_length = 99

In [44]:
encoder_inputs = Input(shape=(None, 128))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, 38))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(38, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, None, 128)    0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           (None, None, 38)     0                                            
__________________________________________________________________________________________________
lstm_9 (LSTM)                   [(None, 64), (None,  49408       input_9[0][0]                    
__________________________________________________________________________________________________
lstm_10 (LSTM)                  [(None, None, 64), ( 26368       input_10[0][0]                   
                                                                 lstm_9[0][1]               

In [65]:
optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=[metrics.categorical_accuracy])
checkpoint = ModelCheckpoint("model", monitor='val_categorical_accuracy', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=5)
history = model.fit([encoder_inputs_train, decoder_inputs_train], decoder_outputs_train,
          batch_size=512,
          epochs=100,
          validation_data=([encoder_inputs_val, decoder_inputs_val], decoder_outputs_val),
          callbacks=[checkpoint])

Train on 213642 samples, validate on 53460 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

Epoch 00005: val_categorical_accuracy improved from -inf to 0.73827, saving model to model
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: val_categorical_accuracy did not improve from 0.73827
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

Epoch 00015: val_categorical_accuracy improved from 0.73827 to 0.73835, saving model to model
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

Epoch 00020: val_categorical_accuracy improved from 0.73835 to 0.73838, saving model to model
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100

Epoch 00025: val_categorical_accuracy improved from 0.73838 to 0.73906, saving model to model
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100

Epoch 00030: val_categorical_accuracy improved from 0.73906 to 0.73979, saving model to model
Epoch 31/100
Epoch 32/100
Ep

In [0]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [63]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, vocab.index('%')] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = vocab[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '^' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

start_char = np.zeros(38)
start_char[vocab.index('%')] = 1
start_seq = np.concatenate([start_char[np.newaxis,:], repeated_feature_matrix_train[0][np.newaxis,:]], axis=1)[np.newaxis,:,:]
decode_sequence(start_seq)

'CCCCCCCCOCOCCCCCCCCCCC@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'