In [51]:
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed, BatchNormalization, Dropout, Reshape
from keras.optimizers import Adam
from keras.models import Sequential, load_model, Model
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
import time
import pandas as pd 

In [59]:
class CLM():
    def __init__(self, layers, n_chars, dropouts, trainables,lr):
        self.layers = layers
        self.n_chars = n_chars
        self.dropouts = dropouts
        self.trainables = trainables
        self.lr = lr
        self.model = None
        self.build_model()
    def build_model(self):
        # create a squential model and assigned to self.model
        self.model = Sequential()
        # Add a BatchNormalization layer input layer, input shape should be None, number of characters
        self.model.add(BatchNormalization(input_shape = (None,self.n_chars)))
        
        # Add n LSTM layers, where n is defined by the variable layers. This will be a list of ints where each int is the number of units in the LSTM layer. i.e [256, 512] will add two layers with 256 and 512 neurons respectively
        # This loop should also include the dropout values contained in the dropouts variable (list of floats). 
        # And it should also include the trainable parameter from the trainables variable (list of booleans). 
        print(self.layers)
        for i in range(len(self.layers)):
            print(self.layers[i])
            self.model.add(LSTM(units=self.layers[i], trainable=self.trainables[i]))
            
            self.model.add(Dropout(self.dropouts[i]))
        # add another BatchNormalization layer, this time no need to specify input shape
        self.model.add(BatchNormalization())
        # Finally add an output layer, it should be a Dense layer with n_chars units and softmax activation. Also, this layer should be containd in a TimeDistributed layer, so that it can be applied to each character in the sequence.
        self.model.add(TimeDistributed(Dense(self.n_chars, activation='softmax')))
        # compile the model with Adam optimizer and categorical_crossentropy loss. Don't forget to set the learning rate to the value contained in the lr variable with optimizer = Adam(learning_rate=self.lr)
        optimizer = Adam(learning_rate=self.lr)
        self.model.compile(optimizer=optimizer, loss='categorical_crossentropy')

In [53]:
#Create a checkpointer callback that saves the model weights to a file every epoch. The filename should be '{epoch:03d}.weights.h5'
checkpointer = ModelCheckpoint(
    filepath='{epoch:03d}.weights.h5',
    save_weights_only=True,
    save_freq='epoch'
)

In [54]:
#Import the generators
from data_gen import onehotencoder, DataGenerator, TOKEN_INDICES

In [55]:
#load the data from data/us_pharma_patent_data_lowe_smiles_can_unique_stereochem.txt
data = pd.read_csv('data/us_pharma_patent_data_lowe_smiles_can_unique_stereochem.txt', sep='\t', header=None)
#Remember to drop missing values and duplicates
data = data.dropna().drop_duplicates()
#Also, remove any smiles string that contains a character NOT in our vocabulary (excluding pad, start and end chars). Hint: allowed_chars = [t for t in TOKEN_INDICES.keys()][:-3]
allowed_chars = [t for t in TOKEN_INDICES.keys()][:-3]
data = data[data[0].apply(lambda x: all(char in allowed_chars for char in x))]
print(data.head())
#Split the data into train and test sets with a 80/20 split. Don't forget to reset the index of the dataframes before splitting, so then we can use the train.index and test.index to create the generators
data = data.reset_index(drop=True)
train_data, test_data = train_test_split(data, test_size=0.2)


                                                   0
1  COc1ccc2c(c1)CC[C@@H]1[C@@H]2CC[C@]2(C)[C@H](O...
2                  CC1(C)COP(=O)(O)OC1c1ccc2ccccc2c1
3  Nc1cc(F)ccc1C(=O)NCCN1CCC2(CC1)C(=O)NCN2c1ccc(...
4  COc1ccccc1-c1ccc(C(=O)OC(C)(C)C)c(NC(=O)c2cncc...
5                COc1ccccc1-c1cc(NC(=O)C2CCNCC2)ncn1


In [56]:
#Initialize the generators, we need one for training and one for validation. Use batch size of 256 for both. Remember to pass all the data to them, but the corresponding indices. Also the data should be passed as a list of strings, so don't forget to use the .tolist() method on the dataframe column
train_smiles = train_data[0].tolist()
test_smiles = test_data[0].tolist()
#confused about variables
train_generator = DataGenerator(
    data = train_smiles,
    max_len = 90,
    batch_size=256,
    list_IDs=train_data.index,
    num_chars=35
)

validation_generator = DataGenerator(
    data = test_smiles,
    max_len = 90,
    batch_size=256,
    list_IDs=test_data.index,
    num_chars=35
)

In [57]:
#Here are some predefine hyperparameters (taken from the paper which are already optimize, lucky us!)
layers = [1024,256]
dropouts = [0.40, 0.40]
trainables = [True, True]
lr = 0.001
epochs = 2

In [60]:
#Create the model object from the CLM class
chem_model = CLM(
    layers=layers,
    dropouts=dropouts,
    trainables=trainables,
    lr=lr,
    n_chars=35
)
#not sure why I am getting dimension error?

[1024, 256]
1024
256


ValueError: Input 0 of layer "lstm_13" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 1024)

In [None]:
#Do training! (pass the generators instead of the data directly) 
chem_model.fit(train_generator, validation_generator, epochs=2, callbacks=[checkpointer])