In [1]:
import re
import numpy as np
import pandas as pd

import torch
from torch import nn
import torch.nn.functional as F

import warnings
# ignore some deprecation warnings
warnings.filterwarnings('ignore')

# Load main data set

In [2]:
df = pd.read_csv('retrosynthesis-all', header=None)
df['source'] = df[0].apply(lambda x: x.split('>>')[0])
df['target'] = df[0].apply(lambda x: x.split('>>')[1])
df.drop(0, axis=1, inplace=True)
df.head()

Unnamed: 0,source,target
0,O=C1CC[C@H](CN2CCN(CCOc3cc4ncnc(Nc5ccc(F)c(Cl)...,CS(=O)(=O)OC[C@H]1CCC(=O)O1.Fc1ccc(Nc2ncnc3cc...
1,Nc1nc2[nH]c(CCCc3csc(C(=O)O)c3)cc2c(=O)[nH]1,COC(=O)c1cc(CCCc2cc3c(=O)[nH]c(N)nc3[nH]2)cs1
2,CC1(C)OB(c2cccc(Nc3nccc(C(F)(F)F)n3)c2)OC1(C)C,CC1(C)OB(B2OC(C)(C)C(C)(C)O2)OC1(C)C.FC(F)(F)...
3,CC(C)(C)OC(=O)NCC(=O)CCC(=O)OCCCC(=O)O,CC(C)(C)OC(=O)NCC(=O)CCC(=O)OCCCC(=O)OCc1ccccc1
4,Fc1cc2c(NC3CCCCCC3)ncnc2cn1,Fc1cc2c(Cl)ncnc2cn1.NC1CCCCCC1


# SMILES Vocabulary: How is it generated?

The model's Vocabulary handles the transformation of SMILES strings into a sequence of tokens. Tokens are the pre-defined lowest and indivisible unit of string text. In Natural Language Processing (NLP), tokens are typically defined on the word or character level. The level of tokenization dictates *what* the model can output, e.g., if tokenization on the character level is used, then the model outputs individual characters.

For generative SMILES models, tokenization is performed on the character level where each token *loosely* maps to a unique atom type (brackets, "(" for example indicate branching and thus, do not map to an atom but rather gives connectivity information).


In [3]:
import sys
sys.path.append('src/')
from smiles_lstm.model.smiles_vocabulary import SMILESTokenizer, Vocabulary, create_vocabulary

tk = SMILESTokenizer()
vocab = Vocabulary()

# create a vocabulary using all SMILES in df
smiles_dataset = df['source'].unique().tolist()+ df['target'].unique().tolist()
smiles_dataset = np.unique(smiles_dataset).tolist()

vocabulary = create_vocabulary(smiles_list=smiles_dataset, tokenizer=tk)
print(f'There are {len(vocabulary)} unique tokens in the vocabulary.\n')

There are 86 unique tokens in the vocabulary.



# RNN section

This section describes *how* the numerical representation of tokens are transformed into an input vector known as the embedding that will act as the input to the RNN.

An Embedding Layer is essentially a look-up table. In the constructor above, `num_embeddings` refers to the Vocabulary size. 

`num_embeddings` denotes how many vectors to initialize. 

Since we have n unique tokens, we need _ different vectors: 1 for each unique token. This is why `num_embeddings` is _ in this example. `embedding_dim` denotes the dimension of the embedding vector. 5 is arbitrarily chosen here just for easy visualization.



In [21]:
# construct an "Embedding layer"
EMBEDDING_DIM = 5
NUM_EMBEDDING = len(vocabulary)

embedding_layer = nn.Embedding(num_embeddings=NUM_EMBEDDING,
                               embedding_dim=EMBEDDING_DIM)

# only 1 layer of LSTM cells is initialized here for the sake of illustration
# input_size = 5 because we previously defined the "embedding_dim" of the Embedding layer to be 5
# hidden_size = 5 is arbitrarily chosen for easy visualization
recurrent_layer = nn.LSTM(input_size=EMBEDDING_DIM,
                          hidden_size=5,
                          num_layers=1,
                          dropout=0,
                          batch_first=True)

# Train / validation / test split

In [24]:
from sklearn.model_selection import train_test_split

print(df.shape)

# Splitting the data into train and combined val/test sets
train_data, val_test_data = train_test_split(df, test_size=0.10, random_state=42)

# Splitting the combined val/test set into separate val and test sets
val_data, test_data = train_test_split(val_test_data, test_size=0.2, random_state=42)

# Printing the sizes of the resulting splits
print("Train data size:", len(train_data))
print("Validation data size:", len(val_data))
print("Test data size:", len(test_data))

(45033, 2)
Train data size: 40529
Validation data size: 3603
Test data size: 901


# Build the NMT 

In [71]:
import sys
import pandas as pd
sys.path.append('src/')
import argparse
from pathlib import Path
from smiles_lstm.model.smiles_lstm import SmilesLSTM
from smiles_lstm.model.smiles_trainer import SmilesTrainer
from smiles_lstm.model.smiles_vocabulary import SMILESTokenizer, create_vocabulary
from smiles_lstm.utils import load
from smiles_lstm.utils.misc import suppress_warnings
from torch.nn.utils.rnn import pad_sequence
import string

In [79]:
train     = train_data.copy()
test      = test_data.copy()
valid     = val_data.copy()

# create a vocabulary using all SMILES in df
dataset = df['source'].unique().tolist()+ df['target'].unique().tolist()
dataset = np.unique(dataset).tolist()

tokenizer = SMILESTokenizer()
vocab     = create_vocabulary(smiles_list=dataset,
                                    tokenizer=tokenizer,
                                    canonical=False)

MAX_LENGTH = max(len(v) for v in dataset)

print(f'There are {len(vocabulary)} unique tokens in the vocabulary.\n')
print(f'Max length: {MAX_LENGTH}.\n')

There are 86 unique tokens in the vocabulary.

Max length: 198.



### Function for pad sequencing

In [82]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def pad_sequence(tokenizer_array, desired_length):
    padded_sequence = pad_sequences([tokenizer_array], maxlen=desired_length, padding='post')[0]
    return padded_sequence

#### Tokenize and pad sequencing datasets

In [83]:
for d in [train, test, valid]:
    for c in d.columns:
        d[c] = d[c].apply(lambda x: tk.tokenize(x, with_begin_and_end=False))
        d[c] = d[c].apply(lambda x: vocabulary.encode(x).astype(int))
        d[c] = d[c].apply(lambda x: pad_sequence(x, MAX_LENGTH))

# Model Building

In [84]:
# Convert the source and target columns into numpy arrays
trainX = np.array(train['source'].tolist())
trainY = np.array(train['target'].tolist())

print(trainX.shape, trainY.shape)

(40529, 198) (40529, 198)


In [85]:
trainX = torch.LongTensor(trainX)
trainY = torch.LongTensor(trainY)

tensor = trainX[0]
tensor = torch.reshape(tensor, (1, trainX.shape[1]))

embedding = embedding_layer(tensor)

embedding_layer = nn.Embedding(num_embeddings=NUM_EMBEDDING,
                               embedding_dim=5)
recurrent_layer = nn.LSTM(input_size=5,
                          hidden_size=5,
                          num_layers=1,
                          dropout=0,
                          batch_first=True)

embedding = embedding_layer(tensor)
# let's run the embedding through the recurrent layer
rnn_output, (hidden_state, cell_state) = recurrent_layer(embedding)

# initialize the linear layer
# in_features = 5 as that is the hidden_size defined in the recurrent layer above
# out_features = 20 as that is the size of the Vocabulary
linear_layer = nn.Linear(in_features=5,
                         out_features=NUM_EMBEDDING)

linear_output = linear_layer(rnn_output)
softmax = linear_output.softmax(dim=2)
log_softmax = linear_output.log_softmax(dim=2)

In [87]:
print(tensor.shape)
print(log_softmax.sum(dim=2).shape)

torch.Size([1, 198])
torch.Size([1, 198])


In [95]:
most_probable_tokens = log_softmax.argmax(dim=2).flatten().tolist()

# we now extract the max value in each tensor of the log-softmax output above and the corresponding token
for idx, (correct_token, most_probable_token) in enumerate(zip(smiles, most_probable_tokens)):
    print(f"At time step {idx+1}, the generative model proposes {vocabulary.tokens()[most_probable_token]} as the most probable token and the correct token is {correct_token}")

At time step 1, the generative model proposes ( as the most probable token and the correct token is O
At time step 2, the generative model proposes [N-] as the most probable token and the correct token is =
At time step 3, the generative model proposes [N-] as the most probable token and the correct token is C
At time step 4, the generative model proposes 1 as the most probable token and the correct token is (
At time step 5, the generative model proposes ( as the most probable token and the correct token is O
At time step 6, the generative model proposes ( as the most probable token and the correct token is )
At time step 7, the generative model proposes [N-] as the most probable token and the correct token is C
At time step 8, the generative model proposes [N-] as the most probable token and the correct token is N
At time step 9, the generative model proposes [N-] as the most probable token and the correct token is C
At time step 10, the generative model proposes 1 as the most probab

## Make the model learn

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding

# Define the input shape
input_shape = (trainX.shape[1], 1)  # Assuming you want to feed one feature at a time

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=NUM_EMBEDDING, output_dim=EMBEDDING_DIM, input_length=MAX_LENGTH))
model.add(LSTM(units=128, input_shape=input_shape))
model.add(Dense(units=trainY.shape[1], activation='log_softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(trainX, trainY, epochs=10, batch_size=2048)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fad2056b9a0>

In [None]:
pred = model.predict(trainY)
pred = pd.DataFrame(pred)
pred

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,188,189,190,191,192,193,194,195,196,197
0,-5.624979,-31.864088,-35.137905,-32.202782,-35.519382,-34.402882,-36.823887,-35.278248,-35.409874,-36.401161,...,-4.857732,-4.775615,-4.705916,-4.737956,-4.692621,-4.749136,-4.767376,-4.761227,-4.735643,-4.782259
1,-5.624979,-31.864088,-35.137905,-32.202782,-35.519382,-34.402882,-36.823887,-35.278248,-35.409874,-36.401161,...,-4.857732,-4.775615,-4.705916,-4.737956,-4.692621,-4.749136,-4.767376,-4.761227,-4.735643,-4.782259
2,-5.624979,-31.864088,-35.137905,-32.202782,-35.519382,-34.402882,-36.823887,-35.278248,-35.409874,-36.401161,...,-4.857732,-4.775615,-4.705916,-4.737956,-4.692621,-4.749136,-4.767376,-4.761227,-4.735643,-4.782259
3,-5.624979,-31.864088,-35.137905,-32.202782,-35.519382,-34.402882,-36.823887,-35.278248,-35.409874,-36.401161,...,-4.857732,-4.775615,-4.705916,-4.737956,-4.692621,-4.749136,-4.767376,-4.761227,-4.735643,-4.782259
4,-5.624979,-31.864088,-35.137905,-32.202782,-35.519382,-34.402882,-36.823887,-35.278248,-35.409874,-36.401161,...,-4.857732,-4.775615,-4.705916,-4.737956,-4.692621,-4.749136,-4.767376,-4.761227,-4.735643,-4.782259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40524,-5.624981,-31.864100,-35.137920,-32.202793,-35.519398,-34.402893,-36.823902,-35.278263,-35.409885,-36.401176,...,-4.857732,-4.775614,-4.705914,-4.737955,-4.692621,-4.749136,-4.767377,-4.761227,-4.735642,-4.782259
40525,-5.624981,-31.864100,-35.137920,-32.202793,-35.519398,-34.402893,-36.823902,-35.278263,-35.409885,-36.401176,...,-4.857732,-4.775614,-4.705914,-4.737955,-4.692621,-4.749136,-4.767377,-4.761227,-4.735642,-4.782259
40526,-5.624981,-31.864100,-35.137920,-32.202793,-35.519398,-34.402893,-36.823902,-35.278263,-35.409885,-36.401176,...,-4.857732,-4.775614,-4.705914,-4.737955,-4.692621,-4.749136,-4.767377,-4.761227,-4.735642,-4.782259
40527,-5.624981,-31.864100,-35.137920,-32.202793,-35.519398,-34.402893,-36.823902,-35.278263,-35.409885,-36.401176,...,-4.857732,-4.775614,-4.705914,-4.737955,-4.692621,-4.749136,-4.767377,-4.761227,-4.735642,-4.782259


In [None]:
tk.untokenize(vocab.decode(testY[0]))

' C=C(C)C(=O)Cl.CCCCCOC(CN)OCCCCC'

In [None]:
pred

array([[0.99877197, 0.9998901 , 0.9999237 , ..., 0.01189647, 0.00572667,
        0.00950348],
       [0.99877197, 0.9998901 , 0.9999237 , ..., 0.01189647, 0.00572667,
        0.00950348],
       [0.99877197, 0.9998901 , 0.9999237 , ..., 0.01189647, 0.00572667,
        0.00950348],
       ...,
       [0.99877197, 0.9998901 , 0.9999237 , ..., 0.01189647, 0.00572667,
        0.00950348],
       [0.99877197, 0.9998901 , 0.9999237 , ..., 0.01189647, 0.00572667,
        0.00950348],
       [0.99877197, 0.9998901 , 0.9999237 , ..., 0.01189647, 0.00572667,
        0.00950348]], dtype=float32)

In [None]:
tk.untokenize(vocab.decode(pred[0]))

KeyError: 0.99877197

In [None]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

