In [1]:
import keras_nlp
import tensorflow
import keras_tuner
import keras
import pandas as pd
from keras import layers

2024-08-23 08:37:38.970787: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-23 08:37:38.986035: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-23 08:37:38.990578: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-23 08:37:39.002222: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_

In [2]:
#Fixed parameters
PROCESSING_FIXED = {'start_char': "<s>",
                    'end_char': "</s>", 
                    'pad_char': "<pad>",
                    'mas_char': "<mask>"}

INDICES_TOKEN = {0: 'c',
                 1: 'C',
                 2: '(',
                 3: ')',
                 4: 'O',
                 5: '1',
                 6: '2',
                 7: '=',
                 8: 'N',
                 9: '@',
                 10: '[',
                 11: ']',
                 12: 'n',
                 13: '3',
                 14: 'H',
                 15: 'F',
                 16: '4',
                 17: '-',
                 18: 'S',
                 19: 'Cl',
                 20: '/',
                 21: 's',
                 22: 'o',
                 23: '5',
                 24: '+',
                 25: '#',
                 26: '\\',
                 27: 'Br',
                 28: 'P',
                 29: '6',
                 30: 'I',
                 31: '7',
                 32: PROCESSING_FIXED['start_char'],
                 33: PROCESSING_FIXED['end_char'],
                 34: PROCESSING_FIXED['pad_char'],
                 35: PROCESSING_FIXED['mas_char']}                
TOKEN_INDICES = {v: k for k, v in INDICES_TOKEN.items()}

In [3]:
tokenizer = keras_nlp.models.RobertaTokenizer(vocabulary=TOKEN_INDICES, merges=['C l', 'B r'])


2024-08-23 08:37:46.532151: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6187 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:65:00.0, compute capability: 7.5


In [4]:

preprocessor = keras_nlp.models.RobertaMaskedLMPreprocessor(
    tokenizer,
    sequence_length=90,
    mask_selection_rate=0.15,
    mask_selection_length=16,
    mask_token_rate=0.8,
    random_token_rate=0
)

In [5]:
class RobertaHyperModel(keras_tuner.HyperModel):
    def __init__(self, vocabulary_size, max_sequence_length, preprocessor):
        self.vocabulary_size = vocabulary_size
        self.max_sequence_length = max_sequence_length
        self.preprocessor = preprocessor

    def build(self, hp):
        hidden_dim = hp.Int('hidden_dim', min_value=128, max_value=512, step=32)
        num_heads = hp.Int('num_heads', min_value=2, max_value=8, step=1)
        dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.3, step=0.05)
        intermediate_dim = hp.Int('intermediate_dim', min_value=256, max_value=1024, step=64)
        num_layers = hp.Int('num_layers', min_value=2, max_value=6, step=1)
        learning_rate = hp.Float('learning_rate', min_value=1e-5, max_value=5e-5, sampling='log')

        backbone = keras_nlp.models.RobertaBackbone(
            vocabulary_size=self.vocabulary_size,
            num_layers=num_layers,
            num_heads=num_heads,
            hidden_dim=hidden_dim,
            intermediate_dim=intermediate_dim,
            max_sequence_length=self.max_sequence_length,
            dropout=dropout_rate,
        )
        model = keras_nlp.models.RobertaMaskedLM(
            backbone=backbone,
            preprocessor=self.preprocessor,
        )
        # dropouts?
        #model.add(layers.Dropout(rate=dropout_rate))
        model.compile(
            optimizer=tensorflow.keras.optimizers.Adam(learning_rate=learning_rate),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        return model

In [6]:
tuner = keras_tuner.RandomSearch(
    hypermodel=RobertaHyperModel(
        vocabulary_size=36,
        max_sequence_length=90,
        preprocessor=preprocessor,
    ),
    objective="val_loss",
    max_trials=20,
    executions_per_trial=1,
    overwrite=True,
    directory="hyper_tuning",
    project_name="roberta_smiles",
)

In [7]:
tuner.search_space_summary()


Search space summary
Default search space size: 6
hidden_dim (Int)
{'default': None, 'conditions': [], 'min_value': 128, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
num_heads (Int)
{'default': None, 'conditions': [], 'min_value': 2, 'max_value': 8, 'step': 1, 'sampling': 'linear'}
dropout_rate (Float)
{'default': 0.1, 'conditions': [], 'min_value': 0.1, 'max_value': 0.3, 'step': 0.05, 'sampling': 'linear'}
intermediate_dim (Int)
{'default': None, 'conditions': [], 'min_value': 256, 'max_value': 1024, 'step': 64, 'sampling': 'linear'}
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 2, 'max_value': 6, 'step': 1, 'sampling': 'linear'}
learning_rate (Float)
{'default': 1e-05, 'conditions': [], 'min_value': 1e-05, 'max_value': 5e-05, 'step': None, 'sampling': 'log'}


In [8]:
#load the data from data/us_pharma_patent_data_lowe_smiles_can_unique_stereochem.txt
data = pd.read_csv('data/us_pharma_patent_data_lowe_smiles_can_unique_stereochem.txt', sep='\t', header=None)
#Remember to drop missing values and duplicates
data = data.dropna().drop_duplicates()
#Also, remove any smiles string that contains a character NOT in our vocabulary (excluding pad, start and end chars). Hint: allowed_chars = [t for t in TOKEN_INDICES.keys()][:-3]
allowed_chars = [t for t in TOKEN_INDICES.keys()][:-3]
data = data[data[0].apply(lambda x: all(char in allowed_chars for char in x))]
#drop data longer than 90 characters
data = data[data[0].apply(lambda x: len(x)<=90)]
#Split the data into train and test sets with a 80/20 split. Don't forget to reset the index of the dataframes before splitting, so then we can use the train.index and test.index to create the generators
data = data.reset_index(drop=True)
#train_data, test_data = train_test_split(data, test_size=0.2)

In [9]:
import numpy as np
train_idx = np.loadtxt('data/train_data_idx.txt')
test_idx = np.loadtxt('data/test_data_idx.txt')

In [10]:
train_data = data[0].iloc[train_idx]
test_data = data[0].iloc[test_idx]

In [None]:
tuner.search(x=train_data, validation_data=test_data, epochs=10, callbacks=[keras.callbacks.TensorBoard("tensorboard/tb_logs")])

Trial 18 Complete [02h 03m 03s]
val_loss: 0.717340350151062

Best val_loss So Far: 0.45886924862861633
Total elapsed time: 21h 56m 20s

Search: Running Trial #19

Value             |Best Value So Far |Hyperparameter
288               |512               |hidden_dim
3                 |6                 |num_heads
0.15              |0.2               |dropout_rate
832               |256               |intermediate_dim
3                 |4                 |num_layers
1.445e-05         |2.3983e-05        |learning_rate

Epoch 1/10
[1m 9758/15288[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m1:36[0m 17ms/step - accuracy: 0.1198 - loss: 1.4357 - sparse_categorical_accuracy: 0.2660

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1m15288/15288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 20ms/step - accuracy: 0.3684 - loss: 0.8711 - sparse_categorical_accuracy: 0.4341 - val_accuracy: 0.4373 - val_loss: 0.8347 - val_sparse_categorical_accuracy: 0.4484
Epoch 4/10
[1m 3698/15288[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m3:37[0m 19ms/step - accuracy: 0.3700 - loss: 0.8346 - sparse_categorical_accuracy: 0.4475

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1m15288/15288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 20ms/step - accuracy: 0.4908 - loss: 0.7890 - sparse_categorical_accuracy: 0.4806 - val_accuracy: 0.4878 - val_loss: 0.7412 - val_sparse_categorical_accuracy: 0.5134
Epoch 7/10
[1m 3420/15288[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m3:41[0m 19ms/step - accuracy: 0.4802 - loss: 0.7581 - sparse_categorical_accuracy: 0.5016

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1m15288/15288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 20ms/step - accuracy: 0.5004 - loss: 0.7093 - sparse_categorical_accuracy: 0.5273 - val_accuracy: 0.5114 - val_loss: 0.6401 - val_sparse_categorical_accuracy: 0.5581
Epoch 10/10
[1m 3606/15288[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m3:39[0m 19ms/step - accuracy: 0.4670 - loss: 0.6828 - sparse_categorical_accuracy: 0.5482

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1m15288/15288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 18ms/step - accuracy: 0.1568 - loss: 0.9242 - sparse_categorical_accuracy: 0.3108 - val_accuracy: 0.4349 - val_loss: 0.8502 - val_sparse_categorical_accuracy: 0.3384
Epoch 3/10
[1m 6086/15288[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m2:25[0m 16ms/step - accuracy: 0.4407 - loss: 0.8610 - sparse_categorical_accuracy: 0.3449

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1m12841/15288[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m38s[0m 16ms/step - accuracy: 0.5770 - loss: 0.7531 - sparse_categorical_accuracy: 0.5004

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1m15288/15288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 17ms/step - accuracy: 0.5690 - loss: 0.6571 - sparse_categorical_accuracy: 0.5675 - val_accuracy: 0.5812 - val_loss: 0.5493 - val_sparse_categorical_accuracy: 0.6379
Epoch 10/10
[1m 1740/15288[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m3:34[0m 16ms/step - accuracy: 0.5604 - loss: 0.6233 - sparse_categorical_accuracy: 0.5955

In [None]:
best_model = tuner.get_best_models(num_models=1)

In [None]:
best_model[0].summary(expand_nested=True)

In [None]:
best_model[0].get_config()

In [None]:
best_model[0].save('best_roberta.keras')