In [20]:
import keras_nlp
import tensorflow
import keras_tuner
import keras
from keras import layers

In [21]:
#Fixed parameters
PROCESSING_FIXED = {'start_char': "<s>",
                    'end_char': "</s>", 
                    'pad_char': "<pad>",
                    'mas_char': "<mask>"}

INDICES_TOKEN = {0: 'c',
                 1: 'C',
                 2: '(',
                 3: ')',
                 4: 'O',
                 5: '1',
                 6: '2',
                 7: '=',
                 8: 'N',
                 9: '@',
                 10: '[',
                 11: ']',
                 12: 'n',
                 13: '3',
                 14: 'H',
                 15: 'F',
                 16: '4',
                 17: '-',
                 18: 'S',
                 19: 'Cl',
                 20: '/',
                 21: 's',
                 22: 'o',
                 23: '5',
                 24: '+',
                 25: '#',
                 26: '\\',
                 27: 'Br',
                 28: 'P',
                 29: '6',
                 30: 'I',
                 31: '7',
                 32: PROCESSING_FIXED['start_char'],
                 33: PROCESSING_FIXED['end_char'],
                 34: PROCESSING_FIXED['pad_char'],
                 35: PROCESSING_FIXED['mas_char']}                
TOKEN_INDICES = {v: k for k, v in INDICES_TOKEN.items()}


In [22]:
tokenizer = keras_nlp.models.RobertaTokenizer(vocabulary=TOKEN_INDICES, merges=['C l', 'B r'])

In [23]:
preprocessor = keras_nlp.models.RobertaMaskedLMPreprocessor(
    tokenizer,
    sequence_length=90,
    mask_selection_rate=0.15,
    mask_selection_length=16,
    mask_token_rate=0.8,
    random_token_rate=0
)

In [24]:
back = keras_nlp.models.RobertaBackbone(
    vocabulary_size=36,
    num_layers=4,
    num_heads=4,
    hidden_dim=256,
    intermediate_dim=512,
    max_sequence_length=90,
    #dropout_rate=0.1,
)

In [8]:
model = keras_nlp.models.RobertaMaskedLM(back, preprocessor)

In [9]:
model.summary()

In [13]:
#model.fit()

In [27]:
class RobertaHyperModel(keras_tuner.HyperModel):
    def __init__(self, vocabulary_size, max_sequence_length, preprocessor):
        self.vocabulary_size = vocabulary_size
        self.max_sequence_length = max_sequence_length
        self.preprocessor = preprocessor

    def build(self, hp):
        hidden_dim = hp.Int('hidden_dim', min_value=128, max_value=512, step=32)
        num_heads = hp.Int('num_heads', min_value=2, max_value=8, step=1)
        dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.3, step=0.05)
        intermediate_dim = hp.Int('intermediate_dim', min_value=256, max_value=1024, step=64)
        num_layers = hp.Int('num_layers', min_value=2, max_value=6, step=1)
        learning_rate = hp.Float('learning_rate', min_value=1e-5, max_value=5e-5, sampling='log')

        backbone = keras_nlp.models.RobertaBackbone(
            vocabulary_size=self.vocabulary_size,
            num_layers=num_layers,
            num_heads=num_heads,
            hidden_dim=hidden_dim,
            intermediate_dim=intermediate_dim,
            max_sequence_length=self.max_sequence_length,
            #dropout_rate=dropout_rate,
        )
        model = keras_nlp.models.RobertaMaskedLM(
            backbone=backbone,
            preprocessor=self.preprocessor,
        )
        # dropouts?
        #model.add(layers.Dropout(rate=dropout_rate))
        model.compile(
            optimizer=tensorflow.keras.optimizers.Adam(learning_rate=learning_rate),
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        return model

In [28]:
tuner = keras_tuner.RandomSearch(
    hypermodel=RobertaHyperModel(
        vocabulary_size=36,
        max_sequence_length=90,
        preprocessor=preprocessor,
    ),
    objective="val_accuracy",
    max_trials=3,
    executions_per_trial=2,
    overwrite=True,
    directory="my_dir",
    project_name="helloworld",
)

In [29]:
tuner.search_space_summary()

Search space summary
Default search space size: 6
hidden_dim (Int)
{'default': None, 'conditions': [], 'min_value': 128, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
num_heads (Int)
{'default': None, 'conditions': [], 'min_value': 2, 'max_value': 8, 'step': 1, 'sampling': 'linear'}
dropout_rate (Float)
{'default': 0.1, 'conditions': [], 'min_value': 0.1, 'max_value': 0.3, 'step': 0.05, 'sampling': 'linear'}
intermediate_dim (Int)
{'default': None, 'conditions': [], 'min_value': 256, 'max_value': 1024, 'step': 64, 'sampling': 'linear'}
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 2, 'max_value': 6, 'step': 1, 'sampling': 'linear'}
learning_rate (Float)
{'default': 1e-05, 'conditions': [], 'min_value': 1e-05, 'max_value': 5e-05, 'step': None, 'sampling': 'log'}


In [None]:
#Create a class for the model using what we have above. It is important to put the model in a hypermodel class in order to be able to tune hyperparameters with keras-tuner. 
#Follow this tutorial until the Tune model training part: https://keras.io/guides/keras_tuner/getting_started/#tune-model-training
#The parameters to tune are the following:
#hidden dimension, number of attention heads, dropout, intermediate dimention, number of hidden layers, and the learning rate.
#The ranges to use are:
# hidden_dim: 128 to 512
# num_heads: 2, to 8
# dropout: 0.1 to 0.3
# intermediate_dim: 256 to 1024
# num_layers: 2 to 6
# learning_rate: 1e-5 to 5e-5