In [None]:
#In the case of the roberta model there are several ways to derive
#the embeddings of the tokens. The most common way is to use the
#last layer of the model to get the embeddings. However, the last
#layer of the model is not always the best layer to get the embeddings.
#I recommend you to read this blog post https://towardsdatascience.com/nlp-extract-contextualized-word-embeddings-from-bert-keras-tf-67ef29f60a7b
#where they showcase that the best way to get embeddings from BERT
#models is to use the concatenation of the last four layers.
#This is very big, so instead we will use the sum of the last four layers
#which is the second best way to get embeddings from BERT models.

In [1]:
#First I will copy some thing we need from the ROBERTA notebook
import keras_nlp
import tensorflow
import keras_tuner
import keras
import pandas as pd
from keras import layers
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential, load_model, Model
import numpy as np

2024-09-17 14:22:28.530571: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-17 14:22:28.652883: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-17 14:22:28.693763: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-17 14:22:28.923558: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#Fixed parameters
PROCESSING_FIXED = {'start_char': "<s>",
                    'end_char': "</s>", 
                    'pad_char': "<pad>",
                    'mas_char': "<mask>"}

INDICES_TOKEN = {0: 'c',
                 1: 'C',
                 2: '(',
                 3: ')',
                 4: 'O',
                 5: '1',
                 6: '2',
                 7: '=',
                 8: 'N',
                 9: '@',
                 10: '[',
                 11: ']',
                 12: 'n',
                 13: '3',
                 14: 'H',
                 15: 'F',
                 16: '4',
                 17: '-',
                 18: 'S',
                 19: 'Cl',
                 20: '/',
                 21: 's',
                 22: 'o',
                 23: '5',
                 24: '+',
                 25: '#',
                 26: '\\',
                 27: 'Br',
                 28: 'P',
                 29: '6',
                 30: 'I',
                 31: '7',
                 32: PROCESSING_FIXED['start_char'],
                 33: PROCESSING_FIXED['end_char'],
                 34: PROCESSING_FIXED['pad_char'],
                 35: PROCESSING_FIXED['mas_char']}                
TOKEN_INDICES = {v: k for k, v in INDICES_TOKEN.items()}

In [154]:
#We need a preprocessor that won't mask the tokens!
tokenizer = keras_nlp.models.RobertaTokenizer(vocabulary=TOKEN_INDICES, merges=['C l', 'B r'])

preprocessor = keras_nlp.models.RobertaPreprocessor(
    tokenizer,
    sequence_length=90
)

In [19]:
#Let's load the best model from the ROBERTA pretraining (epoch 58/60)
chem_model = load_model(f'../pretraining/ROBERTA/058.keras')
chem_model.save_to_preset('roberta_pretrained')


  instance.compile_from_config(compile_config)
  saveable.load_own_variables(weights_store.get(inner_path))


In [27]:
#We will use the backbone of the model to get the embeddings check https://keras.io/api/keras_nlp/base_classes/backbone/#backbone-class
pretrained = chem_model.backbone

In [166]:
pretrained.summary(expand_nested=True)

In [None]:
#Treat the data in a similar way as in the fine-tuning notebook


In [192]:
#Generate the token inputs by passing the SMILES strings through the preprocessor see https://keras.io/api/keras_nlp/models/roberta/roberta_preprocessor/

In [207]:
#To get the last four layers we will use the following code, this creates a model that takes the input of the pretrained model and outputs the last four layers
embedder = Model(inputs=pretrained.inputs, outputs=[pret_layers.output for pret_layers in pretrained.layers[-4:]])

In [215]:
#Then we need to pass the token inputs through the embedder model to get the embeddings

In [219]:
#The results will have the shape (4, samples, sequence_length, 512) where 512 is the hidden size of the model and 4 is the number of layers we are using
#We need to sum the results along the first axis to get the embeddings of the tokens of each sample

In [None]:
#Then we need to generate a simple classification model that takes the embeddings and outputs the class of activity, similar to the fine-tuning notebook