In [2]:
#In the case of the roberta model there are several ways to derive
#the embeddings of the tokens. The most common way is to use the
#last layer of the model to get the embeddings. However, the last
#layer of the model is not always the best layer to get the embeddings.
#I recommend you to read this blog post https://towardsdatascience.com/nlp-extract-contextualized-word-embeddings-from-bert-keras-tf-67ef29f60a7b
#where they showcase that the best way to get embeddings from BERT
#models is to use the concatenation of the last four layers.
#This is very big, so instead we will use the sum of the last four layers
#which is the second best way to get embeddings from BERT models.

In [3]:
#First I will copy some thing we need from the ROBERTA notebook
import keras_nlp
import tensorflow
import keras_tuner
import keras
import pandas as pd
from keras import layers
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential, load_model, Model
import numpy as np

2024-09-19 20:55:05.374273: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-19 20:55:05.376009: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-19 20:55:05.384843: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-19 20:55:05.402025: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-19 20:55:05.420204: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been 

In [4]:
#Fixed parameters
PROCESSING_FIXED = {'start_char': "<s>",
                    'end_char': "</s>", 
                    'pad_char': "<pad>",
                    'mas_char': "<mask>"}

INDICES_TOKEN = {0: 'c',
                 1: 'C',
                 2: '(',
                 3: ')',
                 4: 'O',
                 5: '1',
                 6: '2',
                 7: '=',
                 8: 'N',
                 9: '@',
                 10: '[',
                 11: ']',
                 12: 'n',
                 13: '3',
                 14: 'H',
                 15: 'F',
                 16: '4',
                 17: '-',
                 18: 'S',
                 19: 'Cl',
                 20: '/',
                 21: 's',
                 22: 'o',
                 23: '5',
                 24: '+',
                 25: '#',
                 26: '\\',
                 27: 'Br',
                 28: 'P',
                 29: '6',
                 30: 'I',
                 31: '7',
                 32: PROCESSING_FIXED['start_char'],
                 33: PROCESSING_FIXED['end_char'],
                 34: PROCESSING_FIXED['pad_char'],
                 35: PROCESSING_FIXED['mas_char']}                
TOKEN_INDICES = {v: k for k, v in INDICES_TOKEN.items()}



In [5]:
#We need a preprocessor that won't mask the tokens!
tokenizer = keras_nlp.models.RobertaTokenizer(vocabulary=TOKEN_INDICES, merges=['C l', 'B r'])

preprocessor = keras_nlp.models.RobertaPreprocessor(
    tokenizer,
    sequence_length=90
)

I0000 00:00:1726793707.986421   33594 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-09-19 20:55:07.986703: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [7]:
#Let's load the best model from the ROBERTA pretraining (epoch 58/60)
chem_model = load_model(f'../pretraining/best_roberta.keras')
chem_model.save_to_preset('roberta_pretrained')


  instance.compile_from_config(compile_config)


In [8]:
#We will use the backbone of the model to get the embeddings check https://keras.io/api/keras_nlp/base_classes/backbone/#backbone-class
pretrained = chem_model.backbone

In [9]:
pretrained.summary(expand_nested=True)

In [10]:
#Treat the data in a similar way as in the fine-tuning notebook
beta = pd.read_csv('beta_activity_class.csv') #Clean CSV file with beta secretase smiles and activity
beta["activity_class"].value_counts()
#dropna of activity_class
beta = beta.dropna(subset=["activity_class"])
#transfor activity_class to 0,1,2
beta["activity_class"] = beta["activity_class"].replace("moderately_active", 1)
beta["activity_class"] = beta["activity_class"].replace("inactive", 0)
beta["activity_class"] = beta["activity_class"].replace("very_active", 2)
#Also, remove any smiles string that contains a character NOT in our vocabulary (excluding pad, start and end chars). Hint: allowed_chars = [t for t in TOKEN_INDICES.keys()][:-3]
allowed_chars = [t for t in TOKEN_INDICES.keys()][:-3]
beta = beta[beta['Ligand SMILES'].apply(lambda x: all(char in allowed_chars for char in x))]
#drop data longer than 90 characters
beta = beta[beta['Ligand SMILES'].apply(lambda x: len(x)<=90)]


  beta["activity_class"] = beta["activity_class"].replace("very_active", 2)


In [11]:
#Generate the token inputs by passing the SMILES strings through the preprocessor see https://keras.io/api/keras_nlp/models/roberta/roberta_preprocessor/
token_input = preprocessor(beta['Ligand SMILES'])

2024-09-19 20:56:18.145604: E tensorflow/core/util/util.cc:131] oneDNN supports DT_INT64 only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


In [12]:
#To get the last four layers we will use the following code, this creates a model that takes the input of the pretrained model and outputs the last four layers
embedder = Model(inputs=pretrained.inputs, outputs=[pret_layers.output for pret_layers in pretrained.layers[-4:]])

In [215]:
#Then we need to pass the token inputs through the embedder model to get the embeddings
embedder

In [219]:
#The results will have the shape (4, samples, sequence_length, 512) where 512 is the hidden size of the model and 4 is the number of layers we are using
#We need to sum the results along the first axis to get the embeddings of the tokens of each sample

In [None]:
#Then we need to generate a simple classification model that takes the embeddings and outputs the class of activity, similar to the fine-tuning notebook