In [1]:



BENCHMARK_NAME = 'bass_pb40'
import os
import tensorflow as tf
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))
import pandas as pd
from tensorflow import keras
from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

# A local (non-global) binary output
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

2.5.3
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
# from numbers from 1 to 6
for i in range(5, 7):
    model_no = i
    BENCHMARKS_DIR = './data//training/' + str(model_no)
    # Loading the datasets
    
    train_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.train.csv' % BENCHMARK_NAME)
    valid_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.val.csv' % BENCHMARK_NAME)
    train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
    valid_set = pd.read_csv(valid_set_file_path).dropna().drop_duplicates()
    
    print(f'{len(train_set)} training set records, {len(valid_set)} validation set records.')
    
    # Loading the pre-trained model and fine-tuning it on the loaded dataset
    
    pretrained_model_generator, input_encoder = load_pretrained_model()
    
    # get_model_with_hidden_layers_as_outputs gives the model output access to the hidden layers (on top of the output)
    model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
            get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)
    
    training_callbacks = [
        keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-05, verbose = 1),
        keras.callbacks.EarlyStopping(patience = 3, restore_best_weights = True),
        keras.callbacks.TensorBoard(log_dir = './logs', histogram_freq = 1, update_freq= 100)
    ]
    
    
    finetune(model_generator, input_encoder, OUTPUT_SPEC, train_set['seq'], train_set['label'], valid_set['seq'], valid_set['label'], \
            seq_len = 42, batch_size = 64, max_epochs_per_stage = 40, lr = 1e-04, begin_with_frozen_pretrained_layers = True, \
            lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 0, final_seq_len = 1024, final_lr = 5e-06, callbacks = training_callbacks)
    
    
    model=model_generator.create_model(seq_len=42)
    
    model.save("./models/"+ str(model_no))

365187 training set records, 73039 validation set records.
[2024_05_10-07:48:38] Training set: Filtered out 0 of 365187 (0.0%) records of lengths exceeding 40.
[2024_05_10-07:48:39] Validation set: Filtered out 0 of 73039 (0.0%) records of lengths exceeding 40.
[2024_05_10-07:48:40] Training the entire fine-tuned model...








Epoch 1/40
Epoch 2/40

Epoch 00002: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 3/40
Epoch 4/40

Epoch 00004: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 5/40
Epoch 6/40




INFO:tensorflow:Assets written to: ./models/5\assets


INFO:tensorflow:Assets written to: ./models/5\assets


365194 training set records, 73032 validation set records.
[2024_05_10-08:10:11] Training set: Filtered out 0 of 365194 (0.0%) records of lengths exceeding 40.
[2024_05_10-08:10:13] Validation set: Filtered out 0 of 73032 (0.0%) records of lengths exceeding 40.
[2024_05_10-08:10:14] Training the entire fine-tuned model...








Epoch 1/40
Epoch 2/40
Epoch 3/40

Epoch 00003: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 4/40

Epoch 00004: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 5/40




INFO:tensorflow:Assets written to: ./models/6\assets


INFO:tensorflow:Assets written to: ./models/6\assets
